1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive 10 // stores that can be put together into vector-stores. Next, it attempts to 11 // construct vectorizable tree using the use-def chains. If a profitable tree 12 // was found, the SLP vectorizer performs vectorization on the tree. 13 // 14 // The pass is inspired by the work described in the paper: 15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 16 // 17 //===----------------------------------------------------------------------===// 18 #include "llvm/Transforms/Vectorize.h" 19 #include "llvm/ADT/MapVector.h" 20 #include "llvm/ADT/PostOrderIterator.h" 21 #include "llvm/ADT/SetVector.h" 22 #include "llvm/ADT/Optional.h" 23 #include "llvm/ADT/Statistic.h" 24 #include "llvm/Analysis/AliasAnalysis.h" 25 #include "llvm/Analysis/AssumptionCache.h" 26 #include "llvm/Analysis/CodeMetrics.h" 27 #include "llvm/Analysis/LoopInfo.h" 28 #include "llvm/Analysis/ScalarEvolution.h" 29 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 30 #include "llvm/Analysis/TargetTransformInfo.h" 31 #include "llvm/Analysis/ValueTracking.h" 32 #include "llvm/IR/DataLayout.h" 33 #include "llvm/IR/Dominators.h" 34 #include "llvm/IR/IRBuilder.h" 35 #include "llvm/IR/Instructions.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Module.h" 38 #include "llvm/IR/NoFolder.h" 39 #include "llvm/IR/Type.h" 40 #include "llvm/IR/Value.h" 41 #include "llvm/IR/Verifier.h" 42 #include "llvm/Pass.h" 43 #include "llvm/Support/CommandLine.h" 44 #include "llvm/Support/Debug.h" 45 #include "llvm/Support/raw_ostream.h" 46 #include "llvm/Transforms/Utils/VectorUtils.h" 47 #include <algorithm> 48 #include <map> 49 #include <memory> 50 51 using namespace llvm; 52 53 #define SV_NAME "slp-vectorizer" 54 #define DEBUG_TYPE "SLP" 55 56 STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); 57 58 static cl::opt<int> 59 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, 60 cl::desc("Only vectorize if you gain more than this " 61 "number ")); 62 63 static cl::opt<bool> 64 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, 65 cl::desc("Attempt to vectorize horizontal reductions")); 66 67 static cl::opt<bool> ShouldStartVectorizeHorAtStore( 68 "slp-vectorize-hor-store", cl::init(false), cl::Hidden, 69 cl::desc( 70 "Attempt to vectorize horizontal reductions feeding into a store")); 71 72 namespace { 73 74 static const unsigned MinVecRegSize = 128; 75 76 static const unsigned RecursionMaxDepth = 12; 77 78 // Limit the number of alias checks. The limit is chosen so that 79 // it has no negative effect on the llvm benchmarks. 80 static const unsigned AliasedCheckLimit = 10; 81 82 // Another limit for the alias checks: The maximum distance between load/store 83 // instructions where alias checks are done. 84 // This limit is useful for very large basic blocks. 85 static const unsigned MaxMemDepDistance = 160; 86 87 /// \returns the parent basic block if all of the instructions in \p VL 88 /// are in the same block or null otherwise. 89 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { 90 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 91 if (!I0) 92 return nullptr; 93 BasicBlock *BB = I0->getParent(); 94 for (int i = 1, e = VL.size(); i < e; i++) { 95 Instruction *I = dyn_cast<Instruction>(VL[i]); 96 if (!I) 97 return nullptr; 98 99 if (BB != I->getParent()) 100 return nullptr; 101 } 102 return BB; 103 } 104 105 /// \returns True if all of the values in \p VL are constants. 106 static bool allConstant(ArrayRef<Value *> VL) { 107 for (unsigned i = 0, e = VL.size(); i < e; ++i) 108 if (!isa<Constant>(VL[i])) 109 return false; 110 return true; 111 } 112 113 /// \returns True if all of the values in \p VL are identical. 114 static bool isSplat(ArrayRef<Value *> VL) { 115 for (unsigned i = 1, e = VL.size(); i < e; ++i) 116 if (VL[i] != VL[0]) 117 return false; 118 return true; 119 } 120 121 ///\returns Opcode that can be clubbed with \p Op to create an alternate 122 /// sequence which can later be merged as a ShuffleVector instruction. 123 static unsigned getAltOpcode(unsigned Op) { 124 switch (Op) { 125 case Instruction::FAdd: 126 return Instruction::FSub; 127 case Instruction::FSub: 128 return Instruction::FAdd; 129 case Instruction::Add: 130 return Instruction::Sub; 131 case Instruction::Sub: 132 return Instruction::Add; 133 default: 134 return 0; 135 } 136 } 137 138 ///\returns bool representing if Opcode \p Op can be part 139 /// of an alternate sequence which can later be merged as 140 /// a ShuffleVector instruction. 141 static bool canCombineAsAltInst(unsigned Op) { 142 if (Op == Instruction::FAdd || Op == Instruction::FSub || 143 Op == Instruction::Sub || Op == Instruction::Add) 144 return true; 145 return false; 146 } 147 148 /// \returns ShuffleVector instruction if intructions in \p VL have 149 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. 150 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) 151 static unsigned isAltInst(ArrayRef<Value *> VL) { 152 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 153 unsigned Opcode = I0->getOpcode(); 154 unsigned AltOpcode = getAltOpcode(Opcode); 155 for (int i = 1, e = VL.size(); i < e; i++) { 156 Instruction *I = dyn_cast<Instruction>(VL[i]); 157 if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) 158 return 0; 159 } 160 return Instruction::ShuffleVector; 161 } 162 163 /// \returns The opcode if all of the Instructions in \p VL have the same 164 /// opcode, or zero. 165 static unsigned getSameOpcode(ArrayRef<Value *> VL) { 166 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 167 if (!I0) 168 return 0; 169 unsigned Opcode = I0->getOpcode(); 170 for (int i = 1, e = VL.size(); i < e; i++) { 171 Instruction *I = dyn_cast<Instruction>(VL[i]); 172 if (!I || Opcode != I->getOpcode()) { 173 if (canCombineAsAltInst(Opcode) && i == 1) 174 return isAltInst(VL); 175 return 0; 176 } 177 } 178 return Opcode; 179 } 180 181 /// Get the intersection (logical and) of all of the potential IR flags 182 /// of each scalar operation (VL) that will be converted into a vector (I). 183 /// Flag set: NSW, NUW, exact, and all of fast-math. 184 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { 185 if (auto *VecOp = dyn_cast<BinaryOperator>(I)) { 186 if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) { 187 // Intersection is initialized to the 0th scalar, 188 // so start counting from index '1'. 189 for (int i = 1, e = VL.size(); i < e; ++i) { 190 if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i])) 191 Intersection->andIRFlags(Scalar); 192 } 193 VecOp->copyIRFlags(Intersection); 194 } 195 } 196 } 197 198 /// \returns \p I after propagating metadata from \p VL. 199 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { 200 Instruction *I0 = cast<Instruction>(VL[0]); 201 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; 202 I0->getAllMetadataOtherThanDebugLoc(Metadata); 203 204 for (unsigned i = 0, n = Metadata.size(); i != n; ++i) { 205 unsigned Kind = Metadata[i].first; 206 MDNode *MD = Metadata[i].second; 207 208 for (int i = 1, e = VL.size(); MD && i != e; i++) { 209 Instruction *I = cast<Instruction>(VL[i]); 210 MDNode *IMD = I->getMetadata(Kind); 211 212 switch (Kind) { 213 default: 214 MD = nullptr; // Remove unknown metadata 215 break; 216 case LLVMContext::MD_tbaa: 217 MD = MDNode::getMostGenericTBAA(MD, IMD); 218 break; 219 case LLVMContext::MD_alias_scope: 220 MD = MDNode::getMostGenericAliasScope(MD, IMD); 221 break; 222 case LLVMContext::MD_noalias: 223 MD = MDNode::intersect(MD, IMD); 224 break; 225 case LLVMContext::MD_fpmath: 226 MD = MDNode::getMostGenericFPMath(MD, IMD); 227 break; 228 } 229 } 230 I->setMetadata(Kind, MD); 231 } 232 return I; 233 } 234 235 /// \returns The type that all of the values in \p VL have or null if there 236 /// are different types. 237 static Type* getSameType(ArrayRef<Value *> VL) { 238 Type *Ty = VL[0]->getType(); 239 for (int i = 1, e = VL.size(); i < e; i++) 240 if (VL[i]->getType() != Ty) 241 return nullptr; 242 243 return Ty; 244 } 245 246 /// \returns True if the ExtractElement instructions in VL can be vectorized 247 /// to use the original vector. 248 static bool CanReuseExtract(ArrayRef<Value *> VL) { 249 assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); 250 // Check if all of the extracts come from the same vector and from the 251 // correct offset. 252 Value *VL0 = VL[0]; 253 ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); 254 Value *Vec = E0->getOperand(0); 255 256 // We have to extract from the same vector type. 257 unsigned NElts = Vec->getType()->getVectorNumElements(); 258 259 if (NElts != VL.size()) 260 return false; 261 262 // Check that all of the indices extract from the correct offset. 263 ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); 264 if (!CI || CI->getZExtValue()) 265 return false; 266 267 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 268 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 269 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); 270 271 if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) 272 return false; 273 } 274 275 return true; 276 } 277 278 /// \returns True if in-tree use also needs extract. This refers to 279 /// possible scalar operand in vectorized instruction. 280 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, 281 TargetLibraryInfo *TLI) { 282 283 unsigned Opcode = UserInst->getOpcode(); 284 switch (Opcode) { 285 case Instruction::Load: { 286 LoadInst *LI = cast<LoadInst>(UserInst); 287 return (LI->getPointerOperand() == Scalar); 288 } 289 case Instruction::Store: { 290 StoreInst *SI = cast<StoreInst>(UserInst); 291 return (SI->getPointerOperand() == Scalar); 292 } 293 case Instruction::Call: { 294 CallInst *CI = cast<CallInst>(UserInst); 295 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 296 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 297 return (CI->getArgOperand(1) == Scalar); 298 } 299 } 300 default: 301 return false; 302 } 303 } 304 305 /// \returns the AA location that is being access by the instruction. 306 static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) { 307 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 308 return AA->getLocation(SI); 309 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 310 return AA->getLocation(LI); 311 return AliasAnalysis::Location(); 312 } 313 314 /// \returns True if the instruction is not a volatile or atomic load/store. 315 static bool isSimple(Instruction *I) { 316 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 317 return LI->isSimple(); 318 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 319 return SI->isSimple(); 320 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 321 return !MI->isVolatile(); 322 return true; 323 } 324 325 /// Bottom Up SLP Vectorizer. 326 class BoUpSLP { 327 public: 328 typedef SmallVector<Value *, 8> ValueList; 329 typedef SmallVector<Instruction *, 16> InstrList; 330 typedef SmallPtrSet<Value *, 16> ValueSet; 331 typedef SmallVector<StoreInst *, 8> StoreList; 332 333 BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, 334 TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, 335 LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC) 336 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), 337 SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), 338 Builder(Se->getContext()) { 339 CodeMetrics::collectEphemeralValues(F, AC, EphValues); 340 } 341 342 /// \brief Vectorize the tree that starts with the elements in \p VL. 343 /// Returns the vectorized root. 344 Value *vectorizeTree(); 345 346 /// \returns the cost incurred by unwanted spills and fills, caused by 347 /// holding live values over call sites. 348 int getSpillCost(); 349 350 /// \returns the vectorization cost of the subtree that starts at \p VL. 351 /// A negative number means that this is profitable. 352 int getTreeCost(); 353 354 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 355 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 356 void buildTree(ArrayRef<Value *> Roots, 357 ArrayRef<Value *> UserIgnoreLst = None); 358 359 /// Clear the internal data structures that are created by 'buildTree'. 360 void deleteTree() { 361 VectorizableTree.clear(); 362 ScalarToTreeEntry.clear(); 363 MustGather.clear(); 364 ExternalUses.clear(); 365 NumLoadsWantToKeepOrder = 0; 366 NumLoadsWantToChangeOrder = 0; 367 for (auto &Iter : BlocksSchedules) { 368 BlockScheduling *BS = Iter.second.get(); 369 BS->clear(); 370 } 371 } 372 373 /// \returns true if the memory operations A and B are consecutive. 374 bool isConsecutiveAccess(Value *A, Value *B); 375 376 /// \brief Perform LICM and CSE on the newly generated gather sequences. 377 void optimizeGatherSequence(); 378 379 /// \returns true if it is benefitial to reverse the vector order. 380 bool shouldReorder() const { 381 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; 382 } 383 384 private: 385 struct TreeEntry; 386 387 /// \returns the cost of the vectorizable entry. 388 int getEntryCost(TreeEntry *E); 389 390 /// This is the recursive part of buildTree. 391 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); 392 393 /// Vectorize a single entry in the tree. 394 Value *vectorizeTree(TreeEntry *E); 395 396 /// Vectorize a single entry in the tree, starting in \p VL. 397 Value *vectorizeTree(ArrayRef<Value *> VL); 398 399 /// \returns the pointer to the vectorized value if \p VL is already 400 /// vectorized, or NULL. They may happen in cycles. 401 Value *alreadyVectorized(ArrayRef<Value *> VL) const; 402 403 /// \brief Take the pointer operand from the Load/Store instruction. 404 /// \returns NULL if this is not a valid Load/Store instruction. 405 static Value *getPointerOperand(Value *I); 406 407 /// \brief Take the address space operand from the Load/Store instruction. 408 /// \returns -1 if this is not a valid Load/Store instruction. 409 static unsigned getAddressSpaceOperand(Value *I); 410 411 /// \returns the scalarization cost for this type. Scalarization in this 412 /// context means the creation of vectors from a group of scalars. 413 int getGatherCost(Type *Ty); 414 415 /// \returns the scalarization cost for this list of values. Assuming that 416 /// this subtree gets vectorized, we may need to extract the values from the 417 /// roots. This method calculates the cost of extracting the values. 418 int getGatherCost(ArrayRef<Value *> VL); 419 420 /// \brief Set the Builder insert point to one after the last instruction in 421 /// the bundle 422 void setInsertPointAfterBundle(ArrayRef<Value *> VL); 423 424 /// \returns a vector from a collection of scalars in \p VL. 425 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); 426 427 /// \returns whether the VectorizableTree is fully vectoriable and will 428 /// be beneficial even the tree height is tiny. 429 bool isFullyVectorizableTinyTree(); 430 431 /// \reorder commutative operands in alt shuffle if they result in 432 /// vectorized code. 433 void reorderAltShuffleOperands(ArrayRef<Value *> VL, 434 SmallVectorImpl<Value *> &Left, 435 SmallVectorImpl<Value *> &Right); 436 /// \reorder commutative operands to get better probability of 437 /// generating vectorized code. 438 void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 439 SmallVectorImpl<Value *> &Left, 440 SmallVectorImpl<Value *> &Right); 441 struct TreeEntry { 442 TreeEntry() : Scalars(), VectorizedValue(nullptr), 443 NeedToGather(0) {} 444 445 /// \returns true if the scalars in VL are equal to this entry. 446 bool isSame(ArrayRef<Value *> VL) const { 447 assert(VL.size() == Scalars.size() && "Invalid size"); 448 return std::equal(VL.begin(), VL.end(), Scalars.begin()); 449 } 450 451 /// A vector of scalars. 452 ValueList Scalars; 453 454 /// The Scalars are vectorized into this value. It is initialized to Null. 455 Value *VectorizedValue; 456 457 /// Do we need to gather this sequence ? 458 bool NeedToGather; 459 }; 460 461 /// Create a new VectorizableTree entry. 462 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { 463 VectorizableTree.push_back(TreeEntry()); 464 int idx = VectorizableTree.size() - 1; 465 TreeEntry *Last = &VectorizableTree[idx]; 466 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); 467 Last->NeedToGather = !Vectorized; 468 if (Vectorized) { 469 for (int i = 0, e = VL.size(); i != e; ++i) { 470 assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); 471 ScalarToTreeEntry[VL[i]] = idx; 472 } 473 } else { 474 MustGather.insert(VL.begin(), VL.end()); 475 } 476 return Last; 477 } 478 479 /// -- Vectorization State -- 480 /// Holds all of the tree entries. 481 std::vector<TreeEntry> VectorizableTree; 482 483 /// Maps a specific scalar to its tree entry. 484 SmallDenseMap<Value*, int> ScalarToTreeEntry; 485 486 /// A list of scalars that we found that we need to keep as scalars. 487 ValueSet MustGather; 488 489 /// This POD struct describes one external user in the vectorized tree. 490 struct ExternalUser { 491 ExternalUser (Value *S, llvm::User *U, int L) : 492 Scalar(S), User(U), Lane(L){}; 493 // Which scalar in our function. 494 Value *Scalar; 495 // Which user that uses the scalar. 496 llvm::User *User; 497 // Which lane does the scalar belong to. 498 int Lane; 499 }; 500 typedef SmallVector<ExternalUser, 16> UserList; 501 502 /// Checks if two instructions may access the same memory. 503 /// 504 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it 505 /// is invariant in the calling loop. 506 bool isAliased(const AliasAnalysis::Location &Loc1, Instruction *Inst1, 507 Instruction *Inst2) { 508 509 // First check if the result is already in the cache. 510 AliasCacheKey key = std::make_pair(Inst1, Inst2); 511 Optional<bool> &result = AliasCache[key]; 512 if (result.hasValue()) { 513 return result.getValue(); 514 } 515 AliasAnalysis::Location Loc2 = getLocation(Inst2, AA); 516 bool aliased = true; 517 if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { 518 // Do the alias check. 519 aliased = AA->alias(Loc1, Loc2); 520 } 521 // Store the result in the cache. 522 result = aliased; 523 return aliased; 524 } 525 526 typedef std::pair<Instruction *, Instruction *> AliasCacheKey; 527 528 /// Cache for alias results. 529 /// TODO: consider moving this to the AliasAnalysis itself. 530 DenseMap<AliasCacheKey, Optional<bool>> AliasCache; 531 532 /// Removes an instruction from its block and eventually deletes it. 533 /// It's like Instruction::eraseFromParent() except that the actual deletion 534 /// is delayed until BoUpSLP is destructed. 535 /// This is required to ensure that there are no incorrect collisions in the 536 /// AliasCache, which can happen if a new instruction is allocated at the 537 /// same address as a previously deleted instruction. 538 void eraseInstruction(Instruction *I) { 539 I->removeFromParent(); 540 I->dropAllReferences(); 541 DeletedInstructions.push_back(std::unique_ptr<Instruction>(I)); 542 } 543 544 /// Temporary store for deleted instructions. Instructions will be deleted 545 /// eventually when the BoUpSLP is destructed. 546 SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions; 547 548 /// A list of values that need to extracted out of the tree. 549 /// This list holds pairs of (Internal Scalar : External User). 550 UserList ExternalUses; 551 552 /// Values used only by @llvm.assume calls. 553 SmallPtrSet<const Value *, 32> EphValues; 554 555 /// Holds all of the instructions that we gathered. 556 SetVector<Instruction *> GatherSeq; 557 /// A list of blocks that we are going to CSE. 558 SetVector<BasicBlock *> CSEBlocks; 559 560 /// Contains all scheduling relevant data for an instruction. 561 /// A ScheduleData either represents a single instruction or a member of an 562 /// instruction bundle (= a group of instructions which is combined into a 563 /// vector instruction). 564 struct ScheduleData { 565 566 // The initial value for the dependency counters. It means that the 567 // dependencies are not calculated yet. 568 enum { InvalidDeps = -1 }; 569 570 ScheduleData() 571 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr), 572 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0), 573 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps), 574 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {} 575 576 void init(int BlockSchedulingRegionID) { 577 FirstInBundle = this; 578 NextInBundle = nullptr; 579 NextLoadStore = nullptr; 580 IsScheduled = false; 581 SchedulingRegionID = BlockSchedulingRegionID; 582 UnscheduledDepsInBundle = UnscheduledDeps; 583 clearDependencies(); 584 } 585 586 /// Returns true if the dependency information has been calculated. 587 bool hasValidDependencies() const { return Dependencies != InvalidDeps; } 588 589 /// Returns true for single instructions and for bundle representatives 590 /// (= the head of a bundle). 591 bool isSchedulingEntity() const { return FirstInBundle == this; } 592 593 /// Returns true if it represents an instruction bundle and not only a 594 /// single instruction. 595 bool isPartOfBundle() const { 596 return NextInBundle != nullptr || FirstInBundle != this; 597 } 598 599 /// Returns true if it is ready for scheduling, i.e. it has no more 600 /// unscheduled depending instructions/bundles. 601 bool isReady() const { 602 assert(isSchedulingEntity() && 603 "can't consider non-scheduling entity for ready list"); 604 return UnscheduledDepsInBundle == 0 && !IsScheduled; 605 } 606 607 /// Modifies the number of unscheduled dependencies, also updating it for 608 /// the whole bundle. 609 int incrementUnscheduledDeps(int Incr) { 610 UnscheduledDeps += Incr; 611 return FirstInBundle->UnscheduledDepsInBundle += Incr; 612 } 613 614 /// Sets the number of unscheduled dependencies to the number of 615 /// dependencies. 616 void resetUnscheduledDeps() { 617 incrementUnscheduledDeps(Dependencies - UnscheduledDeps); 618 } 619 620 /// Clears all dependency information. 621 void clearDependencies() { 622 Dependencies = InvalidDeps; 623 resetUnscheduledDeps(); 624 MemoryDependencies.clear(); 625 } 626 627 void dump(raw_ostream &os) const { 628 if (!isSchedulingEntity()) { 629 os << "/ " << *Inst; 630 } else if (NextInBundle) { 631 os << '[' << *Inst; 632 ScheduleData *SD = NextInBundle; 633 while (SD) { 634 os << ';' << *SD->Inst; 635 SD = SD->NextInBundle; 636 } 637 os << ']'; 638 } else { 639 os << *Inst; 640 } 641 } 642 643 Instruction *Inst; 644 645 /// Points to the head in an instruction bundle (and always to this for 646 /// single instructions). 647 ScheduleData *FirstInBundle; 648 649 /// Single linked list of all instructions in a bundle. Null if it is a 650 /// single instruction. 651 ScheduleData *NextInBundle; 652 653 /// Single linked list of all memory instructions (e.g. load, store, call) 654 /// in the block - until the end of the scheduling region. 655 ScheduleData *NextLoadStore; 656 657 /// The dependent memory instructions. 658 /// This list is derived on demand in calculateDependencies(). 659 SmallVector<ScheduleData *, 4> MemoryDependencies; 660 661 /// This ScheduleData is in the current scheduling region if this matches 662 /// the current SchedulingRegionID of BlockScheduling. 663 int SchedulingRegionID; 664 665 /// Used for getting a "good" final ordering of instructions. 666 int SchedulingPriority; 667 668 /// The number of dependencies. Constitutes of the number of users of the 669 /// instruction plus the number of dependent memory instructions (if any). 670 /// This value is calculated on demand. 671 /// If InvalidDeps, the number of dependencies is not calculated yet. 672 /// 673 int Dependencies; 674 675 /// The number of dependencies minus the number of dependencies of scheduled 676 /// instructions. As soon as this is zero, the instruction/bundle gets ready 677 /// for scheduling. 678 /// Note that this is negative as long as Dependencies is not calculated. 679 int UnscheduledDeps; 680 681 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for 682 /// single instructions. 683 int UnscheduledDepsInBundle; 684 685 /// True if this instruction is scheduled (or considered as scheduled in the 686 /// dry-run). 687 bool IsScheduled; 688 }; 689 690 #ifndef NDEBUG 691 friend raw_ostream &operator<<(raw_ostream &os, 692 const BoUpSLP::ScheduleData &SD); 693 #endif 694 695 /// Contains all scheduling data for a basic block. 696 /// 697 struct BlockScheduling { 698 699 BlockScheduling(BasicBlock *BB) 700 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), 701 ScheduleStart(nullptr), ScheduleEnd(nullptr), 702 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), 703 // Make sure that the initial SchedulingRegionID is greater than the 704 // initial SchedulingRegionID in ScheduleData (which is 0). 705 SchedulingRegionID(1) {} 706 707 void clear() { 708 ReadyInsts.clear(); 709 ScheduleStart = nullptr; 710 ScheduleEnd = nullptr; 711 FirstLoadStoreInRegion = nullptr; 712 LastLoadStoreInRegion = nullptr; 713 714 // Make a new scheduling region, i.e. all existing ScheduleData is not 715 // in the new region yet. 716 ++SchedulingRegionID; 717 } 718 719 ScheduleData *getScheduleData(Value *V) { 720 ScheduleData *SD = ScheduleDataMap[V]; 721 if (SD && SD->SchedulingRegionID == SchedulingRegionID) 722 return SD; 723 return nullptr; 724 } 725 726 bool isInSchedulingRegion(ScheduleData *SD) { 727 return SD->SchedulingRegionID == SchedulingRegionID; 728 } 729 730 /// Marks an instruction as scheduled and puts all dependent ready 731 /// instructions into the ready-list. 732 template <typename ReadyListType> 733 void schedule(ScheduleData *SD, ReadyListType &ReadyList) { 734 SD->IsScheduled = true; 735 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); 736 737 ScheduleData *BundleMember = SD; 738 while (BundleMember) { 739 // Handle the def-use chain dependencies. 740 for (Use &U : BundleMember->Inst->operands()) { 741 ScheduleData *OpDef = getScheduleData(U.get()); 742 if (OpDef && OpDef->hasValidDependencies() && 743 OpDef->incrementUnscheduledDeps(-1) == 0) { 744 // There are no more unscheduled dependencies after decrementing, 745 // so we can put the dependent instruction into the ready list. 746 ScheduleData *DepBundle = OpDef->FirstInBundle; 747 assert(!DepBundle->IsScheduled && 748 "already scheduled bundle gets ready"); 749 ReadyList.insert(DepBundle); 750 DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); 751 } 752 } 753 // Handle the memory dependencies. 754 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { 755 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { 756 // There are no more unscheduled dependencies after decrementing, 757 // so we can put the dependent instruction into the ready list. 758 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; 759 assert(!DepBundle->IsScheduled && 760 "already scheduled bundle gets ready"); 761 ReadyList.insert(DepBundle); 762 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n"); 763 } 764 } 765 BundleMember = BundleMember->NextInBundle; 766 } 767 } 768 769 /// Put all instructions into the ReadyList which are ready for scheduling. 770 template <typename ReadyListType> 771 void initialFillReadyList(ReadyListType &ReadyList) { 772 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 773 ScheduleData *SD = getScheduleData(I); 774 if (SD->isSchedulingEntity() && SD->isReady()) { 775 ReadyList.insert(SD); 776 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); 777 } 778 } 779 } 780 781 /// Checks if a bundle of instructions can be scheduled, i.e. has no 782 /// cyclic dependencies. This is only a dry-run, no instructions are 783 /// actually moved at this stage. 784 bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP); 785 786 /// Un-bundles a group of instructions. 787 void cancelScheduling(ArrayRef<Value *> VL); 788 789 /// Extends the scheduling region so that V is inside the region. 790 void extendSchedulingRegion(Value *V); 791 792 /// Initialize the ScheduleData structures for new instructions in the 793 /// scheduling region. 794 void initScheduleData(Instruction *FromI, Instruction *ToI, 795 ScheduleData *PrevLoadStore, 796 ScheduleData *NextLoadStore); 797 798 /// Updates the dependency information of a bundle and of all instructions/ 799 /// bundles which depend on the original bundle. 800 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, 801 BoUpSLP *SLP); 802 803 /// Sets all instruction in the scheduling region to un-scheduled. 804 void resetSchedule(); 805 806 BasicBlock *BB; 807 808 /// Simple memory allocation for ScheduleData. 809 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; 810 811 /// The size of a ScheduleData array in ScheduleDataChunks. 812 int ChunkSize; 813 814 /// The allocator position in the current chunk, which is the last entry 815 /// of ScheduleDataChunks. 816 int ChunkPos; 817 818 /// Attaches ScheduleData to Instruction. 819 /// Note that the mapping survives during all vectorization iterations, i.e. 820 /// ScheduleData structures are recycled. 821 DenseMap<Value *, ScheduleData *> ScheduleDataMap; 822 823 struct ReadyList : SmallVector<ScheduleData *, 8> { 824 void insert(ScheduleData *SD) { push_back(SD); } 825 }; 826 827 /// The ready-list for scheduling (only used for the dry-run). 828 ReadyList ReadyInsts; 829 830 /// The first instruction of the scheduling region. 831 Instruction *ScheduleStart; 832 833 /// The first instruction _after_ the scheduling region. 834 Instruction *ScheduleEnd; 835 836 /// The first memory accessing instruction in the scheduling region 837 /// (can be null). 838 ScheduleData *FirstLoadStoreInRegion; 839 840 /// The last memory accessing instruction in the scheduling region 841 /// (can be null). 842 ScheduleData *LastLoadStoreInRegion; 843 844 /// The ID of the scheduling region. For a new vectorization iteration this 845 /// is incremented which "removes" all ScheduleData from the region. 846 int SchedulingRegionID; 847 }; 848 849 /// Attaches the BlockScheduling structures to basic blocks. 850 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; 851 852 /// Performs the "real" scheduling. Done before vectorization is actually 853 /// performed in a basic block. 854 void scheduleBlock(BlockScheduling *BS); 855 856 /// List of users to ignore during scheduling and that don't need extracting. 857 ArrayRef<Value *> UserIgnoreList; 858 859 // Number of load-bundles, which contain consecutive loads. 860 int NumLoadsWantToKeepOrder; 861 862 // Number of load-bundles of size 2, which are consecutive loads if reversed. 863 int NumLoadsWantToChangeOrder; 864 865 // Analysis and block reference. 866 Function *F; 867 ScalarEvolution *SE; 868 const DataLayout *DL; 869 TargetTransformInfo *TTI; 870 TargetLibraryInfo *TLI; 871 AliasAnalysis *AA; 872 LoopInfo *LI; 873 DominatorTree *DT; 874 /// Instruction builder to construct the vectorized tree. 875 IRBuilder<> Builder; 876 }; 877 878 #ifndef NDEBUG 879 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) { 880 SD.dump(os); 881 return os; 882 } 883 #endif 884 885 void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 886 ArrayRef<Value *> UserIgnoreLst) { 887 deleteTree(); 888 UserIgnoreList = UserIgnoreLst; 889 if (!getSameType(Roots)) 890 return; 891 buildTree_rec(Roots, 0); 892 893 // Collect the values that we need to extract from the tree. 894 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 895 TreeEntry *Entry = &VectorizableTree[EIdx]; 896 897 // For each lane: 898 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 899 Value *Scalar = Entry->Scalars[Lane]; 900 901 // No need to handle users of gathered values. 902 if (Entry->NeedToGather) 903 continue; 904 905 for (User *U : Scalar->users()) { 906 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); 907 908 Instruction *UserInst = dyn_cast<Instruction>(U); 909 if (!UserInst) 910 continue; 911 912 // Skip in-tree scalars that become vectors 913 if (ScalarToTreeEntry.count(U)) { 914 int Idx = ScalarToTreeEntry[U]; 915 TreeEntry *UseEntry = &VectorizableTree[Idx]; 916 Value *UseScalar = UseEntry->Scalars[0]; 917 // Some in-tree scalars will remain as scalar in vectorized 918 // instructions. If that is the case, the one in Lane 0 will 919 // be used. 920 if (UseScalar != U || 921 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { 922 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U 923 << ".\n"); 924 assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); 925 continue; 926 } 927 } 928 929 // Ignore users in the user ignore list. 930 if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != 931 UserIgnoreList.end()) 932 continue; 933 934 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << 935 Lane << " from " << *Scalar << ".\n"); 936 ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); 937 } 938 } 939 } 940 } 941 942 943 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { 944 bool SameTy = getSameType(VL); (void)SameTy; 945 bool isAltShuffle = false; 946 assert(SameTy && "Invalid types!"); 947 948 if (Depth == RecursionMaxDepth) { 949 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); 950 newTreeEntry(VL, false); 951 return; 952 } 953 954 // Don't handle vectors. 955 if (VL[0]->getType()->isVectorTy()) { 956 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); 957 newTreeEntry(VL, false); 958 return; 959 } 960 961 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 962 if (SI->getValueOperand()->getType()->isVectorTy()) { 963 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); 964 newTreeEntry(VL, false); 965 return; 966 } 967 unsigned Opcode = getSameOpcode(VL); 968 969 // Check that this shuffle vector refers to the alternate 970 // sequence of opcodes. 971 if (Opcode == Instruction::ShuffleVector) { 972 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 973 unsigned Op = I0->getOpcode(); 974 if (Op != Instruction::ShuffleVector) 975 isAltShuffle = true; 976 } 977 978 // If all of the operands are identical or constant we have a simple solution. 979 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { 980 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); 981 newTreeEntry(VL, false); 982 return; 983 } 984 985 // We now know that this is a vector of instructions of the same type from 986 // the same block. 987 988 // Don't vectorize ephemeral values. 989 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 990 if (EphValues.count(VL[i])) { 991 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 992 ") is ephemeral.\n"); 993 newTreeEntry(VL, false); 994 return; 995 } 996 } 997 998 // Check if this is a duplicate of another entry. 999 if (ScalarToTreeEntry.count(VL[0])) { 1000 int Idx = ScalarToTreeEntry[VL[0]]; 1001 TreeEntry *E = &VectorizableTree[Idx]; 1002 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1003 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); 1004 if (E->Scalars[i] != VL[i]) { 1005 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); 1006 newTreeEntry(VL, false); 1007 return; 1008 } 1009 } 1010 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); 1011 return; 1012 } 1013 1014 // Check that none of the instructions in the bundle are already in the tree. 1015 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1016 if (ScalarToTreeEntry.count(VL[i])) { 1017 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 1018 ") is already in tree.\n"); 1019 newTreeEntry(VL, false); 1020 return; 1021 } 1022 } 1023 1024 // If any of the scalars is marked as a value that needs to stay scalar then 1025 // we need to gather the scalars. 1026 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1027 if (MustGather.count(VL[i])) { 1028 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); 1029 newTreeEntry(VL, false); 1030 return; 1031 } 1032 } 1033 1034 // Check that all of the users of the scalars that we want to vectorize are 1035 // schedulable. 1036 Instruction *VL0 = cast<Instruction>(VL[0]); 1037 BasicBlock *BB = cast<Instruction>(VL0)->getParent(); 1038 1039 if (!DT->isReachableFromEntry(BB)) { 1040 // Don't go into unreachable blocks. They may contain instructions with 1041 // dependency cycles which confuse the final scheduling. 1042 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); 1043 newTreeEntry(VL, false); 1044 return; 1045 } 1046 1047 // Check that every instructions appears once in this bundle. 1048 for (unsigned i = 0, e = VL.size(); i < e; ++i) 1049 for (unsigned j = i+1; j < e; ++j) 1050 if (VL[i] == VL[j]) { 1051 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 1052 newTreeEntry(VL, false); 1053 return; 1054 } 1055 1056 auto &BSRef = BlocksSchedules[BB]; 1057 if (!BSRef) { 1058 BSRef = llvm::make_unique<BlockScheduling>(BB); 1059 } 1060 BlockScheduling &BS = *BSRef.get(); 1061 1062 if (!BS.tryScheduleBundle(VL, this)) { 1063 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); 1064 BS.cancelScheduling(VL); 1065 newTreeEntry(VL, false); 1066 return; 1067 } 1068 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); 1069 1070 switch (Opcode) { 1071 case Instruction::PHI: { 1072 PHINode *PH = dyn_cast<PHINode>(VL0); 1073 1074 // Check for terminator values (e.g. invoke). 1075 for (unsigned j = 0; j < VL.size(); ++j) 1076 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1077 TerminatorInst *Term = dyn_cast<TerminatorInst>( 1078 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); 1079 if (Term) { 1080 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); 1081 BS.cancelScheduling(VL); 1082 newTreeEntry(VL, false); 1083 return; 1084 } 1085 } 1086 1087 newTreeEntry(VL, true); 1088 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); 1089 1090 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1091 ValueList Operands; 1092 // Prepare the operand vector. 1093 for (unsigned j = 0; j < VL.size(); ++j) 1094 Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock( 1095 PH->getIncomingBlock(i))); 1096 1097 buildTree_rec(Operands, Depth + 1); 1098 } 1099 return; 1100 } 1101 case Instruction::ExtractElement: { 1102 bool Reuse = CanReuseExtract(VL); 1103 if (Reuse) { 1104 DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); 1105 } else { 1106 BS.cancelScheduling(VL); 1107 } 1108 newTreeEntry(VL, Reuse); 1109 return; 1110 } 1111 case Instruction::Load: { 1112 // Check if the loads are consecutive or of we need to swizzle them. 1113 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { 1114 LoadInst *L = cast<LoadInst>(VL[i]); 1115 if (!L->isSimple()) { 1116 BS.cancelScheduling(VL); 1117 newTreeEntry(VL, false); 1118 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); 1119 return; 1120 } 1121 if (!isConsecutiveAccess(VL[i], VL[i + 1])) { 1122 if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) { 1123 ++NumLoadsWantToChangeOrder; 1124 } 1125 BS.cancelScheduling(VL); 1126 newTreeEntry(VL, false); 1127 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); 1128 return; 1129 } 1130 } 1131 ++NumLoadsWantToKeepOrder; 1132 newTreeEntry(VL, true); 1133 DEBUG(dbgs() << "SLP: added a vector of loads.\n"); 1134 return; 1135 } 1136 case Instruction::ZExt: 1137 case Instruction::SExt: 1138 case Instruction::FPToUI: 1139 case Instruction::FPToSI: 1140 case Instruction::FPExt: 1141 case Instruction::PtrToInt: 1142 case Instruction::IntToPtr: 1143 case Instruction::SIToFP: 1144 case Instruction::UIToFP: 1145 case Instruction::Trunc: 1146 case Instruction::FPTrunc: 1147 case Instruction::BitCast: { 1148 Type *SrcTy = VL0->getOperand(0)->getType(); 1149 for (unsigned i = 0; i < VL.size(); ++i) { 1150 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); 1151 if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { 1152 BS.cancelScheduling(VL); 1153 newTreeEntry(VL, false); 1154 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); 1155 return; 1156 } 1157 } 1158 newTreeEntry(VL, true); 1159 DEBUG(dbgs() << "SLP: added a vector of casts.\n"); 1160 1161 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1162 ValueList Operands; 1163 // Prepare the operand vector. 1164 for (unsigned j = 0; j < VL.size(); ++j) 1165 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1166 1167 buildTree_rec(Operands, Depth+1); 1168 } 1169 return; 1170 } 1171 case Instruction::ICmp: 1172 case Instruction::FCmp: { 1173 // Check that all of the compares have the same predicate. 1174 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); 1175 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); 1176 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 1177 CmpInst *Cmp = cast<CmpInst>(VL[i]); 1178 if (Cmp->getPredicate() != P0 || 1179 Cmp->getOperand(0)->getType() != ComparedTy) { 1180 BS.cancelScheduling(VL); 1181 newTreeEntry(VL, false); 1182 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); 1183 return; 1184 } 1185 } 1186 1187 newTreeEntry(VL, true); 1188 DEBUG(dbgs() << "SLP: added a vector of compares.\n"); 1189 1190 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1191 ValueList Operands; 1192 // Prepare the operand vector. 1193 for (unsigned j = 0; j < VL.size(); ++j) 1194 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1195 1196 buildTree_rec(Operands, Depth+1); 1197 } 1198 return; 1199 } 1200 case Instruction::Select: 1201 case Instruction::Add: 1202 case Instruction::FAdd: 1203 case Instruction::Sub: 1204 case Instruction::FSub: 1205 case Instruction::Mul: 1206 case Instruction::FMul: 1207 case Instruction::UDiv: 1208 case Instruction::SDiv: 1209 case Instruction::FDiv: 1210 case Instruction::URem: 1211 case Instruction::SRem: 1212 case Instruction::FRem: 1213 case Instruction::Shl: 1214 case Instruction::LShr: 1215 case Instruction::AShr: 1216 case Instruction::And: 1217 case Instruction::Or: 1218 case Instruction::Xor: { 1219 newTreeEntry(VL, true); 1220 DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); 1221 1222 // Sort operands of the instructions so that each side is more likely to 1223 // have the same opcode. 1224 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { 1225 ValueList Left, Right; 1226 reorderInputsAccordingToOpcode(VL, Left, Right); 1227 buildTree_rec(Left, Depth + 1); 1228 buildTree_rec(Right, Depth + 1); 1229 return; 1230 } 1231 1232 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1233 ValueList Operands; 1234 // Prepare the operand vector. 1235 for (unsigned j = 0; j < VL.size(); ++j) 1236 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1237 1238 buildTree_rec(Operands, Depth+1); 1239 } 1240 return; 1241 } 1242 case Instruction::GetElementPtr: { 1243 // We don't combine GEPs with complicated (nested) indexing. 1244 for (unsigned j = 0; j < VL.size(); ++j) { 1245 if (cast<Instruction>(VL[j])->getNumOperands() != 2) { 1246 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); 1247 BS.cancelScheduling(VL); 1248 newTreeEntry(VL, false); 1249 return; 1250 } 1251 } 1252 1253 // We can't combine several GEPs into one vector if they operate on 1254 // different types. 1255 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType(); 1256 for (unsigned j = 0; j < VL.size(); ++j) { 1257 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); 1258 if (Ty0 != CurTy) { 1259 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); 1260 BS.cancelScheduling(VL); 1261 newTreeEntry(VL, false); 1262 return; 1263 } 1264 } 1265 1266 // We don't combine GEPs with non-constant indexes. 1267 for (unsigned j = 0; j < VL.size(); ++j) { 1268 auto Op = cast<Instruction>(VL[j])->getOperand(1); 1269 if (!isa<ConstantInt>(Op)) { 1270 DEBUG( 1271 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); 1272 BS.cancelScheduling(VL); 1273 newTreeEntry(VL, false); 1274 return; 1275 } 1276 } 1277 1278 newTreeEntry(VL, true); 1279 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); 1280 for (unsigned i = 0, e = 2; i < e; ++i) { 1281 ValueList Operands; 1282 // Prepare the operand vector. 1283 for (unsigned j = 0; j < VL.size(); ++j) 1284 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1285 1286 buildTree_rec(Operands, Depth + 1); 1287 } 1288 return; 1289 } 1290 case Instruction::Store: { 1291 // Check if the stores are consecutive or of we need to swizzle them. 1292 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) 1293 if (!isConsecutiveAccess(VL[i], VL[i + 1])) { 1294 BS.cancelScheduling(VL); 1295 newTreeEntry(VL, false); 1296 DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); 1297 return; 1298 } 1299 1300 newTreeEntry(VL, true); 1301 DEBUG(dbgs() << "SLP: added a vector of stores.\n"); 1302 1303 ValueList Operands; 1304 for (unsigned j = 0; j < VL.size(); ++j) 1305 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); 1306 1307 buildTree_rec(Operands, Depth + 1); 1308 return; 1309 } 1310 case Instruction::Call: { 1311 // Check if the calls are all to the same vectorizable intrinsic. 1312 CallInst *CI = cast<CallInst>(VL[0]); 1313 // Check if this is an Intrinsic call or something that can be 1314 // represented by an intrinsic call 1315 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1316 if (!isTriviallyVectorizable(ID)) { 1317 BS.cancelScheduling(VL); 1318 newTreeEntry(VL, false); 1319 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); 1320 return; 1321 } 1322 Function *Int = CI->getCalledFunction(); 1323 Value *A1I = nullptr; 1324 if (hasVectorInstrinsicScalarOpd(ID, 1)) 1325 A1I = CI->getArgOperand(1); 1326 for (unsigned i = 1, e = VL.size(); i != e; ++i) { 1327 CallInst *CI2 = dyn_cast<CallInst>(VL[i]); 1328 if (!CI2 || CI2->getCalledFunction() != Int || 1329 getIntrinsicIDForCall(CI2, TLI) != ID) { 1330 BS.cancelScheduling(VL); 1331 newTreeEntry(VL, false); 1332 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] 1333 << "\n"); 1334 return; 1335 } 1336 // ctlz,cttz and powi are special intrinsics whose second argument 1337 // should be same in order for them to be vectorized. 1338 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 1339 Value *A1J = CI2->getArgOperand(1); 1340 if (A1I != A1J) { 1341 BS.cancelScheduling(VL); 1342 newTreeEntry(VL, false); 1343 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI 1344 << " argument "<< A1I<<"!=" << A1J 1345 << "\n"); 1346 return; 1347 } 1348 } 1349 } 1350 1351 newTreeEntry(VL, true); 1352 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { 1353 ValueList Operands; 1354 // Prepare the operand vector. 1355 for (unsigned j = 0; j < VL.size(); ++j) { 1356 CallInst *CI2 = dyn_cast<CallInst>(VL[j]); 1357 Operands.push_back(CI2->getArgOperand(i)); 1358 } 1359 buildTree_rec(Operands, Depth + 1); 1360 } 1361 return; 1362 } 1363 case Instruction::ShuffleVector: { 1364 // If this is not an alternate sequence of opcode like add-sub 1365 // then do not vectorize this instruction. 1366 if (!isAltShuffle) { 1367 BS.cancelScheduling(VL); 1368 newTreeEntry(VL, false); 1369 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); 1370 return; 1371 } 1372 newTreeEntry(VL, true); 1373 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); 1374 1375 // Reorder operands if reordering would enable vectorization. 1376 if (isa<BinaryOperator>(VL0)) { 1377 ValueList Left, Right; 1378 reorderAltShuffleOperands(VL, Left, Right); 1379 buildTree_rec(Left, Depth + 1); 1380 buildTree_rec(Right, Depth + 1); 1381 return; 1382 } 1383 1384 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1385 ValueList Operands; 1386 // Prepare the operand vector. 1387 for (unsigned j = 0; j < VL.size(); ++j) 1388 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1389 1390 buildTree_rec(Operands, Depth + 1); 1391 } 1392 return; 1393 } 1394 default: 1395 BS.cancelScheduling(VL); 1396 newTreeEntry(VL, false); 1397 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); 1398 return; 1399 } 1400 } 1401 1402 int BoUpSLP::getEntryCost(TreeEntry *E) { 1403 ArrayRef<Value*> VL = E->Scalars; 1404 1405 Type *ScalarTy = VL[0]->getType(); 1406 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1407 ScalarTy = SI->getValueOperand()->getType(); 1408 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1409 1410 if (E->NeedToGather) { 1411 if (allConstant(VL)) 1412 return 0; 1413 if (isSplat(VL)) { 1414 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); 1415 } 1416 return getGatherCost(E->Scalars); 1417 } 1418 unsigned Opcode = getSameOpcode(VL); 1419 assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); 1420 Instruction *VL0 = cast<Instruction>(VL[0]); 1421 switch (Opcode) { 1422 case Instruction::PHI: { 1423 return 0; 1424 } 1425 case Instruction::ExtractElement: { 1426 if (CanReuseExtract(VL)) { 1427 int DeadCost = 0; 1428 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 1429 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 1430 if (E->hasOneUse()) 1431 // Take credit for instruction that will become dead. 1432 DeadCost += 1433 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); 1434 } 1435 return -DeadCost; 1436 } 1437 return getGatherCost(VecTy); 1438 } 1439 case Instruction::ZExt: 1440 case Instruction::SExt: 1441 case Instruction::FPToUI: 1442 case Instruction::FPToSI: 1443 case Instruction::FPExt: 1444 case Instruction::PtrToInt: 1445 case Instruction::IntToPtr: 1446 case Instruction::SIToFP: 1447 case Instruction::UIToFP: 1448 case Instruction::Trunc: 1449 case Instruction::FPTrunc: 1450 case Instruction::BitCast: { 1451 Type *SrcTy = VL0->getOperand(0)->getType(); 1452 1453 // Calculate the cost of this instruction. 1454 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), 1455 VL0->getType(), SrcTy); 1456 1457 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); 1458 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); 1459 return VecCost - ScalarCost; 1460 } 1461 case Instruction::FCmp: 1462 case Instruction::ICmp: 1463 case Instruction::Select: 1464 case Instruction::Add: 1465 case Instruction::FAdd: 1466 case Instruction::Sub: 1467 case Instruction::FSub: 1468 case Instruction::Mul: 1469 case Instruction::FMul: 1470 case Instruction::UDiv: 1471 case Instruction::SDiv: 1472 case Instruction::FDiv: 1473 case Instruction::URem: 1474 case Instruction::SRem: 1475 case Instruction::FRem: 1476 case Instruction::Shl: 1477 case Instruction::LShr: 1478 case Instruction::AShr: 1479 case Instruction::And: 1480 case Instruction::Or: 1481 case Instruction::Xor: { 1482 // Calculate the cost of this instruction. 1483 int ScalarCost = 0; 1484 int VecCost = 0; 1485 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || 1486 Opcode == Instruction::Select) { 1487 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); 1488 ScalarCost = VecTy->getNumElements() * 1489 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); 1490 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); 1491 } else { 1492 // Certain instructions can be cheaper to vectorize if they have a 1493 // constant second vector operand. 1494 TargetTransformInfo::OperandValueKind Op1VK = 1495 TargetTransformInfo::OK_AnyValue; 1496 TargetTransformInfo::OperandValueKind Op2VK = 1497 TargetTransformInfo::OK_UniformConstantValue; 1498 TargetTransformInfo::OperandValueProperties Op1VP = 1499 TargetTransformInfo::OP_None; 1500 TargetTransformInfo::OperandValueProperties Op2VP = 1501 TargetTransformInfo::OP_None; 1502 1503 // If all operands are exactly the same ConstantInt then set the 1504 // operand kind to OK_UniformConstantValue. 1505 // If instead not all operands are constants, then set the operand kind 1506 // to OK_AnyValue. If all operands are constants but not the same, 1507 // then set the operand kind to OK_NonUniformConstantValue. 1508 ConstantInt *CInt = nullptr; 1509 for (unsigned i = 0; i < VL.size(); ++i) { 1510 const Instruction *I = cast<Instruction>(VL[i]); 1511 if (!isa<ConstantInt>(I->getOperand(1))) { 1512 Op2VK = TargetTransformInfo::OK_AnyValue; 1513 break; 1514 } 1515 if (i == 0) { 1516 CInt = cast<ConstantInt>(I->getOperand(1)); 1517 continue; 1518 } 1519 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && 1520 CInt != cast<ConstantInt>(I->getOperand(1))) 1521 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 1522 } 1523 // FIXME: Currently cost of model modification for division by 1524 // power of 2 is handled only for X86. Add support for other targets. 1525 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && 1526 CInt->getValue().isPowerOf2()) 1527 Op2VP = TargetTransformInfo::OP_PowerOf2; 1528 1529 ScalarCost = VecTy->getNumElements() * 1530 TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, 1531 Op1VP, Op2VP); 1532 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, 1533 Op1VP, Op2VP); 1534 } 1535 return VecCost - ScalarCost; 1536 } 1537 case Instruction::GetElementPtr: { 1538 TargetTransformInfo::OperandValueKind Op1VK = 1539 TargetTransformInfo::OK_AnyValue; 1540 TargetTransformInfo::OperandValueKind Op2VK = 1541 TargetTransformInfo::OK_UniformConstantValue; 1542 1543 int ScalarCost = 1544 VecTy->getNumElements() * 1545 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); 1546 int VecCost = 1547 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); 1548 1549 return VecCost - ScalarCost; 1550 } 1551 case Instruction::Load: { 1552 // Cost of wide load - cost of scalar loads. 1553 int ScalarLdCost = VecTy->getNumElements() * 1554 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); 1555 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0); 1556 return VecLdCost - ScalarLdCost; 1557 } 1558 case Instruction::Store: { 1559 // We know that we can merge the stores. Calculate the cost. 1560 int ScalarStCost = VecTy->getNumElements() * 1561 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); 1562 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0); 1563 return VecStCost - ScalarStCost; 1564 } 1565 case Instruction::Call: { 1566 CallInst *CI = cast<CallInst>(VL0); 1567 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1568 1569 // Calculate the cost of the scalar and vector calls. 1570 SmallVector<Type*, 4> ScalarTys, VecTys; 1571 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { 1572 ScalarTys.push_back(CI->getArgOperand(op)->getType()); 1573 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), 1574 VecTy->getNumElements())); 1575 } 1576 1577 int ScalarCallCost = VecTy->getNumElements() * 1578 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys); 1579 1580 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys); 1581 1582 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost 1583 << " (" << VecCallCost << "-" << ScalarCallCost << ")" 1584 << " for " << *CI << "\n"); 1585 1586 return VecCallCost - ScalarCallCost; 1587 } 1588 case Instruction::ShuffleVector: { 1589 TargetTransformInfo::OperandValueKind Op1VK = 1590 TargetTransformInfo::OK_AnyValue; 1591 TargetTransformInfo::OperandValueKind Op2VK = 1592 TargetTransformInfo::OK_AnyValue; 1593 int ScalarCost = 0; 1594 int VecCost = 0; 1595 for (unsigned i = 0; i < VL.size(); ++i) { 1596 Instruction *I = cast<Instruction>(VL[i]); 1597 if (!I) 1598 break; 1599 ScalarCost += 1600 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); 1601 } 1602 // VecCost is equal to sum of the cost of creating 2 vectors 1603 // and the cost of creating shuffle. 1604 Instruction *I0 = cast<Instruction>(VL[0]); 1605 VecCost = 1606 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); 1607 Instruction *I1 = cast<Instruction>(VL[1]); 1608 VecCost += 1609 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); 1610 VecCost += 1611 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); 1612 return VecCost - ScalarCost; 1613 } 1614 default: 1615 llvm_unreachable("Unknown instruction"); 1616 } 1617 } 1618 1619 bool BoUpSLP::isFullyVectorizableTinyTree() { 1620 DEBUG(dbgs() << "SLP: Check whether the tree with height " << 1621 VectorizableTree.size() << " is fully vectorizable .\n"); 1622 1623 // We only handle trees of height 2. 1624 if (VectorizableTree.size() != 2) 1625 return false; 1626 1627 // Handle splat stores. 1628 if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars)) 1629 return true; 1630 1631 // Gathering cost would be too much for tiny trees. 1632 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) 1633 return false; 1634 1635 return true; 1636 } 1637 1638 int BoUpSLP::getSpillCost() { 1639 // Walk from the bottom of the tree to the top, tracking which values are 1640 // live. When we see a call instruction that is not part of our tree, 1641 // query TTI to see if there is a cost to keeping values live over it 1642 // (for example, if spills and fills are required). 1643 unsigned BundleWidth = VectorizableTree.front().Scalars.size(); 1644 int Cost = 0; 1645 1646 SmallPtrSet<Instruction*, 4> LiveValues; 1647 Instruction *PrevInst = nullptr; 1648 1649 for (unsigned N = 0; N < VectorizableTree.size(); ++N) { 1650 Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]); 1651 if (!Inst) 1652 continue; 1653 1654 if (!PrevInst) { 1655 PrevInst = Inst; 1656 continue; 1657 } 1658 1659 DEBUG( 1660 dbgs() << "SLP: #LV: " << LiveValues.size(); 1661 for (auto *X : LiveValues) 1662 dbgs() << " " << X->getName(); 1663 dbgs() << ", Looking at "; 1664 Inst->dump(); 1665 ); 1666 1667 // Update LiveValues. 1668 LiveValues.erase(PrevInst); 1669 for (auto &J : PrevInst->operands()) { 1670 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J)) 1671 LiveValues.insert(cast<Instruction>(&*J)); 1672 } 1673 1674 // Now find the sequence of instructions between PrevInst and Inst. 1675 BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst); 1676 --PrevInstIt; 1677 while (InstIt != PrevInstIt) { 1678 if (PrevInstIt == PrevInst->getParent()->rend()) { 1679 PrevInstIt = Inst->getParent()->rbegin(); 1680 continue; 1681 } 1682 1683 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) { 1684 SmallVector<Type*, 4> V; 1685 for (auto *II : LiveValues) 1686 V.push_back(VectorType::get(II->getType(), BundleWidth)); 1687 Cost += TTI->getCostOfKeepingLiveOverCall(V); 1688 } 1689 1690 ++PrevInstIt; 1691 } 1692 1693 PrevInst = Inst; 1694 } 1695 1696 DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n"); 1697 return Cost; 1698 } 1699 1700 int BoUpSLP::getTreeCost() { 1701 int Cost = 0; 1702 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << 1703 VectorizableTree.size() << ".\n"); 1704 1705 // We only vectorize tiny trees if it is fully vectorizable. 1706 if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { 1707 if (VectorizableTree.empty()) { 1708 assert(!ExternalUses.size() && "We should not have any external users"); 1709 } 1710 return INT_MAX; 1711 } 1712 1713 unsigned BundleWidth = VectorizableTree[0].Scalars.size(); 1714 1715 for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { 1716 int C = getEntryCost(&VectorizableTree[i]); 1717 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " 1718 << *VectorizableTree[i].Scalars[0] << " .\n"); 1719 Cost += C; 1720 } 1721 1722 SmallSet<Value *, 16> ExtractCostCalculated; 1723 int ExtractCost = 0; 1724 for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end(); 1725 I != E; ++I) { 1726 // We only add extract cost once for the same scalar. 1727 if (!ExtractCostCalculated.insert(I->Scalar).second) 1728 continue; 1729 1730 // Uses by ephemeral values are free (because the ephemeral value will be 1731 // removed prior to code generation, and so the extraction will be 1732 // removed as well). 1733 if (EphValues.count(I->User)) 1734 continue; 1735 1736 VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); 1737 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1738 I->Lane); 1739 } 1740 1741 Cost += getSpillCost(); 1742 1743 DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); 1744 return Cost + ExtractCost; 1745 } 1746 1747 int BoUpSLP::getGatherCost(Type *Ty) { 1748 int Cost = 0; 1749 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) 1750 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); 1751 return Cost; 1752 } 1753 1754 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { 1755 // Find the type of the operands in VL. 1756 Type *ScalarTy = VL[0]->getType(); 1757 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1758 ScalarTy = SI->getValueOperand()->getType(); 1759 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1760 // Find the cost of inserting/extracting values from the vector. 1761 return getGatherCost(VecTy); 1762 } 1763 1764 Value *BoUpSLP::getPointerOperand(Value *I) { 1765 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1766 return LI->getPointerOperand(); 1767 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 1768 return SI->getPointerOperand(); 1769 return nullptr; 1770 } 1771 1772 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { 1773 if (LoadInst *L = dyn_cast<LoadInst>(I)) 1774 return L->getPointerAddressSpace(); 1775 if (StoreInst *S = dyn_cast<StoreInst>(I)) 1776 return S->getPointerAddressSpace(); 1777 return -1; 1778 } 1779 1780 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { 1781 Value *PtrA = getPointerOperand(A); 1782 Value *PtrB = getPointerOperand(B); 1783 unsigned ASA = getAddressSpaceOperand(A); 1784 unsigned ASB = getAddressSpaceOperand(B); 1785 1786 // Check that the address spaces match and that the pointers are valid. 1787 if (!PtrA || !PtrB || (ASA != ASB)) 1788 return false; 1789 1790 // Make sure that A and B are different pointers of the same type. 1791 if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) 1792 return false; 1793 1794 unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA); 1795 Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); 1796 APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty)); 1797 1798 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); 1799 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA); 1800 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB); 1801 1802 APInt OffsetDelta = OffsetB - OffsetA; 1803 1804 // Check if they are based on the same pointer. That makes the offsets 1805 // sufficient. 1806 if (PtrA == PtrB) 1807 return OffsetDelta == Size; 1808 1809 // Compute the necessary base pointer delta to have the necessary final delta 1810 // equal to the size. 1811 APInt BaseDelta = Size - OffsetDelta; 1812 1813 // Otherwise compute the distance with SCEV between the base pointers. 1814 const SCEV *PtrSCEVA = SE->getSCEV(PtrA); 1815 const SCEV *PtrSCEVB = SE->getSCEV(PtrB); 1816 const SCEV *C = SE->getConstant(BaseDelta); 1817 const SCEV *X = SE->getAddExpr(PtrSCEVA, C); 1818 return X == PtrSCEVB; 1819 } 1820 1821 // Reorder commutative operations in alternate shuffle if the resulting vectors 1822 // are consecutive loads. This would allow us to vectorize the tree. 1823 // If we have something like- 1824 // load a[0] - load b[0] 1825 // load b[1] + load a[1] 1826 // load a[2] - load b[2] 1827 // load a[3] + load b[3] 1828 // Reordering the second load b[1] load a[1] would allow us to vectorize this 1829 // code. 1830 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, 1831 SmallVectorImpl<Value *> &Left, 1832 SmallVectorImpl<Value *> &Right) { 1833 1834 // Push left and right operands of binary operation into Left and Right 1835 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 1836 Left.push_back(cast<Instruction>(VL[i])->getOperand(0)); 1837 Right.push_back(cast<Instruction>(VL[i])->getOperand(1)); 1838 } 1839 1840 // Reorder if we have a commutative operation and consecutive access 1841 // are on either side of the alternate instructions. 1842 for (unsigned j = 0; j < VL.size() - 1; ++j) { 1843 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { 1844 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { 1845 Instruction *VL1 = cast<Instruction>(VL[j]); 1846 Instruction *VL2 = cast<Instruction>(VL[j + 1]); 1847 if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) { 1848 std::swap(Left[j], Right[j]); 1849 continue; 1850 } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) { 1851 std::swap(Left[j + 1], Right[j + 1]); 1852 continue; 1853 } 1854 // else unchanged 1855 } 1856 } 1857 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { 1858 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { 1859 Instruction *VL1 = cast<Instruction>(VL[j]); 1860 Instruction *VL2 = cast<Instruction>(VL[j + 1]); 1861 if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) { 1862 std::swap(Left[j], Right[j]); 1863 continue; 1864 } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) { 1865 std::swap(Left[j + 1], Right[j + 1]); 1866 continue; 1867 } 1868 // else unchanged 1869 } 1870 } 1871 } 1872 } 1873 1874 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 1875 SmallVectorImpl<Value *> &Left, 1876 SmallVectorImpl<Value *> &Right) { 1877 1878 SmallVector<Value *, 16> OrigLeft, OrigRight; 1879 1880 bool AllSameOpcodeLeft = true; 1881 bool AllSameOpcodeRight = true; 1882 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1883 Instruction *I = cast<Instruction>(VL[i]); 1884 Value *VLeft = I->getOperand(0); 1885 Value *VRight = I->getOperand(1); 1886 1887 OrigLeft.push_back(VLeft); 1888 OrigRight.push_back(VRight); 1889 1890 Instruction *ILeft = dyn_cast<Instruction>(VLeft); 1891 Instruction *IRight = dyn_cast<Instruction>(VRight); 1892 1893 // Check whether all operands on one side have the same opcode. In this case 1894 // we want to preserve the original order and not make things worse by 1895 // reordering. 1896 if (i && AllSameOpcodeLeft && ILeft) { 1897 if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) { 1898 if (PLeft->getOpcode() != ILeft->getOpcode()) 1899 AllSameOpcodeLeft = false; 1900 } else 1901 AllSameOpcodeLeft = false; 1902 } 1903 if (i && AllSameOpcodeRight && IRight) { 1904 if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) { 1905 if (PRight->getOpcode() != IRight->getOpcode()) 1906 AllSameOpcodeRight = false; 1907 } else 1908 AllSameOpcodeRight = false; 1909 } 1910 1911 // Sort two opcodes. In the code below we try to preserve the ability to use 1912 // broadcast of values instead of individual inserts. 1913 // vl1 = load 1914 // vl2 = phi 1915 // vr1 = load 1916 // vr2 = vr2 1917 // = vl1 x vr1 1918 // = vl2 x vr2 1919 // If we just sorted according to opcode we would leave the first line in 1920 // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). 1921 // = vl1 x vr1 1922 // = vr2 x vl2 1923 // Because vr2 and vr1 are from the same load we loose the opportunity of a 1924 // broadcast for the packed right side in the backend: we have [vr1, vl2] 1925 // instead of [vr1, vr2=vr1]. 1926 if (ILeft && IRight) { 1927 if (!i && ILeft->getOpcode() > IRight->getOpcode()) { 1928 Left.push_back(IRight); 1929 Right.push_back(ILeft); 1930 } else if (i && ILeft->getOpcode() > IRight->getOpcode() && 1931 Right[i - 1] != IRight) { 1932 // Try not to destroy a broad cast for no apparent benefit. 1933 Left.push_back(IRight); 1934 Right.push_back(ILeft); 1935 } else if (i && ILeft->getOpcode() == IRight->getOpcode() && 1936 Right[i - 1] == ILeft) { 1937 // Try preserve broadcasts. 1938 Left.push_back(IRight); 1939 Right.push_back(ILeft); 1940 } else if (i && ILeft->getOpcode() == IRight->getOpcode() && 1941 Left[i - 1] == IRight) { 1942 // Try preserve broadcasts. 1943 Left.push_back(IRight); 1944 Right.push_back(ILeft); 1945 } else { 1946 Left.push_back(ILeft); 1947 Right.push_back(IRight); 1948 } 1949 continue; 1950 } 1951 // One opcode, put the instruction on the right. 1952 if (ILeft) { 1953 Left.push_back(VRight); 1954 Right.push_back(ILeft); 1955 continue; 1956 } 1957 Left.push_back(VLeft); 1958 Right.push_back(VRight); 1959 } 1960 1961 bool LeftBroadcast = isSplat(Left); 1962 bool RightBroadcast = isSplat(Right); 1963 1964 // If operands end up being broadcast return this operand order. 1965 if (LeftBroadcast || RightBroadcast) 1966 return; 1967 1968 // Don't reorder if the operands where good to begin. 1969 if (AllSameOpcodeRight || AllSameOpcodeLeft) { 1970 Left = OrigLeft; 1971 Right = OrigRight; 1972 } 1973 1974 // Finally check if we can get longer vectorizable chain by reordering 1975 // without breaking the good operand order detected above. 1976 // E.g. If we have something like- 1977 // load a[0] load b[0] 1978 // load b[1] load a[1] 1979 // load a[2] load b[2] 1980 // load a[3] load b[3] 1981 // Reordering the second load b[1] load a[1] would allow us to vectorize 1982 // this code and we still retain AllSameOpcode property. 1983 // FIXME: This load reordering might break AllSameOpcode in some rare cases 1984 // such as- 1985 // add a[0],c[0] load b[0] 1986 // add a[1],c[2] load b[1] 1987 // b[2] load b[2] 1988 // add a[3],c[3] load b[3] 1989 for (unsigned j = 0; j < VL.size() - 1; ++j) { 1990 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { 1991 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { 1992 if (isConsecutiveAccess(L, L1)) { 1993 std::swap(Left[j + 1], Right[j + 1]); 1994 continue; 1995 } 1996 } 1997 } 1998 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { 1999 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { 2000 if (isConsecutiveAccess(L, L1)) { 2001 std::swap(Left[j + 1], Right[j + 1]); 2002 continue; 2003 } 2004 } 2005 } 2006 // else unchanged 2007 } 2008 } 2009 2010 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { 2011 Instruction *VL0 = cast<Instruction>(VL[0]); 2012 BasicBlock::iterator NextInst = VL0; 2013 ++NextInst; 2014 Builder.SetInsertPoint(VL0->getParent(), NextInst); 2015 Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); 2016 } 2017 2018 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { 2019 Value *Vec = UndefValue::get(Ty); 2020 // Generate the 'InsertElement' instruction. 2021 for (unsigned i = 0; i < Ty->getNumElements(); ++i) { 2022 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); 2023 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { 2024 GatherSeq.insert(Insrt); 2025 CSEBlocks.insert(Insrt->getParent()); 2026 2027 // Add to our 'need-to-extract' list. 2028 if (ScalarToTreeEntry.count(VL[i])) { 2029 int Idx = ScalarToTreeEntry[VL[i]]; 2030 TreeEntry *E = &VectorizableTree[Idx]; 2031 // Find which lane we need to extract. 2032 int FoundLane = -1; 2033 for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { 2034 // Is this the lane of the scalar that we are looking for ? 2035 if (E->Scalars[Lane] == VL[i]) { 2036 FoundLane = Lane; 2037 break; 2038 } 2039 } 2040 assert(FoundLane >= 0 && "Could not find the correct lane"); 2041 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); 2042 } 2043 } 2044 } 2045 2046 return Vec; 2047 } 2048 2049 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { 2050 SmallDenseMap<Value*, int>::const_iterator Entry 2051 = ScalarToTreeEntry.find(VL[0]); 2052 if (Entry != ScalarToTreeEntry.end()) { 2053 int Idx = Entry->second; 2054 const TreeEntry *En = &VectorizableTree[Idx]; 2055 if (En->isSame(VL) && En->VectorizedValue) 2056 return En->VectorizedValue; 2057 } 2058 return nullptr; 2059 } 2060 2061 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { 2062 if (ScalarToTreeEntry.count(VL[0])) { 2063 int Idx = ScalarToTreeEntry[VL[0]]; 2064 TreeEntry *E = &VectorizableTree[Idx]; 2065 if (E->isSame(VL)) 2066 return vectorizeTree(E); 2067 } 2068 2069 Type *ScalarTy = VL[0]->getType(); 2070 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 2071 ScalarTy = SI->getValueOperand()->getType(); 2072 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 2073 2074 return Gather(VL, VecTy); 2075 } 2076 2077 Value *BoUpSLP::vectorizeTree(TreeEntry *E) { 2078 IRBuilder<>::InsertPointGuard Guard(Builder); 2079 2080 if (E->VectorizedValue) { 2081 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); 2082 return E->VectorizedValue; 2083 } 2084 2085 Instruction *VL0 = cast<Instruction>(E->Scalars[0]); 2086 Type *ScalarTy = VL0->getType(); 2087 if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) 2088 ScalarTy = SI->getValueOperand()->getType(); 2089 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); 2090 2091 if (E->NeedToGather) { 2092 setInsertPointAfterBundle(E->Scalars); 2093 return Gather(E->Scalars, VecTy); 2094 } 2095 2096 unsigned Opcode = getSameOpcode(E->Scalars); 2097 2098 switch (Opcode) { 2099 case Instruction::PHI: { 2100 PHINode *PH = dyn_cast<PHINode>(VL0); 2101 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); 2102 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 2103 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); 2104 E->VectorizedValue = NewPhi; 2105 2106 // PHINodes may have multiple entries from the same block. We want to 2107 // visit every block once. 2108 SmallSet<BasicBlock*, 4> VisitedBBs; 2109 2110 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 2111 ValueList Operands; 2112 BasicBlock *IBB = PH->getIncomingBlock(i); 2113 2114 if (!VisitedBBs.insert(IBB).second) { 2115 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); 2116 continue; 2117 } 2118 2119 // Prepare the operand vector. 2120 for (unsigned j = 0; j < E->Scalars.size(); ++j) 2121 Operands.push_back(cast<PHINode>(E->Scalars[j])-> 2122 getIncomingValueForBlock(IBB)); 2123 2124 Builder.SetInsertPoint(IBB->getTerminator()); 2125 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 2126 Value *Vec = vectorizeTree(Operands); 2127 NewPhi->addIncoming(Vec, IBB); 2128 } 2129 2130 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && 2131 "Invalid number of incoming values"); 2132 return NewPhi; 2133 } 2134 2135 case Instruction::ExtractElement: { 2136 if (CanReuseExtract(E->Scalars)) { 2137 Value *V = VL0->getOperand(0); 2138 E->VectorizedValue = V; 2139 return V; 2140 } 2141 return Gather(E->Scalars, VecTy); 2142 } 2143 case Instruction::ZExt: 2144 case Instruction::SExt: 2145 case Instruction::FPToUI: 2146 case Instruction::FPToSI: 2147 case Instruction::FPExt: 2148 case Instruction::PtrToInt: 2149 case Instruction::IntToPtr: 2150 case Instruction::SIToFP: 2151 case Instruction::UIToFP: 2152 case Instruction::Trunc: 2153 case Instruction::FPTrunc: 2154 case Instruction::BitCast: { 2155 ValueList INVL; 2156 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2157 INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2158 2159 setInsertPointAfterBundle(E->Scalars); 2160 2161 Value *InVec = vectorizeTree(INVL); 2162 2163 if (Value *V = alreadyVectorized(E->Scalars)) 2164 return V; 2165 2166 CastInst *CI = dyn_cast<CastInst>(VL0); 2167 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); 2168 E->VectorizedValue = V; 2169 ++NumVectorInstructions; 2170 return V; 2171 } 2172 case Instruction::FCmp: 2173 case Instruction::ICmp: { 2174 ValueList LHSV, RHSV; 2175 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2176 LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2177 RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2178 } 2179 2180 setInsertPointAfterBundle(E->Scalars); 2181 2182 Value *L = vectorizeTree(LHSV); 2183 Value *R = vectorizeTree(RHSV); 2184 2185 if (Value *V = alreadyVectorized(E->Scalars)) 2186 return V; 2187 2188 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); 2189 Value *V; 2190 if (Opcode == Instruction::FCmp) 2191 V = Builder.CreateFCmp(P0, L, R); 2192 else 2193 V = Builder.CreateICmp(P0, L, R); 2194 2195 E->VectorizedValue = V; 2196 ++NumVectorInstructions; 2197 return V; 2198 } 2199 case Instruction::Select: { 2200 ValueList TrueVec, FalseVec, CondVec; 2201 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2202 CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2203 TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2204 FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2)); 2205 } 2206 2207 setInsertPointAfterBundle(E->Scalars); 2208 2209 Value *Cond = vectorizeTree(CondVec); 2210 Value *True = vectorizeTree(TrueVec); 2211 Value *False = vectorizeTree(FalseVec); 2212 2213 if (Value *V = alreadyVectorized(E->Scalars)) 2214 return V; 2215 2216 Value *V = Builder.CreateSelect(Cond, True, False); 2217 E->VectorizedValue = V; 2218 ++NumVectorInstructions; 2219 return V; 2220 } 2221 case Instruction::Add: 2222 case Instruction::FAdd: 2223 case Instruction::Sub: 2224 case Instruction::FSub: 2225 case Instruction::Mul: 2226 case Instruction::FMul: 2227 case Instruction::UDiv: 2228 case Instruction::SDiv: 2229 case Instruction::FDiv: 2230 case Instruction::URem: 2231 case Instruction::SRem: 2232 case Instruction::FRem: 2233 case Instruction::Shl: 2234 case Instruction::LShr: 2235 case Instruction::AShr: 2236 case Instruction::And: 2237 case Instruction::Or: 2238 case Instruction::Xor: { 2239 ValueList LHSVL, RHSVL; 2240 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) 2241 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); 2242 else 2243 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2244 LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2245 RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2246 } 2247 2248 setInsertPointAfterBundle(E->Scalars); 2249 2250 Value *LHS = vectorizeTree(LHSVL); 2251 Value *RHS = vectorizeTree(RHSVL); 2252 2253 if (LHS == RHS && isa<Instruction>(LHS)) { 2254 assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); 2255 } 2256 2257 if (Value *V = alreadyVectorized(E->Scalars)) 2258 return V; 2259 2260 BinaryOperator *BinOp = cast<BinaryOperator>(VL0); 2261 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); 2262 E->VectorizedValue = V; 2263 propagateIRFlags(E->VectorizedValue, E->Scalars); 2264 ++NumVectorInstructions; 2265 2266 if (Instruction *I = dyn_cast<Instruction>(V)) 2267 return propagateMetadata(I, E->Scalars); 2268 2269 return V; 2270 } 2271 case Instruction::Load: { 2272 // Loads are inserted at the head of the tree because we don't want to 2273 // sink them all the way down past store instructions. 2274 setInsertPointAfterBundle(E->Scalars); 2275 2276 LoadInst *LI = cast<LoadInst>(VL0); 2277 Type *ScalarLoadTy = LI->getType(); 2278 unsigned AS = LI->getPointerAddressSpace(); 2279 2280 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), 2281 VecTy->getPointerTo(AS)); 2282 2283 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2284 // ExternalUses list to make sure that an extract will be generated in the 2285 // future. 2286 if (ScalarToTreeEntry.count(LI->getPointerOperand())) 2287 ExternalUses.push_back( 2288 ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0)); 2289 2290 unsigned Alignment = LI->getAlignment(); 2291 LI = Builder.CreateLoad(VecPtr); 2292 if (!Alignment) 2293 Alignment = DL->getABITypeAlignment(ScalarLoadTy); 2294 LI->setAlignment(Alignment); 2295 E->VectorizedValue = LI; 2296 ++NumVectorInstructions; 2297 return propagateMetadata(LI, E->Scalars); 2298 } 2299 case Instruction::Store: { 2300 StoreInst *SI = cast<StoreInst>(VL0); 2301 unsigned Alignment = SI->getAlignment(); 2302 unsigned AS = SI->getPointerAddressSpace(); 2303 2304 ValueList ValueOp; 2305 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2306 ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand()); 2307 2308 setInsertPointAfterBundle(E->Scalars); 2309 2310 Value *VecValue = vectorizeTree(ValueOp); 2311 Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), 2312 VecTy->getPointerTo(AS)); 2313 StoreInst *S = Builder.CreateStore(VecValue, VecPtr); 2314 2315 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2316 // ExternalUses list to make sure that an extract will be generated in the 2317 // future. 2318 if (ScalarToTreeEntry.count(SI->getPointerOperand())) 2319 ExternalUses.push_back( 2320 ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0)); 2321 2322 if (!Alignment) 2323 Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); 2324 S->setAlignment(Alignment); 2325 E->VectorizedValue = S; 2326 ++NumVectorInstructions; 2327 return propagateMetadata(S, E->Scalars); 2328 } 2329 case Instruction::GetElementPtr: { 2330 setInsertPointAfterBundle(E->Scalars); 2331 2332 ValueList Op0VL; 2333 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2334 Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0)); 2335 2336 Value *Op0 = vectorizeTree(Op0VL); 2337 2338 std::vector<Value *> OpVecs; 2339 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; 2340 ++j) { 2341 ValueList OpVL; 2342 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2343 OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j)); 2344 2345 Value *OpVec = vectorizeTree(OpVL); 2346 OpVecs.push_back(OpVec); 2347 } 2348 2349 Value *V = Builder.CreateGEP(Op0, OpVecs); 2350 E->VectorizedValue = V; 2351 ++NumVectorInstructions; 2352 2353 if (Instruction *I = dyn_cast<Instruction>(V)) 2354 return propagateMetadata(I, E->Scalars); 2355 2356 return V; 2357 } 2358 case Instruction::Call: { 2359 CallInst *CI = cast<CallInst>(VL0); 2360 setInsertPointAfterBundle(E->Scalars); 2361 Function *FI; 2362 Intrinsic::ID IID = Intrinsic::not_intrinsic; 2363 Value *ScalarArg = nullptr; 2364 if (CI && (FI = CI->getCalledFunction())) { 2365 IID = (Intrinsic::ID) FI->getIntrinsicID(); 2366 } 2367 std::vector<Value *> OpVecs; 2368 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { 2369 ValueList OpVL; 2370 // ctlz,cttz and powi are special intrinsics whose second argument is 2371 // a scalar. This argument should not be vectorized. 2372 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { 2373 CallInst *CEI = cast<CallInst>(E->Scalars[0]); 2374 ScalarArg = CEI->getArgOperand(j); 2375 OpVecs.push_back(CEI->getArgOperand(j)); 2376 continue; 2377 } 2378 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2379 CallInst *CEI = cast<CallInst>(E->Scalars[i]); 2380 OpVL.push_back(CEI->getArgOperand(j)); 2381 } 2382 2383 Value *OpVec = vectorizeTree(OpVL); 2384 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); 2385 OpVecs.push_back(OpVec); 2386 } 2387 2388 Module *M = F->getParent(); 2389 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 2390 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; 2391 Function *CF = Intrinsic::getDeclaration(M, ID, Tys); 2392 Value *V = Builder.CreateCall(CF, OpVecs); 2393 2394 // The scalar argument uses an in-tree scalar so we add the new vectorized 2395 // call to ExternalUses list to make sure that an extract will be 2396 // generated in the future. 2397 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg)) 2398 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); 2399 2400 E->VectorizedValue = V; 2401 ++NumVectorInstructions; 2402 return V; 2403 } 2404 case Instruction::ShuffleVector: { 2405 ValueList LHSVL, RHSVL; 2406 assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand"); 2407 reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); 2408 setInsertPointAfterBundle(E->Scalars); 2409 2410 Value *LHS = vectorizeTree(LHSVL); 2411 Value *RHS = vectorizeTree(RHSVL); 2412 2413 if (Value *V = alreadyVectorized(E->Scalars)) 2414 return V; 2415 2416 // Create a vector of LHS op1 RHS 2417 BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0); 2418 Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); 2419 2420 // Create a vector of LHS op2 RHS 2421 Instruction *VL1 = cast<Instruction>(E->Scalars[1]); 2422 BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1); 2423 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); 2424 2425 // Create shuffle to take alternate operations from the vector. 2426 // Also, gather up odd and even scalar ops to propagate IR flags to 2427 // each vector operation. 2428 ValueList OddScalars, EvenScalars; 2429 unsigned e = E->Scalars.size(); 2430 SmallVector<Constant *, 8> Mask(e); 2431 for (unsigned i = 0; i < e; ++i) { 2432 if (i & 1) { 2433 Mask[i] = Builder.getInt32(e + i); 2434 OddScalars.push_back(E->Scalars[i]); 2435 } else { 2436 Mask[i] = Builder.getInt32(i); 2437 EvenScalars.push_back(E->Scalars[i]); 2438 } 2439 } 2440 2441 Value *ShuffleMask = ConstantVector::get(Mask); 2442 propagateIRFlags(V0, EvenScalars); 2443 propagateIRFlags(V1, OddScalars); 2444 2445 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); 2446 E->VectorizedValue = V; 2447 ++NumVectorInstructions; 2448 if (Instruction *I = dyn_cast<Instruction>(V)) 2449 return propagateMetadata(I, E->Scalars); 2450 2451 return V; 2452 } 2453 default: 2454 llvm_unreachable("unknown inst"); 2455 } 2456 return nullptr; 2457 } 2458 2459 Value *BoUpSLP::vectorizeTree() { 2460 2461 // All blocks must be scheduled before any instructions are inserted. 2462 for (auto &BSIter : BlocksSchedules) { 2463 scheduleBlock(BSIter.second.get()); 2464 } 2465 2466 Builder.SetInsertPoint(F->getEntryBlock().begin()); 2467 vectorizeTree(&VectorizableTree[0]); 2468 2469 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); 2470 2471 // Extract all of the elements with the external uses. 2472 for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end(); 2473 it != e; ++it) { 2474 Value *Scalar = it->Scalar; 2475 llvm::User *User = it->User; 2476 2477 // Skip users that we already RAUW. This happens when one instruction 2478 // has multiple uses of the same value. 2479 if (std::find(Scalar->user_begin(), Scalar->user_end(), User) == 2480 Scalar->user_end()) 2481 continue; 2482 assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); 2483 2484 int Idx = ScalarToTreeEntry[Scalar]; 2485 TreeEntry *E = &VectorizableTree[Idx]; 2486 assert(!E->NeedToGather && "Extracting from a gather list"); 2487 2488 Value *Vec = E->VectorizedValue; 2489 assert(Vec && "Can't find vectorizable value"); 2490 2491 Value *Lane = Builder.getInt32(it->Lane); 2492 // Generate extracts for out-of-tree users. 2493 // Find the insertion point for the extractelement lane. 2494 if (isa<Instruction>(Vec)){ 2495 if (PHINode *PH = dyn_cast<PHINode>(User)) { 2496 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { 2497 if (PH->getIncomingValue(i) == Scalar) { 2498 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); 2499 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2500 CSEBlocks.insert(PH->getIncomingBlock(i)); 2501 PH->setOperand(i, Ex); 2502 } 2503 } 2504 } else { 2505 Builder.SetInsertPoint(cast<Instruction>(User)); 2506 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2507 CSEBlocks.insert(cast<Instruction>(User)->getParent()); 2508 User->replaceUsesOfWith(Scalar, Ex); 2509 } 2510 } else { 2511 Builder.SetInsertPoint(F->getEntryBlock().begin()); 2512 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2513 CSEBlocks.insert(&F->getEntryBlock()); 2514 User->replaceUsesOfWith(Scalar, Ex); 2515 } 2516 2517 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); 2518 } 2519 2520 // For each vectorized value: 2521 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 2522 TreeEntry *Entry = &VectorizableTree[EIdx]; 2523 2524 // For each lane: 2525 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 2526 Value *Scalar = Entry->Scalars[Lane]; 2527 // No need to handle users of gathered values. 2528 if (Entry->NeedToGather) 2529 continue; 2530 2531 assert(Entry->VectorizedValue && "Can't find vectorizable value"); 2532 2533 Type *Ty = Scalar->getType(); 2534 if (!Ty->isVoidTy()) { 2535 #ifndef NDEBUG 2536 for (User *U : Scalar->users()) { 2537 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); 2538 2539 assert((ScalarToTreeEntry.count(U) || 2540 // It is legal to replace users in the ignorelist by undef. 2541 (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != 2542 UserIgnoreList.end())) && 2543 "Replacing out-of-tree value with undef"); 2544 } 2545 #endif 2546 Value *Undef = UndefValue::get(Ty); 2547 Scalar->replaceAllUsesWith(Undef); 2548 } 2549 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); 2550 eraseInstruction(cast<Instruction>(Scalar)); 2551 } 2552 } 2553 2554 Builder.ClearInsertionPoint(); 2555 2556 return VectorizableTree[0].VectorizedValue; 2557 } 2558 2559 void BoUpSLP::optimizeGatherSequence() { 2560 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() 2561 << " gather sequences instructions.\n"); 2562 // LICM InsertElementInst sequences. 2563 for (SetVector<Instruction *>::iterator it = GatherSeq.begin(), 2564 e = GatherSeq.end(); it != e; ++it) { 2565 InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); 2566 2567 if (!Insert) 2568 continue; 2569 2570 // Check if this block is inside a loop. 2571 Loop *L = LI->getLoopFor(Insert->getParent()); 2572 if (!L) 2573 continue; 2574 2575 // Check if it has a preheader. 2576 BasicBlock *PreHeader = L->getLoopPreheader(); 2577 if (!PreHeader) 2578 continue; 2579 2580 // If the vector or the element that we insert into it are 2581 // instructions that are defined in this basic block then we can't 2582 // hoist this instruction. 2583 Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); 2584 Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); 2585 if (CurrVec && L->contains(CurrVec)) 2586 continue; 2587 if (NewElem && L->contains(NewElem)) 2588 continue; 2589 2590 // We can hoist this instruction. Move it to the pre-header. 2591 Insert->moveBefore(PreHeader->getTerminator()); 2592 } 2593 2594 // Make a list of all reachable blocks in our CSE queue. 2595 SmallVector<const DomTreeNode *, 8> CSEWorkList; 2596 CSEWorkList.reserve(CSEBlocks.size()); 2597 for (BasicBlock *BB : CSEBlocks) 2598 if (DomTreeNode *N = DT->getNode(BB)) { 2599 assert(DT->isReachableFromEntry(N)); 2600 CSEWorkList.push_back(N); 2601 } 2602 2603 // Sort blocks by domination. This ensures we visit a block after all blocks 2604 // dominating it are visited. 2605 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), 2606 [this](const DomTreeNode *A, const DomTreeNode *B) { 2607 return DT->properlyDominates(A, B); 2608 }); 2609 2610 // Perform O(N^2) search over the gather sequences and merge identical 2611 // instructions. TODO: We can further optimize this scan if we split the 2612 // instructions into different buckets based on the insert lane. 2613 SmallVector<Instruction *, 16> Visited; 2614 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { 2615 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && 2616 "Worklist not sorted properly!"); 2617 BasicBlock *BB = (*I)->getBlock(); 2618 // For all instructions in blocks containing gather sequences: 2619 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { 2620 Instruction *In = it++; 2621 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) 2622 continue; 2623 2624 // Check if we can replace this instruction with any of the 2625 // visited instructions. 2626 for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(), 2627 ve = Visited.end(); 2628 v != ve; ++v) { 2629 if (In->isIdenticalTo(*v) && 2630 DT->dominates((*v)->getParent(), In->getParent())) { 2631 In->replaceAllUsesWith(*v); 2632 eraseInstruction(In); 2633 In = nullptr; 2634 break; 2635 } 2636 } 2637 if (In) { 2638 assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end()); 2639 Visited.push_back(In); 2640 } 2641 } 2642 } 2643 CSEBlocks.clear(); 2644 GatherSeq.clear(); 2645 } 2646 2647 // Groups the instructions to a bundle (which is then a single scheduling entity) 2648 // and schedules instructions until the bundle gets ready. 2649 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, 2650 BoUpSLP *SLP) { 2651 if (isa<PHINode>(VL[0])) 2652 return true; 2653 2654 // Initialize the instruction bundle. 2655 Instruction *OldScheduleEnd = ScheduleEnd; 2656 ScheduleData *PrevInBundle = nullptr; 2657 ScheduleData *Bundle = nullptr; 2658 bool ReSchedule = false; 2659 DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); 2660 for (Value *V : VL) { 2661 extendSchedulingRegion(V); 2662 ScheduleData *BundleMember = getScheduleData(V); 2663 assert(BundleMember && 2664 "no ScheduleData for bundle member (maybe not in same basic block)"); 2665 if (BundleMember->IsScheduled) { 2666 // A bundle member was scheduled as single instruction before and now 2667 // needs to be scheduled as part of the bundle. We just get rid of the 2668 // existing schedule. 2669 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember 2670 << " was already scheduled\n"); 2671 ReSchedule = true; 2672 } 2673 assert(BundleMember->isSchedulingEntity() && 2674 "bundle member already part of other bundle"); 2675 if (PrevInBundle) { 2676 PrevInBundle->NextInBundle = BundleMember; 2677 } else { 2678 Bundle = BundleMember; 2679 } 2680 BundleMember->UnscheduledDepsInBundle = 0; 2681 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; 2682 2683 // Group the instructions to a bundle. 2684 BundleMember->FirstInBundle = Bundle; 2685 PrevInBundle = BundleMember; 2686 } 2687 if (ScheduleEnd != OldScheduleEnd) { 2688 // The scheduling region got new instructions at the lower end (or it is a 2689 // new region for the first bundle). This makes it necessary to 2690 // recalculate all dependencies. 2691 // It is seldom that this needs to be done a second time after adding the 2692 // initial bundle to the region. 2693 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2694 ScheduleData *SD = getScheduleData(I); 2695 SD->clearDependencies(); 2696 } 2697 ReSchedule = true; 2698 } 2699 if (ReSchedule) { 2700 resetSchedule(); 2701 initialFillReadyList(ReadyInsts); 2702 } 2703 2704 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " 2705 << BB->getName() << "\n"); 2706 2707 calculateDependencies(Bundle, true, SLP); 2708 2709 // Now try to schedule the new bundle. As soon as the bundle is "ready" it 2710 // means that there are no cyclic dependencies and we can schedule it. 2711 // Note that's important that we don't "schedule" the bundle yet (see 2712 // cancelScheduling). 2713 while (!Bundle->isReady() && !ReadyInsts.empty()) { 2714 2715 ScheduleData *pickedSD = ReadyInsts.back(); 2716 ReadyInsts.pop_back(); 2717 2718 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { 2719 schedule(pickedSD, ReadyInsts); 2720 } 2721 } 2722 return Bundle->isReady(); 2723 } 2724 2725 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) { 2726 if (isa<PHINode>(VL[0])) 2727 return; 2728 2729 ScheduleData *Bundle = getScheduleData(VL[0]); 2730 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); 2731 assert(!Bundle->IsScheduled && 2732 "Can't cancel bundle which is already scheduled"); 2733 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && 2734 "tried to unbundle something which is not a bundle"); 2735 2736 // Un-bundle: make single instructions out of the bundle. 2737 ScheduleData *BundleMember = Bundle; 2738 while (BundleMember) { 2739 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); 2740 BundleMember->FirstInBundle = BundleMember; 2741 ScheduleData *Next = BundleMember->NextInBundle; 2742 BundleMember->NextInBundle = nullptr; 2743 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; 2744 if (BundleMember->UnscheduledDepsInBundle == 0) { 2745 ReadyInsts.insert(BundleMember); 2746 } 2747 BundleMember = Next; 2748 } 2749 } 2750 2751 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { 2752 if (getScheduleData(V)) 2753 return; 2754 Instruction *I = dyn_cast<Instruction>(V); 2755 assert(I && "bundle member must be an instruction"); 2756 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); 2757 if (!ScheduleStart) { 2758 // It's the first instruction in the new region. 2759 initScheduleData(I, I->getNextNode(), nullptr, nullptr); 2760 ScheduleStart = I; 2761 ScheduleEnd = I->getNextNode(); 2762 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2763 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); 2764 return; 2765 } 2766 // Search up and down at the same time, because we don't know if the new 2767 // instruction is above or below the existing scheduling region. 2768 BasicBlock::reverse_iterator UpIter(ScheduleStart); 2769 BasicBlock::reverse_iterator UpperEnd = BB->rend(); 2770 BasicBlock::iterator DownIter(ScheduleEnd); 2771 BasicBlock::iterator LowerEnd = BB->end(); 2772 for (;;) { 2773 if (UpIter != UpperEnd) { 2774 if (&*UpIter == I) { 2775 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); 2776 ScheduleStart = I; 2777 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); 2778 return; 2779 } 2780 UpIter++; 2781 } 2782 if (DownIter != LowerEnd) { 2783 if (&*DownIter == I) { 2784 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, 2785 nullptr); 2786 ScheduleEnd = I->getNextNode(); 2787 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2788 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); 2789 return; 2790 } 2791 DownIter++; 2792 } 2793 assert((UpIter != UpperEnd || DownIter != LowerEnd) && 2794 "instruction not found in block"); 2795 } 2796 } 2797 2798 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, 2799 Instruction *ToI, 2800 ScheduleData *PrevLoadStore, 2801 ScheduleData *NextLoadStore) { 2802 ScheduleData *CurrentLoadStore = PrevLoadStore; 2803 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { 2804 ScheduleData *SD = ScheduleDataMap[I]; 2805 if (!SD) { 2806 // Allocate a new ScheduleData for the instruction. 2807 if (ChunkPos >= ChunkSize) { 2808 ScheduleDataChunks.push_back( 2809 llvm::make_unique<ScheduleData[]>(ChunkSize)); 2810 ChunkPos = 0; 2811 } 2812 SD = &(ScheduleDataChunks.back()[ChunkPos++]); 2813 ScheduleDataMap[I] = SD; 2814 SD->Inst = I; 2815 } 2816 assert(!isInSchedulingRegion(SD) && 2817 "new ScheduleData already in scheduling region"); 2818 SD->init(SchedulingRegionID); 2819 2820 if (I->mayReadOrWriteMemory()) { 2821 // Update the linked list of memory accessing instructions. 2822 if (CurrentLoadStore) { 2823 CurrentLoadStore->NextLoadStore = SD; 2824 } else { 2825 FirstLoadStoreInRegion = SD; 2826 } 2827 CurrentLoadStore = SD; 2828 } 2829 } 2830 if (NextLoadStore) { 2831 if (CurrentLoadStore) 2832 CurrentLoadStore->NextLoadStore = NextLoadStore; 2833 } else { 2834 LastLoadStoreInRegion = CurrentLoadStore; 2835 } 2836 } 2837 2838 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, 2839 bool InsertInReadyList, 2840 BoUpSLP *SLP) { 2841 assert(SD->isSchedulingEntity()); 2842 2843 SmallVector<ScheduleData *, 10> WorkList; 2844 WorkList.push_back(SD); 2845 2846 while (!WorkList.empty()) { 2847 ScheduleData *SD = WorkList.back(); 2848 WorkList.pop_back(); 2849 2850 ScheduleData *BundleMember = SD; 2851 while (BundleMember) { 2852 assert(isInSchedulingRegion(BundleMember)); 2853 if (!BundleMember->hasValidDependencies()) { 2854 2855 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n"); 2856 BundleMember->Dependencies = 0; 2857 BundleMember->resetUnscheduledDeps(); 2858 2859 // Handle def-use chain dependencies. 2860 for (User *U : BundleMember->Inst->users()) { 2861 if (isa<Instruction>(U)) { 2862 ScheduleData *UseSD = getScheduleData(U); 2863 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { 2864 BundleMember->Dependencies++; 2865 ScheduleData *DestBundle = UseSD->FirstInBundle; 2866 if (!DestBundle->IsScheduled) { 2867 BundleMember->incrementUnscheduledDeps(1); 2868 } 2869 if (!DestBundle->hasValidDependencies()) { 2870 WorkList.push_back(DestBundle); 2871 } 2872 } 2873 } else { 2874 // I'm not sure if this can ever happen. But we need to be safe. 2875 // This lets the instruction/bundle never be scheduled and eventally 2876 // disable vectorization. 2877 BundleMember->Dependencies++; 2878 BundleMember->incrementUnscheduledDeps(1); 2879 } 2880 } 2881 2882 // Handle the memory dependencies. 2883 ScheduleData *DepDest = BundleMember->NextLoadStore; 2884 if (DepDest) { 2885 Instruction *SrcInst = BundleMember->Inst; 2886 AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA); 2887 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); 2888 unsigned numAliased = 0; 2889 unsigned DistToSrc = 1; 2890 2891 while (DepDest) { 2892 assert(isInSchedulingRegion(DepDest)); 2893 2894 // We have two limits to reduce the complexity: 2895 // 1) AliasedCheckLimit: It's a small limit to reduce calls to 2896 // SLP->isAliased (which is the expensive part in this loop). 2897 // 2) MaxMemDepDistance: It's for very large blocks and it aborts 2898 // the whole loop (even if the loop is fast, it's quadratic). 2899 // It's important for the loop break condition (see below) to 2900 // check this limit even between two read-only instructions. 2901 if (DistToSrc >= MaxMemDepDistance || 2902 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && 2903 (numAliased >= AliasedCheckLimit || 2904 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { 2905 2906 // We increment the counter only if the locations are aliased 2907 // (instead of counting all alias checks). This gives a better 2908 // balance between reduced runtime and accurate dependencies. 2909 numAliased++; 2910 2911 DepDest->MemoryDependencies.push_back(BundleMember); 2912 BundleMember->Dependencies++; 2913 ScheduleData *DestBundle = DepDest->FirstInBundle; 2914 if (!DestBundle->IsScheduled) { 2915 BundleMember->incrementUnscheduledDeps(1); 2916 } 2917 if (!DestBundle->hasValidDependencies()) { 2918 WorkList.push_back(DestBundle); 2919 } 2920 } 2921 DepDest = DepDest->NextLoadStore; 2922 2923 // Example, explaining the loop break condition: Let's assume our 2924 // starting instruction is i0 and MaxMemDepDistance = 3. 2925 // 2926 // +--------v--v--v 2927 // i0,i1,i2,i3,i4,i5,i6,i7,i8 2928 // +--------^--^--^ 2929 // 2930 // MaxMemDepDistance let us stop alias-checking at i3 and we add 2931 // dependencies from i0 to i3,i4,.. (even if they are not aliased). 2932 // Previously we already added dependencies from i3 to i6,i7,i8 2933 // (because of MaxMemDepDistance). As we added a dependency from 2934 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 2935 // and we can abort this loop at i6. 2936 if (DistToSrc >= 2 * MaxMemDepDistance) 2937 break; 2938 DistToSrc++; 2939 } 2940 } 2941 } 2942 BundleMember = BundleMember->NextInBundle; 2943 } 2944 if (InsertInReadyList && SD->isReady()) { 2945 ReadyInsts.push_back(SD); 2946 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n"); 2947 } 2948 } 2949 } 2950 2951 void BoUpSLP::BlockScheduling::resetSchedule() { 2952 assert(ScheduleStart && 2953 "tried to reset schedule on block which has not been scheduled"); 2954 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2955 ScheduleData *SD = getScheduleData(I); 2956 assert(isInSchedulingRegion(SD)); 2957 SD->IsScheduled = false; 2958 SD->resetUnscheduledDeps(); 2959 } 2960 ReadyInsts.clear(); 2961 } 2962 2963 void BoUpSLP::scheduleBlock(BlockScheduling *BS) { 2964 2965 if (!BS->ScheduleStart) 2966 return; 2967 2968 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); 2969 2970 BS->resetSchedule(); 2971 2972 // For the real scheduling we use a more sophisticated ready-list: it is 2973 // sorted by the original instruction location. This lets the final schedule 2974 // be as close as possible to the original instruction order. 2975 struct ScheduleDataCompare { 2976 bool operator()(ScheduleData *SD1, ScheduleData *SD2) { 2977 return SD2->SchedulingPriority < SD1->SchedulingPriority; 2978 } 2979 }; 2980 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; 2981 2982 // Ensure that all depencency data is updated and fill the ready-list with 2983 // initial instructions. 2984 int Idx = 0; 2985 int NumToSchedule = 0; 2986 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; 2987 I = I->getNextNode()) { 2988 ScheduleData *SD = BS->getScheduleData(I); 2989 assert( 2990 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && 2991 "scheduler and vectorizer have different opinion on what is a bundle"); 2992 SD->FirstInBundle->SchedulingPriority = Idx++; 2993 if (SD->isSchedulingEntity()) { 2994 BS->calculateDependencies(SD, false, this); 2995 NumToSchedule++; 2996 } 2997 } 2998 BS->initialFillReadyList(ReadyInsts); 2999 3000 Instruction *LastScheduledInst = BS->ScheduleEnd; 3001 3002 // Do the "real" scheduling. 3003 while (!ReadyInsts.empty()) { 3004 ScheduleData *picked = *ReadyInsts.begin(); 3005 ReadyInsts.erase(ReadyInsts.begin()); 3006 3007 // Move the scheduled instruction(s) to their dedicated places, if not 3008 // there yet. 3009 ScheduleData *BundleMember = picked; 3010 while (BundleMember) { 3011 Instruction *pickedInst = BundleMember->Inst; 3012 if (LastScheduledInst->getNextNode() != pickedInst) { 3013 BS->BB->getInstList().remove(pickedInst); 3014 BS->BB->getInstList().insert(LastScheduledInst, pickedInst); 3015 } 3016 LastScheduledInst = pickedInst; 3017 BundleMember = BundleMember->NextInBundle; 3018 } 3019 3020 BS->schedule(picked, ReadyInsts); 3021 NumToSchedule--; 3022 } 3023 assert(NumToSchedule == 0 && "could not schedule all instructions"); 3024 3025 // Avoid duplicate scheduling of the block. 3026 BS->ScheduleStart = nullptr; 3027 } 3028 3029 /// The SLPVectorizer Pass. 3030 struct SLPVectorizer : public FunctionPass { 3031 typedef SmallVector<StoreInst *, 8> StoreList; 3032 typedef MapVector<Value *, StoreList> StoreListMap; 3033 3034 /// Pass identification, replacement for typeid 3035 static char ID; 3036 3037 explicit SLPVectorizer() : FunctionPass(ID) { 3038 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); 3039 } 3040 3041 ScalarEvolution *SE; 3042 const DataLayout *DL; 3043 TargetTransformInfo *TTI; 3044 TargetLibraryInfo *TLI; 3045 AliasAnalysis *AA; 3046 LoopInfo *LI; 3047 DominatorTree *DT; 3048 AssumptionCache *AC; 3049 3050 bool runOnFunction(Function &F) override { 3051 if (skipOptnoneFunction(F)) 3052 return false; 3053 3054 SE = &getAnalysis<ScalarEvolution>(); 3055 DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); 3056 DL = DLP ? &DLP->getDataLayout() : nullptr; 3057 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 3058 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 3059 TLI = TLIP ? &TLIP->getTLI() : nullptr; 3060 AA = &getAnalysis<AliasAnalysis>(); 3061 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 3062 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 3063 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 3064 3065 StoreRefs.clear(); 3066 bool Changed = false; 3067 3068 // If the target claims to have no vector registers don't attempt 3069 // vectorization. 3070 if (!TTI->getNumberOfRegisters(true)) 3071 return false; 3072 3073 // Must have DataLayout. We can't require it because some tests run w/o 3074 // triple. 3075 if (!DL) 3076 return false; 3077 3078 // Don't vectorize when the attribute NoImplicitFloat is used. 3079 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 3080 return false; 3081 3082 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); 3083 3084 // Use the bottom up slp vectorizer to construct chains that start with 3085 // store instructions. 3086 BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC); 3087 3088 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to 3089 // delete instructions. 3090 3091 // Scan the blocks in the function in post order. 3092 for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), 3093 e = po_end(&F.getEntryBlock()); it != e; ++it) { 3094 BasicBlock *BB = *it; 3095 // Vectorize trees that end at stores. 3096 if (unsigned count = collectStores(BB, R)) { 3097 (void)count; 3098 DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); 3099 Changed |= vectorizeStoreChains(R); 3100 } 3101 3102 // Vectorize trees that end at reductions. 3103 Changed |= vectorizeChainsInBlock(BB, R); 3104 } 3105 3106 if (Changed) { 3107 R.optimizeGatherSequence(); 3108 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); 3109 DEBUG(verifyFunction(F)); 3110 } 3111 return Changed; 3112 } 3113 3114 void getAnalysisUsage(AnalysisUsage &AU) const override { 3115 FunctionPass::getAnalysisUsage(AU); 3116 AU.addRequired<AssumptionCacheTracker>(); 3117 AU.addRequired<ScalarEvolution>(); 3118 AU.addRequired<AliasAnalysis>(); 3119 AU.addRequired<TargetTransformInfoWrapperPass>(); 3120 AU.addRequired<LoopInfoWrapperPass>(); 3121 AU.addRequired<DominatorTreeWrapperPass>(); 3122 AU.addPreserved<LoopInfoWrapperPass>(); 3123 AU.addPreserved<DominatorTreeWrapperPass>(); 3124 AU.setPreservesCFG(); 3125 } 3126 3127 private: 3128 3129 /// \brief Collect memory references and sort them according to their base 3130 /// object. We sort the stores to their base objects to reduce the cost of the 3131 /// quadratic search on the stores. TODO: We can further reduce this cost 3132 /// if we flush the chain creation every time we run into a memory barrier. 3133 unsigned collectStores(BasicBlock *BB, BoUpSLP &R); 3134 3135 /// \brief Try to vectorize a chain that starts at two arithmetic instrs. 3136 bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); 3137 3138 /// \brief Try to vectorize a list of operands. 3139 /// \@param BuildVector A list of users to ignore for the purpose of 3140 /// scheduling and that don't need extracting. 3141 /// \returns true if a value was vectorized. 3142 bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 3143 ArrayRef<Value *> BuildVector = None, 3144 bool allowReorder = false); 3145 3146 /// \brief Try to vectorize a chain that may start at the operands of \V; 3147 bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); 3148 3149 /// \brief Vectorize the stores that were collected in StoreRefs. 3150 bool vectorizeStoreChains(BoUpSLP &R); 3151 3152 /// \brief Scan the basic block and look for patterns that are likely to start 3153 /// a vectorization chain. 3154 bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); 3155 3156 bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, 3157 BoUpSLP &R); 3158 3159 bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, 3160 BoUpSLP &R); 3161 private: 3162 StoreListMap StoreRefs; 3163 }; 3164 3165 /// \brief Check that the Values in the slice in VL array are still existent in 3166 /// the WeakVH array. 3167 /// Vectorization of part of the VL array may cause later values in the VL array 3168 /// to become invalid. We track when this has happened in the WeakVH array. 3169 static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL, 3170 SmallVectorImpl<WeakVH> &VH, 3171 unsigned SliceBegin, 3172 unsigned SliceSize) { 3173 for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i) 3174 if (VH[i] != VL[i]) 3175 return true; 3176 3177 return false; 3178 } 3179 3180 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, 3181 int CostThreshold, BoUpSLP &R) { 3182 unsigned ChainLen = Chain.size(); 3183 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen 3184 << "\n"); 3185 Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); 3186 unsigned Sz = DL->getTypeSizeInBits(StoreTy); 3187 unsigned VF = MinVecRegSize / Sz; 3188 3189 if (!isPowerOf2_32(Sz) || VF < 2) 3190 return false; 3191 3192 // Keep track of values that were deleted by vectorizing in the loop below. 3193 SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end()); 3194 3195 bool Changed = false; 3196 // Look for profitable vectorizable trees at all offsets, starting at zero. 3197 for (unsigned i = 0, e = ChainLen; i < e; ++i) { 3198 if (i + VF > e) 3199 break; 3200 3201 // Check that a previous iteration of this loop did not delete the Value. 3202 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) 3203 continue; 3204 3205 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i 3206 << "\n"); 3207 ArrayRef<Value *> Operands = Chain.slice(i, VF); 3208 3209 R.buildTree(Operands); 3210 3211 int Cost = R.getTreeCost(); 3212 3213 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); 3214 if (Cost < CostThreshold) { 3215 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); 3216 R.vectorizeTree(); 3217 3218 // Move to the next bundle. 3219 i += VF - 1; 3220 Changed = true; 3221 } 3222 } 3223 3224 return Changed; 3225 } 3226 3227 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, 3228 int costThreshold, BoUpSLP &R) { 3229 SetVector<Value *> Heads, Tails; 3230 SmallDenseMap<Value *, Value *> ConsecutiveChain; 3231 3232 // We may run into multiple chains that merge into a single chain. We mark the 3233 // stores that we vectorized so that we don't visit the same store twice. 3234 BoUpSLP::ValueSet VectorizedStores; 3235 bool Changed = false; 3236 3237 // Do a quadratic search on all of the given stores and find 3238 // all of the pairs of stores that follow each other. 3239 for (unsigned i = 0, e = Stores.size(); i < e; ++i) { 3240 for (unsigned j = 0; j < e; ++j) { 3241 if (i == j) 3242 continue; 3243 3244 if (R.isConsecutiveAccess(Stores[i], Stores[j])) { 3245 Tails.insert(Stores[j]); 3246 Heads.insert(Stores[i]); 3247 ConsecutiveChain[Stores[i]] = Stores[j]; 3248 } 3249 } 3250 } 3251 3252 // For stores that start but don't end a link in the chain: 3253 for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); 3254 it != e; ++it) { 3255 if (Tails.count(*it)) 3256 continue; 3257 3258 // We found a store instr that starts a chain. Now follow the chain and try 3259 // to vectorize it. 3260 BoUpSLP::ValueList Operands; 3261 Value *I = *it; 3262 // Collect the chain into a list. 3263 while (Tails.count(I) || Heads.count(I)) { 3264 if (VectorizedStores.count(I)) 3265 break; 3266 Operands.push_back(I); 3267 // Move to the next value in the chain. 3268 I = ConsecutiveChain[I]; 3269 } 3270 3271 bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); 3272 3273 // Mark the vectorized stores so that we don't vectorize them again. 3274 if (Vectorized) 3275 VectorizedStores.insert(Operands.begin(), Operands.end()); 3276 Changed |= Vectorized; 3277 } 3278 3279 return Changed; 3280 } 3281 3282 3283 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { 3284 unsigned count = 0; 3285 StoreRefs.clear(); 3286 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3287 StoreInst *SI = dyn_cast<StoreInst>(it); 3288 if (!SI) 3289 continue; 3290 3291 // Don't touch volatile stores. 3292 if (!SI->isSimple()) 3293 continue; 3294 3295 // Check that the pointer points to scalars. 3296 Type *Ty = SI->getValueOperand()->getType(); 3297 if (Ty->isAggregateType() || Ty->isVectorTy()) 3298 continue; 3299 3300 // Find the base pointer. 3301 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); 3302 3303 // Save the store locations. 3304 StoreRefs[Ptr].push_back(SI); 3305 count++; 3306 } 3307 return count; 3308 } 3309 3310 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { 3311 if (!A || !B) 3312 return false; 3313 Value *VL[] = { A, B }; 3314 return tryToVectorizeList(VL, R, None, true); 3315 } 3316 3317 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 3318 ArrayRef<Value *> BuildVector, 3319 bool allowReorder) { 3320 if (VL.size() < 2) 3321 return false; 3322 3323 DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n"); 3324 3325 // Check that all of the parts are scalar instructions of the same type. 3326 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 3327 if (!I0) 3328 return false; 3329 3330 unsigned Opcode0 = I0->getOpcode(); 3331 3332 Type *Ty0 = I0->getType(); 3333 unsigned Sz = DL->getTypeSizeInBits(Ty0); 3334 unsigned VF = MinVecRegSize / Sz; 3335 3336 for (int i = 0, e = VL.size(); i < e; ++i) { 3337 Type *Ty = VL[i]->getType(); 3338 if (Ty->isAggregateType() || Ty->isVectorTy()) 3339 return false; 3340 Instruction *Inst = dyn_cast<Instruction>(VL[i]); 3341 if (!Inst || Inst->getOpcode() != Opcode0) 3342 return false; 3343 } 3344 3345 bool Changed = false; 3346 3347 // Keep track of values that were deleted by vectorizing in the loop below. 3348 SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); 3349 3350 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 3351 unsigned OpsWidth = 0; 3352 3353 if (i + VF > e) 3354 OpsWidth = e - i; 3355 else 3356 OpsWidth = VF; 3357 3358 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) 3359 break; 3360 3361 // Check that a previous iteration of this loop did not delete the Value. 3362 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth)) 3363 continue; 3364 3365 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " 3366 << "\n"); 3367 ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); 3368 3369 ArrayRef<Value *> BuildVectorSlice; 3370 if (!BuildVector.empty()) 3371 BuildVectorSlice = BuildVector.slice(i, OpsWidth); 3372 3373 R.buildTree(Ops, BuildVectorSlice); 3374 // TODO: check if we can allow reordering also for other cases than 3375 // tryToVectorizePair() 3376 if (allowReorder && R.shouldReorder()) { 3377 assert(Ops.size() == 2); 3378 assert(BuildVectorSlice.empty()); 3379 Value *ReorderedOps[] = { Ops[1], Ops[0] }; 3380 R.buildTree(ReorderedOps, None); 3381 } 3382 int Cost = R.getTreeCost(); 3383 3384 if (Cost < -SLPCostThreshold) { 3385 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); 3386 Value *VectorizedRoot = R.vectorizeTree(); 3387 3388 // Reconstruct the build vector by extracting the vectorized root. This 3389 // way we handle the case where some elements of the vector are undefined. 3390 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) 3391 if (!BuildVectorSlice.empty()) { 3392 // The insert point is the last build vector instruction. The vectorized 3393 // root will precede it. This guarantees that we get an instruction. The 3394 // vectorized tree could have been constant folded. 3395 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); 3396 unsigned VecIdx = 0; 3397 for (auto &V : BuildVectorSlice) { 3398 IRBuilder<true, NoFolder> Builder( 3399 ++BasicBlock::iterator(InsertAfter)); 3400 InsertElementInst *IE = cast<InsertElementInst>(V); 3401 Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( 3402 VectorizedRoot, Builder.getInt32(VecIdx++))); 3403 IE->setOperand(1, Extract); 3404 IE->removeFromParent(); 3405 IE->insertAfter(Extract); 3406 InsertAfter = IE; 3407 } 3408 } 3409 // Move to the next bundle. 3410 i += VF - 1; 3411 Changed = true; 3412 } 3413 } 3414 3415 return Changed; 3416 } 3417 3418 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { 3419 if (!V) 3420 return false; 3421 3422 // Try to vectorize V. 3423 if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) 3424 return true; 3425 3426 BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0)); 3427 BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1)); 3428 // Try to skip B. 3429 if (B && B->hasOneUse()) { 3430 BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); 3431 BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); 3432 if (tryToVectorizePair(A, B0, R)) { 3433 return true; 3434 } 3435 if (tryToVectorizePair(A, B1, R)) { 3436 return true; 3437 } 3438 } 3439 3440 // Try to skip A. 3441 if (A && A->hasOneUse()) { 3442 BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); 3443 BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); 3444 if (tryToVectorizePair(A0, B, R)) { 3445 return true; 3446 } 3447 if (tryToVectorizePair(A1, B, R)) { 3448 return true; 3449 } 3450 } 3451 return 0; 3452 } 3453 3454 /// \brief Generate a shuffle mask to be used in a reduction tree. 3455 /// 3456 /// \param VecLen The length of the vector to be reduced. 3457 /// \param NumEltsToRdx The number of elements that should be reduced in the 3458 /// vector. 3459 /// \param IsPairwise Whether the reduction is a pairwise or splitting 3460 /// reduction. A pairwise reduction will generate a mask of 3461 /// <0,2,...> or <1,3,..> while a splitting reduction will generate 3462 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2. 3463 /// \param IsLeft True will generate a mask of even elements, odd otherwise. 3464 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, 3465 bool IsPairwise, bool IsLeft, 3466 IRBuilder<> &Builder) { 3467 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); 3468 3469 SmallVector<Constant *, 32> ShuffleMask( 3470 VecLen, UndefValue::get(Builder.getInt32Ty())); 3471 3472 if (IsPairwise) 3473 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). 3474 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3475 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); 3476 else 3477 // Move the upper half of the vector to the lower half. 3478 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3479 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); 3480 3481 return ConstantVector::get(ShuffleMask); 3482 } 3483 3484 3485 /// Model horizontal reductions. 3486 /// 3487 /// A horizontal reduction is a tree of reduction operations (currently add and 3488 /// fadd) that has operations that can be put into a vector as its leaf. 3489 /// For example, this tree: 3490 /// 3491 /// mul mul mul mul 3492 /// \ / \ / 3493 /// + + 3494 /// \ / 3495 /// + 3496 /// This tree has "mul" as its reduced values and "+" as its reduction 3497 /// operations. A reduction might be feeding into a store or a binary operation 3498 /// feeding a phi. 3499 /// ... 3500 /// \ / 3501 /// + 3502 /// | 3503 /// phi += 3504 /// 3505 /// Or: 3506 /// ... 3507 /// \ / 3508 /// + 3509 /// | 3510 /// *p = 3511 /// 3512 class HorizontalReduction { 3513 SmallVector<Value *, 16> ReductionOps; 3514 SmallVector<Value *, 32> ReducedVals; 3515 3516 BinaryOperator *ReductionRoot; 3517 PHINode *ReductionPHI; 3518 3519 /// The opcode of the reduction. 3520 unsigned ReductionOpcode; 3521 /// The opcode of the values we perform a reduction on. 3522 unsigned ReducedValueOpcode; 3523 /// The width of one full horizontal reduction operation. 3524 unsigned ReduxWidth; 3525 /// Should we model this reduction as a pairwise reduction tree or a tree that 3526 /// splits the vector in halves and adds those halves. 3527 bool IsPairwiseReduction; 3528 3529 public: 3530 HorizontalReduction() 3531 : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), 3532 ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} 3533 3534 /// \brief Try to find a reduction tree. 3535 bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, 3536 const DataLayout *DL) { 3537 assert((!Phi || 3538 std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && 3539 "Thi phi needs to use the binary operator"); 3540 3541 // We could have a initial reductions that is not an add. 3542 // r *= v1 + v2 + v3 + v4 3543 // In such a case start looking for a tree rooted in the first '+'. 3544 if (Phi) { 3545 if (B->getOperand(0) == Phi) { 3546 Phi = nullptr; 3547 B = dyn_cast<BinaryOperator>(B->getOperand(1)); 3548 } else if (B->getOperand(1) == Phi) { 3549 Phi = nullptr; 3550 B = dyn_cast<BinaryOperator>(B->getOperand(0)); 3551 } 3552 } 3553 3554 if (!B) 3555 return false; 3556 3557 Type *Ty = B->getType(); 3558 if (Ty->isVectorTy()) 3559 return false; 3560 3561 ReductionOpcode = B->getOpcode(); 3562 ReducedValueOpcode = 0; 3563 ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); 3564 ReductionRoot = B; 3565 ReductionPHI = Phi; 3566 3567 if (ReduxWidth < 4) 3568 return false; 3569 3570 // We currently only support adds. 3571 if (ReductionOpcode != Instruction::Add && 3572 ReductionOpcode != Instruction::FAdd) 3573 return false; 3574 3575 // Post order traverse the reduction tree starting at B. We only handle true 3576 // trees containing only binary operators. 3577 SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack; 3578 Stack.push_back(std::make_pair(B, 0)); 3579 while (!Stack.empty()) { 3580 BinaryOperator *TreeN = Stack.back().first; 3581 unsigned EdgeToVist = Stack.back().second++; 3582 bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; 3583 3584 // Only handle trees in the current basic block. 3585 if (TreeN->getParent() != B->getParent()) 3586 return false; 3587 3588 // Each tree node needs to have one user except for the ultimate 3589 // reduction. 3590 if (!TreeN->hasOneUse() && TreeN != B) 3591 return false; 3592 3593 // Postorder vist. 3594 if (EdgeToVist == 2 || IsReducedValue) { 3595 if (IsReducedValue) { 3596 // Make sure that the opcodes of the operations that we are going to 3597 // reduce match. 3598 if (!ReducedValueOpcode) 3599 ReducedValueOpcode = TreeN->getOpcode(); 3600 else if (ReducedValueOpcode != TreeN->getOpcode()) 3601 return false; 3602 ReducedVals.push_back(TreeN); 3603 } else { 3604 // We need to be able to reassociate the adds. 3605 if (!TreeN->isAssociative()) 3606 return false; 3607 ReductionOps.push_back(TreeN); 3608 } 3609 // Retract. 3610 Stack.pop_back(); 3611 continue; 3612 } 3613 3614 // Visit left or right. 3615 Value *NextV = TreeN->getOperand(EdgeToVist); 3616 BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV); 3617 if (Next) 3618 Stack.push_back(std::make_pair(Next, 0)); 3619 else if (NextV != Phi) 3620 return false; 3621 } 3622 return true; 3623 } 3624 3625 /// \brief Attempt to vectorize the tree found by 3626 /// matchAssociativeReduction. 3627 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { 3628 if (ReducedVals.empty()) 3629 return false; 3630 3631 unsigned NumReducedVals = ReducedVals.size(); 3632 if (NumReducedVals < ReduxWidth) 3633 return false; 3634 3635 Value *VectorizedTree = nullptr; 3636 IRBuilder<> Builder(ReductionRoot); 3637 FastMathFlags Unsafe; 3638 Unsafe.setUnsafeAlgebra(); 3639 Builder.SetFastMathFlags(Unsafe); 3640 unsigned i = 0; 3641 3642 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { 3643 V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps); 3644 3645 // Estimate cost. 3646 int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); 3647 if (Cost >= -SLPCostThreshold) 3648 break; 3649 3650 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost 3651 << ". (HorRdx)\n"); 3652 3653 // Vectorize a tree. 3654 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); 3655 Value *VectorizedRoot = V.vectorizeTree(); 3656 3657 // Emit a reduction. 3658 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); 3659 if (VectorizedTree) { 3660 Builder.SetCurrentDebugLocation(Loc); 3661 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3662 ReducedSubTree, "bin.rdx"); 3663 } else 3664 VectorizedTree = ReducedSubTree; 3665 } 3666 3667 if (VectorizedTree) { 3668 // Finish the reduction. 3669 for (; i < NumReducedVals; ++i) { 3670 Builder.SetCurrentDebugLocation( 3671 cast<Instruction>(ReducedVals[i])->getDebugLoc()); 3672 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3673 ReducedVals[i]); 3674 } 3675 // Update users. 3676 if (ReductionPHI) { 3677 assert(ReductionRoot && "Need a reduction operation"); 3678 ReductionRoot->setOperand(0, VectorizedTree); 3679 ReductionRoot->setOperand(1, ReductionPHI); 3680 } else 3681 ReductionRoot->replaceAllUsesWith(VectorizedTree); 3682 } 3683 return VectorizedTree != nullptr; 3684 } 3685 3686 private: 3687 3688 /// \brief Calcuate the cost of a reduction. 3689 int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { 3690 Type *ScalarTy = FirstReducedVal->getType(); 3691 Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); 3692 3693 int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); 3694 int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); 3695 3696 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; 3697 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; 3698 3699 int ScalarReduxCost = 3700 ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); 3701 3702 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost 3703 << " for reduction that starts with " << *FirstReducedVal 3704 << " (It is a " 3705 << (IsPairwiseReduction ? "pairwise" : "splitting") 3706 << " reduction)\n"); 3707 3708 return VecReduxCost - ScalarReduxCost; 3709 } 3710 3711 static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, 3712 Value *R, const Twine &Name = "") { 3713 if (Opcode == Instruction::FAdd) 3714 return Builder.CreateFAdd(L, R, Name); 3715 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); 3716 } 3717 3718 /// \brief Emit a horizontal reduction of the vectorized value. 3719 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { 3720 assert(VectorizedValue && "Need to have a vectorized tree node"); 3721 assert(isPowerOf2_32(ReduxWidth) && 3722 "We only handle power-of-two reductions for now"); 3723 3724 Value *TmpVec = VectorizedValue; 3725 for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { 3726 if (IsPairwiseReduction) { 3727 Value *LeftMask = 3728 createRdxShuffleMask(ReduxWidth, i, true, true, Builder); 3729 Value *RightMask = 3730 createRdxShuffleMask(ReduxWidth, i, true, false, Builder); 3731 3732 Value *LeftShuf = Builder.CreateShuffleVector( 3733 TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); 3734 Value *RightShuf = Builder.CreateShuffleVector( 3735 TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), 3736 "rdx.shuf.r"); 3737 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, 3738 "bin.rdx"); 3739 } else { 3740 Value *UpperHalf = 3741 createRdxShuffleMask(ReduxWidth, i, false, false, Builder); 3742 Value *Shuf = Builder.CreateShuffleVector( 3743 TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); 3744 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); 3745 } 3746 } 3747 3748 // The result is in the first element of the vector. 3749 return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 3750 } 3751 }; 3752 3753 /// \brief Recognize construction of vectors like 3754 /// %ra = insertelement <4 x float> undef, float %s0, i32 0 3755 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 3756 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 3757 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 3758 /// 3759 /// Returns true if it matches 3760 /// 3761 static bool findBuildVector(InsertElementInst *FirstInsertElem, 3762 SmallVectorImpl<Value *> &BuildVector, 3763 SmallVectorImpl<Value *> &BuildVectorOpds) { 3764 if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) 3765 return false; 3766 3767 InsertElementInst *IE = FirstInsertElem; 3768 while (true) { 3769 BuildVector.push_back(IE); 3770 BuildVectorOpds.push_back(IE->getOperand(1)); 3771 3772 if (IE->use_empty()) 3773 return false; 3774 3775 InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back()); 3776 if (!NextUse) 3777 return true; 3778 3779 // If this isn't the final use, make sure the next insertelement is the only 3780 // use. It's OK if the final constructed vector is used multiple times 3781 if (!IE->hasOneUse()) 3782 return false; 3783 3784 IE = NextUse; 3785 } 3786 3787 return false; 3788 } 3789 3790 static bool PhiTypeSorterFunc(Value *V, Value *V2) { 3791 return V->getType() < V2->getType(); 3792 } 3793 3794 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { 3795 bool Changed = false; 3796 SmallVector<Value *, 4> Incoming; 3797 SmallSet<Value *, 16> VisitedInstrs; 3798 3799 bool HaveVectorizedPhiNodes = true; 3800 while (HaveVectorizedPhiNodes) { 3801 HaveVectorizedPhiNodes = false; 3802 3803 // Collect the incoming values from the PHIs. 3804 Incoming.clear(); 3805 for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie; 3806 ++instr) { 3807 PHINode *P = dyn_cast<PHINode>(instr); 3808 if (!P) 3809 break; 3810 3811 if (!VisitedInstrs.count(P)) 3812 Incoming.push_back(P); 3813 } 3814 3815 // Sort by type. 3816 std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc); 3817 3818 // Try to vectorize elements base on their type. 3819 for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), 3820 E = Incoming.end(); 3821 IncIt != E;) { 3822 3823 // Look for the next elements with the same type. 3824 SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; 3825 while (SameTypeIt != E && 3826 (*SameTypeIt)->getType() == (*IncIt)->getType()) { 3827 VisitedInstrs.insert(*SameTypeIt); 3828 ++SameTypeIt; 3829 } 3830 3831 // Try to vectorize them. 3832 unsigned NumElts = (SameTypeIt - IncIt); 3833 DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); 3834 if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { 3835 // Success start over because instructions might have been changed. 3836 HaveVectorizedPhiNodes = true; 3837 Changed = true; 3838 break; 3839 } 3840 3841 // Start over at the next instruction of a different type (or the end). 3842 IncIt = SameTypeIt; 3843 } 3844 } 3845 3846 VisitedInstrs.clear(); 3847 3848 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { 3849 // We may go through BB multiple times so skip the one we have checked. 3850 if (!VisitedInstrs.insert(it).second) 3851 continue; 3852 3853 if (isa<DbgInfoIntrinsic>(it)) 3854 continue; 3855 3856 // Try to vectorize reductions that use PHINodes. 3857 if (PHINode *P = dyn_cast<PHINode>(it)) { 3858 // Check that the PHI is a reduction PHI. 3859 if (P->getNumIncomingValues() != 2) 3860 return Changed; 3861 Value *Rdx = 3862 (P->getIncomingBlock(0) == BB 3863 ? (P->getIncomingValue(0)) 3864 : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) 3865 : nullptr)); 3866 // Check if this is a Binary Operator. 3867 BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); 3868 if (!BI) 3869 continue; 3870 3871 // Try to match and vectorize a horizontal reduction. 3872 HorizontalReduction HorRdx; 3873 if (ShouldVectorizeHor && 3874 HorRdx.matchAssociativeReduction(P, BI, DL) && 3875 HorRdx.tryToReduce(R, TTI)) { 3876 Changed = true; 3877 it = BB->begin(); 3878 e = BB->end(); 3879 continue; 3880 } 3881 3882 Value *Inst = BI->getOperand(0); 3883 if (Inst == P) 3884 Inst = BI->getOperand(1); 3885 3886 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) { 3887 // We would like to start over since some instructions are deleted 3888 // and the iterator may become invalid value. 3889 Changed = true; 3890 it = BB->begin(); 3891 e = BB->end(); 3892 continue; 3893 } 3894 3895 continue; 3896 } 3897 3898 // Try to vectorize horizontal reductions feeding into a store. 3899 if (ShouldStartVectorizeHorAtStore) 3900 if (StoreInst *SI = dyn_cast<StoreInst>(it)) 3901 if (BinaryOperator *BinOp = 3902 dyn_cast<BinaryOperator>(SI->getValueOperand())) { 3903 HorizontalReduction HorRdx; 3904 if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) && 3905 HorRdx.tryToReduce(R, TTI)) || 3906 tryToVectorize(BinOp, R))) { 3907 Changed = true; 3908 it = BB->begin(); 3909 e = BB->end(); 3910 continue; 3911 } 3912 } 3913 3914 // Try to vectorize horizontal reductions feeding into a return. 3915 if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) 3916 if (RI->getNumOperands() != 0) 3917 if (BinaryOperator *BinOp = 3918 dyn_cast<BinaryOperator>(RI->getOperand(0))) { 3919 DEBUG(dbgs() << "SLP: Found a return to vectorize.\n"); 3920 if (tryToVectorizePair(BinOp->getOperand(0), 3921 BinOp->getOperand(1), R)) { 3922 Changed = true; 3923 it = BB->begin(); 3924 e = BB->end(); 3925 continue; 3926 } 3927 } 3928 3929 // Try to vectorize trees that start at compare instructions. 3930 if (CmpInst *CI = dyn_cast<CmpInst>(it)) { 3931 if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { 3932 Changed = true; 3933 // We would like to start over since some instructions are deleted 3934 // and the iterator may become invalid value. 3935 it = BB->begin(); 3936 e = BB->end(); 3937 continue; 3938 } 3939 3940 for (int i = 0; i < 2; ++i) { 3941 if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) { 3942 if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { 3943 Changed = true; 3944 // We would like to start over since some instructions are deleted 3945 // and the iterator may become invalid value. 3946 it = BB->begin(); 3947 e = BB->end(); 3948 break; 3949 } 3950 } 3951 } 3952 continue; 3953 } 3954 3955 // Try to vectorize trees that start at insertelement instructions. 3956 if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { 3957 SmallVector<Value *, 16> BuildVector; 3958 SmallVector<Value *, 16> BuildVectorOpds; 3959 if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) 3960 continue; 3961 3962 // Vectorize starting with the build vector operands ignoring the 3963 // BuildVector instructions for the purpose of scheduling and user 3964 // extraction. 3965 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { 3966 Changed = true; 3967 it = BB->begin(); 3968 e = BB->end(); 3969 } 3970 3971 continue; 3972 } 3973 } 3974 3975 return Changed; 3976 } 3977 3978 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { 3979 bool Changed = false; 3980 // Attempt to sort and vectorize each of the store-groups. 3981 for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); 3982 it != e; ++it) { 3983 if (it->second.size() < 2) 3984 continue; 3985 3986 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " 3987 << it->second.size() << ".\n"); 3988 3989 // Process the stores in chunks of 16. 3990 for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { 3991 unsigned Len = std::min<unsigned>(CE - CI, 16); 3992 Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), 3993 -SLPCostThreshold, R); 3994 } 3995 } 3996 return Changed; 3997 } 3998 3999 } // end anonymous namespace 4000 4001 char SLPVectorizer::ID = 0; 4002 static const char lv_name[] = "SLP Vectorizer"; 4003 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) 4004 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 4005 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 4006 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 4007 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 4008 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 4009 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) 4010 4011 namespace llvm { 4012 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); } 4013 } 4014