1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive 10 // stores that can be put together into vector-stores. Next, it attempts to 11 // construct vectorizable tree using the use-def chains. If a profitable tree 12 // was found, the SLP vectorizer performs vectorization on the tree. 13 // 14 // The pass is inspired by the work described in the paper: 15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 16 // 17 //===----------------------------------------------------------------------===// 18 #include "llvm/Transforms/Vectorize.h" 19 #include "llvm/ADT/MapVector.h" 20 #include "llvm/ADT/PostOrderIterator.h" 21 #include "llvm/ADT/SetVector.h" 22 #include "llvm/ADT/Optional.h" 23 #include "llvm/ADT/Statistic.h" 24 #include "llvm/Analysis/AliasAnalysis.h" 25 #include "llvm/Analysis/AssumptionCache.h" 26 #include "llvm/Analysis/CodeMetrics.h" 27 #include "llvm/Analysis/LoopInfo.h" 28 #include "llvm/Analysis/ScalarEvolution.h" 29 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 30 #include "llvm/Analysis/TargetTransformInfo.h" 31 #include "llvm/Analysis/ValueTracking.h" 32 #include "llvm/IR/DataLayout.h" 33 #include "llvm/IR/Dominators.h" 34 #include "llvm/IR/IRBuilder.h" 35 #include "llvm/IR/Instructions.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Module.h" 38 #include "llvm/IR/NoFolder.h" 39 #include "llvm/IR/Type.h" 40 #include "llvm/IR/Value.h" 41 #include "llvm/IR/Verifier.h" 42 #include "llvm/Pass.h" 43 #include "llvm/Support/CommandLine.h" 44 #include "llvm/Support/Debug.h" 45 #include "llvm/Support/raw_ostream.h" 46 #include "llvm/Transforms/Utils/VectorUtils.h" 47 #include <algorithm> 48 #include <map> 49 #include <memory> 50 51 using namespace llvm; 52 53 #define SV_NAME "slp-vectorizer" 54 #define DEBUG_TYPE "SLP" 55 56 STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); 57 58 static cl::opt<int> 59 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, 60 cl::desc("Only vectorize if you gain more than this " 61 "number ")); 62 63 static cl::opt<bool> 64 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, 65 cl::desc("Attempt to vectorize horizontal reductions")); 66 67 static cl::opt<bool> ShouldStartVectorizeHorAtStore( 68 "slp-vectorize-hor-store", cl::init(false), cl::Hidden, 69 cl::desc( 70 "Attempt to vectorize horizontal reductions feeding into a store")); 71 72 namespace { 73 74 static const unsigned MinVecRegSize = 128; 75 76 static const unsigned RecursionMaxDepth = 12; 77 78 // Limit the number of alias checks. The limit is chosen so that 79 // it has no negative effect on the llvm benchmarks. 80 static const unsigned AliasedCheckLimit = 10; 81 82 // Another limit for the alias checks: The maximum distance between load/store 83 // instructions where alias checks are done. 84 // This limit is useful for very large basic blocks. 85 static const unsigned MaxMemDepDistance = 160; 86 87 /// \brief Predicate for the element types that the SLP vectorizer supports. 88 /// 89 /// The most important thing to filter here are types which are invalid in LLVM 90 /// vectors. We also filter target specific types which have absolutely no 91 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just 92 /// avoids spending time checking the cost model and realizing that they will 93 /// be inevitably scalarized. 94 static bool isValidElementType(Type *Ty) { 95 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && 96 !Ty->isPPC_FP128Ty(); 97 } 98 99 /// \returns the parent basic block if all of the instructions in \p VL 100 /// are in the same block or null otherwise. 101 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { 102 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 103 if (!I0) 104 return nullptr; 105 BasicBlock *BB = I0->getParent(); 106 for (int i = 1, e = VL.size(); i < e; i++) { 107 Instruction *I = dyn_cast<Instruction>(VL[i]); 108 if (!I) 109 return nullptr; 110 111 if (BB != I->getParent()) 112 return nullptr; 113 } 114 return BB; 115 } 116 117 /// \returns True if all of the values in \p VL are constants. 118 static bool allConstant(ArrayRef<Value *> VL) { 119 for (unsigned i = 0, e = VL.size(); i < e; ++i) 120 if (!isa<Constant>(VL[i])) 121 return false; 122 return true; 123 } 124 125 /// \returns True if all of the values in \p VL are identical. 126 static bool isSplat(ArrayRef<Value *> VL) { 127 for (unsigned i = 1, e = VL.size(); i < e; ++i) 128 if (VL[i] != VL[0]) 129 return false; 130 return true; 131 } 132 133 ///\returns Opcode that can be clubbed with \p Op to create an alternate 134 /// sequence which can later be merged as a ShuffleVector instruction. 135 static unsigned getAltOpcode(unsigned Op) { 136 switch (Op) { 137 case Instruction::FAdd: 138 return Instruction::FSub; 139 case Instruction::FSub: 140 return Instruction::FAdd; 141 case Instruction::Add: 142 return Instruction::Sub; 143 case Instruction::Sub: 144 return Instruction::Add; 145 default: 146 return 0; 147 } 148 } 149 150 ///\returns bool representing if Opcode \p Op can be part 151 /// of an alternate sequence which can later be merged as 152 /// a ShuffleVector instruction. 153 static bool canCombineAsAltInst(unsigned Op) { 154 if (Op == Instruction::FAdd || Op == Instruction::FSub || 155 Op == Instruction::Sub || Op == Instruction::Add) 156 return true; 157 return false; 158 } 159 160 /// \returns ShuffleVector instruction if intructions in \p VL have 161 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. 162 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) 163 static unsigned isAltInst(ArrayRef<Value *> VL) { 164 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 165 unsigned Opcode = I0->getOpcode(); 166 unsigned AltOpcode = getAltOpcode(Opcode); 167 for (int i = 1, e = VL.size(); i < e; i++) { 168 Instruction *I = dyn_cast<Instruction>(VL[i]); 169 if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) 170 return 0; 171 } 172 return Instruction::ShuffleVector; 173 } 174 175 /// \returns The opcode if all of the Instructions in \p VL have the same 176 /// opcode, or zero. 177 static unsigned getSameOpcode(ArrayRef<Value *> VL) { 178 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 179 if (!I0) 180 return 0; 181 unsigned Opcode = I0->getOpcode(); 182 for (int i = 1, e = VL.size(); i < e; i++) { 183 Instruction *I = dyn_cast<Instruction>(VL[i]); 184 if (!I || Opcode != I->getOpcode()) { 185 if (canCombineAsAltInst(Opcode) && i == 1) 186 return isAltInst(VL); 187 return 0; 188 } 189 } 190 return Opcode; 191 } 192 193 /// Get the intersection (logical and) of all of the potential IR flags 194 /// of each scalar operation (VL) that will be converted into a vector (I). 195 /// Flag set: NSW, NUW, exact, and all of fast-math. 196 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { 197 if (auto *VecOp = dyn_cast<BinaryOperator>(I)) { 198 if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) { 199 // Intersection is initialized to the 0th scalar, 200 // so start counting from index '1'. 201 for (int i = 1, e = VL.size(); i < e; ++i) { 202 if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i])) 203 Intersection->andIRFlags(Scalar); 204 } 205 VecOp->copyIRFlags(Intersection); 206 } 207 } 208 } 209 210 /// \returns \p I after propagating metadata from \p VL. 211 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { 212 Instruction *I0 = cast<Instruction>(VL[0]); 213 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; 214 I0->getAllMetadataOtherThanDebugLoc(Metadata); 215 216 for (unsigned i = 0, n = Metadata.size(); i != n; ++i) { 217 unsigned Kind = Metadata[i].first; 218 MDNode *MD = Metadata[i].second; 219 220 for (int i = 1, e = VL.size(); MD && i != e; i++) { 221 Instruction *I = cast<Instruction>(VL[i]); 222 MDNode *IMD = I->getMetadata(Kind); 223 224 switch (Kind) { 225 default: 226 MD = nullptr; // Remove unknown metadata 227 break; 228 case LLVMContext::MD_tbaa: 229 MD = MDNode::getMostGenericTBAA(MD, IMD); 230 break; 231 case LLVMContext::MD_alias_scope: 232 MD = MDNode::getMostGenericAliasScope(MD, IMD); 233 break; 234 case LLVMContext::MD_noalias: 235 MD = MDNode::intersect(MD, IMD); 236 break; 237 case LLVMContext::MD_fpmath: 238 MD = MDNode::getMostGenericFPMath(MD, IMD); 239 break; 240 } 241 } 242 I->setMetadata(Kind, MD); 243 } 244 return I; 245 } 246 247 /// \returns The type that all of the values in \p VL have or null if there 248 /// are different types. 249 static Type* getSameType(ArrayRef<Value *> VL) { 250 Type *Ty = VL[0]->getType(); 251 for (int i = 1, e = VL.size(); i < e; i++) 252 if (VL[i]->getType() != Ty) 253 return nullptr; 254 255 return Ty; 256 } 257 258 /// \returns True if the ExtractElement instructions in VL can be vectorized 259 /// to use the original vector. 260 static bool CanReuseExtract(ArrayRef<Value *> VL) { 261 assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); 262 // Check if all of the extracts come from the same vector and from the 263 // correct offset. 264 Value *VL0 = VL[0]; 265 ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); 266 Value *Vec = E0->getOperand(0); 267 268 // We have to extract from the same vector type. 269 unsigned NElts = Vec->getType()->getVectorNumElements(); 270 271 if (NElts != VL.size()) 272 return false; 273 274 // Check that all of the indices extract from the correct offset. 275 ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); 276 if (!CI || CI->getZExtValue()) 277 return false; 278 279 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 280 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 281 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); 282 283 if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) 284 return false; 285 } 286 287 return true; 288 } 289 290 /// \returns True if in-tree use also needs extract. This refers to 291 /// possible scalar operand in vectorized instruction. 292 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, 293 TargetLibraryInfo *TLI) { 294 295 unsigned Opcode = UserInst->getOpcode(); 296 switch (Opcode) { 297 case Instruction::Load: { 298 LoadInst *LI = cast<LoadInst>(UserInst); 299 return (LI->getPointerOperand() == Scalar); 300 } 301 case Instruction::Store: { 302 StoreInst *SI = cast<StoreInst>(UserInst); 303 return (SI->getPointerOperand() == Scalar); 304 } 305 case Instruction::Call: { 306 CallInst *CI = cast<CallInst>(UserInst); 307 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 308 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 309 return (CI->getArgOperand(1) == Scalar); 310 } 311 } 312 default: 313 return false; 314 } 315 } 316 317 /// \returns the AA location that is being access by the instruction. 318 static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) { 319 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 320 return AA->getLocation(SI); 321 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 322 return AA->getLocation(LI); 323 return AliasAnalysis::Location(); 324 } 325 326 /// \returns True if the instruction is not a volatile or atomic load/store. 327 static bool isSimple(Instruction *I) { 328 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 329 return LI->isSimple(); 330 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 331 return SI->isSimple(); 332 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 333 return !MI->isVolatile(); 334 return true; 335 } 336 337 /// Bottom Up SLP Vectorizer. 338 class BoUpSLP { 339 public: 340 typedef SmallVector<Value *, 8> ValueList; 341 typedef SmallVector<Instruction *, 16> InstrList; 342 typedef SmallPtrSet<Value *, 16> ValueSet; 343 typedef SmallVector<StoreInst *, 8> StoreList; 344 345 BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, 346 TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, 347 LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC) 348 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), 349 SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), 350 Builder(Se->getContext()) { 351 CodeMetrics::collectEphemeralValues(F, AC, EphValues); 352 } 353 354 /// \brief Vectorize the tree that starts with the elements in \p VL. 355 /// Returns the vectorized root. 356 Value *vectorizeTree(); 357 358 /// \returns the cost incurred by unwanted spills and fills, caused by 359 /// holding live values over call sites. 360 int getSpillCost(); 361 362 /// \returns the vectorization cost of the subtree that starts at \p VL. 363 /// A negative number means that this is profitable. 364 int getTreeCost(); 365 366 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 367 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 368 void buildTree(ArrayRef<Value *> Roots, 369 ArrayRef<Value *> UserIgnoreLst = None); 370 371 /// Clear the internal data structures that are created by 'buildTree'. 372 void deleteTree() { 373 VectorizableTree.clear(); 374 ScalarToTreeEntry.clear(); 375 MustGather.clear(); 376 ExternalUses.clear(); 377 NumLoadsWantToKeepOrder = 0; 378 NumLoadsWantToChangeOrder = 0; 379 for (auto &Iter : BlocksSchedules) { 380 BlockScheduling *BS = Iter.second.get(); 381 BS->clear(); 382 } 383 } 384 385 /// \returns true if the memory operations A and B are consecutive. 386 bool isConsecutiveAccess(Value *A, Value *B); 387 388 /// \brief Perform LICM and CSE on the newly generated gather sequences. 389 void optimizeGatherSequence(); 390 391 /// \returns true if it is benefitial to reverse the vector order. 392 bool shouldReorder() const { 393 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; 394 } 395 396 private: 397 struct TreeEntry; 398 399 /// \returns the cost of the vectorizable entry. 400 int getEntryCost(TreeEntry *E); 401 402 /// This is the recursive part of buildTree. 403 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); 404 405 /// Vectorize a single entry in the tree. 406 Value *vectorizeTree(TreeEntry *E); 407 408 /// Vectorize a single entry in the tree, starting in \p VL. 409 Value *vectorizeTree(ArrayRef<Value *> VL); 410 411 /// \returns the pointer to the vectorized value if \p VL is already 412 /// vectorized, or NULL. They may happen in cycles. 413 Value *alreadyVectorized(ArrayRef<Value *> VL) const; 414 415 /// \brief Take the pointer operand from the Load/Store instruction. 416 /// \returns NULL if this is not a valid Load/Store instruction. 417 static Value *getPointerOperand(Value *I); 418 419 /// \brief Take the address space operand from the Load/Store instruction. 420 /// \returns -1 if this is not a valid Load/Store instruction. 421 static unsigned getAddressSpaceOperand(Value *I); 422 423 /// \returns the scalarization cost for this type. Scalarization in this 424 /// context means the creation of vectors from a group of scalars. 425 int getGatherCost(Type *Ty); 426 427 /// \returns the scalarization cost for this list of values. Assuming that 428 /// this subtree gets vectorized, we may need to extract the values from the 429 /// roots. This method calculates the cost of extracting the values. 430 int getGatherCost(ArrayRef<Value *> VL); 431 432 /// \brief Set the Builder insert point to one after the last instruction in 433 /// the bundle 434 void setInsertPointAfterBundle(ArrayRef<Value *> VL); 435 436 /// \returns a vector from a collection of scalars in \p VL. 437 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); 438 439 /// \returns whether the VectorizableTree is fully vectoriable and will 440 /// be beneficial even the tree height is tiny. 441 bool isFullyVectorizableTinyTree(); 442 443 /// \reorder commutative operands in alt shuffle if they result in 444 /// vectorized code. 445 void reorderAltShuffleOperands(ArrayRef<Value *> VL, 446 SmallVectorImpl<Value *> &Left, 447 SmallVectorImpl<Value *> &Right); 448 /// \reorder commutative operands to get better probability of 449 /// generating vectorized code. 450 void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 451 SmallVectorImpl<Value *> &Left, 452 SmallVectorImpl<Value *> &Right); 453 struct TreeEntry { 454 TreeEntry() : Scalars(), VectorizedValue(nullptr), 455 NeedToGather(0) {} 456 457 /// \returns true if the scalars in VL are equal to this entry. 458 bool isSame(ArrayRef<Value *> VL) const { 459 assert(VL.size() == Scalars.size() && "Invalid size"); 460 return std::equal(VL.begin(), VL.end(), Scalars.begin()); 461 } 462 463 /// A vector of scalars. 464 ValueList Scalars; 465 466 /// The Scalars are vectorized into this value. It is initialized to Null. 467 Value *VectorizedValue; 468 469 /// Do we need to gather this sequence ? 470 bool NeedToGather; 471 }; 472 473 /// Create a new VectorizableTree entry. 474 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { 475 VectorizableTree.push_back(TreeEntry()); 476 int idx = VectorizableTree.size() - 1; 477 TreeEntry *Last = &VectorizableTree[idx]; 478 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); 479 Last->NeedToGather = !Vectorized; 480 if (Vectorized) { 481 for (int i = 0, e = VL.size(); i != e; ++i) { 482 assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); 483 ScalarToTreeEntry[VL[i]] = idx; 484 } 485 } else { 486 MustGather.insert(VL.begin(), VL.end()); 487 } 488 return Last; 489 } 490 491 /// -- Vectorization State -- 492 /// Holds all of the tree entries. 493 std::vector<TreeEntry> VectorizableTree; 494 495 /// Maps a specific scalar to its tree entry. 496 SmallDenseMap<Value*, int> ScalarToTreeEntry; 497 498 /// A list of scalars that we found that we need to keep as scalars. 499 ValueSet MustGather; 500 501 /// This POD struct describes one external user in the vectorized tree. 502 struct ExternalUser { 503 ExternalUser (Value *S, llvm::User *U, int L) : 504 Scalar(S), User(U), Lane(L){}; 505 // Which scalar in our function. 506 Value *Scalar; 507 // Which user that uses the scalar. 508 llvm::User *User; 509 // Which lane does the scalar belong to. 510 int Lane; 511 }; 512 typedef SmallVector<ExternalUser, 16> UserList; 513 514 /// Checks if two instructions may access the same memory. 515 /// 516 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it 517 /// is invariant in the calling loop. 518 bool isAliased(const AliasAnalysis::Location &Loc1, Instruction *Inst1, 519 Instruction *Inst2) { 520 521 // First check if the result is already in the cache. 522 AliasCacheKey key = std::make_pair(Inst1, Inst2); 523 Optional<bool> &result = AliasCache[key]; 524 if (result.hasValue()) { 525 return result.getValue(); 526 } 527 AliasAnalysis::Location Loc2 = getLocation(Inst2, AA); 528 bool aliased = true; 529 if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { 530 // Do the alias check. 531 aliased = AA->alias(Loc1, Loc2); 532 } 533 // Store the result in the cache. 534 result = aliased; 535 return aliased; 536 } 537 538 typedef std::pair<Instruction *, Instruction *> AliasCacheKey; 539 540 /// Cache for alias results. 541 /// TODO: consider moving this to the AliasAnalysis itself. 542 DenseMap<AliasCacheKey, Optional<bool>> AliasCache; 543 544 /// Removes an instruction from its block and eventually deletes it. 545 /// It's like Instruction::eraseFromParent() except that the actual deletion 546 /// is delayed until BoUpSLP is destructed. 547 /// This is required to ensure that there are no incorrect collisions in the 548 /// AliasCache, which can happen if a new instruction is allocated at the 549 /// same address as a previously deleted instruction. 550 void eraseInstruction(Instruction *I) { 551 I->removeFromParent(); 552 I->dropAllReferences(); 553 DeletedInstructions.push_back(std::unique_ptr<Instruction>(I)); 554 } 555 556 /// Temporary store for deleted instructions. Instructions will be deleted 557 /// eventually when the BoUpSLP is destructed. 558 SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions; 559 560 /// A list of values that need to extracted out of the tree. 561 /// This list holds pairs of (Internal Scalar : External User). 562 UserList ExternalUses; 563 564 /// Values used only by @llvm.assume calls. 565 SmallPtrSet<const Value *, 32> EphValues; 566 567 /// Holds all of the instructions that we gathered. 568 SetVector<Instruction *> GatherSeq; 569 /// A list of blocks that we are going to CSE. 570 SetVector<BasicBlock *> CSEBlocks; 571 572 /// Contains all scheduling relevant data for an instruction. 573 /// A ScheduleData either represents a single instruction or a member of an 574 /// instruction bundle (= a group of instructions which is combined into a 575 /// vector instruction). 576 struct ScheduleData { 577 578 // The initial value for the dependency counters. It means that the 579 // dependencies are not calculated yet. 580 enum { InvalidDeps = -1 }; 581 582 ScheduleData() 583 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr), 584 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0), 585 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps), 586 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {} 587 588 void init(int BlockSchedulingRegionID) { 589 FirstInBundle = this; 590 NextInBundle = nullptr; 591 NextLoadStore = nullptr; 592 IsScheduled = false; 593 SchedulingRegionID = BlockSchedulingRegionID; 594 UnscheduledDepsInBundle = UnscheduledDeps; 595 clearDependencies(); 596 } 597 598 /// Returns true if the dependency information has been calculated. 599 bool hasValidDependencies() const { return Dependencies != InvalidDeps; } 600 601 /// Returns true for single instructions and for bundle representatives 602 /// (= the head of a bundle). 603 bool isSchedulingEntity() const { return FirstInBundle == this; } 604 605 /// Returns true if it represents an instruction bundle and not only a 606 /// single instruction. 607 bool isPartOfBundle() const { 608 return NextInBundle != nullptr || FirstInBundle != this; 609 } 610 611 /// Returns true if it is ready for scheduling, i.e. it has no more 612 /// unscheduled depending instructions/bundles. 613 bool isReady() const { 614 assert(isSchedulingEntity() && 615 "can't consider non-scheduling entity for ready list"); 616 return UnscheduledDepsInBundle == 0 && !IsScheduled; 617 } 618 619 /// Modifies the number of unscheduled dependencies, also updating it for 620 /// the whole bundle. 621 int incrementUnscheduledDeps(int Incr) { 622 UnscheduledDeps += Incr; 623 return FirstInBundle->UnscheduledDepsInBundle += Incr; 624 } 625 626 /// Sets the number of unscheduled dependencies to the number of 627 /// dependencies. 628 void resetUnscheduledDeps() { 629 incrementUnscheduledDeps(Dependencies - UnscheduledDeps); 630 } 631 632 /// Clears all dependency information. 633 void clearDependencies() { 634 Dependencies = InvalidDeps; 635 resetUnscheduledDeps(); 636 MemoryDependencies.clear(); 637 } 638 639 void dump(raw_ostream &os) const { 640 if (!isSchedulingEntity()) { 641 os << "/ " << *Inst; 642 } else if (NextInBundle) { 643 os << '[' << *Inst; 644 ScheduleData *SD = NextInBundle; 645 while (SD) { 646 os << ';' << *SD->Inst; 647 SD = SD->NextInBundle; 648 } 649 os << ']'; 650 } else { 651 os << *Inst; 652 } 653 } 654 655 Instruction *Inst; 656 657 /// Points to the head in an instruction bundle (and always to this for 658 /// single instructions). 659 ScheduleData *FirstInBundle; 660 661 /// Single linked list of all instructions in a bundle. Null if it is a 662 /// single instruction. 663 ScheduleData *NextInBundle; 664 665 /// Single linked list of all memory instructions (e.g. load, store, call) 666 /// in the block - until the end of the scheduling region. 667 ScheduleData *NextLoadStore; 668 669 /// The dependent memory instructions. 670 /// This list is derived on demand in calculateDependencies(). 671 SmallVector<ScheduleData *, 4> MemoryDependencies; 672 673 /// This ScheduleData is in the current scheduling region if this matches 674 /// the current SchedulingRegionID of BlockScheduling. 675 int SchedulingRegionID; 676 677 /// Used for getting a "good" final ordering of instructions. 678 int SchedulingPriority; 679 680 /// The number of dependencies. Constitutes of the number of users of the 681 /// instruction plus the number of dependent memory instructions (if any). 682 /// This value is calculated on demand. 683 /// If InvalidDeps, the number of dependencies is not calculated yet. 684 /// 685 int Dependencies; 686 687 /// The number of dependencies minus the number of dependencies of scheduled 688 /// instructions. As soon as this is zero, the instruction/bundle gets ready 689 /// for scheduling. 690 /// Note that this is negative as long as Dependencies is not calculated. 691 int UnscheduledDeps; 692 693 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for 694 /// single instructions. 695 int UnscheduledDepsInBundle; 696 697 /// True if this instruction is scheduled (or considered as scheduled in the 698 /// dry-run). 699 bool IsScheduled; 700 }; 701 702 #ifndef NDEBUG 703 friend raw_ostream &operator<<(raw_ostream &os, 704 const BoUpSLP::ScheduleData &SD); 705 #endif 706 707 /// Contains all scheduling data for a basic block. 708 /// 709 struct BlockScheduling { 710 711 BlockScheduling(BasicBlock *BB) 712 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), 713 ScheduleStart(nullptr), ScheduleEnd(nullptr), 714 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), 715 // Make sure that the initial SchedulingRegionID is greater than the 716 // initial SchedulingRegionID in ScheduleData (which is 0). 717 SchedulingRegionID(1) {} 718 719 void clear() { 720 ReadyInsts.clear(); 721 ScheduleStart = nullptr; 722 ScheduleEnd = nullptr; 723 FirstLoadStoreInRegion = nullptr; 724 LastLoadStoreInRegion = nullptr; 725 726 // Make a new scheduling region, i.e. all existing ScheduleData is not 727 // in the new region yet. 728 ++SchedulingRegionID; 729 } 730 731 ScheduleData *getScheduleData(Value *V) { 732 ScheduleData *SD = ScheduleDataMap[V]; 733 if (SD && SD->SchedulingRegionID == SchedulingRegionID) 734 return SD; 735 return nullptr; 736 } 737 738 bool isInSchedulingRegion(ScheduleData *SD) { 739 return SD->SchedulingRegionID == SchedulingRegionID; 740 } 741 742 /// Marks an instruction as scheduled and puts all dependent ready 743 /// instructions into the ready-list. 744 template <typename ReadyListType> 745 void schedule(ScheduleData *SD, ReadyListType &ReadyList) { 746 SD->IsScheduled = true; 747 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); 748 749 ScheduleData *BundleMember = SD; 750 while (BundleMember) { 751 // Handle the def-use chain dependencies. 752 for (Use &U : BundleMember->Inst->operands()) { 753 ScheduleData *OpDef = getScheduleData(U.get()); 754 if (OpDef && OpDef->hasValidDependencies() && 755 OpDef->incrementUnscheduledDeps(-1) == 0) { 756 // There are no more unscheduled dependencies after decrementing, 757 // so we can put the dependent instruction into the ready list. 758 ScheduleData *DepBundle = OpDef->FirstInBundle; 759 assert(!DepBundle->IsScheduled && 760 "already scheduled bundle gets ready"); 761 ReadyList.insert(DepBundle); 762 DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); 763 } 764 } 765 // Handle the memory dependencies. 766 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { 767 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { 768 // There are no more unscheduled dependencies after decrementing, 769 // so we can put the dependent instruction into the ready list. 770 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; 771 assert(!DepBundle->IsScheduled && 772 "already scheduled bundle gets ready"); 773 ReadyList.insert(DepBundle); 774 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n"); 775 } 776 } 777 BundleMember = BundleMember->NextInBundle; 778 } 779 } 780 781 /// Put all instructions into the ReadyList which are ready for scheduling. 782 template <typename ReadyListType> 783 void initialFillReadyList(ReadyListType &ReadyList) { 784 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 785 ScheduleData *SD = getScheduleData(I); 786 if (SD->isSchedulingEntity() && SD->isReady()) { 787 ReadyList.insert(SD); 788 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); 789 } 790 } 791 } 792 793 /// Checks if a bundle of instructions can be scheduled, i.e. has no 794 /// cyclic dependencies. This is only a dry-run, no instructions are 795 /// actually moved at this stage. 796 bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP); 797 798 /// Un-bundles a group of instructions. 799 void cancelScheduling(ArrayRef<Value *> VL); 800 801 /// Extends the scheduling region so that V is inside the region. 802 void extendSchedulingRegion(Value *V); 803 804 /// Initialize the ScheduleData structures for new instructions in the 805 /// scheduling region. 806 void initScheduleData(Instruction *FromI, Instruction *ToI, 807 ScheduleData *PrevLoadStore, 808 ScheduleData *NextLoadStore); 809 810 /// Updates the dependency information of a bundle and of all instructions/ 811 /// bundles which depend on the original bundle. 812 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, 813 BoUpSLP *SLP); 814 815 /// Sets all instruction in the scheduling region to un-scheduled. 816 void resetSchedule(); 817 818 BasicBlock *BB; 819 820 /// Simple memory allocation for ScheduleData. 821 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; 822 823 /// The size of a ScheduleData array in ScheduleDataChunks. 824 int ChunkSize; 825 826 /// The allocator position in the current chunk, which is the last entry 827 /// of ScheduleDataChunks. 828 int ChunkPos; 829 830 /// Attaches ScheduleData to Instruction. 831 /// Note that the mapping survives during all vectorization iterations, i.e. 832 /// ScheduleData structures are recycled. 833 DenseMap<Value *, ScheduleData *> ScheduleDataMap; 834 835 struct ReadyList : SmallVector<ScheduleData *, 8> { 836 void insert(ScheduleData *SD) { push_back(SD); } 837 }; 838 839 /// The ready-list for scheduling (only used for the dry-run). 840 ReadyList ReadyInsts; 841 842 /// The first instruction of the scheduling region. 843 Instruction *ScheduleStart; 844 845 /// The first instruction _after_ the scheduling region. 846 Instruction *ScheduleEnd; 847 848 /// The first memory accessing instruction in the scheduling region 849 /// (can be null). 850 ScheduleData *FirstLoadStoreInRegion; 851 852 /// The last memory accessing instruction in the scheduling region 853 /// (can be null). 854 ScheduleData *LastLoadStoreInRegion; 855 856 /// The ID of the scheduling region. For a new vectorization iteration this 857 /// is incremented which "removes" all ScheduleData from the region. 858 int SchedulingRegionID; 859 }; 860 861 /// Attaches the BlockScheduling structures to basic blocks. 862 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; 863 864 /// Performs the "real" scheduling. Done before vectorization is actually 865 /// performed in a basic block. 866 void scheduleBlock(BlockScheduling *BS); 867 868 /// List of users to ignore during scheduling and that don't need extracting. 869 ArrayRef<Value *> UserIgnoreList; 870 871 // Number of load-bundles, which contain consecutive loads. 872 int NumLoadsWantToKeepOrder; 873 874 // Number of load-bundles of size 2, which are consecutive loads if reversed. 875 int NumLoadsWantToChangeOrder; 876 877 // Analysis and block reference. 878 Function *F; 879 ScalarEvolution *SE; 880 const DataLayout *DL; 881 TargetTransformInfo *TTI; 882 TargetLibraryInfo *TLI; 883 AliasAnalysis *AA; 884 LoopInfo *LI; 885 DominatorTree *DT; 886 /// Instruction builder to construct the vectorized tree. 887 IRBuilder<> Builder; 888 }; 889 890 #ifndef NDEBUG 891 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) { 892 SD.dump(os); 893 return os; 894 } 895 #endif 896 897 void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 898 ArrayRef<Value *> UserIgnoreLst) { 899 deleteTree(); 900 UserIgnoreList = UserIgnoreLst; 901 if (!getSameType(Roots)) 902 return; 903 buildTree_rec(Roots, 0); 904 905 // Collect the values that we need to extract from the tree. 906 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 907 TreeEntry *Entry = &VectorizableTree[EIdx]; 908 909 // For each lane: 910 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 911 Value *Scalar = Entry->Scalars[Lane]; 912 913 // No need to handle users of gathered values. 914 if (Entry->NeedToGather) 915 continue; 916 917 for (User *U : Scalar->users()) { 918 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); 919 920 Instruction *UserInst = dyn_cast<Instruction>(U); 921 if (!UserInst) 922 continue; 923 924 // Skip in-tree scalars that become vectors 925 if (ScalarToTreeEntry.count(U)) { 926 int Idx = ScalarToTreeEntry[U]; 927 TreeEntry *UseEntry = &VectorizableTree[Idx]; 928 Value *UseScalar = UseEntry->Scalars[0]; 929 // Some in-tree scalars will remain as scalar in vectorized 930 // instructions. If that is the case, the one in Lane 0 will 931 // be used. 932 if (UseScalar != U || 933 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { 934 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U 935 << ".\n"); 936 assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); 937 continue; 938 } 939 } 940 941 // Ignore users in the user ignore list. 942 if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != 943 UserIgnoreList.end()) 944 continue; 945 946 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << 947 Lane << " from " << *Scalar << ".\n"); 948 ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); 949 } 950 } 951 } 952 } 953 954 955 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { 956 bool SameTy = getSameType(VL); (void)SameTy; 957 bool isAltShuffle = false; 958 assert(SameTy && "Invalid types!"); 959 960 if (Depth == RecursionMaxDepth) { 961 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); 962 newTreeEntry(VL, false); 963 return; 964 } 965 966 // Don't handle vectors. 967 if (VL[0]->getType()->isVectorTy()) { 968 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); 969 newTreeEntry(VL, false); 970 return; 971 } 972 973 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 974 if (SI->getValueOperand()->getType()->isVectorTy()) { 975 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); 976 newTreeEntry(VL, false); 977 return; 978 } 979 unsigned Opcode = getSameOpcode(VL); 980 981 // Check that this shuffle vector refers to the alternate 982 // sequence of opcodes. 983 if (Opcode == Instruction::ShuffleVector) { 984 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 985 unsigned Op = I0->getOpcode(); 986 if (Op != Instruction::ShuffleVector) 987 isAltShuffle = true; 988 } 989 990 // If all of the operands are identical or constant we have a simple solution. 991 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { 992 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); 993 newTreeEntry(VL, false); 994 return; 995 } 996 997 // We now know that this is a vector of instructions of the same type from 998 // the same block. 999 1000 // Don't vectorize ephemeral values. 1001 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1002 if (EphValues.count(VL[i])) { 1003 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 1004 ") is ephemeral.\n"); 1005 newTreeEntry(VL, false); 1006 return; 1007 } 1008 } 1009 1010 // Check if this is a duplicate of another entry. 1011 if (ScalarToTreeEntry.count(VL[0])) { 1012 int Idx = ScalarToTreeEntry[VL[0]]; 1013 TreeEntry *E = &VectorizableTree[Idx]; 1014 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1015 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); 1016 if (E->Scalars[i] != VL[i]) { 1017 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); 1018 newTreeEntry(VL, false); 1019 return; 1020 } 1021 } 1022 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); 1023 return; 1024 } 1025 1026 // Check that none of the instructions in the bundle are already in the tree. 1027 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1028 if (ScalarToTreeEntry.count(VL[i])) { 1029 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 1030 ") is already in tree.\n"); 1031 newTreeEntry(VL, false); 1032 return; 1033 } 1034 } 1035 1036 // If any of the scalars is marked as a value that needs to stay scalar then 1037 // we need to gather the scalars. 1038 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1039 if (MustGather.count(VL[i])) { 1040 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); 1041 newTreeEntry(VL, false); 1042 return; 1043 } 1044 } 1045 1046 // Check that all of the users of the scalars that we want to vectorize are 1047 // schedulable. 1048 Instruction *VL0 = cast<Instruction>(VL[0]); 1049 BasicBlock *BB = cast<Instruction>(VL0)->getParent(); 1050 1051 if (!DT->isReachableFromEntry(BB)) { 1052 // Don't go into unreachable blocks. They may contain instructions with 1053 // dependency cycles which confuse the final scheduling. 1054 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); 1055 newTreeEntry(VL, false); 1056 return; 1057 } 1058 1059 // Check that every instructions appears once in this bundle. 1060 for (unsigned i = 0, e = VL.size(); i < e; ++i) 1061 for (unsigned j = i+1; j < e; ++j) 1062 if (VL[i] == VL[j]) { 1063 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 1064 newTreeEntry(VL, false); 1065 return; 1066 } 1067 1068 auto &BSRef = BlocksSchedules[BB]; 1069 if (!BSRef) { 1070 BSRef = llvm::make_unique<BlockScheduling>(BB); 1071 } 1072 BlockScheduling &BS = *BSRef.get(); 1073 1074 if (!BS.tryScheduleBundle(VL, this)) { 1075 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); 1076 BS.cancelScheduling(VL); 1077 newTreeEntry(VL, false); 1078 return; 1079 } 1080 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); 1081 1082 switch (Opcode) { 1083 case Instruction::PHI: { 1084 PHINode *PH = dyn_cast<PHINode>(VL0); 1085 1086 // Check for terminator values (e.g. invoke). 1087 for (unsigned j = 0; j < VL.size(); ++j) 1088 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1089 TerminatorInst *Term = dyn_cast<TerminatorInst>( 1090 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); 1091 if (Term) { 1092 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); 1093 BS.cancelScheduling(VL); 1094 newTreeEntry(VL, false); 1095 return; 1096 } 1097 } 1098 1099 newTreeEntry(VL, true); 1100 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); 1101 1102 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1103 ValueList Operands; 1104 // Prepare the operand vector. 1105 for (unsigned j = 0; j < VL.size(); ++j) 1106 Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock( 1107 PH->getIncomingBlock(i))); 1108 1109 buildTree_rec(Operands, Depth + 1); 1110 } 1111 return; 1112 } 1113 case Instruction::ExtractElement: { 1114 bool Reuse = CanReuseExtract(VL); 1115 if (Reuse) { 1116 DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); 1117 } else { 1118 BS.cancelScheduling(VL); 1119 } 1120 newTreeEntry(VL, Reuse); 1121 return; 1122 } 1123 case Instruction::Load: { 1124 // Check if the loads are consecutive or of we need to swizzle them. 1125 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { 1126 LoadInst *L = cast<LoadInst>(VL[i]); 1127 if (!L->isSimple()) { 1128 BS.cancelScheduling(VL); 1129 newTreeEntry(VL, false); 1130 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); 1131 return; 1132 } 1133 if (!isConsecutiveAccess(VL[i], VL[i + 1])) { 1134 if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) { 1135 ++NumLoadsWantToChangeOrder; 1136 } 1137 BS.cancelScheduling(VL); 1138 newTreeEntry(VL, false); 1139 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); 1140 return; 1141 } 1142 } 1143 ++NumLoadsWantToKeepOrder; 1144 newTreeEntry(VL, true); 1145 DEBUG(dbgs() << "SLP: added a vector of loads.\n"); 1146 return; 1147 } 1148 case Instruction::ZExt: 1149 case Instruction::SExt: 1150 case Instruction::FPToUI: 1151 case Instruction::FPToSI: 1152 case Instruction::FPExt: 1153 case Instruction::PtrToInt: 1154 case Instruction::IntToPtr: 1155 case Instruction::SIToFP: 1156 case Instruction::UIToFP: 1157 case Instruction::Trunc: 1158 case Instruction::FPTrunc: 1159 case Instruction::BitCast: { 1160 Type *SrcTy = VL0->getOperand(0)->getType(); 1161 for (unsigned i = 0; i < VL.size(); ++i) { 1162 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); 1163 if (Ty != SrcTy || !isValidElementType(Ty)) { 1164 BS.cancelScheduling(VL); 1165 newTreeEntry(VL, false); 1166 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); 1167 return; 1168 } 1169 } 1170 newTreeEntry(VL, true); 1171 DEBUG(dbgs() << "SLP: added a vector of casts.\n"); 1172 1173 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1174 ValueList Operands; 1175 // Prepare the operand vector. 1176 for (unsigned j = 0; j < VL.size(); ++j) 1177 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1178 1179 buildTree_rec(Operands, Depth+1); 1180 } 1181 return; 1182 } 1183 case Instruction::ICmp: 1184 case Instruction::FCmp: { 1185 // Check that all of the compares have the same predicate. 1186 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); 1187 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); 1188 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 1189 CmpInst *Cmp = cast<CmpInst>(VL[i]); 1190 if (Cmp->getPredicate() != P0 || 1191 Cmp->getOperand(0)->getType() != ComparedTy) { 1192 BS.cancelScheduling(VL); 1193 newTreeEntry(VL, false); 1194 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); 1195 return; 1196 } 1197 } 1198 1199 newTreeEntry(VL, true); 1200 DEBUG(dbgs() << "SLP: added a vector of compares.\n"); 1201 1202 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1203 ValueList Operands; 1204 // Prepare the operand vector. 1205 for (unsigned j = 0; j < VL.size(); ++j) 1206 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1207 1208 buildTree_rec(Operands, Depth+1); 1209 } 1210 return; 1211 } 1212 case Instruction::Select: 1213 case Instruction::Add: 1214 case Instruction::FAdd: 1215 case Instruction::Sub: 1216 case Instruction::FSub: 1217 case Instruction::Mul: 1218 case Instruction::FMul: 1219 case Instruction::UDiv: 1220 case Instruction::SDiv: 1221 case Instruction::FDiv: 1222 case Instruction::URem: 1223 case Instruction::SRem: 1224 case Instruction::FRem: 1225 case Instruction::Shl: 1226 case Instruction::LShr: 1227 case Instruction::AShr: 1228 case Instruction::And: 1229 case Instruction::Or: 1230 case Instruction::Xor: { 1231 newTreeEntry(VL, true); 1232 DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); 1233 1234 // Sort operands of the instructions so that each side is more likely to 1235 // have the same opcode. 1236 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { 1237 ValueList Left, Right; 1238 reorderInputsAccordingToOpcode(VL, Left, Right); 1239 buildTree_rec(Left, Depth + 1); 1240 buildTree_rec(Right, Depth + 1); 1241 return; 1242 } 1243 1244 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1245 ValueList Operands; 1246 // Prepare the operand vector. 1247 for (unsigned j = 0; j < VL.size(); ++j) 1248 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1249 1250 buildTree_rec(Operands, Depth+1); 1251 } 1252 return; 1253 } 1254 case Instruction::GetElementPtr: { 1255 // We don't combine GEPs with complicated (nested) indexing. 1256 for (unsigned j = 0; j < VL.size(); ++j) { 1257 if (cast<Instruction>(VL[j])->getNumOperands() != 2) { 1258 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); 1259 BS.cancelScheduling(VL); 1260 newTreeEntry(VL, false); 1261 return; 1262 } 1263 } 1264 1265 // We can't combine several GEPs into one vector if they operate on 1266 // different types. 1267 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType(); 1268 for (unsigned j = 0; j < VL.size(); ++j) { 1269 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); 1270 if (Ty0 != CurTy) { 1271 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); 1272 BS.cancelScheduling(VL); 1273 newTreeEntry(VL, false); 1274 return; 1275 } 1276 } 1277 1278 // We don't combine GEPs with non-constant indexes. 1279 for (unsigned j = 0; j < VL.size(); ++j) { 1280 auto Op = cast<Instruction>(VL[j])->getOperand(1); 1281 if (!isa<ConstantInt>(Op)) { 1282 DEBUG( 1283 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); 1284 BS.cancelScheduling(VL); 1285 newTreeEntry(VL, false); 1286 return; 1287 } 1288 } 1289 1290 newTreeEntry(VL, true); 1291 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); 1292 for (unsigned i = 0, e = 2; i < e; ++i) { 1293 ValueList Operands; 1294 // Prepare the operand vector. 1295 for (unsigned j = 0; j < VL.size(); ++j) 1296 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1297 1298 buildTree_rec(Operands, Depth + 1); 1299 } 1300 return; 1301 } 1302 case Instruction::Store: { 1303 // Check if the stores are consecutive or of we need to swizzle them. 1304 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) 1305 if (!isConsecutiveAccess(VL[i], VL[i + 1])) { 1306 BS.cancelScheduling(VL); 1307 newTreeEntry(VL, false); 1308 DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); 1309 return; 1310 } 1311 1312 newTreeEntry(VL, true); 1313 DEBUG(dbgs() << "SLP: added a vector of stores.\n"); 1314 1315 ValueList Operands; 1316 for (unsigned j = 0; j < VL.size(); ++j) 1317 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); 1318 1319 buildTree_rec(Operands, Depth + 1); 1320 return; 1321 } 1322 case Instruction::Call: { 1323 // Check if the calls are all to the same vectorizable intrinsic. 1324 CallInst *CI = cast<CallInst>(VL[0]); 1325 // Check if this is an Intrinsic call or something that can be 1326 // represented by an intrinsic call 1327 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1328 if (!isTriviallyVectorizable(ID)) { 1329 BS.cancelScheduling(VL); 1330 newTreeEntry(VL, false); 1331 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); 1332 return; 1333 } 1334 Function *Int = CI->getCalledFunction(); 1335 Value *A1I = nullptr; 1336 if (hasVectorInstrinsicScalarOpd(ID, 1)) 1337 A1I = CI->getArgOperand(1); 1338 for (unsigned i = 1, e = VL.size(); i != e; ++i) { 1339 CallInst *CI2 = dyn_cast<CallInst>(VL[i]); 1340 if (!CI2 || CI2->getCalledFunction() != Int || 1341 getIntrinsicIDForCall(CI2, TLI) != ID) { 1342 BS.cancelScheduling(VL); 1343 newTreeEntry(VL, false); 1344 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] 1345 << "\n"); 1346 return; 1347 } 1348 // ctlz,cttz and powi are special intrinsics whose second argument 1349 // should be same in order for them to be vectorized. 1350 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 1351 Value *A1J = CI2->getArgOperand(1); 1352 if (A1I != A1J) { 1353 BS.cancelScheduling(VL); 1354 newTreeEntry(VL, false); 1355 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI 1356 << " argument "<< A1I<<"!=" << A1J 1357 << "\n"); 1358 return; 1359 } 1360 } 1361 } 1362 1363 newTreeEntry(VL, true); 1364 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { 1365 ValueList Operands; 1366 // Prepare the operand vector. 1367 for (unsigned j = 0; j < VL.size(); ++j) { 1368 CallInst *CI2 = dyn_cast<CallInst>(VL[j]); 1369 Operands.push_back(CI2->getArgOperand(i)); 1370 } 1371 buildTree_rec(Operands, Depth + 1); 1372 } 1373 return; 1374 } 1375 case Instruction::ShuffleVector: { 1376 // If this is not an alternate sequence of opcode like add-sub 1377 // then do not vectorize this instruction. 1378 if (!isAltShuffle) { 1379 BS.cancelScheduling(VL); 1380 newTreeEntry(VL, false); 1381 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); 1382 return; 1383 } 1384 newTreeEntry(VL, true); 1385 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); 1386 1387 // Reorder operands if reordering would enable vectorization. 1388 if (isa<BinaryOperator>(VL0)) { 1389 ValueList Left, Right; 1390 reorderAltShuffleOperands(VL, Left, Right); 1391 buildTree_rec(Left, Depth + 1); 1392 buildTree_rec(Right, Depth + 1); 1393 return; 1394 } 1395 1396 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1397 ValueList Operands; 1398 // Prepare the operand vector. 1399 for (unsigned j = 0; j < VL.size(); ++j) 1400 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1401 1402 buildTree_rec(Operands, Depth + 1); 1403 } 1404 return; 1405 } 1406 default: 1407 BS.cancelScheduling(VL); 1408 newTreeEntry(VL, false); 1409 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); 1410 return; 1411 } 1412 } 1413 1414 int BoUpSLP::getEntryCost(TreeEntry *E) { 1415 ArrayRef<Value*> VL = E->Scalars; 1416 1417 Type *ScalarTy = VL[0]->getType(); 1418 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1419 ScalarTy = SI->getValueOperand()->getType(); 1420 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1421 1422 if (E->NeedToGather) { 1423 if (allConstant(VL)) 1424 return 0; 1425 if (isSplat(VL)) { 1426 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); 1427 } 1428 return getGatherCost(E->Scalars); 1429 } 1430 unsigned Opcode = getSameOpcode(VL); 1431 assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); 1432 Instruction *VL0 = cast<Instruction>(VL[0]); 1433 switch (Opcode) { 1434 case Instruction::PHI: { 1435 return 0; 1436 } 1437 case Instruction::ExtractElement: { 1438 if (CanReuseExtract(VL)) { 1439 int DeadCost = 0; 1440 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 1441 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 1442 if (E->hasOneUse()) 1443 // Take credit for instruction that will become dead. 1444 DeadCost += 1445 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); 1446 } 1447 return -DeadCost; 1448 } 1449 return getGatherCost(VecTy); 1450 } 1451 case Instruction::ZExt: 1452 case Instruction::SExt: 1453 case Instruction::FPToUI: 1454 case Instruction::FPToSI: 1455 case Instruction::FPExt: 1456 case Instruction::PtrToInt: 1457 case Instruction::IntToPtr: 1458 case Instruction::SIToFP: 1459 case Instruction::UIToFP: 1460 case Instruction::Trunc: 1461 case Instruction::FPTrunc: 1462 case Instruction::BitCast: { 1463 Type *SrcTy = VL0->getOperand(0)->getType(); 1464 1465 // Calculate the cost of this instruction. 1466 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), 1467 VL0->getType(), SrcTy); 1468 1469 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); 1470 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); 1471 return VecCost - ScalarCost; 1472 } 1473 case Instruction::FCmp: 1474 case Instruction::ICmp: 1475 case Instruction::Select: 1476 case Instruction::Add: 1477 case Instruction::FAdd: 1478 case Instruction::Sub: 1479 case Instruction::FSub: 1480 case Instruction::Mul: 1481 case Instruction::FMul: 1482 case Instruction::UDiv: 1483 case Instruction::SDiv: 1484 case Instruction::FDiv: 1485 case Instruction::URem: 1486 case Instruction::SRem: 1487 case Instruction::FRem: 1488 case Instruction::Shl: 1489 case Instruction::LShr: 1490 case Instruction::AShr: 1491 case Instruction::And: 1492 case Instruction::Or: 1493 case Instruction::Xor: { 1494 // Calculate the cost of this instruction. 1495 int ScalarCost = 0; 1496 int VecCost = 0; 1497 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || 1498 Opcode == Instruction::Select) { 1499 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); 1500 ScalarCost = VecTy->getNumElements() * 1501 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); 1502 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); 1503 } else { 1504 // Certain instructions can be cheaper to vectorize if they have a 1505 // constant second vector operand. 1506 TargetTransformInfo::OperandValueKind Op1VK = 1507 TargetTransformInfo::OK_AnyValue; 1508 TargetTransformInfo::OperandValueKind Op2VK = 1509 TargetTransformInfo::OK_UniformConstantValue; 1510 TargetTransformInfo::OperandValueProperties Op1VP = 1511 TargetTransformInfo::OP_None; 1512 TargetTransformInfo::OperandValueProperties Op2VP = 1513 TargetTransformInfo::OP_None; 1514 1515 // If all operands are exactly the same ConstantInt then set the 1516 // operand kind to OK_UniformConstantValue. 1517 // If instead not all operands are constants, then set the operand kind 1518 // to OK_AnyValue. If all operands are constants but not the same, 1519 // then set the operand kind to OK_NonUniformConstantValue. 1520 ConstantInt *CInt = nullptr; 1521 for (unsigned i = 0; i < VL.size(); ++i) { 1522 const Instruction *I = cast<Instruction>(VL[i]); 1523 if (!isa<ConstantInt>(I->getOperand(1))) { 1524 Op2VK = TargetTransformInfo::OK_AnyValue; 1525 break; 1526 } 1527 if (i == 0) { 1528 CInt = cast<ConstantInt>(I->getOperand(1)); 1529 continue; 1530 } 1531 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && 1532 CInt != cast<ConstantInt>(I->getOperand(1))) 1533 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 1534 } 1535 // FIXME: Currently cost of model modification for division by 1536 // power of 2 is handled only for X86. Add support for other targets. 1537 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && 1538 CInt->getValue().isPowerOf2()) 1539 Op2VP = TargetTransformInfo::OP_PowerOf2; 1540 1541 ScalarCost = VecTy->getNumElements() * 1542 TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, 1543 Op1VP, Op2VP); 1544 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, 1545 Op1VP, Op2VP); 1546 } 1547 return VecCost - ScalarCost; 1548 } 1549 case Instruction::GetElementPtr: { 1550 TargetTransformInfo::OperandValueKind Op1VK = 1551 TargetTransformInfo::OK_AnyValue; 1552 TargetTransformInfo::OperandValueKind Op2VK = 1553 TargetTransformInfo::OK_UniformConstantValue; 1554 1555 int ScalarCost = 1556 VecTy->getNumElements() * 1557 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); 1558 int VecCost = 1559 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); 1560 1561 return VecCost - ScalarCost; 1562 } 1563 case Instruction::Load: { 1564 // Cost of wide load - cost of scalar loads. 1565 int ScalarLdCost = VecTy->getNumElements() * 1566 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); 1567 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0); 1568 return VecLdCost - ScalarLdCost; 1569 } 1570 case Instruction::Store: { 1571 // We know that we can merge the stores. Calculate the cost. 1572 int ScalarStCost = VecTy->getNumElements() * 1573 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); 1574 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0); 1575 return VecStCost - ScalarStCost; 1576 } 1577 case Instruction::Call: { 1578 CallInst *CI = cast<CallInst>(VL0); 1579 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1580 1581 // Calculate the cost of the scalar and vector calls. 1582 SmallVector<Type*, 4> ScalarTys, VecTys; 1583 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { 1584 ScalarTys.push_back(CI->getArgOperand(op)->getType()); 1585 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), 1586 VecTy->getNumElements())); 1587 } 1588 1589 int ScalarCallCost = VecTy->getNumElements() * 1590 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys); 1591 1592 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys); 1593 1594 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost 1595 << " (" << VecCallCost << "-" << ScalarCallCost << ")" 1596 << " for " << *CI << "\n"); 1597 1598 return VecCallCost - ScalarCallCost; 1599 } 1600 case Instruction::ShuffleVector: { 1601 TargetTransformInfo::OperandValueKind Op1VK = 1602 TargetTransformInfo::OK_AnyValue; 1603 TargetTransformInfo::OperandValueKind Op2VK = 1604 TargetTransformInfo::OK_AnyValue; 1605 int ScalarCost = 0; 1606 int VecCost = 0; 1607 for (unsigned i = 0; i < VL.size(); ++i) { 1608 Instruction *I = cast<Instruction>(VL[i]); 1609 if (!I) 1610 break; 1611 ScalarCost += 1612 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); 1613 } 1614 // VecCost is equal to sum of the cost of creating 2 vectors 1615 // and the cost of creating shuffle. 1616 Instruction *I0 = cast<Instruction>(VL[0]); 1617 VecCost = 1618 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); 1619 Instruction *I1 = cast<Instruction>(VL[1]); 1620 VecCost += 1621 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); 1622 VecCost += 1623 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); 1624 return VecCost - ScalarCost; 1625 } 1626 default: 1627 llvm_unreachable("Unknown instruction"); 1628 } 1629 } 1630 1631 bool BoUpSLP::isFullyVectorizableTinyTree() { 1632 DEBUG(dbgs() << "SLP: Check whether the tree with height " << 1633 VectorizableTree.size() << " is fully vectorizable .\n"); 1634 1635 // We only handle trees of height 2. 1636 if (VectorizableTree.size() != 2) 1637 return false; 1638 1639 // Handle splat stores. 1640 if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars)) 1641 return true; 1642 1643 // Gathering cost would be too much for tiny trees. 1644 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) 1645 return false; 1646 1647 return true; 1648 } 1649 1650 int BoUpSLP::getSpillCost() { 1651 // Walk from the bottom of the tree to the top, tracking which values are 1652 // live. When we see a call instruction that is not part of our tree, 1653 // query TTI to see if there is a cost to keeping values live over it 1654 // (for example, if spills and fills are required). 1655 unsigned BundleWidth = VectorizableTree.front().Scalars.size(); 1656 int Cost = 0; 1657 1658 SmallPtrSet<Instruction*, 4> LiveValues; 1659 Instruction *PrevInst = nullptr; 1660 1661 for (unsigned N = 0; N < VectorizableTree.size(); ++N) { 1662 Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]); 1663 if (!Inst) 1664 continue; 1665 1666 if (!PrevInst) { 1667 PrevInst = Inst; 1668 continue; 1669 } 1670 1671 DEBUG( 1672 dbgs() << "SLP: #LV: " << LiveValues.size(); 1673 for (auto *X : LiveValues) 1674 dbgs() << " " << X->getName(); 1675 dbgs() << ", Looking at "; 1676 Inst->dump(); 1677 ); 1678 1679 // Update LiveValues. 1680 LiveValues.erase(PrevInst); 1681 for (auto &J : PrevInst->operands()) { 1682 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J)) 1683 LiveValues.insert(cast<Instruction>(&*J)); 1684 } 1685 1686 // Now find the sequence of instructions between PrevInst and Inst. 1687 BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst); 1688 --PrevInstIt; 1689 while (InstIt != PrevInstIt) { 1690 if (PrevInstIt == PrevInst->getParent()->rend()) { 1691 PrevInstIt = Inst->getParent()->rbegin(); 1692 continue; 1693 } 1694 1695 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) { 1696 SmallVector<Type*, 4> V; 1697 for (auto *II : LiveValues) 1698 V.push_back(VectorType::get(II->getType(), BundleWidth)); 1699 Cost += TTI->getCostOfKeepingLiveOverCall(V); 1700 } 1701 1702 ++PrevInstIt; 1703 } 1704 1705 PrevInst = Inst; 1706 } 1707 1708 DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n"); 1709 return Cost; 1710 } 1711 1712 int BoUpSLP::getTreeCost() { 1713 int Cost = 0; 1714 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << 1715 VectorizableTree.size() << ".\n"); 1716 1717 // We only vectorize tiny trees if it is fully vectorizable. 1718 if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { 1719 if (VectorizableTree.empty()) { 1720 assert(!ExternalUses.size() && "We should not have any external users"); 1721 } 1722 return INT_MAX; 1723 } 1724 1725 unsigned BundleWidth = VectorizableTree[0].Scalars.size(); 1726 1727 for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { 1728 int C = getEntryCost(&VectorizableTree[i]); 1729 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " 1730 << *VectorizableTree[i].Scalars[0] << " .\n"); 1731 Cost += C; 1732 } 1733 1734 SmallSet<Value *, 16> ExtractCostCalculated; 1735 int ExtractCost = 0; 1736 for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end(); 1737 I != E; ++I) { 1738 // We only add extract cost once for the same scalar. 1739 if (!ExtractCostCalculated.insert(I->Scalar).second) 1740 continue; 1741 1742 // Uses by ephemeral values are free (because the ephemeral value will be 1743 // removed prior to code generation, and so the extraction will be 1744 // removed as well). 1745 if (EphValues.count(I->User)) 1746 continue; 1747 1748 VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); 1749 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1750 I->Lane); 1751 } 1752 1753 Cost += getSpillCost(); 1754 1755 DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); 1756 return Cost + ExtractCost; 1757 } 1758 1759 int BoUpSLP::getGatherCost(Type *Ty) { 1760 int Cost = 0; 1761 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) 1762 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); 1763 return Cost; 1764 } 1765 1766 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { 1767 // Find the type of the operands in VL. 1768 Type *ScalarTy = VL[0]->getType(); 1769 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1770 ScalarTy = SI->getValueOperand()->getType(); 1771 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1772 // Find the cost of inserting/extracting values from the vector. 1773 return getGatherCost(VecTy); 1774 } 1775 1776 Value *BoUpSLP::getPointerOperand(Value *I) { 1777 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1778 return LI->getPointerOperand(); 1779 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 1780 return SI->getPointerOperand(); 1781 return nullptr; 1782 } 1783 1784 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { 1785 if (LoadInst *L = dyn_cast<LoadInst>(I)) 1786 return L->getPointerAddressSpace(); 1787 if (StoreInst *S = dyn_cast<StoreInst>(I)) 1788 return S->getPointerAddressSpace(); 1789 return -1; 1790 } 1791 1792 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { 1793 Value *PtrA = getPointerOperand(A); 1794 Value *PtrB = getPointerOperand(B); 1795 unsigned ASA = getAddressSpaceOperand(A); 1796 unsigned ASB = getAddressSpaceOperand(B); 1797 1798 // Check that the address spaces match and that the pointers are valid. 1799 if (!PtrA || !PtrB || (ASA != ASB)) 1800 return false; 1801 1802 // Make sure that A and B are different pointers of the same type. 1803 if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) 1804 return false; 1805 1806 unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA); 1807 Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); 1808 APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty)); 1809 1810 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); 1811 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA); 1812 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB); 1813 1814 APInt OffsetDelta = OffsetB - OffsetA; 1815 1816 // Check if they are based on the same pointer. That makes the offsets 1817 // sufficient. 1818 if (PtrA == PtrB) 1819 return OffsetDelta == Size; 1820 1821 // Compute the necessary base pointer delta to have the necessary final delta 1822 // equal to the size. 1823 APInt BaseDelta = Size - OffsetDelta; 1824 1825 // Otherwise compute the distance with SCEV between the base pointers. 1826 const SCEV *PtrSCEVA = SE->getSCEV(PtrA); 1827 const SCEV *PtrSCEVB = SE->getSCEV(PtrB); 1828 const SCEV *C = SE->getConstant(BaseDelta); 1829 const SCEV *X = SE->getAddExpr(PtrSCEVA, C); 1830 return X == PtrSCEVB; 1831 } 1832 1833 // Reorder commutative operations in alternate shuffle if the resulting vectors 1834 // are consecutive loads. This would allow us to vectorize the tree. 1835 // If we have something like- 1836 // load a[0] - load b[0] 1837 // load b[1] + load a[1] 1838 // load a[2] - load b[2] 1839 // load a[3] + load b[3] 1840 // Reordering the second load b[1] load a[1] would allow us to vectorize this 1841 // code. 1842 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, 1843 SmallVectorImpl<Value *> &Left, 1844 SmallVectorImpl<Value *> &Right) { 1845 1846 // Push left and right operands of binary operation into Left and Right 1847 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 1848 Left.push_back(cast<Instruction>(VL[i])->getOperand(0)); 1849 Right.push_back(cast<Instruction>(VL[i])->getOperand(1)); 1850 } 1851 1852 // Reorder if we have a commutative operation and consecutive access 1853 // are on either side of the alternate instructions. 1854 for (unsigned j = 0; j < VL.size() - 1; ++j) { 1855 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { 1856 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { 1857 Instruction *VL1 = cast<Instruction>(VL[j]); 1858 Instruction *VL2 = cast<Instruction>(VL[j + 1]); 1859 if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) { 1860 std::swap(Left[j], Right[j]); 1861 continue; 1862 } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) { 1863 std::swap(Left[j + 1], Right[j + 1]); 1864 continue; 1865 } 1866 // else unchanged 1867 } 1868 } 1869 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { 1870 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { 1871 Instruction *VL1 = cast<Instruction>(VL[j]); 1872 Instruction *VL2 = cast<Instruction>(VL[j + 1]); 1873 if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) { 1874 std::swap(Left[j], Right[j]); 1875 continue; 1876 } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) { 1877 std::swap(Left[j + 1], Right[j + 1]); 1878 continue; 1879 } 1880 // else unchanged 1881 } 1882 } 1883 } 1884 } 1885 1886 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 1887 SmallVectorImpl<Value *> &Left, 1888 SmallVectorImpl<Value *> &Right) { 1889 1890 SmallVector<Value *, 16> OrigLeft, OrigRight; 1891 1892 bool AllSameOpcodeLeft = true; 1893 bool AllSameOpcodeRight = true; 1894 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1895 Instruction *I = cast<Instruction>(VL[i]); 1896 Value *VLeft = I->getOperand(0); 1897 Value *VRight = I->getOperand(1); 1898 1899 OrigLeft.push_back(VLeft); 1900 OrigRight.push_back(VRight); 1901 1902 Instruction *ILeft = dyn_cast<Instruction>(VLeft); 1903 Instruction *IRight = dyn_cast<Instruction>(VRight); 1904 1905 // Check whether all operands on one side have the same opcode. In this case 1906 // we want to preserve the original order and not make things worse by 1907 // reordering. 1908 if (i && AllSameOpcodeLeft && ILeft) { 1909 if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) { 1910 if (PLeft->getOpcode() != ILeft->getOpcode()) 1911 AllSameOpcodeLeft = false; 1912 } else 1913 AllSameOpcodeLeft = false; 1914 } 1915 if (i && AllSameOpcodeRight && IRight) { 1916 if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) { 1917 if (PRight->getOpcode() != IRight->getOpcode()) 1918 AllSameOpcodeRight = false; 1919 } else 1920 AllSameOpcodeRight = false; 1921 } 1922 1923 // Sort two opcodes. In the code below we try to preserve the ability to use 1924 // broadcast of values instead of individual inserts. 1925 // vl1 = load 1926 // vl2 = phi 1927 // vr1 = load 1928 // vr2 = vr2 1929 // = vl1 x vr1 1930 // = vl2 x vr2 1931 // If we just sorted according to opcode we would leave the first line in 1932 // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). 1933 // = vl1 x vr1 1934 // = vr2 x vl2 1935 // Because vr2 and vr1 are from the same load we loose the opportunity of a 1936 // broadcast for the packed right side in the backend: we have [vr1, vl2] 1937 // instead of [vr1, vr2=vr1]. 1938 if (ILeft && IRight) { 1939 if (!i && ILeft->getOpcode() > IRight->getOpcode()) { 1940 Left.push_back(IRight); 1941 Right.push_back(ILeft); 1942 } else if (i && ILeft->getOpcode() > IRight->getOpcode() && 1943 Right[i - 1] != IRight) { 1944 // Try not to destroy a broad cast for no apparent benefit. 1945 Left.push_back(IRight); 1946 Right.push_back(ILeft); 1947 } else if (i && ILeft->getOpcode() == IRight->getOpcode() && 1948 Right[i - 1] == ILeft) { 1949 // Try preserve broadcasts. 1950 Left.push_back(IRight); 1951 Right.push_back(ILeft); 1952 } else if (i && ILeft->getOpcode() == IRight->getOpcode() && 1953 Left[i - 1] == IRight) { 1954 // Try preserve broadcasts. 1955 Left.push_back(IRight); 1956 Right.push_back(ILeft); 1957 } else { 1958 Left.push_back(ILeft); 1959 Right.push_back(IRight); 1960 } 1961 continue; 1962 } 1963 // One opcode, put the instruction on the right. 1964 if (ILeft) { 1965 Left.push_back(VRight); 1966 Right.push_back(ILeft); 1967 continue; 1968 } 1969 Left.push_back(VLeft); 1970 Right.push_back(VRight); 1971 } 1972 1973 bool LeftBroadcast = isSplat(Left); 1974 bool RightBroadcast = isSplat(Right); 1975 1976 // If operands end up being broadcast return this operand order. 1977 if (LeftBroadcast || RightBroadcast) 1978 return; 1979 1980 // Don't reorder if the operands where good to begin. 1981 if (AllSameOpcodeRight || AllSameOpcodeLeft) { 1982 Left = OrigLeft; 1983 Right = OrigRight; 1984 } 1985 1986 // Finally check if we can get longer vectorizable chain by reordering 1987 // without breaking the good operand order detected above. 1988 // E.g. If we have something like- 1989 // load a[0] load b[0] 1990 // load b[1] load a[1] 1991 // load a[2] load b[2] 1992 // load a[3] load b[3] 1993 // Reordering the second load b[1] load a[1] would allow us to vectorize 1994 // this code and we still retain AllSameOpcode property. 1995 // FIXME: This load reordering might break AllSameOpcode in some rare cases 1996 // such as- 1997 // add a[0],c[0] load b[0] 1998 // add a[1],c[2] load b[1] 1999 // b[2] load b[2] 2000 // add a[3],c[3] load b[3] 2001 for (unsigned j = 0; j < VL.size() - 1; ++j) { 2002 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { 2003 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { 2004 if (isConsecutiveAccess(L, L1)) { 2005 std::swap(Left[j + 1], Right[j + 1]); 2006 continue; 2007 } 2008 } 2009 } 2010 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { 2011 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { 2012 if (isConsecutiveAccess(L, L1)) { 2013 std::swap(Left[j + 1], Right[j + 1]); 2014 continue; 2015 } 2016 } 2017 } 2018 // else unchanged 2019 } 2020 } 2021 2022 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { 2023 Instruction *VL0 = cast<Instruction>(VL[0]); 2024 BasicBlock::iterator NextInst = VL0; 2025 ++NextInst; 2026 Builder.SetInsertPoint(VL0->getParent(), NextInst); 2027 Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); 2028 } 2029 2030 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { 2031 Value *Vec = UndefValue::get(Ty); 2032 // Generate the 'InsertElement' instruction. 2033 for (unsigned i = 0; i < Ty->getNumElements(); ++i) { 2034 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); 2035 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { 2036 GatherSeq.insert(Insrt); 2037 CSEBlocks.insert(Insrt->getParent()); 2038 2039 // Add to our 'need-to-extract' list. 2040 if (ScalarToTreeEntry.count(VL[i])) { 2041 int Idx = ScalarToTreeEntry[VL[i]]; 2042 TreeEntry *E = &VectorizableTree[Idx]; 2043 // Find which lane we need to extract. 2044 int FoundLane = -1; 2045 for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { 2046 // Is this the lane of the scalar that we are looking for ? 2047 if (E->Scalars[Lane] == VL[i]) { 2048 FoundLane = Lane; 2049 break; 2050 } 2051 } 2052 assert(FoundLane >= 0 && "Could not find the correct lane"); 2053 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); 2054 } 2055 } 2056 } 2057 2058 return Vec; 2059 } 2060 2061 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { 2062 SmallDenseMap<Value*, int>::const_iterator Entry 2063 = ScalarToTreeEntry.find(VL[0]); 2064 if (Entry != ScalarToTreeEntry.end()) { 2065 int Idx = Entry->second; 2066 const TreeEntry *En = &VectorizableTree[Idx]; 2067 if (En->isSame(VL) && En->VectorizedValue) 2068 return En->VectorizedValue; 2069 } 2070 return nullptr; 2071 } 2072 2073 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { 2074 if (ScalarToTreeEntry.count(VL[0])) { 2075 int Idx = ScalarToTreeEntry[VL[0]]; 2076 TreeEntry *E = &VectorizableTree[Idx]; 2077 if (E->isSame(VL)) 2078 return vectorizeTree(E); 2079 } 2080 2081 Type *ScalarTy = VL[0]->getType(); 2082 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 2083 ScalarTy = SI->getValueOperand()->getType(); 2084 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 2085 2086 return Gather(VL, VecTy); 2087 } 2088 2089 Value *BoUpSLP::vectorizeTree(TreeEntry *E) { 2090 IRBuilder<>::InsertPointGuard Guard(Builder); 2091 2092 if (E->VectorizedValue) { 2093 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); 2094 return E->VectorizedValue; 2095 } 2096 2097 Instruction *VL0 = cast<Instruction>(E->Scalars[0]); 2098 Type *ScalarTy = VL0->getType(); 2099 if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) 2100 ScalarTy = SI->getValueOperand()->getType(); 2101 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); 2102 2103 if (E->NeedToGather) { 2104 setInsertPointAfterBundle(E->Scalars); 2105 return Gather(E->Scalars, VecTy); 2106 } 2107 2108 unsigned Opcode = getSameOpcode(E->Scalars); 2109 2110 switch (Opcode) { 2111 case Instruction::PHI: { 2112 PHINode *PH = dyn_cast<PHINode>(VL0); 2113 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); 2114 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 2115 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); 2116 E->VectorizedValue = NewPhi; 2117 2118 // PHINodes may have multiple entries from the same block. We want to 2119 // visit every block once. 2120 SmallSet<BasicBlock*, 4> VisitedBBs; 2121 2122 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 2123 ValueList Operands; 2124 BasicBlock *IBB = PH->getIncomingBlock(i); 2125 2126 if (!VisitedBBs.insert(IBB).second) { 2127 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); 2128 continue; 2129 } 2130 2131 // Prepare the operand vector. 2132 for (unsigned j = 0; j < E->Scalars.size(); ++j) 2133 Operands.push_back(cast<PHINode>(E->Scalars[j])-> 2134 getIncomingValueForBlock(IBB)); 2135 2136 Builder.SetInsertPoint(IBB->getTerminator()); 2137 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 2138 Value *Vec = vectorizeTree(Operands); 2139 NewPhi->addIncoming(Vec, IBB); 2140 } 2141 2142 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && 2143 "Invalid number of incoming values"); 2144 return NewPhi; 2145 } 2146 2147 case Instruction::ExtractElement: { 2148 if (CanReuseExtract(E->Scalars)) { 2149 Value *V = VL0->getOperand(0); 2150 E->VectorizedValue = V; 2151 return V; 2152 } 2153 return Gather(E->Scalars, VecTy); 2154 } 2155 case Instruction::ZExt: 2156 case Instruction::SExt: 2157 case Instruction::FPToUI: 2158 case Instruction::FPToSI: 2159 case Instruction::FPExt: 2160 case Instruction::PtrToInt: 2161 case Instruction::IntToPtr: 2162 case Instruction::SIToFP: 2163 case Instruction::UIToFP: 2164 case Instruction::Trunc: 2165 case Instruction::FPTrunc: 2166 case Instruction::BitCast: { 2167 ValueList INVL; 2168 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2169 INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2170 2171 setInsertPointAfterBundle(E->Scalars); 2172 2173 Value *InVec = vectorizeTree(INVL); 2174 2175 if (Value *V = alreadyVectorized(E->Scalars)) 2176 return V; 2177 2178 CastInst *CI = dyn_cast<CastInst>(VL0); 2179 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); 2180 E->VectorizedValue = V; 2181 ++NumVectorInstructions; 2182 return V; 2183 } 2184 case Instruction::FCmp: 2185 case Instruction::ICmp: { 2186 ValueList LHSV, RHSV; 2187 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2188 LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2189 RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2190 } 2191 2192 setInsertPointAfterBundle(E->Scalars); 2193 2194 Value *L = vectorizeTree(LHSV); 2195 Value *R = vectorizeTree(RHSV); 2196 2197 if (Value *V = alreadyVectorized(E->Scalars)) 2198 return V; 2199 2200 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); 2201 Value *V; 2202 if (Opcode == Instruction::FCmp) 2203 V = Builder.CreateFCmp(P0, L, R); 2204 else 2205 V = Builder.CreateICmp(P0, L, R); 2206 2207 E->VectorizedValue = V; 2208 ++NumVectorInstructions; 2209 return V; 2210 } 2211 case Instruction::Select: { 2212 ValueList TrueVec, FalseVec, CondVec; 2213 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2214 CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2215 TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2216 FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2)); 2217 } 2218 2219 setInsertPointAfterBundle(E->Scalars); 2220 2221 Value *Cond = vectorizeTree(CondVec); 2222 Value *True = vectorizeTree(TrueVec); 2223 Value *False = vectorizeTree(FalseVec); 2224 2225 if (Value *V = alreadyVectorized(E->Scalars)) 2226 return V; 2227 2228 Value *V = Builder.CreateSelect(Cond, True, False); 2229 E->VectorizedValue = V; 2230 ++NumVectorInstructions; 2231 return V; 2232 } 2233 case Instruction::Add: 2234 case Instruction::FAdd: 2235 case Instruction::Sub: 2236 case Instruction::FSub: 2237 case Instruction::Mul: 2238 case Instruction::FMul: 2239 case Instruction::UDiv: 2240 case Instruction::SDiv: 2241 case Instruction::FDiv: 2242 case Instruction::URem: 2243 case Instruction::SRem: 2244 case Instruction::FRem: 2245 case Instruction::Shl: 2246 case Instruction::LShr: 2247 case Instruction::AShr: 2248 case Instruction::And: 2249 case Instruction::Or: 2250 case Instruction::Xor: { 2251 ValueList LHSVL, RHSVL; 2252 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) 2253 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); 2254 else 2255 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2256 LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2257 RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2258 } 2259 2260 setInsertPointAfterBundle(E->Scalars); 2261 2262 Value *LHS = vectorizeTree(LHSVL); 2263 Value *RHS = vectorizeTree(RHSVL); 2264 2265 if (LHS == RHS && isa<Instruction>(LHS)) { 2266 assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); 2267 } 2268 2269 if (Value *V = alreadyVectorized(E->Scalars)) 2270 return V; 2271 2272 BinaryOperator *BinOp = cast<BinaryOperator>(VL0); 2273 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); 2274 E->VectorizedValue = V; 2275 propagateIRFlags(E->VectorizedValue, E->Scalars); 2276 ++NumVectorInstructions; 2277 2278 if (Instruction *I = dyn_cast<Instruction>(V)) 2279 return propagateMetadata(I, E->Scalars); 2280 2281 return V; 2282 } 2283 case Instruction::Load: { 2284 // Loads are inserted at the head of the tree because we don't want to 2285 // sink them all the way down past store instructions. 2286 setInsertPointAfterBundle(E->Scalars); 2287 2288 LoadInst *LI = cast<LoadInst>(VL0); 2289 Type *ScalarLoadTy = LI->getType(); 2290 unsigned AS = LI->getPointerAddressSpace(); 2291 2292 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), 2293 VecTy->getPointerTo(AS)); 2294 2295 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2296 // ExternalUses list to make sure that an extract will be generated in the 2297 // future. 2298 if (ScalarToTreeEntry.count(LI->getPointerOperand())) 2299 ExternalUses.push_back( 2300 ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0)); 2301 2302 unsigned Alignment = LI->getAlignment(); 2303 LI = Builder.CreateLoad(VecPtr); 2304 if (!Alignment) 2305 Alignment = DL->getABITypeAlignment(ScalarLoadTy); 2306 LI->setAlignment(Alignment); 2307 E->VectorizedValue = LI; 2308 ++NumVectorInstructions; 2309 return propagateMetadata(LI, E->Scalars); 2310 } 2311 case Instruction::Store: { 2312 StoreInst *SI = cast<StoreInst>(VL0); 2313 unsigned Alignment = SI->getAlignment(); 2314 unsigned AS = SI->getPointerAddressSpace(); 2315 2316 ValueList ValueOp; 2317 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2318 ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand()); 2319 2320 setInsertPointAfterBundle(E->Scalars); 2321 2322 Value *VecValue = vectorizeTree(ValueOp); 2323 Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), 2324 VecTy->getPointerTo(AS)); 2325 StoreInst *S = Builder.CreateStore(VecValue, VecPtr); 2326 2327 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2328 // ExternalUses list to make sure that an extract will be generated in the 2329 // future. 2330 if (ScalarToTreeEntry.count(SI->getPointerOperand())) 2331 ExternalUses.push_back( 2332 ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0)); 2333 2334 if (!Alignment) 2335 Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); 2336 S->setAlignment(Alignment); 2337 E->VectorizedValue = S; 2338 ++NumVectorInstructions; 2339 return propagateMetadata(S, E->Scalars); 2340 } 2341 case Instruction::GetElementPtr: { 2342 setInsertPointAfterBundle(E->Scalars); 2343 2344 ValueList Op0VL; 2345 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2346 Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0)); 2347 2348 Value *Op0 = vectorizeTree(Op0VL); 2349 2350 std::vector<Value *> OpVecs; 2351 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; 2352 ++j) { 2353 ValueList OpVL; 2354 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2355 OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j)); 2356 2357 Value *OpVec = vectorizeTree(OpVL); 2358 OpVecs.push_back(OpVec); 2359 } 2360 2361 Value *V = Builder.CreateGEP(Op0, OpVecs); 2362 E->VectorizedValue = V; 2363 ++NumVectorInstructions; 2364 2365 if (Instruction *I = dyn_cast<Instruction>(V)) 2366 return propagateMetadata(I, E->Scalars); 2367 2368 return V; 2369 } 2370 case Instruction::Call: { 2371 CallInst *CI = cast<CallInst>(VL0); 2372 setInsertPointAfterBundle(E->Scalars); 2373 Function *FI; 2374 Intrinsic::ID IID = Intrinsic::not_intrinsic; 2375 Value *ScalarArg = nullptr; 2376 if (CI && (FI = CI->getCalledFunction())) { 2377 IID = (Intrinsic::ID) FI->getIntrinsicID(); 2378 } 2379 std::vector<Value *> OpVecs; 2380 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { 2381 ValueList OpVL; 2382 // ctlz,cttz and powi are special intrinsics whose second argument is 2383 // a scalar. This argument should not be vectorized. 2384 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { 2385 CallInst *CEI = cast<CallInst>(E->Scalars[0]); 2386 ScalarArg = CEI->getArgOperand(j); 2387 OpVecs.push_back(CEI->getArgOperand(j)); 2388 continue; 2389 } 2390 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2391 CallInst *CEI = cast<CallInst>(E->Scalars[i]); 2392 OpVL.push_back(CEI->getArgOperand(j)); 2393 } 2394 2395 Value *OpVec = vectorizeTree(OpVL); 2396 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); 2397 OpVecs.push_back(OpVec); 2398 } 2399 2400 Module *M = F->getParent(); 2401 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 2402 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; 2403 Function *CF = Intrinsic::getDeclaration(M, ID, Tys); 2404 Value *V = Builder.CreateCall(CF, OpVecs); 2405 2406 // The scalar argument uses an in-tree scalar so we add the new vectorized 2407 // call to ExternalUses list to make sure that an extract will be 2408 // generated in the future. 2409 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg)) 2410 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); 2411 2412 E->VectorizedValue = V; 2413 ++NumVectorInstructions; 2414 return V; 2415 } 2416 case Instruction::ShuffleVector: { 2417 ValueList LHSVL, RHSVL; 2418 assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand"); 2419 reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); 2420 setInsertPointAfterBundle(E->Scalars); 2421 2422 Value *LHS = vectorizeTree(LHSVL); 2423 Value *RHS = vectorizeTree(RHSVL); 2424 2425 if (Value *V = alreadyVectorized(E->Scalars)) 2426 return V; 2427 2428 // Create a vector of LHS op1 RHS 2429 BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0); 2430 Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); 2431 2432 // Create a vector of LHS op2 RHS 2433 Instruction *VL1 = cast<Instruction>(E->Scalars[1]); 2434 BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1); 2435 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); 2436 2437 // Create shuffle to take alternate operations from the vector. 2438 // Also, gather up odd and even scalar ops to propagate IR flags to 2439 // each vector operation. 2440 ValueList OddScalars, EvenScalars; 2441 unsigned e = E->Scalars.size(); 2442 SmallVector<Constant *, 8> Mask(e); 2443 for (unsigned i = 0; i < e; ++i) { 2444 if (i & 1) { 2445 Mask[i] = Builder.getInt32(e + i); 2446 OddScalars.push_back(E->Scalars[i]); 2447 } else { 2448 Mask[i] = Builder.getInt32(i); 2449 EvenScalars.push_back(E->Scalars[i]); 2450 } 2451 } 2452 2453 Value *ShuffleMask = ConstantVector::get(Mask); 2454 propagateIRFlags(V0, EvenScalars); 2455 propagateIRFlags(V1, OddScalars); 2456 2457 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); 2458 E->VectorizedValue = V; 2459 ++NumVectorInstructions; 2460 if (Instruction *I = dyn_cast<Instruction>(V)) 2461 return propagateMetadata(I, E->Scalars); 2462 2463 return V; 2464 } 2465 default: 2466 llvm_unreachable("unknown inst"); 2467 } 2468 return nullptr; 2469 } 2470 2471 Value *BoUpSLP::vectorizeTree() { 2472 2473 // All blocks must be scheduled before any instructions are inserted. 2474 for (auto &BSIter : BlocksSchedules) { 2475 scheduleBlock(BSIter.second.get()); 2476 } 2477 2478 Builder.SetInsertPoint(F->getEntryBlock().begin()); 2479 vectorizeTree(&VectorizableTree[0]); 2480 2481 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); 2482 2483 // Extract all of the elements with the external uses. 2484 for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end(); 2485 it != e; ++it) { 2486 Value *Scalar = it->Scalar; 2487 llvm::User *User = it->User; 2488 2489 // Skip users that we already RAUW. This happens when one instruction 2490 // has multiple uses of the same value. 2491 if (std::find(Scalar->user_begin(), Scalar->user_end(), User) == 2492 Scalar->user_end()) 2493 continue; 2494 assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); 2495 2496 int Idx = ScalarToTreeEntry[Scalar]; 2497 TreeEntry *E = &VectorizableTree[Idx]; 2498 assert(!E->NeedToGather && "Extracting from a gather list"); 2499 2500 Value *Vec = E->VectorizedValue; 2501 assert(Vec && "Can't find vectorizable value"); 2502 2503 Value *Lane = Builder.getInt32(it->Lane); 2504 // Generate extracts for out-of-tree users. 2505 // Find the insertion point for the extractelement lane. 2506 if (isa<Instruction>(Vec)){ 2507 if (PHINode *PH = dyn_cast<PHINode>(User)) { 2508 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { 2509 if (PH->getIncomingValue(i) == Scalar) { 2510 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); 2511 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2512 CSEBlocks.insert(PH->getIncomingBlock(i)); 2513 PH->setOperand(i, Ex); 2514 } 2515 } 2516 } else { 2517 Builder.SetInsertPoint(cast<Instruction>(User)); 2518 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2519 CSEBlocks.insert(cast<Instruction>(User)->getParent()); 2520 User->replaceUsesOfWith(Scalar, Ex); 2521 } 2522 } else { 2523 Builder.SetInsertPoint(F->getEntryBlock().begin()); 2524 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2525 CSEBlocks.insert(&F->getEntryBlock()); 2526 User->replaceUsesOfWith(Scalar, Ex); 2527 } 2528 2529 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); 2530 } 2531 2532 // For each vectorized value: 2533 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 2534 TreeEntry *Entry = &VectorizableTree[EIdx]; 2535 2536 // For each lane: 2537 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 2538 Value *Scalar = Entry->Scalars[Lane]; 2539 // No need to handle users of gathered values. 2540 if (Entry->NeedToGather) 2541 continue; 2542 2543 assert(Entry->VectorizedValue && "Can't find vectorizable value"); 2544 2545 Type *Ty = Scalar->getType(); 2546 if (!Ty->isVoidTy()) { 2547 #ifndef NDEBUG 2548 for (User *U : Scalar->users()) { 2549 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); 2550 2551 assert((ScalarToTreeEntry.count(U) || 2552 // It is legal to replace users in the ignorelist by undef. 2553 (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != 2554 UserIgnoreList.end())) && 2555 "Replacing out-of-tree value with undef"); 2556 } 2557 #endif 2558 Value *Undef = UndefValue::get(Ty); 2559 Scalar->replaceAllUsesWith(Undef); 2560 } 2561 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); 2562 eraseInstruction(cast<Instruction>(Scalar)); 2563 } 2564 } 2565 2566 Builder.ClearInsertionPoint(); 2567 2568 return VectorizableTree[0].VectorizedValue; 2569 } 2570 2571 void BoUpSLP::optimizeGatherSequence() { 2572 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() 2573 << " gather sequences instructions.\n"); 2574 // LICM InsertElementInst sequences. 2575 for (SetVector<Instruction *>::iterator it = GatherSeq.begin(), 2576 e = GatherSeq.end(); it != e; ++it) { 2577 InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); 2578 2579 if (!Insert) 2580 continue; 2581 2582 // Check if this block is inside a loop. 2583 Loop *L = LI->getLoopFor(Insert->getParent()); 2584 if (!L) 2585 continue; 2586 2587 // Check if it has a preheader. 2588 BasicBlock *PreHeader = L->getLoopPreheader(); 2589 if (!PreHeader) 2590 continue; 2591 2592 // If the vector or the element that we insert into it are 2593 // instructions that are defined in this basic block then we can't 2594 // hoist this instruction. 2595 Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); 2596 Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); 2597 if (CurrVec && L->contains(CurrVec)) 2598 continue; 2599 if (NewElem && L->contains(NewElem)) 2600 continue; 2601 2602 // We can hoist this instruction. Move it to the pre-header. 2603 Insert->moveBefore(PreHeader->getTerminator()); 2604 } 2605 2606 // Make a list of all reachable blocks in our CSE queue. 2607 SmallVector<const DomTreeNode *, 8> CSEWorkList; 2608 CSEWorkList.reserve(CSEBlocks.size()); 2609 for (BasicBlock *BB : CSEBlocks) 2610 if (DomTreeNode *N = DT->getNode(BB)) { 2611 assert(DT->isReachableFromEntry(N)); 2612 CSEWorkList.push_back(N); 2613 } 2614 2615 // Sort blocks by domination. This ensures we visit a block after all blocks 2616 // dominating it are visited. 2617 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), 2618 [this](const DomTreeNode *A, const DomTreeNode *B) { 2619 return DT->properlyDominates(A, B); 2620 }); 2621 2622 // Perform O(N^2) search over the gather sequences and merge identical 2623 // instructions. TODO: We can further optimize this scan if we split the 2624 // instructions into different buckets based on the insert lane. 2625 SmallVector<Instruction *, 16> Visited; 2626 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { 2627 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && 2628 "Worklist not sorted properly!"); 2629 BasicBlock *BB = (*I)->getBlock(); 2630 // For all instructions in blocks containing gather sequences: 2631 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { 2632 Instruction *In = it++; 2633 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) 2634 continue; 2635 2636 // Check if we can replace this instruction with any of the 2637 // visited instructions. 2638 for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(), 2639 ve = Visited.end(); 2640 v != ve; ++v) { 2641 if (In->isIdenticalTo(*v) && 2642 DT->dominates((*v)->getParent(), In->getParent())) { 2643 In->replaceAllUsesWith(*v); 2644 eraseInstruction(In); 2645 In = nullptr; 2646 break; 2647 } 2648 } 2649 if (In) { 2650 assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end()); 2651 Visited.push_back(In); 2652 } 2653 } 2654 } 2655 CSEBlocks.clear(); 2656 GatherSeq.clear(); 2657 } 2658 2659 // Groups the instructions to a bundle (which is then a single scheduling entity) 2660 // and schedules instructions until the bundle gets ready. 2661 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, 2662 BoUpSLP *SLP) { 2663 if (isa<PHINode>(VL[0])) 2664 return true; 2665 2666 // Initialize the instruction bundle. 2667 Instruction *OldScheduleEnd = ScheduleEnd; 2668 ScheduleData *PrevInBundle = nullptr; 2669 ScheduleData *Bundle = nullptr; 2670 bool ReSchedule = false; 2671 DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); 2672 for (Value *V : VL) { 2673 extendSchedulingRegion(V); 2674 ScheduleData *BundleMember = getScheduleData(V); 2675 assert(BundleMember && 2676 "no ScheduleData for bundle member (maybe not in same basic block)"); 2677 if (BundleMember->IsScheduled) { 2678 // A bundle member was scheduled as single instruction before and now 2679 // needs to be scheduled as part of the bundle. We just get rid of the 2680 // existing schedule. 2681 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember 2682 << " was already scheduled\n"); 2683 ReSchedule = true; 2684 } 2685 assert(BundleMember->isSchedulingEntity() && 2686 "bundle member already part of other bundle"); 2687 if (PrevInBundle) { 2688 PrevInBundle->NextInBundle = BundleMember; 2689 } else { 2690 Bundle = BundleMember; 2691 } 2692 BundleMember->UnscheduledDepsInBundle = 0; 2693 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; 2694 2695 // Group the instructions to a bundle. 2696 BundleMember->FirstInBundle = Bundle; 2697 PrevInBundle = BundleMember; 2698 } 2699 if (ScheduleEnd != OldScheduleEnd) { 2700 // The scheduling region got new instructions at the lower end (or it is a 2701 // new region for the first bundle). This makes it necessary to 2702 // recalculate all dependencies. 2703 // It is seldom that this needs to be done a second time after adding the 2704 // initial bundle to the region. 2705 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2706 ScheduleData *SD = getScheduleData(I); 2707 SD->clearDependencies(); 2708 } 2709 ReSchedule = true; 2710 } 2711 if (ReSchedule) { 2712 resetSchedule(); 2713 initialFillReadyList(ReadyInsts); 2714 } 2715 2716 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " 2717 << BB->getName() << "\n"); 2718 2719 calculateDependencies(Bundle, true, SLP); 2720 2721 // Now try to schedule the new bundle. As soon as the bundle is "ready" it 2722 // means that there are no cyclic dependencies and we can schedule it. 2723 // Note that's important that we don't "schedule" the bundle yet (see 2724 // cancelScheduling). 2725 while (!Bundle->isReady() && !ReadyInsts.empty()) { 2726 2727 ScheduleData *pickedSD = ReadyInsts.back(); 2728 ReadyInsts.pop_back(); 2729 2730 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { 2731 schedule(pickedSD, ReadyInsts); 2732 } 2733 } 2734 return Bundle->isReady(); 2735 } 2736 2737 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) { 2738 if (isa<PHINode>(VL[0])) 2739 return; 2740 2741 ScheduleData *Bundle = getScheduleData(VL[0]); 2742 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); 2743 assert(!Bundle->IsScheduled && 2744 "Can't cancel bundle which is already scheduled"); 2745 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && 2746 "tried to unbundle something which is not a bundle"); 2747 2748 // Un-bundle: make single instructions out of the bundle. 2749 ScheduleData *BundleMember = Bundle; 2750 while (BundleMember) { 2751 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); 2752 BundleMember->FirstInBundle = BundleMember; 2753 ScheduleData *Next = BundleMember->NextInBundle; 2754 BundleMember->NextInBundle = nullptr; 2755 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; 2756 if (BundleMember->UnscheduledDepsInBundle == 0) { 2757 ReadyInsts.insert(BundleMember); 2758 } 2759 BundleMember = Next; 2760 } 2761 } 2762 2763 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { 2764 if (getScheduleData(V)) 2765 return; 2766 Instruction *I = dyn_cast<Instruction>(V); 2767 assert(I && "bundle member must be an instruction"); 2768 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); 2769 if (!ScheduleStart) { 2770 // It's the first instruction in the new region. 2771 initScheduleData(I, I->getNextNode(), nullptr, nullptr); 2772 ScheduleStart = I; 2773 ScheduleEnd = I->getNextNode(); 2774 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2775 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); 2776 return; 2777 } 2778 // Search up and down at the same time, because we don't know if the new 2779 // instruction is above or below the existing scheduling region. 2780 BasicBlock::reverse_iterator UpIter(ScheduleStart); 2781 BasicBlock::reverse_iterator UpperEnd = BB->rend(); 2782 BasicBlock::iterator DownIter(ScheduleEnd); 2783 BasicBlock::iterator LowerEnd = BB->end(); 2784 for (;;) { 2785 if (UpIter != UpperEnd) { 2786 if (&*UpIter == I) { 2787 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); 2788 ScheduleStart = I; 2789 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); 2790 return; 2791 } 2792 UpIter++; 2793 } 2794 if (DownIter != LowerEnd) { 2795 if (&*DownIter == I) { 2796 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, 2797 nullptr); 2798 ScheduleEnd = I->getNextNode(); 2799 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2800 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); 2801 return; 2802 } 2803 DownIter++; 2804 } 2805 assert((UpIter != UpperEnd || DownIter != LowerEnd) && 2806 "instruction not found in block"); 2807 } 2808 } 2809 2810 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, 2811 Instruction *ToI, 2812 ScheduleData *PrevLoadStore, 2813 ScheduleData *NextLoadStore) { 2814 ScheduleData *CurrentLoadStore = PrevLoadStore; 2815 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { 2816 ScheduleData *SD = ScheduleDataMap[I]; 2817 if (!SD) { 2818 // Allocate a new ScheduleData for the instruction. 2819 if (ChunkPos >= ChunkSize) { 2820 ScheduleDataChunks.push_back( 2821 llvm::make_unique<ScheduleData[]>(ChunkSize)); 2822 ChunkPos = 0; 2823 } 2824 SD = &(ScheduleDataChunks.back()[ChunkPos++]); 2825 ScheduleDataMap[I] = SD; 2826 SD->Inst = I; 2827 } 2828 assert(!isInSchedulingRegion(SD) && 2829 "new ScheduleData already in scheduling region"); 2830 SD->init(SchedulingRegionID); 2831 2832 if (I->mayReadOrWriteMemory()) { 2833 // Update the linked list of memory accessing instructions. 2834 if (CurrentLoadStore) { 2835 CurrentLoadStore->NextLoadStore = SD; 2836 } else { 2837 FirstLoadStoreInRegion = SD; 2838 } 2839 CurrentLoadStore = SD; 2840 } 2841 } 2842 if (NextLoadStore) { 2843 if (CurrentLoadStore) 2844 CurrentLoadStore->NextLoadStore = NextLoadStore; 2845 } else { 2846 LastLoadStoreInRegion = CurrentLoadStore; 2847 } 2848 } 2849 2850 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, 2851 bool InsertInReadyList, 2852 BoUpSLP *SLP) { 2853 assert(SD->isSchedulingEntity()); 2854 2855 SmallVector<ScheduleData *, 10> WorkList; 2856 WorkList.push_back(SD); 2857 2858 while (!WorkList.empty()) { 2859 ScheduleData *SD = WorkList.back(); 2860 WorkList.pop_back(); 2861 2862 ScheduleData *BundleMember = SD; 2863 while (BundleMember) { 2864 assert(isInSchedulingRegion(BundleMember)); 2865 if (!BundleMember->hasValidDependencies()) { 2866 2867 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n"); 2868 BundleMember->Dependencies = 0; 2869 BundleMember->resetUnscheduledDeps(); 2870 2871 // Handle def-use chain dependencies. 2872 for (User *U : BundleMember->Inst->users()) { 2873 if (isa<Instruction>(U)) { 2874 ScheduleData *UseSD = getScheduleData(U); 2875 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { 2876 BundleMember->Dependencies++; 2877 ScheduleData *DestBundle = UseSD->FirstInBundle; 2878 if (!DestBundle->IsScheduled) { 2879 BundleMember->incrementUnscheduledDeps(1); 2880 } 2881 if (!DestBundle->hasValidDependencies()) { 2882 WorkList.push_back(DestBundle); 2883 } 2884 } 2885 } else { 2886 // I'm not sure if this can ever happen. But we need to be safe. 2887 // This lets the instruction/bundle never be scheduled and eventally 2888 // disable vectorization. 2889 BundleMember->Dependencies++; 2890 BundleMember->incrementUnscheduledDeps(1); 2891 } 2892 } 2893 2894 // Handle the memory dependencies. 2895 ScheduleData *DepDest = BundleMember->NextLoadStore; 2896 if (DepDest) { 2897 Instruction *SrcInst = BundleMember->Inst; 2898 AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA); 2899 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); 2900 unsigned numAliased = 0; 2901 unsigned DistToSrc = 1; 2902 2903 while (DepDest) { 2904 assert(isInSchedulingRegion(DepDest)); 2905 2906 // We have two limits to reduce the complexity: 2907 // 1) AliasedCheckLimit: It's a small limit to reduce calls to 2908 // SLP->isAliased (which is the expensive part in this loop). 2909 // 2) MaxMemDepDistance: It's for very large blocks and it aborts 2910 // the whole loop (even if the loop is fast, it's quadratic). 2911 // It's important for the loop break condition (see below) to 2912 // check this limit even between two read-only instructions. 2913 if (DistToSrc >= MaxMemDepDistance || 2914 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && 2915 (numAliased >= AliasedCheckLimit || 2916 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { 2917 2918 // We increment the counter only if the locations are aliased 2919 // (instead of counting all alias checks). This gives a better 2920 // balance between reduced runtime and accurate dependencies. 2921 numAliased++; 2922 2923 DepDest->MemoryDependencies.push_back(BundleMember); 2924 BundleMember->Dependencies++; 2925 ScheduleData *DestBundle = DepDest->FirstInBundle; 2926 if (!DestBundle->IsScheduled) { 2927 BundleMember->incrementUnscheduledDeps(1); 2928 } 2929 if (!DestBundle->hasValidDependencies()) { 2930 WorkList.push_back(DestBundle); 2931 } 2932 } 2933 DepDest = DepDest->NextLoadStore; 2934 2935 // Example, explaining the loop break condition: Let's assume our 2936 // starting instruction is i0 and MaxMemDepDistance = 3. 2937 // 2938 // +--------v--v--v 2939 // i0,i1,i2,i3,i4,i5,i6,i7,i8 2940 // +--------^--^--^ 2941 // 2942 // MaxMemDepDistance let us stop alias-checking at i3 and we add 2943 // dependencies from i0 to i3,i4,.. (even if they are not aliased). 2944 // Previously we already added dependencies from i3 to i6,i7,i8 2945 // (because of MaxMemDepDistance). As we added a dependency from 2946 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 2947 // and we can abort this loop at i6. 2948 if (DistToSrc >= 2 * MaxMemDepDistance) 2949 break; 2950 DistToSrc++; 2951 } 2952 } 2953 } 2954 BundleMember = BundleMember->NextInBundle; 2955 } 2956 if (InsertInReadyList && SD->isReady()) { 2957 ReadyInsts.push_back(SD); 2958 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n"); 2959 } 2960 } 2961 } 2962 2963 void BoUpSLP::BlockScheduling::resetSchedule() { 2964 assert(ScheduleStart && 2965 "tried to reset schedule on block which has not been scheduled"); 2966 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2967 ScheduleData *SD = getScheduleData(I); 2968 assert(isInSchedulingRegion(SD)); 2969 SD->IsScheduled = false; 2970 SD->resetUnscheduledDeps(); 2971 } 2972 ReadyInsts.clear(); 2973 } 2974 2975 void BoUpSLP::scheduleBlock(BlockScheduling *BS) { 2976 2977 if (!BS->ScheduleStart) 2978 return; 2979 2980 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); 2981 2982 BS->resetSchedule(); 2983 2984 // For the real scheduling we use a more sophisticated ready-list: it is 2985 // sorted by the original instruction location. This lets the final schedule 2986 // be as close as possible to the original instruction order. 2987 struct ScheduleDataCompare { 2988 bool operator()(ScheduleData *SD1, ScheduleData *SD2) { 2989 return SD2->SchedulingPriority < SD1->SchedulingPriority; 2990 } 2991 }; 2992 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; 2993 2994 // Ensure that all depencency data is updated and fill the ready-list with 2995 // initial instructions. 2996 int Idx = 0; 2997 int NumToSchedule = 0; 2998 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; 2999 I = I->getNextNode()) { 3000 ScheduleData *SD = BS->getScheduleData(I); 3001 assert( 3002 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && 3003 "scheduler and vectorizer have different opinion on what is a bundle"); 3004 SD->FirstInBundle->SchedulingPriority = Idx++; 3005 if (SD->isSchedulingEntity()) { 3006 BS->calculateDependencies(SD, false, this); 3007 NumToSchedule++; 3008 } 3009 } 3010 BS->initialFillReadyList(ReadyInsts); 3011 3012 Instruction *LastScheduledInst = BS->ScheduleEnd; 3013 3014 // Do the "real" scheduling. 3015 while (!ReadyInsts.empty()) { 3016 ScheduleData *picked = *ReadyInsts.begin(); 3017 ReadyInsts.erase(ReadyInsts.begin()); 3018 3019 // Move the scheduled instruction(s) to their dedicated places, if not 3020 // there yet. 3021 ScheduleData *BundleMember = picked; 3022 while (BundleMember) { 3023 Instruction *pickedInst = BundleMember->Inst; 3024 if (LastScheduledInst->getNextNode() != pickedInst) { 3025 BS->BB->getInstList().remove(pickedInst); 3026 BS->BB->getInstList().insert(LastScheduledInst, pickedInst); 3027 } 3028 LastScheduledInst = pickedInst; 3029 BundleMember = BundleMember->NextInBundle; 3030 } 3031 3032 BS->schedule(picked, ReadyInsts); 3033 NumToSchedule--; 3034 } 3035 assert(NumToSchedule == 0 && "could not schedule all instructions"); 3036 3037 // Avoid duplicate scheduling of the block. 3038 BS->ScheduleStart = nullptr; 3039 } 3040 3041 /// The SLPVectorizer Pass. 3042 struct SLPVectorizer : public FunctionPass { 3043 typedef SmallVector<StoreInst *, 8> StoreList; 3044 typedef MapVector<Value *, StoreList> StoreListMap; 3045 3046 /// Pass identification, replacement for typeid 3047 static char ID; 3048 3049 explicit SLPVectorizer() : FunctionPass(ID) { 3050 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); 3051 } 3052 3053 ScalarEvolution *SE; 3054 const DataLayout *DL; 3055 TargetTransformInfo *TTI; 3056 TargetLibraryInfo *TLI; 3057 AliasAnalysis *AA; 3058 LoopInfo *LI; 3059 DominatorTree *DT; 3060 AssumptionCache *AC; 3061 3062 bool runOnFunction(Function &F) override { 3063 if (skipOptnoneFunction(F)) 3064 return false; 3065 3066 SE = &getAnalysis<ScalarEvolution>(); 3067 DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); 3068 DL = DLP ? &DLP->getDataLayout() : nullptr; 3069 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 3070 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 3071 TLI = TLIP ? &TLIP->getTLI() : nullptr; 3072 AA = &getAnalysis<AliasAnalysis>(); 3073 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 3074 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 3075 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 3076 3077 StoreRefs.clear(); 3078 bool Changed = false; 3079 3080 // If the target claims to have no vector registers don't attempt 3081 // vectorization. 3082 if (!TTI->getNumberOfRegisters(true)) 3083 return false; 3084 3085 // Must have DataLayout. We can't require it because some tests run w/o 3086 // triple. 3087 if (!DL) 3088 return false; 3089 3090 // Don't vectorize when the attribute NoImplicitFloat is used. 3091 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 3092 return false; 3093 3094 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); 3095 3096 // Use the bottom up slp vectorizer to construct chains that start with 3097 // store instructions. 3098 BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC); 3099 3100 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to 3101 // delete instructions. 3102 3103 // Scan the blocks in the function in post order. 3104 for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), 3105 e = po_end(&F.getEntryBlock()); it != e; ++it) { 3106 BasicBlock *BB = *it; 3107 // Vectorize trees that end at stores. 3108 if (unsigned count = collectStores(BB, R)) { 3109 (void)count; 3110 DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); 3111 Changed |= vectorizeStoreChains(R); 3112 } 3113 3114 // Vectorize trees that end at reductions. 3115 Changed |= vectorizeChainsInBlock(BB, R); 3116 } 3117 3118 if (Changed) { 3119 R.optimizeGatherSequence(); 3120 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); 3121 DEBUG(verifyFunction(F)); 3122 } 3123 return Changed; 3124 } 3125 3126 void getAnalysisUsage(AnalysisUsage &AU) const override { 3127 FunctionPass::getAnalysisUsage(AU); 3128 AU.addRequired<AssumptionCacheTracker>(); 3129 AU.addRequired<ScalarEvolution>(); 3130 AU.addRequired<AliasAnalysis>(); 3131 AU.addRequired<TargetTransformInfoWrapperPass>(); 3132 AU.addRequired<LoopInfoWrapperPass>(); 3133 AU.addRequired<DominatorTreeWrapperPass>(); 3134 AU.addPreserved<LoopInfoWrapperPass>(); 3135 AU.addPreserved<DominatorTreeWrapperPass>(); 3136 AU.setPreservesCFG(); 3137 } 3138 3139 private: 3140 3141 /// \brief Collect memory references and sort them according to their base 3142 /// object. We sort the stores to their base objects to reduce the cost of the 3143 /// quadratic search on the stores. TODO: We can further reduce this cost 3144 /// if we flush the chain creation every time we run into a memory barrier. 3145 unsigned collectStores(BasicBlock *BB, BoUpSLP &R); 3146 3147 /// \brief Try to vectorize a chain that starts at two arithmetic instrs. 3148 bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); 3149 3150 /// \brief Try to vectorize a list of operands. 3151 /// \@param BuildVector A list of users to ignore for the purpose of 3152 /// scheduling and that don't need extracting. 3153 /// \returns true if a value was vectorized. 3154 bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 3155 ArrayRef<Value *> BuildVector = None, 3156 bool allowReorder = false); 3157 3158 /// \brief Try to vectorize a chain that may start at the operands of \V; 3159 bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); 3160 3161 /// \brief Vectorize the stores that were collected in StoreRefs. 3162 bool vectorizeStoreChains(BoUpSLP &R); 3163 3164 /// \brief Scan the basic block and look for patterns that are likely to start 3165 /// a vectorization chain. 3166 bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); 3167 3168 bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, 3169 BoUpSLP &R); 3170 3171 bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, 3172 BoUpSLP &R); 3173 private: 3174 StoreListMap StoreRefs; 3175 }; 3176 3177 /// \brief Check that the Values in the slice in VL array are still existent in 3178 /// the WeakVH array. 3179 /// Vectorization of part of the VL array may cause later values in the VL array 3180 /// to become invalid. We track when this has happened in the WeakVH array. 3181 static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL, 3182 SmallVectorImpl<WeakVH> &VH, 3183 unsigned SliceBegin, 3184 unsigned SliceSize) { 3185 for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i) 3186 if (VH[i] != VL[i]) 3187 return true; 3188 3189 return false; 3190 } 3191 3192 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, 3193 int CostThreshold, BoUpSLP &R) { 3194 unsigned ChainLen = Chain.size(); 3195 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen 3196 << "\n"); 3197 Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); 3198 unsigned Sz = DL->getTypeSizeInBits(StoreTy); 3199 unsigned VF = MinVecRegSize / Sz; 3200 3201 if (!isPowerOf2_32(Sz) || VF < 2) 3202 return false; 3203 3204 // Keep track of values that were deleted by vectorizing in the loop below. 3205 SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end()); 3206 3207 bool Changed = false; 3208 // Look for profitable vectorizable trees at all offsets, starting at zero. 3209 for (unsigned i = 0, e = ChainLen; i < e; ++i) { 3210 if (i + VF > e) 3211 break; 3212 3213 // Check that a previous iteration of this loop did not delete the Value. 3214 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) 3215 continue; 3216 3217 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i 3218 << "\n"); 3219 ArrayRef<Value *> Operands = Chain.slice(i, VF); 3220 3221 R.buildTree(Operands); 3222 3223 int Cost = R.getTreeCost(); 3224 3225 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); 3226 if (Cost < CostThreshold) { 3227 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); 3228 R.vectorizeTree(); 3229 3230 // Move to the next bundle. 3231 i += VF - 1; 3232 Changed = true; 3233 } 3234 } 3235 3236 return Changed; 3237 } 3238 3239 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, 3240 int costThreshold, BoUpSLP &R) { 3241 SetVector<Value *> Heads, Tails; 3242 SmallDenseMap<Value *, Value *> ConsecutiveChain; 3243 3244 // We may run into multiple chains that merge into a single chain. We mark the 3245 // stores that we vectorized so that we don't visit the same store twice. 3246 BoUpSLP::ValueSet VectorizedStores; 3247 bool Changed = false; 3248 3249 // Do a quadratic search on all of the given stores and find 3250 // all of the pairs of stores that follow each other. 3251 for (unsigned i = 0, e = Stores.size(); i < e; ++i) { 3252 for (unsigned j = 0; j < e; ++j) { 3253 if (i == j) 3254 continue; 3255 3256 if (R.isConsecutiveAccess(Stores[i], Stores[j])) { 3257 Tails.insert(Stores[j]); 3258 Heads.insert(Stores[i]); 3259 ConsecutiveChain[Stores[i]] = Stores[j]; 3260 } 3261 } 3262 } 3263 3264 // For stores that start but don't end a link in the chain: 3265 for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); 3266 it != e; ++it) { 3267 if (Tails.count(*it)) 3268 continue; 3269 3270 // We found a store instr that starts a chain. Now follow the chain and try 3271 // to vectorize it. 3272 BoUpSLP::ValueList Operands; 3273 Value *I = *it; 3274 // Collect the chain into a list. 3275 while (Tails.count(I) || Heads.count(I)) { 3276 if (VectorizedStores.count(I)) 3277 break; 3278 Operands.push_back(I); 3279 // Move to the next value in the chain. 3280 I = ConsecutiveChain[I]; 3281 } 3282 3283 bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); 3284 3285 // Mark the vectorized stores so that we don't vectorize them again. 3286 if (Vectorized) 3287 VectorizedStores.insert(Operands.begin(), Operands.end()); 3288 Changed |= Vectorized; 3289 } 3290 3291 return Changed; 3292 } 3293 3294 3295 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { 3296 unsigned count = 0; 3297 StoreRefs.clear(); 3298 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3299 StoreInst *SI = dyn_cast<StoreInst>(it); 3300 if (!SI) 3301 continue; 3302 3303 // Don't touch volatile stores. 3304 if (!SI->isSimple()) 3305 continue; 3306 3307 // Check that the pointer points to scalars. 3308 Type *Ty = SI->getValueOperand()->getType(); 3309 if (!isValidElementType(Ty)) 3310 continue; 3311 3312 // Find the base pointer. 3313 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); 3314 3315 // Save the store locations. 3316 StoreRefs[Ptr].push_back(SI); 3317 count++; 3318 } 3319 return count; 3320 } 3321 3322 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { 3323 if (!A || !B) 3324 return false; 3325 Value *VL[] = { A, B }; 3326 return tryToVectorizeList(VL, R, None, true); 3327 } 3328 3329 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 3330 ArrayRef<Value *> BuildVector, 3331 bool allowReorder) { 3332 if (VL.size() < 2) 3333 return false; 3334 3335 DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n"); 3336 3337 // Check that all of the parts are scalar instructions of the same type. 3338 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 3339 if (!I0) 3340 return false; 3341 3342 unsigned Opcode0 = I0->getOpcode(); 3343 3344 Type *Ty0 = I0->getType(); 3345 unsigned Sz = DL->getTypeSizeInBits(Ty0); 3346 unsigned VF = MinVecRegSize / Sz; 3347 3348 for (int i = 0, e = VL.size(); i < e; ++i) { 3349 Type *Ty = VL[i]->getType(); 3350 if (!isValidElementType(Ty)) 3351 return false; 3352 Instruction *Inst = dyn_cast<Instruction>(VL[i]); 3353 if (!Inst || Inst->getOpcode() != Opcode0) 3354 return false; 3355 } 3356 3357 bool Changed = false; 3358 3359 // Keep track of values that were deleted by vectorizing in the loop below. 3360 SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); 3361 3362 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 3363 unsigned OpsWidth = 0; 3364 3365 if (i + VF > e) 3366 OpsWidth = e - i; 3367 else 3368 OpsWidth = VF; 3369 3370 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) 3371 break; 3372 3373 // Check that a previous iteration of this loop did not delete the Value. 3374 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth)) 3375 continue; 3376 3377 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " 3378 << "\n"); 3379 ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); 3380 3381 ArrayRef<Value *> BuildVectorSlice; 3382 if (!BuildVector.empty()) 3383 BuildVectorSlice = BuildVector.slice(i, OpsWidth); 3384 3385 R.buildTree(Ops, BuildVectorSlice); 3386 // TODO: check if we can allow reordering also for other cases than 3387 // tryToVectorizePair() 3388 if (allowReorder && R.shouldReorder()) { 3389 assert(Ops.size() == 2); 3390 assert(BuildVectorSlice.empty()); 3391 Value *ReorderedOps[] = { Ops[1], Ops[0] }; 3392 R.buildTree(ReorderedOps, None); 3393 } 3394 int Cost = R.getTreeCost(); 3395 3396 if (Cost < -SLPCostThreshold) { 3397 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); 3398 Value *VectorizedRoot = R.vectorizeTree(); 3399 3400 // Reconstruct the build vector by extracting the vectorized root. This 3401 // way we handle the case where some elements of the vector are undefined. 3402 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) 3403 if (!BuildVectorSlice.empty()) { 3404 // The insert point is the last build vector instruction. The vectorized 3405 // root will precede it. This guarantees that we get an instruction. The 3406 // vectorized tree could have been constant folded. 3407 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); 3408 unsigned VecIdx = 0; 3409 for (auto &V : BuildVectorSlice) { 3410 IRBuilder<true, NoFolder> Builder( 3411 ++BasicBlock::iterator(InsertAfter)); 3412 InsertElementInst *IE = cast<InsertElementInst>(V); 3413 Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( 3414 VectorizedRoot, Builder.getInt32(VecIdx++))); 3415 IE->setOperand(1, Extract); 3416 IE->removeFromParent(); 3417 IE->insertAfter(Extract); 3418 InsertAfter = IE; 3419 } 3420 } 3421 // Move to the next bundle. 3422 i += VF - 1; 3423 Changed = true; 3424 } 3425 } 3426 3427 return Changed; 3428 } 3429 3430 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { 3431 if (!V) 3432 return false; 3433 3434 // Try to vectorize V. 3435 if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) 3436 return true; 3437 3438 BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0)); 3439 BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1)); 3440 // Try to skip B. 3441 if (B && B->hasOneUse()) { 3442 BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); 3443 BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); 3444 if (tryToVectorizePair(A, B0, R)) { 3445 return true; 3446 } 3447 if (tryToVectorizePair(A, B1, R)) { 3448 return true; 3449 } 3450 } 3451 3452 // Try to skip A. 3453 if (A && A->hasOneUse()) { 3454 BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); 3455 BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); 3456 if (tryToVectorizePair(A0, B, R)) { 3457 return true; 3458 } 3459 if (tryToVectorizePair(A1, B, R)) { 3460 return true; 3461 } 3462 } 3463 return 0; 3464 } 3465 3466 /// \brief Generate a shuffle mask to be used in a reduction tree. 3467 /// 3468 /// \param VecLen The length of the vector to be reduced. 3469 /// \param NumEltsToRdx The number of elements that should be reduced in the 3470 /// vector. 3471 /// \param IsPairwise Whether the reduction is a pairwise or splitting 3472 /// reduction. A pairwise reduction will generate a mask of 3473 /// <0,2,...> or <1,3,..> while a splitting reduction will generate 3474 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2. 3475 /// \param IsLeft True will generate a mask of even elements, odd otherwise. 3476 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, 3477 bool IsPairwise, bool IsLeft, 3478 IRBuilder<> &Builder) { 3479 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); 3480 3481 SmallVector<Constant *, 32> ShuffleMask( 3482 VecLen, UndefValue::get(Builder.getInt32Ty())); 3483 3484 if (IsPairwise) 3485 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). 3486 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3487 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); 3488 else 3489 // Move the upper half of the vector to the lower half. 3490 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3491 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); 3492 3493 return ConstantVector::get(ShuffleMask); 3494 } 3495 3496 3497 /// Model horizontal reductions. 3498 /// 3499 /// A horizontal reduction is a tree of reduction operations (currently add and 3500 /// fadd) that has operations that can be put into a vector as its leaf. 3501 /// For example, this tree: 3502 /// 3503 /// mul mul mul mul 3504 /// \ / \ / 3505 /// + + 3506 /// \ / 3507 /// + 3508 /// This tree has "mul" as its reduced values and "+" as its reduction 3509 /// operations. A reduction might be feeding into a store or a binary operation 3510 /// feeding a phi. 3511 /// ... 3512 /// \ / 3513 /// + 3514 /// | 3515 /// phi += 3516 /// 3517 /// Or: 3518 /// ... 3519 /// \ / 3520 /// + 3521 /// | 3522 /// *p = 3523 /// 3524 class HorizontalReduction { 3525 SmallVector<Value *, 16> ReductionOps; 3526 SmallVector<Value *, 32> ReducedVals; 3527 3528 BinaryOperator *ReductionRoot; 3529 PHINode *ReductionPHI; 3530 3531 /// The opcode of the reduction. 3532 unsigned ReductionOpcode; 3533 /// The opcode of the values we perform a reduction on. 3534 unsigned ReducedValueOpcode; 3535 /// The width of one full horizontal reduction operation. 3536 unsigned ReduxWidth; 3537 /// Should we model this reduction as a pairwise reduction tree or a tree that 3538 /// splits the vector in halves and adds those halves. 3539 bool IsPairwiseReduction; 3540 3541 public: 3542 HorizontalReduction() 3543 : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), 3544 ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} 3545 3546 /// \brief Try to find a reduction tree. 3547 bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, 3548 const DataLayout *DL) { 3549 assert((!Phi || 3550 std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && 3551 "Thi phi needs to use the binary operator"); 3552 3553 // We could have a initial reductions that is not an add. 3554 // r *= v1 + v2 + v3 + v4 3555 // In such a case start looking for a tree rooted in the first '+'. 3556 if (Phi) { 3557 if (B->getOperand(0) == Phi) { 3558 Phi = nullptr; 3559 B = dyn_cast<BinaryOperator>(B->getOperand(1)); 3560 } else if (B->getOperand(1) == Phi) { 3561 Phi = nullptr; 3562 B = dyn_cast<BinaryOperator>(B->getOperand(0)); 3563 } 3564 } 3565 3566 if (!B) 3567 return false; 3568 3569 Type *Ty = B->getType(); 3570 if (!isValidElementType(Ty)) 3571 return false; 3572 3573 ReductionOpcode = B->getOpcode(); 3574 ReducedValueOpcode = 0; 3575 ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); 3576 ReductionRoot = B; 3577 ReductionPHI = Phi; 3578 3579 if (ReduxWidth < 4) 3580 return false; 3581 3582 // We currently only support adds. 3583 if (ReductionOpcode != Instruction::Add && 3584 ReductionOpcode != Instruction::FAdd) 3585 return false; 3586 3587 // Post order traverse the reduction tree starting at B. We only handle true 3588 // trees containing only binary operators. 3589 SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack; 3590 Stack.push_back(std::make_pair(B, 0)); 3591 while (!Stack.empty()) { 3592 BinaryOperator *TreeN = Stack.back().first; 3593 unsigned EdgeToVist = Stack.back().second++; 3594 bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; 3595 3596 // Only handle trees in the current basic block. 3597 if (TreeN->getParent() != B->getParent()) 3598 return false; 3599 3600 // Each tree node needs to have one user except for the ultimate 3601 // reduction. 3602 if (!TreeN->hasOneUse() && TreeN != B) 3603 return false; 3604 3605 // Postorder vist. 3606 if (EdgeToVist == 2 || IsReducedValue) { 3607 if (IsReducedValue) { 3608 // Make sure that the opcodes of the operations that we are going to 3609 // reduce match. 3610 if (!ReducedValueOpcode) 3611 ReducedValueOpcode = TreeN->getOpcode(); 3612 else if (ReducedValueOpcode != TreeN->getOpcode()) 3613 return false; 3614 ReducedVals.push_back(TreeN); 3615 } else { 3616 // We need to be able to reassociate the adds. 3617 if (!TreeN->isAssociative()) 3618 return false; 3619 ReductionOps.push_back(TreeN); 3620 } 3621 // Retract. 3622 Stack.pop_back(); 3623 continue; 3624 } 3625 3626 // Visit left or right. 3627 Value *NextV = TreeN->getOperand(EdgeToVist); 3628 BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV); 3629 if (Next) 3630 Stack.push_back(std::make_pair(Next, 0)); 3631 else if (NextV != Phi) 3632 return false; 3633 } 3634 return true; 3635 } 3636 3637 /// \brief Attempt to vectorize the tree found by 3638 /// matchAssociativeReduction. 3639 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { 3640 if (ReducedVals.empty()) 3641 return false; 3642 3643 unsigned NumReducedVals = ReducedVals.size(); 3644 if (NumReducedVals < ReduxWidth) 3645 return false; 3646 3647 Value *VectorizedTree = nullptr; 3648 IRBuilder<> Builder(ReductionRoot); 3649 FastMathFlags Unsafe; 3650 Unsafe.setUnsafeAlgebra(); 3651 Builder.SetFastMathFlags(Unsafe); 3652 unsigned i = 0; 3653 3654 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { 3655 V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps); 3656 3657 // Estimate cost. 3658 int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); 3659 if (Cost >= -SLPCostThreshold) 3660 break; 3661 3662 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost 3663 << ". (HorRdx)\n"); 3664 3665 // Vectorize a tree. 3666 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); 3667 Value *VectorizedRoot = V.vectorizeTree(); 3668 3669 // Emit a reduction. 3670 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); 3671 if (VectorizedTree) { 3672 Builder.SetCurrentDebugLocation(Loc); 3673 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3674 ReducedSubTree, "bin.rdx"); 3675 } else 3676 VectorizedTree = ReducedSubTree; 3677 } 3678 3679 if (VectorizedTree) { 3680 // Finish the reduction. 3681 for (; i < NumReducedVals; ++i) { 3682 Builder.SetCurrentDebugLocation( 3683 cast<Instruction>(ReducedVals[i])->getDebugLoc()); 3684 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3685 ReducedVals[i]); 3686 } 3687 // Update users. 3688 if (ReductionPHI) { 3689 assert(ReductionRoot && "Need a reduction operation"); 3690 ReductionRoot->setOperand(0, VectorizedTree); 3691 ReductionRoot->setOperand(1, ReductionPHI); 3692 } else 3693 ReductionRoot->replaceAllUsesWith(VectorizedTree); 3694 } 3695 return VectorizedTree != nullptr; 3696 } 3697 3698 private: 3699 3700 /// \brief Calcuate the cost of a reduction. 3701 int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { 3702 Type *ScalarTy = FirstReducedVal->getType(); 3703 Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); 3704 3705 int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); 3706 int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); 3707 3708 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; 3709 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; 3710 3711 int ScalarReduxCost = 3712 ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); 3713 3714 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost 3715 << " for reduction that starts with " << *FirstReducedVal 3716 << " (It is a " 3717 << (IsPairwiseReduction ? "pairwise" : "splitting") 3718 << " reduction)\n"); 3719 3720 return VecReduxCost - ScalarReduxCost; 3721 } 3722 3723 static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, 3724 Value *R, const Twine &Name = "") { 3725 if (Opcode == Instruction::FAdd) 3726 return Builder.CreateFAdd(L, R, Name); 3727 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); 3728 } 3729 3730 /// \brief Emit a horizontal reduction of the vectorized value. 3731 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { 3732 assert(VectorizedValue && "Need to have a vectorized tree node"); 3733 assert(isPowerOf2_32(ReduxWidth) && 3734 "We only handle power-of-two reductions for now"); 3735 3736 Value *TmpVec = VectorizedValue; 3737 for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { 3738 if (IsPairwiseReduction) { 3739 Value *LeftMask = 3740 createRdxShuffleMask(ReduxWidth, i, true, true, Builder); 3741 Value *RightMask = 3742 createRdxShuffleMask(ReduxWidth, i, true, false, Builder); 3743 3744 Value *LeftShuf = Builder.CreateShuffleVector( 3745 TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); 3746 Value *RightShuf = Builder.CreateShuffleVector( 3747 TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), 3748 "rdx.shuf.r"); 3749 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, 3750 "bin.rdx"); 3751 } else { 3752 Value *UpperHalf = 3753 createRdxShuffleMask(ReduxWidth, i, false, false, Builder); 3754 Value *Shuf = Builder.CreateShuffleVector( 3755 TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); 3756 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); 3757 } 3758 } 3759 3760 // The result is in the first element of the vector. 3761 return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 3762 } 3763 }; 3764 3765 /// \brief Recognize construction of vectors like 3766 /// %ra = insertelement <4 x float> undef, float %s0, i32 0 3767 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 3768 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 3769 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 3770 /// 3771 /// Returns true if it matches 3772 /// 3773 static bool findBuildVector(InsertElementInst *FirstInsertElem, 3774 SmallVectorImpl<Value *> &BuildVector, 3775 SmallVectorImpl<Value *> &BuildVectorOpds) { 3776 if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) 3777 return false; 3778 3779 InsertElementInst *IE = FirstInsertElem; 3780 while (true) { 3781 BuildVector.push_back(IE); 3782 BuildVectorOpds.push_back(IE->getOperand(1)); 3783 3784 if (IE->use_empty()) 3785 return false; 3786 3787 InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back()); 3788 if (!NextUse) 3789 return true; 3790 3791 // If this isn't the final use, make sure the next insertelement is the only 3792 // use. It's OK if the final constructed vector is used multiple times 3793 if (!IE->hasOneUse()) 3794 return false; 3795 3796 IE = NextUse; 3797 } 3798 3799 return false; 3800 } 3801 3802 static bool PhiTypeSorterFunc(Value *V, Value *V2) { 3803 return V->getType() < V2->getType(); 3804 } 3805 3806 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { 3807 bool Changed = false; 3808 SmallVector<Value *, 4> Incoming; 3809 SmallSet<Value *, 16> VisitedInstrs; 3810 3811 bool HaveVectorizedPhiNodes = true; 3812 while (HaveVectorizedPhiNodes) { 3813 HaveVectorizedPhiNodes = false; 3814 3815 // Collect the incoming values from the PHIs. 3816 Incoming.clear(); 3817 for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie; 3818 ++instr) { 3819 PHINode *P = dyn_cast<PHINode>(instr); 3820 if (!P) 3821 break; 3822 3823 if (!VisitedInstrs.count(P)) 3824 Incoming.push_back(P); 3825 } 3826 3827 // Sort by type. 3828 std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc); 3829 3830 // Try to vectorize elements base on their type. 3831 for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), 3832 E = Incoming.end(); 3833 IncIt != E;) { 3834 3835 // Look for the next elements with the same type. 3836 SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; 3837 while (SameTypeIt != E && 3838 (*SameTypeIt)->getType() == (*IncIt)->getType()) { 3839 VisitedInstrs.insert(*SameTypeIt); 3840 ++SameTypeIt; 3841 } 3842 3843 // Try to vectorize them. 3844 unsigned NumElts = (SameTypeIt - IncIt); 3845 DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); 3846 if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { 3847 // Success start over because instructions might have been changed. 3848 HaveVectorizedPhiNodes = true; 3849 Changed = true; 3850 break; 3851 } 3852 3853 // Start over at the next instruction of a different type (or the end). 3854 IncIt = SameTypeIt; 3855 } 3856 } 3857 3858 VisitedInstrs.clear(); 3859 3860 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { 3861 // We may go through BB multiple times so skip the one we have checked. 3862 if (!VisitedInstrs.insert(it).second) 3863 continue; 3864 3865 if (isa<DbgInfoIntrinsic>(it)) 3866 continue; 3867 3868 // Try to vectorize reductions that use PHINodes. 3869 if (PHINode *P = dyn_cast<PHINode>(it)) { 3870 // Check that the PHI is a reduction PHI. 3871 if (P->getNumIncomingValues() != 2) 3872 return Changed; 3873 Value *Rdx = 3874 (P->getIncomingBlock(0) == BB 3875 ? (P->getIncomingValue(0)) 3876 : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) 3877 : nullptr)); 3878 // Check if this is a Binary Operator. 3879 BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); 3880 if (!BI) 3881 continue; 3882 3883 // Try to match and vectorize a horizontal reduction. 3884 HorizontalReduction HorRdx; 3885 if (ShouldVectorizeHor && 3886 HorRdx.matchAssociativeReduction(P, BI, DL) && 3887 HorRdx.tryToReduce(R, TTI)) { 3888 Changed = true; 3889 it = BB->begin(); 3890 e = BB->end(); 3891 continue; 3892 } 3893 3894 Value *Inst = BI->getOperand(0); 3895 if (Inst == P) 3896 Inst = BI->getOperand(1); 3897 3898 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) { 3899 // We would like to start over since some instructions are deleted 3900 // and the iterator may become invalid value. 3901 Changed = true; 3902 it = BB->begin(); 3903 e = BB->end(); 3904 continue; 3905 } 3906 3907 continue; 3908 } 3909 3910 // Try to vectorize horizontal reductions feeding into a store. 3911 if (ShouldStartVectorizeHorAtStore) 3912 if (StoreInst *SI = dyn_cast<StoreInst>(it)) 3913 if (BinaryOperator *BinOp = 3914 dyn_cast<BinaryOperator>(SI->getValueOperand())) { 3915 HorizontalReduction HorRdx; 3916 if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) && 3917 HorRdx.tryToReduce(R, TTI)) || 3918 tryToVectorize(BinOp, R))) { 3919 Changed = true; 3920 it = BB->begin(); 3921 e = BB->end(); 3922 continue; 3923 } 3924 } 3925 3926 // Try to vectorize horizontal reductions feeding into a return. 3927 if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) 3928 if (RI->getNumOperands() != 0) 3929 if (BinaryOperator *BinOp = 3930 dyn_cast<BinaryOperator>(RI->getOperand(0))) { 3931 DEBUG(dbgs() << "SLP: Found a return to vectorize.\n"); 3932 if (tryToVectorizePair(BinOp->getOperand(0), 3933 BinOp->getOperand(1), R)) { 3934 Changed = true; 3935 it = BB->begin(); 3936 e = BB->end(); 3937 continue; 3938 } 3939 } 3940 3941 // Try to vectorize trees that start at compare instructions. 3942 if (CmpInst *CI = dyn_cast<CmpInst>(it)) { 3943 if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { 3944 Changed = true; 3945 // We would like to start over since some instructions are deleted 3946 // and the iterator may become invalid value. 3947 it = BB->begin(); 3948 e = BB->end(); 3949 continue; 3950 } 3951 3952 for (int i = 0; i < 2; ++i) { 3953 if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) { 3954 if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { 3955 Changed = true; 3956 // We would like to start over since some instructions are deleted 3957 // and the iterator may become invalid value. 3958 it = BB->begin(); 3959 e = BB->end(); 3960 break; 3961 } 3962 } 3963 } 3964 continue; 3965 } 3966 3967 // Try to vectorize trees that start at insertelement instructions. 3968 if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { 3969 SmallVector<Value *, 16> BuildVector; 3970 SmallVector<Value *, 16> BuildVectorOpds; 3971 if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) 3972 continue; 3973 3974 // Vectorize starting with the build vector operands ignoring the 3975 // BuildVector instructions for the purpose of scheduling and user 3976 // extraction. 3977 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { 3978 Changed = true; 3979 it = BB->begin(); 3980 e = BB->end(); 3981 } 3982 3983 continue; 3984 } 3985 } 3986 3987 return Changed; 3988 } 3989 3990 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { 3991 bool Changed = false; 3992 // Attempt to sort and vectorize each of the store-groups. 3993 for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); 3994 it != e; ++it) { 3995 if (it->second.size() < 2) 3996 continue; 3997 3998 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " 3999 << it->second.size() << ".\n"); 4000 4001 // Process the stores in chunks of 16. 4002 for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { 4003 unsigned Len = std::min<unsigned>(CE - CI, 16); 4004 Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), 4005 -SLPCostThreshold, R); 4006 } 4007 } 4008 return Changed; 4009 } 4010 4011 } // end anonymous namespace 4012 4013 char SLPVectorizer::ID = 0; 4014 static const char lv_name[] = "SLP Vectorizer"; 4015 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) 4016 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 4017 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 4018 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 4019 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 4020 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 4021 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) 4022 4023 namespace llvm { 4024 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); } 4025 } 4026