1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive 10 // stores that can be put together into vector-stores. Next, it attempts to 11 // construct vectorizable tree using the use-def chains. If a profitable tree 12 // was found, the SLP vectorizer performs vectorization on the tree. 13 // 14 // The pass is inspired by the work described in the paper: 15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 16 // 17 //===----------------------------------------------------------------------===// 18 #include "llvm/Transforms/Vectorize.h" 19 #include "llvm/ADT/MapVector.h" 20 #include "llvm/ADT/Optional.h" 21 #include "llvm/ADT/PostOrderIterator.h" 22 #include "llvm/ADT/SetVector.h" 23 #include "llvm/ADT/Statistic.h" 24 #include "llvm/Analysis/AliasAnalysis.h" 25 #include "llvm/Analysis/GlobalsModRef.h" 26 #include "llvm/Analysis/AssumptionCache.h" 27 #include "llvm/Analysis/CodeMetrics.h" 28 #include "llvm/Analysis/LoopInfo.h" 29 #include "llvm/Analysis/LoopAccessAnalysis.h" 30 #include "llvm/Analysis/ScalarEvolution.h" 31 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 32 #include "llvm/Analysis/TargetTransformInfo.h" 33 #include "llvm/Analysis/ValueTracking.h" 34 #include "llvm/IR/DataLayout.h" 35 #include "llvm/IR/Dominators.h" 36 #include "llvm/IR/IRBuilder.h" 37 #include "llvm/IR/Instructions.h" 38 #include "llvm/IR/IntrinsicInst.h" 39 #include "llvm/IR/Module.h" 40 #include "llvm/IR/NoFolder.h" 41 #include "llvm/IR/Type.h" 42 #include "llvm/IR/Value.h" 43 #include "llvm/IR/Verifier.h" 44 #include "llvm/Pass.h" 45 #include "llvm/Support/CommandLine.h" 46 #include "llvm/Support/Debug.h" 47 #include "llvm/Support/raw_ostream.h" 48 #include "llvm/Analysis/VectorUtils.h" 49 #include <algorithm> 50 #include <map> 51 #include <memory> 52 53 using namespace llvm; 54 55 #define SV_NAME "slp-vectorizer" 56 #define DEBUG_TYPE "SLP" 57 58 STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); 59 60 static cl::opt<int> 61 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, 62 cl::desc("Only vectorize if you gain more than this " 63 "number ")); 64 65 static cl::opt<bool> 66 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, 67 cl::desc("Attempt to vectorize horizontal reductions")); 68 69 static cl::opt<bool> ShouldStartVectorizeHorAtStore( 70 "slp-vectorize-hor-store", cl::init(false), cl::Hidden, 71 cl::desc( 72 "Attempt to vectorize horizontal reductions feeding into a store")); 73 74 static cl::opt<int> 75 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, 76 cl::desc("Attempt to vectorize for this register size in bits")); 77 78 /// Limits the size of scheduling regions in a block. 79 /// It avoid long compile times for _very_ large blocks where vector 80 /// instructions are spread over a wide range. 81 /// This limit is way higher than needed by real-world functions. 82 static cl::opt<int> 83 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, 84 cl::desc("Limit the size of the SLP scheduling region per block")); 85 86 namespace { 87 88 // FIXME: Set this via cl::opt to allow overriding. 89 static const unsigned MinVecRegSize = 128; 90 91 static const unsigned RecursionMaxDepth = 12; 92 93 // Limit the number of alias checks. The limit is chosen so that 94 // it has no negative effect on the llvm benchmarks. 95 static const unsigned AliasedCheckLimit = 10; 96 97 // Another limit for the alias checks: The maximum distance between load/store 98 // instructions where alias checks are done. 99 // This limit is useful for very large basic blocks. 100 static const unsigned MaxMemDepDistance = 160; 101 102 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling 103 /// regions to be handled. 104 static const int MinScheduleRegionSize = 16; 105 106 /// \brief Predicate for the element types that the SLP vectorizer supports. 107 /// 108 /// The most important thing to filter here are types which are invalid in LLVM 109 /// vectors. We also filter target specific types which have absolutely no 110 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just 111 /// avoids spending time checking the cost model and realizing that they will 112 /// be inevitably scalarized. 113 static bool isValidElementType(Type *Ty) { 114 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && 115 !Ty->isPPC_FP128Ty(); 116 } 117 118 /// \returns the parent basic block if all of the instructions in \p VL 119 /// are in the same block or null otherwise. 120 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { 121 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 122 if (!I0) 123 return nullptr; 124 BasicBlock *BB = I0->getParent(); 125 for (int i = 1, e = VL.size(); i < e; i++) { 126 Instruction *I = dyn_cast<Instruction>(VL[i]); 127 if (!I) 128 return nullptr; 129 130 if (BB != I->getParent()) 131 return nullptr; 132 } 133 return BB; 134 } 135 136 /// \returns True if all of the values in \p VL are constants. 137 static bool allConstant(ArrayRef<Value *> VL) { 138 for (unsigned i = 0, e = VL.size(); i < e; ++i) 139 if (!isa<Constant>(VL[i])) 140 return false; 141 return true; 142 } 143 144 /// \returns True if all of the values in \p VL are identical. 145 static bool isSplat(ArrayRef<Value *> VL) { 146 for (unsigned i = 1, e = VL.size(); i < e; ++i) 147 if (VL[i] != VL[0]) 148 return false; 149 return true; 150 } 151 152 ///\returns Opcode that can be clubbed with \p Op to create an alternate 153 /// sequence which can later be merged as a ShuffleVector instruction. 154 static unsigned getAltOpcode(unsigned Op) { 155 switch (Op) { 156 case Instruction::FAdd: 157 return Instruction::FSub; 158 case Instruction::FSub: 159 return Instruction::FAdd; 160 case Instruction::Add: 161 return Instruction::Sub; 162 case Instruction::Sub: 163 return Instruction::Add; 164 default: 165 return 0; 166 } 167 } 168 169 ///\returns bool representing if Opcode \p Op can be part 170 /// of an alternate sequence which can later be merged as 171 /// a ShuffleVector instruction. 172 static bool canCombineAsAltInst(unsigned Op) { 173 return Op == Instruction::FAdd || Op == Instruction::FSub || 174 Op == Instruction::Sub || Op == Instruction::Add; 175 } 176 177 /// \returns ShuffleVector instruction if instructions in \p VL have 178 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. 179 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) 180 static unsigned isAltInst(ArrayRef<Value *> VL) { 181 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 182 unsigned Opcode = I0->getOpcode(); 183 unsigned AltOpcode = getAltOpcode(Opcode); 184 for (int i = 1, e = VL.size(); i < e; i++) { 185 Instruction *I = dyn_cast<Instruction>(VL[i]); 186 if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) 187 return 0; 188 } 189 return Instruction::ShuffleVector; 190 } 191 192 /// \returns The opcode if all of the Instructions in \p VL have the same 193 /// opcode, or zero. 194 static unsigned getSameOpcode(ArrayRef<Value *> VL) { 195 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 196 if (!I0) 197 return 0; 198 unsigned Opcode = I0->getOpcode(); 199 for (int i = 1, e = VL.size(); i < e; i++) { 200 Instruction *I = dyn_cast<Instruction>(VL[i]); 201 if (!I || Opcode != I->getOpcode()) { 202 if (canCombineAsAltInst(Opcode) && i == 1) 203 return isAltInst(VL); 204 return 0; 205 } 206 } 207 return Opcode; 208 } 209 210 /// Get the intersection (logical and) of all of the potential IR flags 211 /// of each scalar operation (VL) that will be converted into a vector (I). 212 /// Flag set: NSW, NUW, exact, and all of fast-math. 213 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { 214 if (auto *VecOp = dyn_cast<BinaryOperator>(I)) { 215 if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) { 216 // Intersection is initialized to the 0th scalar, 217 // so start counting from index '1'. 218 for (int i = 1, e = VL.size(); i < e; ++i) { 219 if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i])) 220 Intersection->andIRFlags(Scalar); 221 } 222 VecOp->copyIRFlags(Intersection); 223 } 224 } 225 } 226 227 /// \returns \p I after propagating metadata from \p VL. 228 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { 229 Instruction *I0 = cast<Instruction>(VL[0]); 230 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; 231 I0->getAllMetadataOtherThanDebugLoc(Metadata); 232 233 for (unsigned i = 0, n = Metadata.size(); i != n; ++i) { 234 unsigned Kind = Metadata[i].first; 235 MDNode *MD = Metadata[i].second; 236 237 for (int i = 1, e = VL.size(); MD && i != e; i++) { 238 Instruction *I = cast<Instruction>(VL[i]); 239 MDNode *IMD = I->getMetadata(Kind); 240 241 switch (Kind) { 242 default: 243 MD = nullptr; // Remove unknown metadata 244 break; 245 case LLVMContext::MD_tbaa: 246 MD = MDNode::getMostGenericTBAA(MD, IMD); 247 break; 248 case LLVMContext::MD_alias_scope: 249 MD = MDNode::getMostGenericAliasScope(MD, IMD); 250 break; 251 case LLVMContext::MD_noalias: 252 MD = MDNode::intersect(MD, IMD); 253 break; 254 case LLVMContext::MD_fpmath: 255 MD = MDNode::getMostGenericFPMath(MD, IMD); 256 break; 257 case LLVMContext::MD_nontemporal: 258 MD = MDNode::intersect(MD, IMD); 259 break; 260 } 261 } 262 I->setMetadata(Kind, MD); 263 } 264 return I; 265 } 266 267 /// \returns The type that all of the values in \p VL have or null if there 268 /// are different types. 269 static Type* getSameType(ArrayRef<Value *> VL) { 270 Type *Ty = VL[0]->getType(); 271 for (int i = 1, e = VL.size(); i < e; i++) 272 if (VL[i]->getType() != Ty) 273 return nullptr; 274 275 return Ty; 276 } 277 278 /// \returns True if the ExtractElement instructions in VL can be vectorized 279 /// to use the original vector. 280 static bool CanReuseExtract(ArrayRef<Value *> VL) { 281 assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); 282 // Check if all of the extracts come from the same vector and from the 283 // correct offset. 284 Value *VL0 = VL[0]; 285 ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); 286 Value *Vec = E0->getOperand(0); 287 288 // We have to extract from the same vector type. 289 unsigned NElts = Vec->getType()->getVectorNumElements(); 290 291 if (NElts != VL.size()) 292 return false; 293 294 // Check that all of the indices extract from the correct offset. 295 ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); 296 if (!CI || CI->getZExtValue()) 297 return false; 298 299 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 300 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 301 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); 302 303 if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) 304 return false; 305 } 306 307 return true; 308 } 309 310 /// \returns True if in-tree use also needs extract. This refers to 311 /// possible scalar operand in vectorized instruction. 312 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, 313 TargetLibraryInfo *TLI) { 314 315 unsigned Opcode = UserInst->getOpcode(); 316 switch (Opcode) { 317 case Instruction::Load: { 318 LoadInst *LI = cast<LoadInst>(UserInst); 319 return (LI->getPointerOperand() == Scalar); 320 } 321 case Instruction::Store: { 322 StoreInst *SI = cast<StoreInst>(UserInst); 323 return (SI->getPointerOperand() == Scalar); 324 } 325 case Instruction::Call: { 326 CallInst *CI = cast<CallInst>(UserInst); 327 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 328 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 329 return (CI->getArgOperand(1) == Scalar); 330 } 331 } 332 default: 333 return false; 334 } 335 } 336 337 /// \returns the AA location that is being access by the instruction. 338 static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) { 339 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 340 return MemoryLocation::get(SI); 341 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 342 return MemoryLocation::get(LI); 343 return MemoryLocation(); 344 } 345 346 /// \returns True if the instruction is not a volatile or atomic load/store. 347 static bool isSimple(Instruction *I) { 348 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 349 return LI->isSimple(); 350 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 351 return SI->isSimple(); 352 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 353 return !MI->isVolatile(); 354 return true; 355 } 356 357 /// Bottom Up SLP Vectorizer. 358 class BoUpSLP { 359 public: 360 typedef SmallVector<Value *, 8> ValueList; 361 typedef SmallVector<Instruction *, 16> InstrList; 362 typedef SmallPtrSet<Value *, 16> ValueSet; 363 typedef SmallVector<StoreInst *, 8> StoreList; 364 365 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, 366 TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, 367 DominatorTree *Dt, AssumptionCache *AC) 368 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), 369 SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), 370 Builder(Se->getContext()) { 371 CodeMetrics::collectEphemeralValues(F, AC, EphValues); 372 } 373 374 /// \brief Vectorize the tree that starts with the elements in \p VL. 375 /// Returns the vectorized root. 376 Value *vectorizeTree(); 377 378 /// \returns the cost incurred by unwanted spills and fills, caused by 379 /// holding live values over call sites. 380 int getSpillCost(); 381 382 /// \returns the vectorization cost of the subtree that starts at \p VL. 383 /// A negative number means that this is profitable. 384 int getTreeCost(); 385 386 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 387 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 388 void buildTree(ArrayRef<Value *> Roots, 389 ArrayRef<Value *> UserIgnoreLst = None); 390 391 /// Clear the internal data structures that are created by 'buildTree'. 392 void deleteTree() { 393 VectorizableTree.clear(); 394 ScalarToTreeEntry.clear(); 395 MustGather.clear(); 396 ExternalUses.clear(); 397 NumLoadsWantToKeepOrder = 0; 398 NumLoadsWantToChangeOrder = 0; 399 for (auto &Iter : BlocksSchedules) { 400 BlockScheduling *BS = Iter.second.get(); 401 BS->clear(); 402 } 403 } 404 405 /// \brief Perform LICM and CSE on the newly generated gather sequences. 406 void optimizeGatherSequence(); 407 408 /// \returns true if it is beneficial to reverse the vector order. 409 bool shouldReorder() const { 410 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; 411 } 412 413 /// \return The vector element size in bits to use when vectorizing the 414 /// expression tree ending at \p V. If V is a store, the size is the width of 415 /// the stored value. Otherwise, the size is the width of the largest loaded 416 /// value reaching V. This method is used by the vectorizer to calculate 417 /// vectorization factors. 418 unsigned getVectorElementSize(Value *V); 419 420 private: 421 struct TreeEntry; 422 423 /// \returns the cost of the vectorizable entry. 424 int getEntryCost(TreeEntry *E); 425 426 /// This is the recursive part of buildTree. 427 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); 428 429 /// Vectorize a single entry in the tree. 430 Value *vectorizeTree(TreeEntry *E); 431 432 /// Vectorize a single entry in the tree, starting in \p VL. 433 Value *vectorizeTree(ArrayRef<Value *> VL); 434 435 /// \returns the pointer to the vectorized value if \p VL is already 436 /// vectorized, or NULL. They may happen in cycles. 437 Value *alreadyVectorized(ArrayRef<Value *> VL) const; 438 439 /// \returns the scalarization cost for this type. Scalarization in this 440 /// context means the creation of vectors from a group of scalars. 441 int getGatherCost(Type *Ty); 442 443 /// \returns the scalarization cost for this list of values. Assuming that 444 /// this subtree gets vectorized, we may need to extract the values from the 445 /// roots. This method calculates the cost of extracting the values. 446 int getGatherCost(ArrayRef<Value *> VL); 447 448 /// \brief Set the Builder insert point to one after the last instruction in 449 /// the bundle 450 void setInsertPointAfterBundle(ArrayRef<Value *> VL); 451 452 /// \returns a vector from a collection of scalars in \p VL. 453 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); 454 455 /// \returns whether the VectorizableTree is fully vectorizable and will 456 /// be beneficial even the tree height is tiny. 457 bool isFullyVectorizableTinyTree(); 458 459 /// \reorder commutative operands in alt shuffle if they result in 460 /// vectorized code. 461 void reorderAltShuffleOperands(ArrayRef<Value *> VL, 462 SmallVectorImpl<Value *> &Left, 463 SmallVectorImpl<Value *> &Right); 464 /// \reorder commutative operands to get better probability of 465 /// generating vectorized code. 466 void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 467 SmallVectorImpl<Value *> &Left, 468 SmallVectorImpl<Value *> &Right); 469 struct TreeEntry { 470 TreeEntry() : Scalars(), VectorizedValue(nullptr), 471 NeedToGather(0) {} 472 473 /// \returns true if the scalars in VL are equal to this entry. 474 bool isSame(ArrayRef<Value *> VL) const { 475 assert(VL.size() == Scalars.size() && "Invalid size"); 476 return std::equal(VL.begin(), VL.end(), Scalars.begin()); 477 } 478 479 /// A vector of scalars. 480 ValueList Scalars; 481 482 /// The Scalars are vectorized into this value. It is initialized to Null. 483 Value *VectorizedValue; 484 485 /// Do we need to gather this sequence ? 486 bool NeedToGather; 487 }; 488 489 /// Create a new VectorizableTree entry. 490 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { 491 VectorizableTree.emplace_back(); 492 int idx = VectorizableTree.size() - 1; 493 TreeEntry *Last = &VectorizableTree[idx]; 494 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); 495 Last->NeedToGather = !Vectorized; 496 if (Vectorized) { 497 for (int i = 0, e = VL.size(); i != e; ++i) { 498 assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); 499 ScalarToTreeEntry[VL[i]] = idx; 500 } 501 } else { 502 MustGather.insert(VL.begin(), VL.end()); 503 } 504 return Last; 505 } 506 507 /// -- Vectorization State -- 508 /// Holds all of the tree entries. 509 std::vector<TreeEntry> VectorizableTree; 510 511 /// Maps a specific scalar to its tree entry. 512 SmallDenseMap<Value*, int> ScalarToTreeEntry; 513 514 /// A list of scalars that we found that we need to keep as scalars. 515 ValueSet MustGather; 516 517 /// This POD struct describes one external user in the vectorized tree. 518 struct ExternalUser { 519 ExternalUser (Value *S, llvm::User *U, int L) : 520 Scalar(S), User(U), Lane(L){} 521 // Which scalar in our function. 522 Value *Scalar; 523 // Which user that uses the scalar. 524 llvm::User *User; 525 // Which lane does the scalar belong to. 526 int Lane; 527 }; 528 typedef SmallVector<ExternalUser, 16> UserList; 529 530 /// Checks if two instructions may access the same memory. 531 /// 532 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it 533 /// is invariant in the calling loop. 534 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, 535 Instruction *Inst2) { 536 537 // First check if the result is already in the cache. 538 AliasCacheKey key = std::make_pair(Inst1, Inst2); 539 Optional<bool> &result = AliasCache[key]; 540 if (result.hasValue()) { 541 return result.getValue(); 542 } 543 MemoryLocation Loc2 = getLocation(Inst2, AA); 544 bool aliased = true; 545 if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { 546 // Do the alias check. 547 aliased = AA->alias(Loc1, Loc2); 548 } 549 // Store the result in the cache. 550 result = aliased; 551 return aliased; 552 } 553 554 typedef std::pair<Instruction *, Instruction *> AliasCacheKey; 555 556 /// Cache for alias results. 557 /// TODO: consider moving this to the AliasAnalysis itself. 558 DenseMap<AliasCacheKey, Optional<bool>> AliasCache; 559 560 /// Removes an instruction from its block and eventually deletes it. 561 /// It's like Instruction::eraseFromParent() except that the actual deletion 562 /// is delayed until BoUpSLP is destructed. 563 /// This is required to ensure that there are no incorrect collisions in the 564 /// AliasCache, which can happen if a new instruction is allocated at the 565 /// same address as a previously deleted instruction. 566 void eraseInstruction(Instruction *I) { 567 I->removeFromParent(); 568 I->dropAllReferences(); 569 DeletedInstructions.push_back(std::unique_ptr<Instruction>(I)); 570 } 571 572 /// Temporary store for deleted instructions. Instructions will be deleted 573 /// eventually when the BoUpSLP is destructed. 574 SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions; 575 576 /// A list of values that need to extracted out of the tree. 577 /// This list holds pairs of (Internal Scalar : External User). 578 UserList ExternalUses; 579 580 /// Values used only by @llvm.assume calls. 581 SmallPtrSet<const Value *, 32> EphValues; 582 583 /// Holds all of the instructions that we gathered. 584 SetVector<Instruction *> GatherSeq; 585 /// A list of blocks that we are going to CSE. 586 SetVector<BasicBlock *> CSEBlocks; 587 588 /// Contains all scheduling relevant data for an instruction. 589 /// A ScheduleData either represents a single instruction or a member of an 590 /// instruction bundle (= a group of instructions which is combined into a 591 /// vector instruction). 592 struct ScheduleData { 593 594 // The initial value for the dependency counters. It means that the 595 // dependencies are not calculated yet. 596 enum { InvalidDeps = -1 }; 597 598 ScheduleData() 599 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr), 600 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0), 601 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps), 602 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {} 603 604 void init(int BlockSchedulingRegionID) { 605 FirstInBundle = this; 606 NextInBundle = nullptr; 607 NextLoadStore = nullptr; 608 IsScheduled = false; 609 SchedulingRegionID = BlockSchedulingRegionID; 610 UnscheduledDepsInBundle = UnscheduledDeps; 611 clearDependencies(); 612 } 613 614 /// Returns true if the dependency information has been calculated. 615 bool hasValidDependencies() const { return Dependencies != InvalidDeps; } 616 617 /// Returns true for single instructions and for bundle representatives 618 /// (= the head of a bundle). 619 bool isSchedulingEntity() const { return FirstInBundle == this; } 620 621 /// Returns true if it represents an instruction bundle and not only a 622 /// single instruction. 623 bool isPartOfBundle() const { 624 return NextInBundle != nullptr || FirstInBundle != this; 625 } 626 627 /// Returns true if it is ready for scheduling, i.e. it has no more 628 /// unscheduled depending instructions/bundles. 629 bool isReady() const { 630 assert(isSchedulingEntity() && 631 "can't consider non-scheduling entity for ready list"); 632 return UnscheduledDepsInBundle == 0 && !IsScheduled; 633 } 634 635 /// Modifies the number of unscheduled dependencies, also updating it for 636 /// the whole bundle. 637 int incrementUnscheduledDeps(int Incr) { 638 UnscheduledDeps += Incr; 639 return FirstInBundle->UnscheduledDepsInBundle += Incr; 640 } 641 642 /// Sets the number of unscheduled dependencies to the number of 643 /// dependencies. 644 void resetUnscheduledDeps() { 645 incrementUnscheduledDeps(Dependencies - UnscheduledDeps); 646 } 647 648 /// Clears all dependency information. 649 void clearDependencies() { 650 Dependencies = InvalidDeps; 651 resetUnscheduledDeps(); 652 MemoryDependencies.clear(); 653 } 654 655 void dump(raw_ostream &os) const { 656 if (!isSchedulingEntity()) { 657 os << "/ " << *Inst; 658 } else if (NextInBundle) { 659 os << '[' << *Inst; 660 ScheduleData *SD = NextInBundle; 661 while (SD) { 662 os << ';' << *SD->Inst; 663 SD = SD->NextInBundle; 664 } 665 os << ']'; 666 } else { 667 os << *Inst; 668 } 669 } 670 671 Instruction *Inst; 672 673 /// Points to the head in an instruction bundle (and always to this for 674 /// single instructions). 675 ScheduleData *FirstInBundle; 676 677 /// Single linked list of all instructions in a bundle. Null if it is a 678 /// single instruction. 679 ScheduleData *NextInBundle; 680 681 /// Single linked list of all memory instructions (e.g. load, store, call) 682 /// in the block - until the end of the scheduling region. 683 ScheduleData *NextLoadStore; 684 685 /// The dependent memory instructions. 686 /// This list is derived on demand in calculateDependencies(). 687 SmallVector<ScheduleData *, 4> MemoryDependencies; 688 689 /// This ScheduleData is in the current scheduling region if this matches 690 /// the current SchedulingRegionID of BlockScheduling. 691 int SchedulingRegionID; 692 693 /// Used for getting a "good" final ordering of instructions. 694 int SchedulingPriority; 695 696 /// The number of dependencies. Constitutes of the number of users of the 697 /// instruction plus the number of dependent memory instructions (if any). 698 /// This value is calculated on demand. 699 /// If InvalidDeps, the number of dependencies is not calculated yet. 700 /// 701 int Dependencies; 702 703 /// The number of dependencies minus the number of dependencies of scheduled 704 /// instructions. As soon as this is zero, the instruction/bundle gets ready 705 /// for scheduling. 706 /// Note that this is negative as long as Dependencies is not calculated. 707 int UnscheduledDeps; 708 709 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for 710 /// single instructions. 711 int UnscheduledDepsInBundle; 712 713 /// True if this instruction is scheduled (or considered as scheduled in the 714 /// dry-run). 715 bool IsScheduled; 716 }; 717 718 #ifndef NDEBUG 719 friend raw_ostream &operator<<(raw_ostream &os, 720 const BoUpSLP::ScheduleData &SD); 721 #endif 722 723 /// Contains all scheduling data for a basic block. 724 /// 725 struct BlockScheduling { 726 727 BlockScheduling(BasicBlock *BB) 728 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), 729 ScheduleStart(nullptr), ScheduleEnd(nullptr), 730 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), 731 ScheduleRegionSize(0), 732 ScheduleRegionSizeLimit(ScheduleRegionSizeBudget), 733 // Make sure that the initial SchedulingRegionID is greater than the 734 // initial SchedulingRegionID in ScheduleData (which is 0). 735 SchedulingRegionID(1) {} 736 737 void clear() { 738 ReadyInsts.clear(); 739 ScheduleStart = nullptr; 740 ScheduleEnd = nullptr; 741 FirstLoadStoreInRegion = nullptr; 742 LastLoadStoreInRegion = nullptr; 743 744 // Reduce the maximum schedule region size by the size of the 745 // previous scheduling run. 746 ScheduleRegionSizeLimit -= ScheduleRegionSize; 747 if (ScheduleRegionSizeLimit < MinScheduleRegionSize) 748 ScheduleRegionSizeLimit = MinScheduleRegionSize; 749 ScheduleRegionSize = 0; 750 751 // Make a new scheduling region, i.e. all existing ScheduleData is not 752 // in the new region yet. 753 ++SchedulingRegionID; 754 } 755 756 ScheduleData *getScheduleData(Value *V) { 757 ScheduleData *SD = ScheduleDataMap[V]; 758 if (SD && SD->SchedulingRegionID == SchedulingRegionID) 759 return SD; 760 return nullptr; 761 } 762 763 bool isInSchedulingRegion(ScheduleData *SD) { 764 return SD->SchedulingRegionID == SchedulingRegionID; 765 } 766 767 /// Marks an instruction as scheduled and puts all dependent ready 768 /// instructions into the ready-list. 769 template <typename ReadyListType> 770 void schedule(ScheduleData *SD, ReadyListType &ReadyList) { 771 SD->IsScheduled = true; 772 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); 773 774 ScheduleData *BundleMember = SD; 775 while (BundleMember) { 776 // Handle the def-use chain dependencies. 777 for (Use &U : BundleMember->Inst->operands()) { 778 ScheduleData *OpDef = getScheduleData(U.get()); 779 if (OpDef && OpDef->hasValidDependencies() && 780 OpDef->incrementUnscheduledDeps(-1) == 0) { 781 // There are no more unscheduled dependencies after decrementing, 782 // so we can put the dependent instruction into the ready list. 783 ScheduleData *DepBundle = OpDef->FirstInBundle; 784 assert(!DepBundle->IsScheduled && 785 "already scheduled bundle gets ready"); 786 ReadyList.insert(DepBundle); 787 DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); 788 } 789 } 790 // Handle the memory dependencies. 791 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { 792 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { 793 // There are no more unscheduled dependencies after decrementing, 794 // so we can put the dependent instruction into the ready list. 795 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; 796 assert(!DepBundle->IsScheduled && 797 "already scheduled bundle gets ready"); 798 ReadyList.insert(DepBundle); 799 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n"); 800 } 801 } 802 BundleMember = BundleMember->NextInBundle; 803 } 804 } 805 806 /// Put all instructions into the ReadyList which are ready for scheduling. 807 template <typename ReadyListType> 808 void initialFillReadyList(ReadyListType &ReadyList) { 809 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 810 ScheduleData *SD = getScheduleData(I); 811 if (SD->isSchedulingEntity() && SD->isReady()) { 812 ReadyList.insert(SD); 813 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); 814 } 815 } 816 } 817 818 /// Checks if a bundle of instructions can be scheduled, i.e. has no 819 /// cyclic dependencies. This is only a dry-run, no instructions are 820 /// actually moved at this stage. 821 bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP); 822 823 /// Un-bundles a group of instructions. 824 void cancelScheduling(ArrayRef<Value *> VL); 825 826 /// Extends the scheduling region so that V is inside the region. 827 /// \returns true if the region size is within the limit. 828 bool extendSchedulingRegion(Value *V); 829 830 /// Initialize the ScheduleData structures for new instructions in the 831 /// scheduling region. 832 void initScheduleData(Instruction *FromI, Instruction *ToI, 833 ScheduleData *PrevLoadStore, 834 ScheduleData *NextLoadStore); 835 836 /// Updates the dependency information of a bundle and of all instructions/ 837 /// bundles which depend on the original bundle. 838 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, 839 BoUpSLP *SLP); 840 841 /// Sets all instruction in the scheduling region to un-scheduled. 842 void resetSchedule(); 843 844 BasicBlock *BB; 845 846 /// Simple memory allocation for ScheduleData. 847 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; 848 849 /// The size of a ScheduleData array in ScheduleDataChunks. 850 int ChunkSize; 851 852 /// The allocator position in the current chunk, which is the last entry 853 /// of ScheduleDataChunks. 854 int ChunkPos; 855 856 /// Attaches ScheduleData to Instruction. 857 /// Note that the mapping survives during all vectorization iterations, i.e. 858 /// ScheduleData structures are recycled. 859 DenseMap<Value *, ScheduleData *> ScheduleDataMap; 860 861 struct ReadyList : SmallVector<ScheduleData *, 8> { 862 void insert(ScheduleData *SD) { push_back(SD); } 863 }; 864 865 /// The ready-list for scheduling (only used for the dry-run). 866 ReadyList ReadyInsts; 867 868 /// The first instruction of the scheduling region. 869 Instruction *ScheduleStart; 870 871 /// The first instruction _after_ the scheduling region. 872 Instruction *ScheduleEnd; 873 874 /// The first memory accessing instruction in the scheduling region 875 /// (can be null). 876 ScheduleData *FirstLoadStoreInRegion; 877 878 /// The last memory accessing instruction in the scheduling region 879 /// (can be null). 880 ScheduleData *LastLoadStoreInRegion; 881 882 /// The current size of the scheduling region. 883 int ScheduleRegionSize; 884 885 /// The maximum size allowed for the scheduling region. 886 int ScheduleRegionSizeLimit; 887 888 /// The ID of the scheduling region. For a new vectorization iteration this 889 /// is incremented which "removes" all ScheduleData from the region. 890 int SchedulingRegionID; 891 }; 892 893 /// Attaches the BlockScheduling structures to basic blocks. 894 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; 895 896 /// Performs the "real" scheduling. Done before vectorization is actually 897 /// performed in a basic block. 898 void scheduleBlock(BlockScheduling *BS); 899 900 /// List of users to ignore during scheduling and that don't need extracting. 901 ArrayRef<Value *> UserIgnoreList; 902 903 // Number of load-bundles, which contain consecutive loads. 904 int NumLoadsWantToKeepOrder; 905 906 // Number of load-bundles of size 2, which are consecutive loads if reversed. 907 int NumLoadsWantToChangeOrder; 908 909 // Analysis and block reference. 910 Function *F; 911 ScalarEvolution *SE; 912 TargetTransformInfo *TTI; 913 TargetLibraryInfo *TLI; 914 AliasAnalysis *AA; 915 LoopInfo *LI; 916 DominatorTree *DT; 917 /// Instruction builder to construct the vectorized tree. 918 IRBuilder<> Builder; 919 }; 920 921 #ifndef NDEBUG 922 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) { 923 SD.dump(os); 924 return os; 925 } 926 #endif 927 928 void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 929 ArrayRef<Value *> UserIgnoreLst) { 930 deleteTree(); 931 UserIgnoreList = UserIgnoreLst; 932 if (!getSameType(Roots)) 933 return; 934 buildTree_rec(Roots, 0); 935 936 // Collect the values that we need to extract from the tree. 937 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 938 TreeEntry *Entry = &VectorizableTree[EIdx]; 939 940 // For each lane: 941 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 942 Value *Scalar = Entry->Scalars[Lane]; 943 944 // No need to handle users of gathered values. 945 if (Entry->NeedToGather) 946 continue; 947 948 for (User *U : Scalar->users()) { 949 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); 950 951 Instruction *UserInst = dyn_cast<Instruction>(U); 952 if (!UserInst) 953 continue; 954 955 // Skip in-tree scalars that become vectors 956 if (ScalarToTreeEntry.count(U)) { 957 int Idx = ScalarToTreeEntry[U]; 958 TreeEntry *UseEntry = &VectorizableTree[Idx]; 959 Value *UseScalar = UseEntry->Scalars[0]; 960 // Some in-tree scalars will remain as scalar in vectorized 961 // instructions. If that is the case, the one in Lane 0 will 962 // be used. 963 if (UseScalar != U || 964 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { 965 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U 966 << ".\n"); 967 assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); 968 continue; 969 } 970 } 971 972 // Ignore users in the user ignore list. 973 if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != 974 UserIgnoreList.end()) 975 continue; 976 977 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << 978 Lane << " from " << *Scalar << ".\n"); 979 ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); 980 } 981 } 982 } 983 } 984 985 986 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { 987 bool SameTy = getSameType(VL); (void)SameTy; 988 bool isAltShuffle = false; 989 assert(SameTy && "Invalid types!"); 990 991 if (Depth == RecursionMaxDepth) { 992 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); 993 newTreeEntry(VL, false); 994 return; 995 } 996 997 // Don't handle vectors. 998 if (VL[0]->getType()->isVectorTy()) { 999 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); 1000 newTreeEntry(VL, false); 1001 return; 1002 } 1003 1004 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1005 if (SI->getValueOperand()->getType()->isVectorTy()) { 1006 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); 1007 newTreeEntry(VL, false); 1008 return; 1009 } 1010 unsigned Opcode = getSameOpcode(VL); 1011 1012 // Check that this shuffle vector refers to the alternate 1013 // sequence of opcodes. 1014 if (Opcode == Instruction::ShuffleVector) { 1015 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 1016 unsigned Op = I0->getOpcode(); 1017 if (Op != Instruction::ShuffleVector) 1018 isAltShuffle = true; 1019 } 1020 1021 // If all of the operands are identical or constant we have a simple solution. 1022 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { 1023 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); 1024 newTreeEntry(VL, false); 1025 return; 1026 } 1027 1028 // We now know that this is a vector of instructions of the same type from 1029 // the same block. 1030 1031 // Don't vectorize ephemeral values. 1032 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1033 if (EphValues.count(VL[i])) { 1034 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 1035 ") is ephemeral.\n"); 1036 newTreeEntry(VL, false); 1037 return; 1038 } 1039 } 1040 1041 // Check if this is a duplicate of another entry. 1042 if (ScalarToTreeEntry.count(VL[0])) { 1043 int Idx = ScalarToTreeEntry[VL[0]]; 1044 TreeEntry *E = &VectorizableTree[Idx]; 1045 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1046 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); 1047 if (E->Scalars[i] != VL[i]) { 1048 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); 1049 newTreeEntry(VL, false); 1050 return; 1051 } 1052 } 1053 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); 1054 return; 1055 } 1056 1057 // Check that none of the instructions in the bundle are already in the tree. 1058 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1059 if (ScalarToTreeEntry.count(VL[i])) { 1060 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 1061 ") is already in tree.\n"); 1062 newTreeEntry(VL, false); 1063 return; 1064 } 1065 } 1066 1067 // If any of the scalars is marked as a value that needs to stay scalar then 1068 // we need to gather the scalars. 1069 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1070 if (MustGather.count(VL[i])) { 1071 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); 1072 newTreeEntry(VL, false); 1073 return; 1074 } 1075 } 1076 1077 // Check that all of the users of the scalars that we want to vectorize are 1078 // schedulable. 1079 Instruction *VL0 = cast<Instruction>(VL[0]); 1080 BasicBlock *BB = cast<Instruction>(VL0)->getParent(); 1081 1082 if (!DT->isReachableFromEntry(BB)) { 1083 // Don't go into unreachable blocks. They may contain instructions with 1084 // dependency cycles which confuse the final scheduling. 1085 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); 1086 newTreeEntry(VL, false); 1087 return; 1088 } 1089 1090 // Check that every instructions appears once in this bundle. 1091 for (unsigned i = 0, e = VL.size(); i < e; ++i) 1092 for (unsigned j = i+1; j < e; ++j) 1093 if (VL[i] == VL[j]) { 1094 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 1095 newTreeEntry(VL, false); 1096 return; 1097 } 1098 1099 auto &BSRef = BlocksSchedules[BB]; 1100 if (!BSRef) { 1101 BSRef = llvm::make_unique<BlockScheduling>(BB); 1102 } 1103 BlockScheduling &BS = *BSRef.get(); 1104 1105 if (!BS.tryScheduleBundle(VL, this)) { 1106 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); 1107 assert((!BS.getScheduleData(VL[0]) || 1108 !BS.getScheduleData(VL[0])->isPartOfBundle()) && 1109 "tryScheduleBundle should cancelScheduling on failure"); 1110 newTreeEntry(VL, false); 1111 return; 1112 } 1113 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); 1114 1115 switch (Opcode) { 1116 case Instruction::PHI: { 1117 PHINode *PH = dyn_cast<PHINode>(VL0); 1118 1119 // Check for terminator values (e.g. invoke). 1120 for (unsigned j = 0; j < VL.size(); ++j) 1121 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1122 TerminatorInst *Term = dyn_cast<TerminatorInst>( 1123 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); 1124 if (Term) { 1125 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); 1126 BS.cancelScheduling(VL); 1127 newTreeEntry(VL, false); 1128 return; 1129 } 1130 } 1131 1132 newTreeEntry(VL, true); 1133 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); 1134 1135 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1136 ValueList Operands; 1137 // Prepare the operand vector. 1138 for (unsigned j = 0; j < VL.size(); ++j) 1139 Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock( 1140 PH->getIncomingBlock(i))); 1141 1142 buildTree_rec(Operands, Depth + 1); 1143 } 1144 return; 1145 } 1146 case Instruction::ExtractElement: { 1147 bool Reuse = CanReuseExtract(VL); 1148 if (Reuse) { 1149 DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); 1150 } else { 1151 BS.cancelScheduling(VL); 1152 } 1153 newTreeEntry(VL, Reuse); 1154 return; 1155 } 1156 case Instruction::Load: { 1157 // Check that a vectorized load would load the same memory as a scalar 1158 // load. 1159 // For example we don't want vectorize loads that are smaller than 8 bit. 1160 // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats 1161 // loading/storing it as an i8 struct. If we vectorize loads/stores from 1162 // such a struct we read/write packed bits disagreeing with the 1163 // unvectorized version. 1164 const DataLayout &DL = F->getParent()->getDataLayout(); 1165 Type *ScalarTy = VL[0]->getType(); 1166 1167 if (DL.getTypeSizeInBits(ScalarTy) != 1168 DL.getTypeAllocSizeInBits(ScalarTy)) { 1169 BS.cancelScheduling(VL); 1170 newTreeEntry(VL, false); 1171 DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); 1172 return; 1173 } 1174 // Check if the loads are consecutive or of we need to swizzle them. 1175 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { 1176 LoadInst *L = cast<LoadInst>(VL[i]); 1177 if (!L->isSimple()) { 1178 BS.cancelScheduling(VL); 1179 newTreeEntry(VL, false); 1180 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); 1181 return; 1182 } 1183 1184 if (!isConsecutiveAccess(VL[i], VL[i + 1], DL, *SE)) { 1185 if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL, *SE)) { 1186 ++NumLoadsWantToChangeOrder; 1187 } 1188 BS.cancelScheduling(VL); 1189 newTreeEntry(VL, false); 1190 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); 1191 return; 1192 } 1193 } 1194 ++NumLoadsWantToKeepOrder; 1195 newTreeEntry(VL, true); 1196 DEBUG(dbgs() << "SLP: added a vector of loads.\n"); 1197 return; 1198 } 1199 case Instruction::ZExt: 1200 case Instruction::SExt: 1201 case Instruction::FPToUI: 1202 case Instruction::FPToSI: 1203 case Instruction::FPExt: 1204 case Instruction::PtrToInt: 1205 case Instruction::IntToPtr: 1206 case Instruction::SIToFP: 1207 case Instruction::UIToFP: 1208 case Instruction::Trunc: 1209 case Instruction::FPTrunc: 1210 case Instruction::BitCast: { 1211 Type *SrcTy = VL0->getOperand(0)->getType(); 1212 for (unsigned i = 0; i < VL.size(); ++i) { 1213 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); 1214 if (Ty != SrcTy || !isValidElementType(Ty)) { 1215 BS.cancelScheduling(VL); 1216 newTreeEntry(VL, false); 1217 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); 1218 return; 1219 } 1220 } 1221 newTreeEntry(VL, true); 1222 DEBUG(dbgs() << "SLP: added a vector of casts.\n"); 1223 1224 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1225 ValueList Operands; 1226 // Prepare the operand vector. 1227 for (unsigned j = 0; j < VL.size(); ++j) 1228 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1229 1230 buildTree_rec(Operands, Depth+1); 1231 } 1232 return; 1233 } 1234 case Instruction::ICmp: 1235 case Instruction::FCmp: { 1236 // Check that all of the compares have the same predicate. 1237 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 1238 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); 1239 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 1240 CmpInst *Cmp = cast<CmpInst>(VL[i]); 1241 if (Cmp->getPredicate() != P0 || 1242 Cmp->getOperand(0)->getType() != ComparedTy) { 1243 BS.cancelScheduling(VL); 1244 newTreeEntry(VL, false); 1245 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); 1246 return; 1247 } 1248 } 1249 1250 newTreeEntry(VL, true); 1251 DEBUG(dbgs() << "SLP: added a vector of compares.\n"); 1252 1253 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1254 ValueList Operands; 1255 // Prepare the operand vector. 1256 for (unsigned j = 0; j < VL.size(); ++j) 1257 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1258 1259 buildTree_rec(Operands, Depth+1); 1260 } 1261 return; 1262 } 1263 case Instruction::Select: 1264 case Instruction::Add: 1265 case Instruction::FAdd: 1266 case Instruction::Sub: 1267 case Instruction::FSub: 1268 case Instruction::Mul: 1269 case Instruction::FMul: 1270 case Instruction::UDiv: 1271 case Instruction::SDiv: 1272 case Instruction::FDiv: 1273 case Instruction::URem: 1274 case Instruction::SRem: 1275 case Instruction::FRem: 1276 case Instruction::Shl: 1277 case Instruction::LShr: 1278 case Instruction::AShr: 1279 case Instruction::And: 1280 case Instruction::Or: 1281 case Instruction::Xor: { 1282 newTreeEntry(VL, true); 1283 DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); 1284 1285 // Sort operands of the instructions so that each side is more likely to 1286 // have the same opcode. 1287 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { 1288 ValueList Left, Right; 1289 reorderInputsAccordingToOpcode(VL, Left, Right); 1290 buildTree_rec(Left, Depth + 1); 1291 buildTree_rec(Right, Depth + 1); 1292 return; 1293 } 1294 1295 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1296 ValueList Operands; 1297 // Prepare the operand vector. 1298 for (unsigned j = 0; j < VL.size(); ++j) 1299 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1300 1301 buildTree_rec(Operands, Depth+1); 1302 } 1303 return; 1304 } 1305 case Instruction::GetElementPtr: { 1306 // We don't combine GEPs with complicated (nested) indexing. 1307 for (unsigned j = 0; j < VL.size(); ++j) { 1308 if (cast<Instruction>(VL[j])->getNumOperands() != 2) { 1309 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); 1310 BS.cancelScheduling(VL); 1311 newTreeEntry(VL, false); 1312 return; 1313 } 1314 } 1315 1316 // We can't combine several GEPs into one vector if they operate on 1317 // different types. 1318 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType(); 1319 for (unsigned j = 0; j < VL.size(); ++j) { 1320 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); 1321 if (Ty0 != CurTy) { 1322 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); 1323 BS.cancelScheduling(VL); 1324 newTreeEntry(VL, false); 1325 return; 1326 } 1327 } 1328 1329 // We don't combine GEPs with non-constant indexes. 1330 for (unsigned j = 0; j < VL.size(); ++j) { 1331 auto Op = cast<Instruction>(VL[j])->getOperand(1); 1332 if (!isa<ConstantInt>(Op)) { 1333 DEBUG( 1334 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); 1335 BS.cancelScheduling(VL); 1336 newTreeEntry(VL, false); 1337 return; 1338 } 1339 } 1340 1341 newTreeEntry(VL, true); 1342 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); 1343 for (unsigned i = 0, e = 2; i < e; ++i) { 1344 ValueList Operands; 1345 // Prepare the operand vector. 1346 for (unsigned j = 0; j < VL.size(); ++j) 1347 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1348 1349 buildTree_rec(Operands, Depth + 1); 1350 } 1351 return; 1352 } 1353 case Instruction::Store: { 1354 const DataLayout &DL = F->getParent()->getDataLayout(); 1355 // Check if the stores are consecutive or of we need to swizzle them. 1356 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) 1357 if (!isConsecutiveAccess(VL[i], VL[i + 1], DL, *SE)) { 1358 BS.cancelScheduling(VL); 1359 newTreeEntry(VL, false); 1360 DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); 1361 return; 1362 } 1363 1364 newTreeEntry(VL, true); 1365 DEBUG(dbgs() << "SLP: added a vector of stores.\n"); 1366 1367 ValueList Operands; 1368 for (unsigned j = 0; j < VL.size(); ++j) 1369 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); 1370 1371 buildTree_rec(Operands, Depth + 1); 1372 return; 1373 } 1374 case Instruction::Call: { 1375 // Check if the calls are all to the same vectorizable intrinsic. 1376 CallInst *CI = cast<CallInst>(VL[0]); 1377 // Check if this is an Intrinsic call or something that can be 1378 // represented by an intrinsic call 1379 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1380 if (!isTriviallyVectorizable(ID)) { 1381 BS.cancelScheduling(VL); 1382 newTreeEntry(VL, false); 1383 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); 1384 return; 1385 } 1386 Function *Int = CI->getCalledFunction(); 1387 Value *A1I = nullptr; 1388 if (hasVectorInstrinsicScalarOpd(ID, 1)) 1389 A1I = CI->getArgOperand(1); 1390 for (unsigned i = 1, e = VL.size(); i != e; ++i) { 1391 CallInst *CI2 = dyn_cast<CallInst>(VL[i]); 1392 if (!CI2 || CI2->getCalledFunction() != Int || 1393 getIntrinsicIDForCall(CI2, TLI) != ID) { 1394 BS.cancelScheduling(VL); 1395 newTreeEntry(VL, false); 1396 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] 1397 << "\n"); 1398 return; 1399 } 1400 // ctlz,cttz and powi are special intrinsics whose second argument 1401 // should be same in order for them to be vectorized. 1402 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 1403 Value *A1J = CI2->getArgOperand(1); 1404 if (A1I != A1J) { 1405 BS.cancelScheduling(VL); 1406 newTreeEntry(VL, false); 1407 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI 1408 << " argument "<< A1I<<"!=" << A1J 1409 << "\n"); 1410 return; 1411 } 1412 } 1413 } 1414 1415 newTreeEntry(VL, true); 1416 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { 1417 ValueList Operands; 1418 // Prepare the operand vector. 1419 for (unsigned j = 0; j < VL.size(); ++j) { 1420 CallInst *CI2 = dyn_cast<CallInst>(VL[j]); 1421 Operands.push_back(CI2->getArgOperand(i)); 1422 } 1423 buildTree_rec(Operands, Depth + 1); 1424 } 1425 return; 1426 } 1427 case Instruction::ShuffleVector: { 1428 // If this is not an alternate sequence of opcode like add-sub 1429 // then do not vectorize this instruction. 1430 if (!isAltShuffle) { 1431 BS.cancelScheduling(VL); 1432 newTreeEntry(VL, false); 1433 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); 1434 return; 1435 } 1436 newTreeEntry(VL, true); 1437 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); 1438 1439 // Reorder operands if reordering would enable vectorization. 1440 if (isa<BinaryOperator>(VL0)) { 1441 ValueList Left, Right; 1442 reorderAltShuffleOperands(VL, Left, Right); 1443 buildTree_rec(Left, Depth + 1); 1444 buildTree_rec(Right, Depth + 1); 1445 return; 1446 } 1447 1448 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1449 ValueList Operands; 1450 // Prepare the operand vector. 1451 for (unsigned j = 0; j < VL.size(); ++j) 1452 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1453 1454 buildTree_rec(Operands, Depth + 1); 1455 } 1456 return; 1457 } 1458 default: 1459 BS.cancelScheduling(VL); 1460 newTreeEntry(VL, false); 1461 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); 1462 return; 1463 } 1464 } 1465 1466 int BoUpSLP::getEntryCost(TreeEntry *E) { 1467 ArrayRef<Value*> VL = E->Scalars; 1468 1469 Type *ScalarTy = VL[0]->getType(); 1470 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1471 ScalarTy = SI->getValueOperand()->getType(); 1472 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1473 1474 if (E->NeedToGather) { 1475 if (allConstant(VL)) 1476 return 0; 1477 if (isSplat(VL)) { 1478 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); 1479 } 1480 return getGatherCost(E->Scalars); 1481 } 1482 unsigned Opcode = getSameOpcode(VL); 1483 assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); 1484 Instruction *VL0 = cast<Instruction>(VL[0]); 1485 switch (Opcode) { 1486 case Instruction::PHI: { 1487 return 0; 1488 } 1489 case Instruction::ExtractElement: { 1490 if (CanReuseExtract(VL)) { 1491 int DeadCost = 0; 1492 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 1493 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 1494 if (E->hasOneUse()) 1495 // Take credit for instruction that will become dead. 1496 DeadCost += 1497 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); 1498 } 1499 return -DeadCost; 1500 } 1501 return getGatherCost(VecTy); 1502 } 1503 case Instruction::ZExt: 1504 case Instruction::SExt: 1505 case Instruction::FPToUI: 1506 case Instruction::FPToSI: 1507 case Instruction::FPExt: 1508 case Instruction::PtrToInt: 1509 case Instruction::IntToPtr: 1510 case Instruction::SIToFP: 1511 case Instruction::UIToFP: 1512 case Instruction::Trunc: 1513 case Instruction::FPTrunc: 1514 case Instruction::BitCast: { 1515 Type *SrcTy = VL0->getOperand(0)->getType(); 1516 1517 // Calculate the cost of this instruction. 1518 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), 1519 VL0->getType(), SrcTy); 1520 1521 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); 1522 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); 1523 return VecCost - ScalarCost; 1524 } 1525 case Instruction::FCmp: 1526 case Instruction::ICmp: 1527 case Instruction::Select: 1528 case Instruction::Add: 1529 case Instruction::FAdd: 1530 case Instruction::Sub: 1531 case Instruction::FSub: 1532 case Instruction::Mul: 1533 case Instruction::FMul: 1534 case Instruction::UDiv: 1535 case Instruction::SDiv: 1536 case Instruction::FDiv: 1537 case Instruction::URem: 1538 case Instruction::SRem: 1539 case Instruction::FRem: 1540 case Instruction::Shl: 1541 case Instruction::LShr: 1542 case Instruction::AShr: 1543 case Instruction::And: 1544 case Instruction::Or: 1545 case Instruction::Xor: { 1546 // Calculate the cost of this instruction. 1547 int ScalarCost = 0; 1548 int VecCost = 0; 1549 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || 1550 Opcode == Instruction::Select) { 1551 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); 1552 ScalarCost = VecTy->getNumElements() * 1553 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); 1554 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); 1555 } else { 1556 // Certain instructions can be cheaper to vectorize if they have a 1557 // constant second vector operand. 1558 TargetTransformInfo::OperandValueKind Op1VK = 1559 TargetTransformInfo::OK_AnyValue; 1560 TargetTransformInfo::OperandValueKind Op2VK = 1561 TargetTransformInfo::OK_UniformConstantValue; 1562 TargetTransformInfo::OperandValueProperties Op1VP = 1563 TargetTransformInfo::OP_None; 1564 TargetTransformInfo::OperandValueProperties Op2VP = 1565 TargetTransformInfo::OP_None; 1566 1567 // If all operands are exactly the same ConstantInt then set the 1568 // operand kind to OK_UniformConstantValue. 1569 // If instead not all operands are constants, then set the operand kind 1570 // to OK_AnyValue. If all operands are constants but not the same, 1571 // then set the operand kind to OK_NonUniformConstantValue. 1572 ConstantInt *CInt = nullptr; 1573 for (unsigned i = 0; i < VL.size(); ++i) { 1574 const Instruction *I = cast<Instruction>(VL[i]); 1575 if (!isa<ConstantInt>(I->getOperand(1))) { 1576 Op2VK = TargetTransformInfo::OK_AnyValue; 1577 break; 1578 } 1579 if (i == 0) { 1580 CInt = cast<ConstantInt>(I->getOperand(1)); 1581 continue; 1582 } 1583 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && 1584 CInt != cast<ConstantInt>(I->getOperand(1))) 1585 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 1586 } 1587 // FIXME: Currently cost of model modification for division by 1588 // power of 2 is handled only for X86. Add support for other targets. 1589 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && 1590 CInt->getValue().isPowerOf2()) 1591 Op2VP = TargetTransformInfo::OP_PowerOf2; 1592 1593 ScalarCost = VecTy->getNumElements() * 1594 TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, 1595 Op1VP, Op2VP); 1596 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, 1597 Op1VP, Op2VP); 1598 } 1599 return VecCost - ScalarCost; 1600 } 1601 case Instruction::GetElementPtr: { 1602 TargetTransformInfo::OperandValueKind Op1VK = 1603 TargetTransformInfo::OK_AnyValue; 1604 TargetTransformInfo::OperandValueKind Op2VK = 1605 TargetTransformInfo::OK_UniformConstantValue; 1606 1607 int ScalarCost = 1608 VecTy->getNumElements() * 1609 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); 1610 int VecCost = 1611 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); 1612 1613 return VecCost - ScalarCost; 1614 } 1615 case Instruction::Load: { 1616 // Cost of wide load - cost of scalar loads. 1617 int ScalarLdCost = VecTy->getNumElements() * 1618 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); 1619 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0); 1620 return VecLdCost - ScalarLdCost; 1621 } 1622 case Instruction::Store: { 1623 // We know that we can merge the stores. Calculate the cost. 1624 int ScalarStCost = VecTy->getNumElements() * 1625 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); 1626 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0); 1627 return VecStCost - ScalarStCost; 1628 } 1629 case Instruction::Call: { 1630 CallInst *CI = cast<CallInst>(VL0); 1631 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1632 1633 // Calculate the cost of the scalar and vector calls. 1634 SmallVector<Type*, 4> ScalarTys, VecTys; 1635 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { 1636 ScalarTys.push_back(CI->getArgOperand(op)->getType()); 1637 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), 1638 VecTy->getNumElements())); 1639 } 1640 1641 int ScalarCallCost = VecTy->getNumElements() * 1642 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys); 1643 1644 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys); 1645 1646 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost 1647 << " (" << VecCallCost << "-" << ScalarCallCost << ")" 1648 << " for " << *CI << "\n"); 1649 1650 return VecCallCost - ScalarCallCost; 1651 } 1652 case Instruction::ShuffleVector: { 1653 TargetTransformInfo::OperandValueKind Op1VK = 1654 TargetTransformInfo::OK_AnyValue; 1655 TargetTransformInfo::OperandValueKind Op2VK = 1656 TargetTransformInfo::OK_AnyValue; 1657 int ScalarCost = 0; 1658 int VecCost = 0; 1659 for (unsigned i = 0; i < VL.size(); ++i) { 1660 Instruction *I = cast<Instruction>(VL[i]); 1661 if (!I) 1662 break; 1663 ScalarCost += 1664 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); 1665 } 1666 // VecCost is equal to sum of the cost of creating 2 vectors 1667 // and the cost of creating shuffle. 1668 Instruction *I0 = cast<Instruction>(VL[0]); 1669 VecCost = 1670 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); 1671 Instruction *I1 = cast<Instruction>(VL[1]); 1672 VecCost += 1673 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); 1674 VecCost += 1675 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); 1676 return VecCost - ScalarCost; 1677 } 1678 default: 1679 llvm_unreachable("Unknown instruction"); 1680 } 1681 } 1682 1683 bool BoUpSLP::isFullyVectorizableTinyTree() { 1684 DEBUG(dbgs() << "SLP: Check whether the tree with height " << 1685 VectorizableTree.size() << " is fully vectorizable .\n"); 1686 1687 // We only handle trees of height 2. 1688 if (VectorizableTree.size() != 2) 1689 return false; 1690 1691 // Handle splat and all-constants stores. 1692 if (!VectorizableTree[0].NeedToGather && 1693 (allConstant(VectorizableTree[1].Scalars) || 1694 isSplat(VectorizableTree[1].Scalars))) 1695 return true; 1696 1697 // Gathering cost would be too much for tiny trees. 1698 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) 1699 return false; 1700 1701 return true; 1702 } 1703 1704 int BoUpSLP::getSpillCost() { 1705 // Walk from the bottom of the tree to the top, tracking which values are 1706 // live. When we see a call instruction that is not part of our tree, 1707 // query TTI to see if there is a cost to keeping values live over it 1708 // (for example, if spills and fills are required). 1709 unsigned BundleWidth = VectorizableTree.front().Scalars.size(); 1710 int Cost = 0; 1711 1712 SmallPtrSet<Instruction*, 4> LiveValues; 1713 Instruction *PrevInst = nullptr; 1714 1715 for (unsigned N = 0; N < VectorizableTree.size(); ++N) { 1716 Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]); 1717 if (!Inst) 1718 continue; 1719 1720 if (!PrevInst) { 1721 PrevInst = Inst; 1722 continue; 1723 } 1724 1725 DEBUG( 1726 dbgs() << "SLP: #LV: " << LiveValues.size(); 1727 for (auto *X : LiveValues) 1728 dbgs() << " " << X->getName(); 1729 dbgs() << ", Looking at "; 1730 Inst->dump(); 1731 ); 1732 1733 // Update LiveValues. 1734 LiveValues.erase(PrevInst); 1735 for (auto &J : PrevInst->operands()) { 1736 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J)) 1737 LiveValues.insert(cast<Instruction>(&*J)); 1738 } 1739 1740 // Now find the sequence of instructions between PrevInst and Inst. 1741 BasicBlock::reverse_iterator InstIt(Inst->getIterator()), 1742 PrevInstIt(PrevInst->getIterator()); 1743 --PrevInstIt; 1744 while (InstIt != PrevInstIt) { 1745 if (PrevInstIt == PrevInst->getParent()->rend()) { 1746 PrevInstIt = Inst->getParent()->rbegin(); 1747 continue; 1748 } 1749 1750 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) { 1751 SmallVector<Type*, 4> V; 1752 for (auto *II : LiveValues) 1753 V.push_back(VectorType::get(II->getType(), BundleWidth)); 1754 Cost += TTI->getCostOfKeepingLiveOverCall(V); 1755 } 1756 1757 ++PrevInstIt; 1758 } 1759 1760 PrevInst = Inst; 1761 } 1762 1763 DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n"); 1764 return Cost; 1765 } 1766 1767 int BoUpSLP::getTreeCost() { 1768 int Cost = 0; 1769 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << 1770 VectorizableTree.size() << ".\n"); 1771 1772 // We only vectorize tiny trees if it is fully vectorizable. 1773 if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { 1774 if (VectorizableTree.empty()) { 1775 assert(!ExternalUses.size() && "We should not have any external users"); 1776 } 1777 return INT_MAX; 1778 } 1779 1780 unsigned BundleWidth = VectorizableTree[0].Scalars.size(); 1781 1782 for (TreeEntry &TE : VectorizableTree) { 1783 int C = getEntryCost(&TE); 1784 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " 1785 << TE.Scalars[0] << " .\n"); 1786 Cost += C; 1787 } 1788 1789 SmallSet<Value *, 16> ExtractCostCalculated; 1790 int ExtractCost = 0; 1791 for (ExternalUser &EU : ExternalUses) { 1792 // We only add extract cost once for the same scalar. 1793 if (!ExtractCostCalculated.insert(EU.Scalar).second) 1794 continue; 1795 1796 // Uses by ephemeral values are free (because the ephemeral value will be 1797 // removed prior to code generation, and so the extraction will be 1798 // removed as well). 1799 if (EphValues.count(EU.User)) 1800 continue; 1801 1802 VectorType *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); 1803 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1804 EU.Lane); 1805 } 1806 1807 Cost += getSpillCost(); 1808 1809 DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); 1810 return Cost + ExtractCost; 1811 } 1812 1813 int BoUpSLP::getGatherCost(Type *Ty) { 1814 int Cost = 0; 1815 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) 1816 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); 1817 return Cost; 1818 } 1819 1820 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { 1821 // Find the type of the operands in VL. 1822 Type *ScalarTy = VL[0]->getType(); 1823 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1824 ScalarTy = SI->getValueOperand()->getType(); 1825 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1826 // Find the cost of inserting/extracting values from the vector. 1827 return getGatherCost(VecTy); 1828 } 1829 1830 // Reorder commutative operations in alternate shuffle if the resulting vectors 1831 // are consecutive loads. This would allow us to vectorize the tree. 1832 // If we have something like- 1833 // load a[0] - load b[0] 1834 // load b[1] + load a[1] 1835 // load a[2] - load b[2] 1836 // load a[3] + load b[3] 1837 // Reordering the second load b[1] load a[1] would allow us to vectorize this 1838 // code. 1839 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, 1840 SmallVectorImpl<Value *> &Left, 1841 SmallVectorImpl<Value *> &Right) { 1842 const DataLayout &DL = F->getParent()->getDataLayout(); 1843 1844 // Push left and right operands of binary operation into Left and Right 1845 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 1846 Left.push_back(cast<Instruction>(VL[i])->getOperand(0)); 1847 Right.push_back(cast<Instruction>(VL[i])->getOperand(1)); 1848 } 1849 1850 // Reorder if we have a commutative operation and consecutive access 1851 // are on either side of the alternate instructions. 1852 for (unsigned j = 0; j < VL.size() - 1; ++j) { 1853 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { 1854 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { 1855 Instruction *VL1 = cast<Instruction>(VL[j]); 1856 Instruction *VL2 = cast<Instruction>(VL[j + 1]); 1857 if (isConsecutiveAccess(L, L1, DL, *SE) && VL1->isCommutative()) { 1858 std::swap(Left[j], Right[j]); 1859 continue; 1860 } else if (isConsecutiveAccess(L, L1, DL, *SE) && VL2->isCommutative()) { 1861 std::swap(Left[j + 1], Right[j + 1]); 1862 continue; 1863 } 1864 // else unchanged 1865 } 1866 } 1867 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { 1868 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { 1869 Instruction *VL1 = cast<Instruction>(VL[j]); 1870 Instruction *VL2 = cast<Instruction>(VL[j + 1]); 1871 if (isConsecutiveAccess(L, L1, DL, *SE) && VL1->isCommutative()) { 1872 std::swap(Left[j], Right[j]); 1873 continue; 1874 } else if (isConsecutiveAccess(L, L1, DL, *SE) && VL2->isCommutative()) { 1875 std::swap(Left[j + 1], Right[j + 1]); 1876 continue; 1877 } 1878 // else unchanged 1879 } 1880 } 1881 } 1882 } 1883 1884 // Return true if I should be commuted before adding it's left and right 1885 // operands to the arrays Left and Right. 1886 // 1887 // The vectorizer is trying to either have all elements one side being 1888 // instruction with the same opcode to enable further vectorization, or having 1889 // a splat to lower the vectorizing cost. 1890 static bool shouldReorderOperands(int i, Instruction &I, 1891 SmallVectorImpl<Value *> &Left, 1892 SmallVectorImpl<Value *> &Right, 1893 bool AllSameOpcodeLeft, 1894 bool AllSameOpcodeRight, bool SplatLeft, 1895 bool SplatRight) { 1896 Value *VLeft = I.getOperand(0); 1897 Value *VRight = I.getOperand(1); 1898 // If we have "SplatRight", try to see if commuting is needed to preserve it. 1899 if (SplatRight) { 1900 if (VRight == Right[i - 1]) 1901 // Preserve SplatRight 1902 return false; 1903 if (VLeft == Right[i - 1]) { 1904 // Commuting would preserve SplatRight, but we don't want to break 1905 // SplatLeft either, i.e. preserve the original order if possible. 1906 // (FIXME: why do we care?) 1907 if (SplatLeft && VLeft == Left[i - 1]) 1908 return false; 1909 return true; 1910 } 1911 } 1912 // Symmetrically handle Right side. 1913 if (SplatLeft) { 1914 if (VLeft == Left[i - 1]) 1915 // Preserve SplatLeft 1916 return false; 1917 if (VRight == Left[i - 1]) 1918 return true; 1919 } 1920 1921 Instruction *ILeft = dyn_cast<Instruction>(VLeft); 1922 Instruction *IRight = dyn_cast<Instruction>(VRight); 1923 1924 // If we have "AllSameOpcodeRight", try to see if the left operands preserves 1925 // it and not the right, in this case we want to commute. 1926 if (AllSameOpcodeRight) { 1927 unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode(); 1928 if (IRight && RightPrevOpcode == IRight->getOpcode()) 1929 // Do not commute, a match on the right preserves AllSameOpcodeRight 1930 return false; 1931 if (ILeft && RightPrevOpcode == ILeft->getOpcode()) { 1932 // We have a match and may want to commute, but first check if there is 1933 // not also a match on the existing operands on the Left to preserve 1934 // AllSameOpcodeLeft, i.e. preserve the original order if possible. 1935 // (FIXME: why do we care?) 1936 if (AllSameOpcodeLeft && ILeft && 1937 cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode()) 1938 return false; 1939 return true; 1940 } 1941 } 1942 // Symmetrically handle Left side. 1943 if (AllSameOpcodeLeft) { 1944 unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode(); 1945 if (ILeft && LeftPrevOpcode == ILeft->getOpcode()) 1946 return false; 1947 if (IRight && LeftPrevOpcode == IRight->getOpcode()) 1948 return true; 1949 } 1950 return false; 1951 } 1952 1953 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 1954 SmallVectorImpl<Value *> &Left, 1955 SmallVectorImpl<Value *> &Right) { 1956 1957 if (VL.size()) { 1958 // Peel the first iteration out of the loop since there's nothing 1959 // interesting to do anyway and it simplifies the checks in the loop. 1960 auto VLeft = cast<Instruction>(VL[0])->getOperand(0); 1961 auto VRight = cast<Instruction>(VL[0])->getOperand(1); 1962 if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft)) 1963 // Favor having instruction to the right. FIXME: why? 1964 std::swap(VLeft, VRight); 1965 Left.push_back(VLeft); 1966 Right.push_back(VRight); 1967 } 1968 1969 // Keep track if we have instructions with all the same opcode on one side. 1970 bool AllSameOpcodeLeft = isa<Instruction>(Left[0]); 1971 bool AllSameOpcodeRight = isa<Instruction>(Right[0]); 1972 // Keep track if we have one side with all the same value (broadcast). 1973 bool SplatLeft = true; 1974 bool SplatRight = true; 1975 1976 for (unsigned i = 1, e = VL.size(); i != e; ++i) { 1977 Instruction *I = cast<Instruction>(VL[i]); 1978 assert(I->isCommutative() && "Can only process commutative instruction"); 1979 // Commute to favor either a splat or maximizing having the same opcodes on 1980 // one side. 1981 if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, 1982 AllSameOpcodeRight, SplatLeft, SplatRight)) { 1983 Left.push_back(I->getOperand(1)); 1984 Right.push_back(I->getOperand(0)); 1985 } else { 1986 Left.push_back(I->getOperand(0)); 1987 Right.push_back(I->getOperand(1)); 1988 } 1989 // Update Splat* and AllSameOpcode* after the insertion. 1990 SplatRight = SplatRight && (Right[i - 1] == Right[i]); 1991 SplatLeft = SplatLeft && (Left[i - 1] == Left[i]); 1992 AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) && 1993 (cast<Instruction>(Left[i - 1])->getOpcode() == 1994 cast<Instruction>(Left[i])->getOpcode()); 1995 AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) && 1996 (cast<Instruction>(Right[i - 1])->getOpcode() == 1997 cast<Instruction>(Right[i])->getOpcode()); 1998 } 1999 2000 // If one operand end up being broadcast, return this operand order. 2001 if (SplatRight || SplatLeft) 2002 return; 2003 2004 const DataLayout &DL = F->getParent()->getDataLayout(); 2005 2006 // Finally check if we can get longer vectorizable chain by reordering 2007 // without breaking the good operand order detected above. 2008 // E.g. If we have something like- 2009 // load a[0] load b[0] 2010 // load b[1] load a[1] 2011 // load a[2] load b[2] 2012 // load a[3] load b[3] 2013 // Reordering the second load b[1] load a[1] would allow us to vectorize 2014 // this code and we still retain AllSameOpcode property. 2015 // FIXME: This load reordering might break AllSameOpcode in some rare cases 2016 // such as- 2017 // add a[0],c[0] load b[0] 2018 // add a[1],c[2] load b[1] 2019 // b[2] load b[2] 2020 // add a[3],c[3] load b[3] 2021 for (unsigned j = 0; j < VL.size() - 1; ++j) { 2022 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { 2023 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { 2024 if (isConsecutiveAccess(L, L1, DL, *SE)) { 2025 std::swap(Left[j + 1], Right[j + 1]); 2026 continue; 2027 } 2028 } 2029 } 2030 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { 2031 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { 2032 if (isConsecutiveAccess(L, L1, DL, *SE)) { 2033 std::swap(Left[j + 1], Right[j + 1]); 2034 continue; 2035 } 2036 } 2037 } 2038 // else unchanged 2039 } 2040 } 2041 2042 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { 2043 Instruction *VL0 = cast<Instruction>(VL[0]); 2044 BasicBlock::iterator NextInst(VL0); 2045 ++NextInst; 2046 Builder.SetInsertPoint(VL0->getParent(), NextInst); 2047 Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); 2048 } 2049 2050 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { 2051 Value *Vec = UndefValue::get(Ty); 2052 // Generate the 'InsertElement' instruction. 2053 for (unsigned i = 0; i < Ty->getNumElements(); ++i) { 2054 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); 2055 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { 2056 GatherSeq.insert(Insrt); 2057 CSEBlocks.insert(Insrt->getParent()); 2058 2059 // Add to our 'need-to-extract' list. 2060 if (ScalarToTreeEntry.count(VL[i])) { 2061 int Idx = ScalarToTreeEntry[VL[i]]; 2062 TreeEntry *E = &VectorizableTree[Idx]; 2063 // Find which lane we need to extract. 2064 int FoundLane = -1; 2065 for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { 2066 // Is this the lane of the scalar that we are looking for ? 2067 if (E->Scalars[Lane] == VL[i]) { 2068 FoundLane = Lane; 2069 break; 2070 } 2071 } 2072 assert(FoundLane >= 0 && "Could not find the correct lane"); 2073 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); 2074 } 2075 } 2076 } 2077 2078 return Vec; 2079 } 2080 2081 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { 2082 SmallDenseMap<Value*, int>::const_iterator Entry 2083 = ScalarToTreeEntry.find(VL[0]); 2084 if (Entry != ScalarToTreeEntry.end()) { 2085 int Idx = Entry->second; 2086 const TreeEntry *En = &VectorizableTree[Idx]; 2087 if (En->isSame(VL) && En->VectorizedValue) 2088 return En->VectorizedValue; 2089 } 2090 return nullptr; 2091 } 2092 2093 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { 2094 if (ScalarToTreeEntry.count(VL[0])) { 2095 int Idx = ScalarToTreeEntry[VL[0]]; 2096 TreeEntry *E = &VectorizableTree[Idx]; 2097 if (E->isSame(VL)) 2098 return vectorizeTree(E); 2099 } 2100 2101 Type *ScalarTy = VL[0]->getType(); 2102 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 2103 ScalarTy = SI->getValueOperand()->getType(); 2104 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 2105 2106 return Gather(VL, VecTy); 2107 } 2108 2109 Value *BoUpSLP::vectorizeTree(TreeEntry *E) { 2110 IRBuilder<>::InsertPointGuard Guard(Builder); 2111 2112 if (E->VectorizedValue) { 2113 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); 2114 return E->VectorizedValue; 2115 } 2116 2117 Instruction *VL0 = cast<Instruction>(E->Scalars[0]); 2118 Type *ScalarTy = VL0->getType(); 2119 if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) 2120 ScalarTy = SI->getValueOperand()->getType(); 2121 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); 2122 2123 if (E->NeedToGather) { 2124 setInsertPointAfterBundle(E->Scalars); 2125 return Gather(E->Scalars, VecTy); 2126 } 2127 2128 const DataLayout &DL = F->getParent()->getDataLayout(); 2129 unsigned Opcode = getSameOpcode(E->Scalars); 2130 2131 switch (Opcode) { 2132 case Instruction::PHI: { 2133 PHINode *PH = dyn_cast<PHINode>(VL0); 2134 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); 2135 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 2136 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); 2137 E->VectorizedValue = NewPhi; 2138 2139 // PHINodes may have multiple entries from the same block. We want to 2140 // visit every block once. 2141 SmallSet<BasicBlock*, 4> VisitedBBs; 2142 2143 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 2144 ValueList Operands; 2145 BasicBlock *IBB = PH->getIncomingBlock(i); 2146 2147 if (!VisitedBBs.insert(IBB).second) { 2148 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); 2149 continue; 2150 } 2151 2152 // Prepare the operand vector. 2153 for (Value *V : E->Scalars) 2154 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB)); 2155 2156 Builder.SetInsertPoint(IBB->getTerminator()); 2157 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 2158 Value *Vec = vectorizeTree(Operands); 2159 NewPhi->addIncoming(Vec, IBB); 2160 } 2161 2162 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && 2163 "Invalid number of incoming values"); 2164 return NewPhi; 2165 } 2166 2167 case Instruction::ExtractElement: { 2168 if (CanReuseExtract(E->Scalars)) { 2169 Value *V = VL0->getOperand(0); 2170 E->VectorizedValue = V; 2171 return V; 2172 } 2173 return Gather(E->Scalars, VecTy); 2174 } 2175 case Instruction::ZExt: 2176 case Instruction::SExt: 2177 case Instruction::FPToUI: 2178 case Instruction::FPToSI: 2179 case Instruction::FPExt: 2180 case Instruction::PtrToInt: 2181 case Instruction::IntToPtr: 2182 case Instruction::SIToFP: 2183 case Instruction::UIToFP: 2184 case Instruction::Trunc: 2185 case Instruction::FPTrunc: 2186 case Instruction::BitCast: { 2187 ValueList INVL; 2188 for (Value *V : E->Scalars) 2189 INVL.push_back(cast<Instruction>(V)->getOperand(0)); 2190 2191 setInsertPointAfterBundle(E->Scalars); 2192 2193 Value *InVec = vectorizeTree(INVL); 2194 2195 if (Value *V = alreadyVectorized(E->Scalars)) 2196 return V; 2197 2198 CastInst *CI = dyn_cast<CastInst>(VL0); 2199 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); 2200 E->VectorizedValue = V; 2201 ++NumVectorInstructions; 2202 return V; 2203 } 2204 case Instruction::FCmp: 2205 case Instruction::ICmp: { 2206 ValueList LHSV, RHSV; 2207 for (Value *V : E->Scalars) { 2208 LHSV.push_back(cast<Instruction>(V)->getOperand(0)); 2209 RHSV.push_back(cast<Instruction>(V)->getOperand(1)); 2210 } 2211 2212 setInsertPointAfterBundle(E->Scalars); 2213 2214 Value *L = vectorizeTree(LHSV); 2215 Value *R = vectorizeTree(RHSV); 2216 2217 if (Value *V = alreadyVectorized(E->Scalars)) 2218 return V; 2219 2220 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 2221 Value *V; 2222 if (Opcode == Instruction::FCmp) 2223 V = Builder.CreateFCmp(P0, L, R); 2224 else 2225 V = Builder.CreateICmp(P0, L, R); 2226 2227 E->VectorizedValue = V; 2228 ++NumVectorInstructions; 2229 return V; 2230 } 2231 case Instruction::Select: { 2232 ValueList TrueVec, FalseVec, CondVec; 2233 for (Value *V : E->Scalars) { 2234 CondVec.push_back(cast<Instruction>(V)->getOperand(0)); 2235 TrueVec.push_back(cast<Instruction>(V)->getOperand(1)); 2236 FalseVec.push_back(cast<Instruction>(V)->getOperand(2)); 2237 } 2238 2239 setInsertPointAfterBundle(E->Scalars); 2240 2241 Value *Cond = vectorizeTree(CondVec); 2242 Value *True = vectorizeTree(TrueVec); 2243 Value *False = vectorizeTree(FalseVec); 2244 2245 if (Value *V = alreadyVectorized(E->Scalars)) 2246 return V; 2247 2248 Value *V = Builder.CreateSelect(Cond, True, False); 2249 E->VectorizedValue = V; 2250 ++NumVectorInstructions; 2251 return V; 2252 } 2253 case Instruction::Add: 2254 case Instruction::FAdd: 2255 case Instruction::Sub: 2256 case Instruction::FSub: 2257 case Instruction::Mul: 2258 case Instruction::FMul: 2259 case Instruction::UDiv: 2260 case Instruction::SDiv: 2261 case Instruction::FDiv: 2262 case Instruction::URem: 2263 case Instruction::SRem: 2264 case Instruction::FRem: 2265 case Instruction::Shl: 2266 case Instruction::LShr: 2267 case Instruction::AShr: 2268 case Instruction::And: 2269 case Instruction::Or: 2270 case Instruction::Xor: { 2271 ValueList LHSVL, RHSVL; 2272 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) 2273 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); 2274 else 2275 for (Value *V : E->Scalars) { 2276 LHSVL.push_back(cast<Instruction>(V)->getOperand(0)); 2277 RHSVL.push_back(cast<Instruction>(V)->getOperand(1)); 2278 } 2279 2280 setInsertPointAfterBundle(E->Scalars); 2281 2282 Value *LHS = vectorizeTree(LHSVL); 2283 Value *RHS = vectorizeTree(RHSVL); 2284 2285 if (LHS == RHS && isa<Instruction>(LHS)) { 2286 assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); 2287 } 2288 2289 if (Value *V = alreadyVectorized(E->Scalars)) 2290 return V; 2291 2292 BinaryOperator *BinOp = cast<BinaryOperator>(VL0); 2293 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); 2294 E->VectorizedValue = V; 2295 propagateIRFlags(E->VectorizedValue, E->Scalars); 2296 ++NumVectorInstructions; 2297 2298 if (Instruction *I = dyn_cast<Instruction>(V)) 2299 return propagateMetadata(I, E->Scalars); 2300 2301 return V; 2302 } 2303 case Instruction::Load: { 2304 // Loads are inserted at the head of the tree because we don't want to 2305 // sink them all the way down past store instructions. 2306 setInsertPointAfterBundle(E->Scalars); 2307 2308 LoadInst *LI = cast<LoadInst>(VL0); 2309 Type *ScalarLoadTy = LI->getType(); 2310 unsigned AS = LI->getPointerAddressSpace(); 2311 2312 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), 2313 VecTy->getPointerTo(AS)); 2314 2315 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2316 // ExternalUses list to make sure that an extract will be generated in the 2317 // future. 2318 if (ScalarToTreeEntry.count(LI->getPointerOperand())) 2319 ExternalUses.push_back( 2320 ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0)); 2321 2322 unsigned Alignment = LI->getAlignment(); 2323 LI = Builder.CreateLoad(VecPtr); 2324 if (!Alignment) { 2325 Alignment = DL.getABITypeAlignment(ScalarLoadTy); 2326 } 2327 LI->setAlignment(Alignment); 2328 E->VectorizedValue = LI; 2329 ++NumVectorInstructions; 2330 return propagateMetadata(LI, E->Scalars); 2331 } 2332 case Instruction::Store: { 2333 StoreInst *SI = cast<StoreInst>(VL0); 2334 unsigned Alignment = SI->getAlignment(); 2335 unsigned AS = SI->getPointerAddressSpace(); 2336 2337 ValueList ValueOp; 2338 for (Value *V : E->Scalars) 2339 ValueOp.push_back(cast<StoreInst>(V)->getValueOperand()); 2340 2341 setInsertPointAfterBundle(E->Scalars); 2342 2343 Value *VecValue = vectorizeTree(ValueOp); 2344 Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), 2345 VecTy->getPointerTo(AS)); 2346 StoreInst *S = Builder.CreateStore(VecValue, VecPtr); 2347 2348 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2349 // ExternalUses list to make sure that an extract will be generated in the 2350 // future. 2351 if (ScalarToTreeEntry.count(SI->getPointerOperand())) 2352 ExternalUses.push_back( 2353 ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0)); 2354 2355 if (!Alignment) { 2356 Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType()); 2357 } 2358 S->setAlignment(Alignment); 2359 E->VectorizedValue = S; 2360 ++NumVectorInstructions; 2361 return propagateMetadata(S, E->Scalars); 2362 } 2363 case Instruction::GetElementPtr: { 2364 setInsertPointAfterBundle(E->Scalars); 2365 2366 ValueList Op0VL; 2367 for (Value *V : E->Scalars) 2368 Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0)); 2369 2370 Value *Op0 = vectorizeTree(Op0VL); 2371 2372 std::vector<Value *> OpVecs; 2373 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; 2374 ++j) { 2375 ValueList OpVL; 2376 for (Value *V : E->Scalars) 2377 OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j)); 2378 2379 Value *OpVec = vectorizeTree(OpVL); 2380 OpVecs.push_back(OpVec); 2381 } 2382 2383 Value *V = Builder.CreateGEP( 2384 cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs); 2385 E->VectorizedValue = V; 2386 ++NumVectorInstructions; 2387 2388 if (Instruction *I = dyn_cast<Instruction>(V)) 2389 return propagateMetadata(I, E->Scalars); 2390 2391 return V; 2392 } 2393 case Instruction::Call: { 2394 CallInst *CI = cast<CallInst>(VL0); 2395 setInsertPointAfterBundle(E->Scalars); 2396 Function *FI; 2397 Intrinsic::ID IID = Intrinsic::not_intrinsic; 2398 Value *ScalarArg = nullptr; 2399 if (CI && (FI = CI->getCalledFunction())) { 2400 IID = FI->getIntrinsicID(); 2401 } 2402 std::vector<Value *> OpVecs; 2403 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { 2404 ValueList OpVL; 2405 // ctlz,cttz and powi are special intrinsics whose second argument is 2406 // a scalar. This argument should not be vectorized. 2407 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { 2408 CallInst *CEI = cast<CallInst>(E->Scalars[0]); 2409 ScalarArg = CEI->getArgOperand(j); 2410 OpVecs.push_back(CEI->getArgOperand(j)); 2411 continue; 2412 } 2413 for (Value *V : E->Scalars) { 2414 CallInst *CEI = cast<CallInst>(V); 2415 OpVL.push_back(CEI->getArgOperand(j)); 2416 } 2417 2418 Value *OpVec = vectorizeTree(OpVL); 2419 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); 2420 OpVecs.push_back(OpVec); 2421 } 2422 2423 Module *M = F->getParent(); 2424 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 2425 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; 2426 Function *CF = Intrinsic::getDeclaration(M, ID, Tys); 2427 Value *V = Builder.CreateCall(CF, OpVecs); 2428 2429 // The scalar argument uses an in-tree scalar so we add the new vectorized 2430 // call to ExternalUses list to make sure that an extract will be 2431 // generated in the future. 2432 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg)) 2433 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); 2434 2435 E->VectorizedValue = V; 2436 ++NumVectorInstructions; 2437 return V; 2438 } 2439 case Instruction::ShuffleVector: { 2440 ValueList LHSVL, RHSVL; 2441 assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand"); 2442 reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); 2443 setInsertPointAfterBundle(E->Scalars); 2444 2445 Value *LHS = vectorizeTree(LHSVL); 2446 Value *RHS = vectorizeTree(RHSVL); 2447 2448 if (Value *V = alreadyVectorized(E->Scalars)) 2449 return V; 2450 2451 // Create a vector of LHS op1 RHS 2452 BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0); 2453 Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); 2454 2455 // Create a vector of LHS op2 RHS 2456 Instruction *VL1 = cast<Instruction>(E->Scalars[1]); 2457 BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1); 2458 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); 2459 2460 // Create shuffle to take alternate operations from the vector. 2461 // Also, gather up odd and even scalar ops to propagate IR flags to 2462 // each vector operation. 2463 ValueList OddScalars, EvenScalars; 2464 unsigned e = E->Scalars.size(); 2465 SmallVector<Constant *, 8> Mask(e); 2466 for (unsigned i = 0; i < e; ++i) { 2467 if (i & 1) { 2468 Mask[i] = Builder.getInt32(e + i); 2469 OddScalars.push_back(E->Scalars[i]); 2470 } else { 2471 Mask[i] = Builder.getInt32(i); 2472 EvenScalars.push_back(E->Scalars[i]); 2473 } 2474 } 2475 2476 Value *ShuffleMask = ConstantVector::get(Mask); 2477 propagateIRFlags(V0, EvenScalars); 2478 propagateIRFlags(V1, OddScalars); 2479 2480 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); 2481 E->VectorizedValue = V; 2482 ++NumVectorInstructions; 2483 if (Instruction *I = dyn_cast<Instruction>(V)) 2484 return propagateMetadata(I, E->Scalars); 2485 2486 return V; 2487 } 2488 default: 2489 llvm_unreachable("unknown inst"); 2490 } 2491 return nullptr; 2492 } 2493 2494 Value *BoUpSLP::vectorizeTree() { 2495 2496 // All blocks must be scheduled before any instructions are inserted. 2497 for (auto &BSIter : BlocksSchedules) { 2498 scheduleBlock(BSIter.second.get()); 2499 } 2500 2501 Builder.SetInsertPoint(&F->getEntryBlock().front()); 2502 vectorizeTree(&VectorizableTree[0]); 2503 2504 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); 2505 2506 // Extract all of the elements with the external uses. 2507 for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end(); 2508 it != e; ++it) { 2509 Value *Scalar = it->Scalar; 2510 llvm::User *User = it->User; 2511 2512 // Skip users that we already RAUW. This happens when one instruction 2513 // has multiple uses of the same value. 2514 if (std::find(Scalar->user_begin(), Scalar->user_end(), User) == 2515 Scalar->user_end()) 2516 continue; 2517 assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); 2518 2519 int Idx = ScalarToTreeEntry[Scalar]; 2520 TreeEntry *E = &VectorizableTree[Idx]; 2521 assert(!E->NeedToGather && "Extracting from a gather list"); 2522 2523 Value *Vec = E->VectorizedValue; 2524 assert(Vec && "Can't find vectorizable value"); 2525 2526 Value *Lane = Builder.getInt32(it->Lane); 2527 // Generate extracts for out-of-tree users. 2528 // Find the insertion point for the extractelement lane. 2529 if (isa<Instruction>(Vec)){ 2530 if (PHINode *PH = dyn_cast<PHINode>(User)) { 2531 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { 2532 if (PH->getIncomingValue(i) == Scalar) { 2533 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); 2534 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2535 CSEBlocks.insert(PH->getIncomingBlock(i)); 2536 PH->setOperand(i, Ex); 2537 } 2538 } 2539 } else { 2540 Builder.SetInsertPoint(cast<Instruction>(User)); 2541 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2542 CSEBlocks.insert(cast<Instruction>(User)->getParent()); 2543 User->replaceUsesOfWith(Scalar, Ex); 2544 } 2545 } else { 2546 Builder.SetInsertPoint(&F->getEntryBlock().front()); 2547 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2548 CSEBlocks.insert(&F->getEntryBlock()); 2549 User->replaceUsesOfWith(Scalar, Ex); 2550 } 2551 2552 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); 2553 } 2554 2555 // For each vectorized value: 2556 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 2557 TreeEntry *Entry = &VectorizableTree[EIdx]; 2558 2559 // For each lane: 2560 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 2561 Value *Scalar = Entry->Scalars[Lane]; 2562 // No need to handle users of gathered values. 2563 if (Entry->NeedToGather) 2564 continue; 2565 2566 assert(Entry->VectorizedValue && "Can't find vectorizable value"); 2567 2568 Type *Ty = Scalar->getType(); 2569 if (!Ty->isVoidTy()) { 2570 #ifndef NDEBUG 2571 for (User *U : Scalar->users()) { 2572 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); 2573 2574 assert((ScalarToTreeEntry.count(U) || 2575 // It is legal to replace users in the ignorelist by undef. 2576 (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != 2577 UserIgnoreList.end())) && 2578 "Replacing out-of-tree value with undef"); 2579 } 2580 #endif 2581 Value *Undef = UndefValue::get(Ty); 2582 Scalar->replaceAllUsesWith(Undef); 2583 } 2584 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); 2585 eraseInstruction(cast<Instruction>(Scalar)); 2586 } 2587 } 2588 2589 Builder.ClearInsertionPoint(); 2590 2591 return VectorizableTree[0].VectorizedValue; 2592 } 2593 2594 void BoUpSLP::optimizeGatherSequence() { 2595 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() 2596 << " gather sequences instructions.\n"); 2597 // LICM InsertElementInst sequences. 2598 for (SetVector<Instruction *>::iterator it = GatherSeq.begin(), 2599 e = GatherSeq.end(); it != e; ++it) { 2600 InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); 2601 2602 if (!Insert) 2603 continue; 2604 2605 // Check if this block is inside a loop. 2606 Loop *L = LI->getLoopFor(Insert->getParent()); 2607 if (!L) 2608 continue; 2609 2610 // Check if it has a preheader. 2611 BasicBlock *PreHeader = L->getLoopPreheader(); 2612 if (!PreHeader) 2613 continue; 2614 2615 // If the vector or the element that we insert into it are 2616 // instructions that are defined in this basic block then we can't 2617 // hoist this instruction. 2618 Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); 2619 Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); 2620 if (CurrVec && L->contains(CurrVec)) 2621 continue; 2622 if (NewElem && L->contains(NewElem)) 2623 continue; 2624 2625 // We can hoist this instruction. Move it to the pre-header. 2626 Insert->moveBefore(PreHeader->getTerminator()); 2627 } 2628 2629 // Make a list of all reachable blocks in our CSE queue. 2630 SmallVector<const DomTreeNode *, 8> CSEWorkList; 2631 CSEWorkList.reserve(CSEBlocks.size()); 2632 for (BasicBlock *BB : CSEBlocks) 2633 if (DomTreeNode *N = DT->getNode(BB)) { 2634 assert(DT->isReachableFromEntry(N)); 2635 CSEWorkList.push_back(N); 2636 } 2637 2638 // Sort blocks by domination. This ensures we visit a block after all blocks 2639 // dominating it are visited. 2640 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), 2641 [this](const DomTreeNode *A, const DomTreeNode *B) { 2642 return DT->properlyDominates(A, B); 2643 }); 2644 2645 // Perform O(N^2) search over the gather sequences and merge identical 2646 // instructions. TODO: We can further optimize this scan if we split the 2647 // instructions into different buckets based on the insert lane. 2648 SmallVector<Instruction *, 16> Visited; 2649 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { 2650 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && 2651 "Worklist not sorted properly!"); 2652 BasicBlock *BB = (*I)->getBlock(); 2653 // For all instructions in blocks containing gather sequences: 2654 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { 2655 Instruction *In = &*it++; 2656 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) 2657 continue; 2658 2659 // Check if we can replace this instruction with any of the 2660 // visited instructions. 2661 for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(), 2662 ve = Visited.end(); 2663 v != ve; ++v) { 2664 if (In->isIdenticalTo(*v) && 2665 DT->dominates((*v)->getParent(), In->getParent())) { 2666 In->replaceAllUsesWith(*v); 2667 eraseInstruction(In); 2668 In = nullptr; 2669 break; 2670 } 2671 } 2672 if (In) { 2673 assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end()); 2674 Visited.push_back(In); 2675 } 2676 } 2677 } 2678 CSEBlocks.clear(); 2679 GatherSeq.clear(); 2680 } 2681 2682 // Groups the instructions to a bundle (which is then a single scheduling entity) 2683 // and schedules instructions until the bundle gets ready. 2684 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, 2685 BoUpSLP *SLP) { 2686 if (isa<PHINode>(VL[0])) 2687 return true; 2688 2689 // Initialize the instruction bundle. 2690 Instruction *OldScheduleEnd = ScheduleEnd; 2691 ScheduleData *PrevInBundle = nullptr; 2692 ScheduleData *Bundle = nullptr; 2693 bool ReSchedule = false; 2694 DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); 2695 2696 // Make sure that the scheduling region contains all 2697 // instructions of the bundle. 2698 for (Value *V : VL) { 2699 if (!extendSchedulingRegion(V)) 2700 return false; 2701 } 2702 2703 for (Value *V : VL) { 2704 ScheduleData *BundleMember = getScheduleData(V); 2705 assert(BundleMember && 2706 "no ScheduleData for bundle member (maybe not in same basic block)"); 2707 if (BundleMember->IsScheduled) { 2708 // A bundle member was scheduled as single instruction before and now 2709 // needs to be scheduled as part of the bundle. We just get rid of the 2710 // existing schedule. 2711 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember 2712 << " was already scheduled\n"); 2713 ReSchedule = true; 2714 } 2715 assert(BundleMember->isSchedulingEntity() && 2716 "bundle member already part of other bundle"); 2717 if (PrevInBundle) { 2718 PrevInBundle->NextInBundle = BundleMember; 2719 } else { 2720 Bundle = BundleMember; 2721 } 2722 BundleMember->UnscheduledDepsInBundle = 0; 2723 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; 2724 2725 // Group the instructions to a bundle. 2726 BundleMember->FirstInBundle = Bundle; 2727 PrevInBundle = BundleMember; 2728 } 2729 if (ScheduleEnd != OldScheduleEnd) { 2730 // The scheduling region got new instructions at the lower end (or it is a 2731 // new region for the first bundle). This makes it necessary to 2732 // recalculate all dependencies. 2733 // It is seldom that this needs to be done a second time after adding the 2734 // initial bundle to the region. 2735 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2736 ScheduleData *SD = getScheduleData(I); 2737 SD->clearDependencies(); 2738 } 2739 ReSchedule = true; 2740 } 2741 if (ReSchedule) { 2742 resetSchedule(); 2743 initialFillReadyList(ReadyInsts); 2744 } 2745 2746 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " 2747 << BB->getName() << "\n"); 2748 2749 calculateDependencies(Bundle, true, SLP); 2750 2751 // Now try to schedule the new bundle. As soon as the bundle is "ready" it 2752 // means that there are no cyclic dependencies and we can schedule it. 2753 // Note that's important that we don't "schedule" the bundle yet (see 2754 // cancelScheduling). 2755 while (!Bundle->isReady() && !ReadyInsts.empty()) { 2756 2757 ScheduleData *pickedSD = ReadyInsts.back(); 2758 ReadyInsts.pop_back(); 2759 2760 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { 2761 schedule(pickedSD, ReadyInsts); 2762 } 2763 } 2764 if (!Bundle->isReady()) { 2765 cancelScheduling(VL); 2766 return false; 2767 } 2768 return true; 2769 } 2770 2771 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) { 2772 if (isa<PHINode>(VL[0])) 2773 return; 2774 2775 ScheduleData *Bundle = getScheduleData(VL[0]); 2776 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); 2777 assert(!Bundle->IsScheduled && 2778 "Can't cancel bundle which is already scheduled"); 2779 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && 2780 "tried to unbundle something which is not a bundle"); 2781 2782 // Un-bundle: make single instructions out of the bundle. 2783 ScheduleData *BundleMember = Bundle; 2784 while (BundleMember) { 2785 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); 2786 BundleMember->FirstInBundle = BundleMember; 2787 ScheduleData *Next = BundleMember->NextInBundle; 2788 BundleMember->NextInBundle = nullptr; 2789 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; 2790 if (BundleMember->UnscheduledDepsInBundle == 0) { 2791 ReadyInsts.insert(BundleMember); 2792 } 2793 BundleMember = Next; 2794 } 2795 } 2796 2797 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { 2798 if (getScheduleData(V)) 2799 return true; 2800 Instruction *I = dyn_cast<Instruction>(V); 2801 assert(I && "bundle member must be an instruction"); 2802 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); 2803 if (!ScheduleStart) { 2804 // It's the first instruction in the new region. 2805 initScheduleData(I, I->getNextNode(), nullptr, nullptr); 2806 ScheduleStart = I; 2807 ScheduleEnd = I->getNextNode(); 2808 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2809 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); 2810 return true; 2811 } 2812 // Search up and down at the same time, because we don't know if the new 2813 // instruction is above or below the existing scheduling region. 2814 BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator()); 2815 BasicBlock::reverse_iterator UpperEnd = BB->rend(); 2816 BasicBlock::iterator DownIter(ScheduleEnd); 2817 BasicBlock::iterator LowerEnd = BB->end(); 2818 for (;;) { 2819 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { 2820 DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); 2821 return false; 2822 } 2823 2824 if (UpIter != UpperEnd) { 2825 if (&*UpIter == I) { 2826 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); 2827 ScheduleStart = I; 2828 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); 2829 return true; 2830 } 2831 UpIter++; 2832 } 2833 if (DownIter != LowerEnd) { 2834 if (&*DownIter == I) { 2835 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, 2836 nullptr); 2837 ScheduleEnd = I->getNextNode(); 2838 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2839 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); 2840 return true; 2841 } 2842 DownIter++; 2843 } 2844 assert((UpIter != UpperEnd || DownIter != LowerEnd) && 2845 "instruction not found in block"); 2846 } 2847 return true; 2848 } 2849 2850 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, 2851 Instruction *ToI, 2852 ScheduleData *PrevLoadStore, 2853 ScheduleData *NextLoadStore) { 2854 ScheduleData *CurrentLoadStore = PrevLoadStore; 2855 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { 2856 ScheduleData *SD = ScheduleDataMap[I]; 2857 if (!SD) { 2858 // Allocate a new ScheduleData for the instruction. 2859 if (ChunkPos >= ChunkSize) { 2860 ScheduleDataChunks.push_back( 2861 llvm::make_unique<ScheduleData[]>(ChunkSize)); 2862 ChunkPos = 0; 2863 } 2864 SD = &(ScheduleDataChunks.back()[ChunkPos++]); 2865 ScheduleDataMap[I] = SD; 2866 SD->Inst = I; 2867 } 2868 assert(!isInSchedulingRegion(SD) && 2869 "new ScheduleData already in scheduling region"); 2870 SD->init(SchedulingRegionID); 2871 2872 if (I->mayReadOrWriteMemory()) { 2873 // Update the linked list of memory accessing instructions. 2874 if (CurrentLoadStore) { 2875 CurrentLoadStore->NextLoadStore = SD; 2876 } else { 2877 FirstLoadStoreInRegion = SD; 2878 } 2879 CurrentLoadStore = SD; 2880 } 2881 } 2882 if (NextLoadStore) { 2883 if (CurrentLoadStore) 2884 CurrentLoadStore->NextLoadStore = NextLoadStore; 2885 } else { 2886 LastLoadStoreInRegion = CurrentLoadStore; 2887 } 2888 } 2889 2890 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, 2891 bool InsertInReadyList, 2892 BoUpSLP *SLP) { 2893 assert(SD->isSchedulingEntity()); 2894 2895 SmallVector<ScheduleData *, 10> WorkList; 2896 WorkList.push_back(SD); 2897 2898 while (!WorkList.empty()) { 2899 ScheduleData *SD = WorkList.back(); 2900 WorkList.pop_back(); 2901 2902 ScheduleData *BundleMember = SD; 2903 while (BundleMember) { 2904 assert(isInSchedulingRegion(BundleMember)); 2905 if (!BundleMember->hasValidDependencies()) { 2906 2907 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n"); 2908 BundleMember->Dependencies = 0; 2909 BundleMember->resetUnscheduledDeps(); 2910 2911 // Handle def-use chain dependencies. 2912 for (User *U : BundleMember->Inst->users()) { 2913 if (isa<Instruction>(U)) { 2914 ScheduleData *UseSD = getScheduleData(U); 2915 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { 2916 BundleMember->Dependencies++; 2917 ScheduleData *DestBundle = UseSD->FirstInBundle; 2918 if (!DestBundle->IsScheduled) { 2919 BundleMember->incrementUnscheduledDeps(1); 2920 } 2921 if (!DestBundle->hasValidDependencies()) { 2922 WorkList.push_back(DestBundle); 2923 } 2924 } 2925 } else { 2926 // I'm not sure if this can ever happen. But we need to be safe. 2927 // This lets the instruction/bundle never be scheduled and 2928 // eventually disable vectorization. 2929 BundleMember->Dependencies++; 2930 BundleMember->incrementUnscheduledDeps(1); 2931 } 2932 } 2933 2934 // Handle the memory dependencies. 2935 ScheduleData *DepDest = BundleMember->NextLoadStore; 2936 if (DepDest) { 2937 Instruction *SrcInst = BundleMember->Inst; 2938 MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); 2939 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); 2940 unsigned numAliased = 0; 2941 unsigned DistToSrc = 1; 2942 2943 while (DepDest) { 2944 assert(isInSchedulingRegion(DepDest)); 2945 2946 // We have two limits to reduce the complexity: 2947 // 1) AliasedCheckLimit: It's a small limit to reduce calls to 2948 // SLP->isAliased (which is the expensive part in this loop). 2949 // 2) MaxMemDepDistance: It's for very large blocks and it aborts 2950 // the whole loop (even if the loop is fast, it's quadratic). 2951 // It's important for the loop break condition (see below) to 2952 // check this limit even between two read-only instructions. 2953 if (DistToSrc >= MaxMemDepDistance || 2954 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && 2955 (numAliased >= AliasedCheckLimit || 2956 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { 2957 2958 // We increment the counter only if the locations are aliased 2959 // (instead of counting all alias checks). This gives a better 2960 // balance between reduced runtime and accurate dependencies. 2961 numAliased++; 2962 2963 DepDest->MemoryDependencies.push_back(BundleMember); 2964 BundleMember->Dependencies++; 2965 ScheduleData *DestBundle = DepDest->FirstInBundle; 2966 if (!DestBundle->IsScheduled) { 2967 BundleMember->incrementUnscheduledDeps(1); 2968 } 2969 if (!DestBundle->hasValidDependencies()) { 2970 WorkList.push_back(DestBundle); 2971 } 2972 } 2973 DepDest = DepDest->NextLoadStore; 2974 2975 // Example, explaining the loop break condition: Let's assume our 2976 // starting instruction is i0 and MaxMemDepDistance = 3. 2977 // 2978 // +--------v--v--v 2979 // i0,i1,i2,i3,i4,i5,i6,i7,i8 2980 // +--------^--^--^ 2981 // 2982 // MaxMemDepDistance let us stop alias-checking at i3 and we add 2983 // dependencies from i0 to i3,i4,.. (even if they are not aliased). 2984 // Previously we already added dependencies from i3 to i6,i7,i8 2985 // (because of MaxMemDepDistance). As we added a dependency from 2986 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 2987 // and we can abort this loop at i6. 2988 if (DistToSrc >= 2 * MaxMemDepDistance) 2989 break; 2990 DistToSrc++; 2991 } 2992 } 2993 } 2994 BundleMember = BundleMember->NextInBundle; 2995 } 2996 if (InsertInReadyList && SD->isReady()) { 2997 ReadyInsts.push_back(SD); 2998 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n"); 2999 } 3000 } 3001 } 3002 3003 void BoUpSLP::BlockScheduling::resetSchedule() { 3004 assert(ScheduleStart && 3005 "tried to reset schedule on block which has not been scheduled"); 3006 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 3007 ScheduleData *SD = getScheduleData(I); 3008 assert(isInSchedulingRegion(SD)); 3009 SD->IsScheduled = false; 3010 SD->resetUnscheduledDeps(); 3011 } 3012 ReadyInsts.clear(); 3013 } 3014 3015 void BoUpSLP::scheduleBlock(BlockScheduling *BS) { 3016 3017 if (!BS->ScheduleStart) 3018 return; 3019 3020 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); 3021 3022 BS->resetSchedule(); 3023 3024 // For the real scheduling we use a more sophisticated ready-list: it is 3025 // sorted by the original instruction location. This lets the final schedule 3026 // be as close as possible to the original instruction order. 3027 struct ScheduleDataCompare { 3028 bool operator()(ScheduleData *SD1, ScheduleData *SD2) { 3029 return SD2->SchedulingPriority < SD1->SchedulingPriority; 3030 } 3031 }; 3032 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; 3033 3034 // Ensure that all dependency data is updated and fill the ready-list with 3035 // initial instructions. 3036 int Idx = 0; 3037 int NumToSchedule = 0; 3038 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; 3039 I = I->getNextNode()) { 3040 ScheduleData *SD = BS->getScheduleData(I); 3041 assert( 3042 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && 3043 "scheduler and vectorizer have different opinion on what is a bundle"); 3044 SD->FirstInBundle->SchedulingPriority = Idx++; 3045 if (SD->isSchedulingEntity()) { 3046 BS->calculateDependencies(SD, false, this); 3047 NumToSchedule++; 3048 } 3049 } 3050 BS->initialFillReadyList(ReadyInsts); 3051 3052 Instruction *LastScheduledInst = BS->ScheduleEnd; 3053 3054 // Do the "real" scheduling. 3055 while (!ReadyInsts.empty()) { 3056 ScheduleData *picked = *ReadyInsts.begin(); 3057 ReadyInsts.erase(ReadyInsts.begin()); 3058 3059 // Move the scheduled instruction(s) to their dedicated places, if not 3060 // there yet. 3061 ScheduleData *BundleMember = picked; 3062 while (BundleMember) { 3063 Instruction *pickedInst = BundleMember->Inst; 3064 if (LastScheduledInst->getNextNode() != pickedInst) { 3065 BS->BB->getInstList().remove(pickedInst); 3066 BS->BB->getInstList().insert(LastScheduledInst->getIterator(), 3067 pickedInst); 3068 } 3069 LastScheduledInst = pickedInst; 3070 BundleMember = BundleMember->NextInBundle; 3071 } 3072 3073 BS->schedule(picked, ReadyInsts); 3074 NumToSchedule--; 3075 } 3076 assert(NumToSchedule == 0 && "could not schedule all instructions"); 3077 3078 // Avoid duplicate scheduling of the block. 3079 BS->ScheduleStart = nullptr; 3080 } 3081 3082 unsigned BoUpSLP::getVectorElementSize(Value *V) { 3083 auto &DL = F->getParent()->getDataLayout(); 3084 3085 // If V is a store, just return the width of the stored value without 3086 // traversing the expression tree. This is the common case. 3087 if (auto *Store = dyn_cast<StoreInst>(V)) 3088 return DL.getTypeSizeInBits(Store->getValueOperand()->getType()); 3089 3090 // If V is not a store, we can traverse the expression tree to find loads 3091 // that feed it. The type of the loaded value may indicate a more suitable 3092 // width than V's type. We want to base the vector element size on the width 3093 // of memory operations where possible. 3094 SmallVector<Instruction *, 16> Worklist; 3095 SmallPtrSet<Instruction *, 16> Visited; 3096 if (auto *I = dyn_cast<Instruction>(V)) 3097 Worklist.push_back(I); 3098 3099 // Traverse the expression tree in bottom-up order looking for loads. If we 3100 // encounter an instruciton we don't yet handle, we give up. 3101 auto MaxWidth = 0u; 3102 auto FoundUnknownInst = false; 3103 while (!Worklist.empty() && !FoundUnknownInst) { 3104 auto *I = Worklist.pop_back_val(); 3105 Visited.insert(I); 3106 3107 // We should only be looking at scalar instructions here. If the current 3108 // instruction has a vector type, give up. 3109 auto *Ty = I->getType(); 3110 if (isa<VectorType>(Ty)) 3111 FoundUnknownInst = true; 3112 3113 // If the current instruction is a load, update MaxWidth to reflect the 3114 // width of the loaded value. 3115 else if (isa<LoadInst>(I)) 3116 MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty)); 3117 3118 // Otherwise, we need to visit the operands of the instruction. We only 3119 // handle the interesting cases from buildTree here. If an operand is an 3120 // instruction we haven't yet visited, we add it to the worklist. 3121 else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) || 3122 isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) { 3123 for (Use &U : I->operands()) 3124 if (auto *J = dyn_cast<Instruction>(U.get())) 3125 if (!Visited.count(J)) 3126 Worklist.push_back(J); 3127 } 3128 3129 // If we don't yet handle the instruction, give up. 3130 else 3131 FoundUnknownInst = true; 3132 } 3133 3134 // If we didn't encounter a memory access in the expression tree, or if we 3135 // gave up for some reason, just return the width of V. 3136 if (!MaxWidth || FoundUnknownInst) 3137 return DL.getTypeSizeInBits(V->getType()); 3138 3139 // Otherwise, return the maximum width we found. 3140 return MaxWidth; 3141 } 3142 3143 /// The SLPVectorizer Pass. 3144 struct SLPVectorizer : public FunctionPass { 3145 typedef SmallVector<StoreInst *, 8> StoreList; 3146 typedef MapVector<Value *, StoreList> StoreListMap; 3147 typedef SmallVector<WeakVH, 8> WeakVHList; 3148 typedef MapVector<Value *, WeakVHList> WeakVHListMap; 3149 3150 /// Pass identification, replacement for typeid 3151 static char ID; 3152 3153 explicit SLPVectorizer() : FunctionPass(ID) { 3154 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); 3155 } 3156 3157 ScalarEvolution *SE; 3158 TargetTransformInfo *TTI; 3159 TargetLibraryInfo *TLI; 3160 AliasAnalysis *AA; 3161 LoopInfo *LI; 3162 DominatorTree *DT; 3163 AssumptionCache *AC; 3164 3165 bool runOnFunction(Function &F) override { 3166 if (skipOptnoneFunction(F)) 3167 return false; 3168 3169 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 3170 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 3171 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 3172 TLI = TLIP ? &TLIP->getTLI() : nullptr; 3173 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 3174 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 3175 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 3176 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 3177 3178 Stores.clear(); 3179 GEPs.clear(); 3180 bool Changed = false; 3181 3182 // If the target claims to have no vector registers don't attempt 3183 // vectorization. 3184 if (!TTI->getNumberOfRegisters(true)) 3185 return false; 3186 3187 // Use the vector register size specified by the target unless overridden 3188 // by a command-line option. 3189 // TODO: It would be better to limit the vectorization factor based on 3190 // data type rather than just register size. For example, x86 AVX has 3191 // 256-bit registers, but it does not support integer operations 3192 // at that width (that requires AVX2). 3193 if (MaxVectorRegSizeOption.getNumOccurrences()) 3194 MaxVecRegSize = MaxVectorRegSizeOption; 3195 else 3196 MaxVecRegSize = TTI->getRegisterBitWidth(true); 3197 3198 // Don't vectorize when the attribute NoImplicitFloat is used. 3199 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 3200 return false; 3201 3202 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); 3203 3204 // Use the bottom up slp vectorizer to construct chains that start with 3205 // store instructions. 3206 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC); 3207 3208 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to 3209 // delete instructions. 3210 3211 // Scan the blocks in the function in post order. 3212 for (auto BB : post_order(&F.getEntryBlock())) { 3213 collectSeedInstructions(BB); 3214 3215 // Vectorize trees that end at stores. 3216 if (NumStores > 0) { 3217 DEBUG(dbgs() << "SLP: Found " << NumStores << " stores.\n"); 3218 Changed |= vectorizeStoreChains(R); 3219 } 3220 3221 // Vectorize trees that end at reductions. 3222 Changed |= vectorizeChainsInBlock(BB, R); 3223 3224 // Vectorize the index computations of getelementptr instructions. This 3225 // is primarily intended to catch gather-like idioms ending at 3226 // non-consecutive loads. 3227 if (NumGEPs > 0) { 3228 DEBUG(dbgs() << "SLP: Found " << NumGEPs << " GEPs.\n"); 3229 Changed |= vectorizeGEPIndices(BB, R); 3230 } 3231 } 3232 3233 if (Changed) { 3234 R.optimizeGatherSequence(); 3235 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); 3236 DEBUG(verifyFunction(F)); 3237 } 3238 return Changed; 3239 } 3240 3241 void getAnalysisUsage(AnalysisUsage &AU) const override { 3242 FunctionPass::getAnalysisUsage(AU); 3243 AU.addRequired<AssumptionCacheTracker>(); 3244 AU.addRequired<ScalarEvolutionWrapperPass>(); 3245 AU.addRequired<AAResultsWrapperPass>(); 3246 AU.addRequired<TargetTransformInfoWrapperPass>(); 3247 AU.addRequired<LoopInfoWrapperPass>(); 3248 AU.addRequired<DominatorTreeWrapperPass>(); 3249 AU.addPreserved<LoopInfoWrapperPass>(); 3250 AU.addPreserved<DominatorTreeWrapperPass>(); 3251 AU.addPreserved<AAResultsWrapperPass>(); 3252 AU.addPreserved<GlobalsAAWrapperPass>(); 3253 AU.setPreservesCFG(); 3254 } 3255 3256 private: 3257 /// \brief Collect store and getelementptr instructions and organize them 3258 /// according to the underlying object of their pointer operands. We sort the 3259 /// instructions by their underlying objects to reduce the cost of 3260 /// consecutive access queries. 3261 /// 3262 /// TODO: We can further reduce this cost if we flush the chain creation 3263 /// every time we run into a memory barrier. 3264 void collectSeedInstructions(BasicBlock *BB); 3265 3266 /// \brief Try to vectorize a chain that starts at two arithmetic instrs. 3267 bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); 3268 3269 /// \brief Try to vectorize a list of operands. 3270 /// \@param BuildVector A list of users to ignore for the purpose of 3271 /// scheduling and that don't need extracting. 3272 /// \returns true if a value was vectorized. 3273 bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 3274 ArrayRef<Value *> BuildVector = None, 3275 bool allowReorder = false); 3276 3277 /// \brief Try to vectorize a chain that may start at the operands of \V; 3278 bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); 3279 3280 /// \brief Vectorize the store instructions collected in Stores. 3281 bool vectorizeStoreChains(BoUpSLP &R); 3282 3283 /// \brief Vectorize the index computations of the getelementptr instructions 3284 /// collected in GEPs. 3285 bool vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R); 3286 3287 /// \brief Scan the basic block and look for patterns that are likely to start 3288 /// a vectorization chain. 3289 bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); 3290 3291 bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, 3292 BoUpSLP &R, unsigned VecRegSize); 3293 3294 bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, 3295 BoUpSLP &R); 3296 3297 /// The store instructions in a basic block organized by base pointer. 3298 StoreListMap Stores; 3299 3300 /// The getelementptr instructions in a basic block organized by base pointer. 3301 WeakVHListMap GEPs; 3302 3303 /// The number of store instructions in a basic block. 3304 unsigned NumStores; 3305 3306 /// The number of getelementptr instructions in a basic block. 3307 unsigned NumGEPs; 3308 3309 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. 3310 }; 3311 3312 /// \brief Check that the Values in the slice in VL array are still existent in 3313 /// the WeakVH array. 3314 /// Vectorization of part of the VL array may cause later values in the VL array 3315 /// to become invalid. We track when this has happened in the WeakVH array. 3316 static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH, 3317 unsigned SliceBegin, unsigned SliceSize) { 3318 VL = VL.slice(SliceBegin, SliceSize); 3319 VH = VH.slice(SliceBegin, SliceSize); 3320 return !std::equal(VL.begin(), VL.end(), VH.begin()); 3321 } 3322 3323 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, 3324 int CostThreshold, BoUpSLP &R, 3325 unsigned VecRegSize) { 3326 unsigned ChainLen = Chain.size(); 3327 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen 3328 << "\n"); 3329 unsigned Sz = R.getVectorElementSize(Chain[0]); 3330 unsigned VF = VecRegSize / Sz; 3331 3332 if (!isPowerOf2_32(Sz) || VF < 2) 3333 return false; 3334 3335 // Keep track of values that were deleted by vectorizing in the loop below. 3336 SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end()); 3337 3338 bool Changed = false; 3339 // Look for profitable vectorizable trees at all offsets, starting at zero. 3340 for (unsigned i = 0, e = ChainLen; i < e; ++i) { 3341 if (i + VF > e) 3342 break; 3343 3344 // Check that a previous iteration of this loop did not delete the Value. 3345 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) 3346 continue; 3347 3348 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i 3349 << "\n"); 3350 ArrayRef<Value *> Operands = Chain.slice(i, VF); 3351 3352 R.buildTree(Operands); 3353 3354 int Cost = R.getTreeCost(); 3355 3356 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); 3357 if (Cost < CostThreshold) { 3358 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); 3359 R.vectorizeTree(); 3360 3361 // Move to the next bundle. 3362 i += VF - 1; 3363 Changed = true; 3364 } 3365 } 3366 3367 return Changed; 3368 } 3369 3370 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, 3371 int costThreshold, BoUpSLP &R) { 3372 SetVector<StoreInst *> Heads, Tails; 3373 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; 3374 3375 // We may run into multiple chains that merge into a single chain. We mark the 3376 // stores that we vectorized so that we don't visit the same store twice. 3377 BoUpSLP::ValueSet VectorizedStores; 3378 bool Changed = false; 3379 3380 // Do a quadratic search on all of the given stores and find 3381 // all of the pairs of stores that follow each other. 3382 SmallVector<unsigned, 16> IndexQueue; 3383 for (unsigned i = 0, e = Stores.size(); i < e; ++i) { 3384 const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); 3385 IndexQueue.clear(); 3386 // If a store has multiple consecutive store candidates, search Stores 3387 // array according to the sequence: from i+1 to e, then from i-1 to 0. 3388 // This is because usually pairing with immediate succeeding or preceding 3389 // candidate create the best chance to find slp vectorization opportunity. 3390 unsigned j = 0; 3391 for (j = i + 1; j < e; ++j) 3392 IndexQueue.push_back(j); 3393 for (j = i; j > 0; --j) 3394 IndexQueue.push_back(j - 1); 3395 3396 for (auto &k : IndexQueue) { 3397 if (isConsecutiveAccess(Stores[i], Stores[k], DL, *SE)) { 3398 Tails.insert(Stores[k]); 3399 Heads.insert(Stores[i]); 3400 ConsecutiveChain[Stores[i]] = Stores[k]; 3401 break; 3402 } 3403 } 3404 } 3405 3406 // For stores that start but don't end a link in the chain: 3407 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); 3408 it != e; ++it) { 3409 if (Tails.count(*it)) 3410 continue; 3411 3412 // We found a store instr that starts a chain. Now follow the chain and try 3413 // to vectorize it. 3414 BoUpSLP::ValueList Operands; 3415 StoreInst *I = *it; 3416 // Collect the chain into a list. 3417 while (Tails.count(I) || Heads.count(I)) { 3418 if (VectorizedStores.count(I)) 3419 break; 3420 Operands.push_back(I); 3421 // Move to the next value in the chain. 3422 I = ConsecutiveChain[I]; 3423 } 3424 3425 // FIXME: Is division-by-2 the correct step? Should we assert that the 3426 // register size is a power-of-2? 3427 for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { 3428 if (vectorizeStoreChain(Operands, costThreshold, R, Size)) { 3429 // Mark the vectorized stores so that we don't vectorize them again. 3430 VectorizedStores.insert(Operands.begin(), Operands.end()); 3431 Changed = true; 3432 break; 3433 } 3434 } 3435 } 3436 3437 return Changed; 3438 } 3439 3440 void SLPVectorizer::collectSeedInstructions(BasicBlock *BB) { 3441 3442 // Initialize the collections. We will make a single pass over the block. 3443 Stores.clear(); 3444 GEPs.clear(); 3445 NumStores = NumGEPs = 0; 3446 const DataLayout &DL = BB->getModule()->getDataLayout(); 3447 3448 // Visit the store and getelementptr instructions in BB and organize them in 3449 // Stores and GEPs according to the underlying objects of their pointer 3450 // operands. 3451 for (Instruction &I : *BB) { 3452 3453 // Ignore store instructions that are volatile or have a pointer operand 3454 // that doesn't point to a scalar type. 3455 if (auto *SI = dyn_cast<StoreInst>(&I)) { 3456 if (!SI->isSimple()) 3457 continue; 3458 if (!isValidElementType(SI->getValueOperand()->getType())) 3459 continue; 3460 Stores[GetUnderlyingObject(SI->getPointerOperand(), DL)].push_back(SI); 3461 ++NumStores; 3462 } 3463 3464 // Ignore getelementptr instructions that have more than one index, a 3465 // constant index, or a pointer operand that doesn't point to a scalar 3466 // type. 3467 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 3468 auto Idx = GEP->idx_begin()->get(); 3469 if (GEP->getNumIndices() > 1 || isa<Constant>(Idx)) 3470 continue; 3471 if (!isValidElementType(Idx->getType())) 3472 continue; 3473 GEPs[GetUnderlyingObject(GEP->getPointerOperand(), DL)].push_back(GEP); 3474 ++NumGEPs; 3475 } 3476 } 3477 } 3478 3479 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { 3480 if (!A || !B) 3481 return false; 3482 Value *VL[] = { A, B }; 3483 return tryToVectorizeList(VL, R, None, true); 3484 } 3485 3486 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 3487 ArrayRef<Value *> BuildVector, 3488 bool allowReorder) { 3489 if (VL.size() < 2) 3490 return false; 3491 3492 DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n"); 3493 3494 // Check that all of the parts are scalar instructions of the same type. 3495 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 3496 if (!I0) 3497 return false; 3498 3499 unsigned Opcode0 = I0->getOpcode(); 3500 3501 // FIXME: Register size should be a parameter to this function, so we can 3502 // try different vectorization factors. 3503 unsigned Sz = R.getVectorElementSize(I0); 3504 unsigned VF = MinVecRegSize / Sz; 3505 3506 for (Value *V : VL) { 3507 Type *Ty = V->getType(); 3508 if (!isValidElementType(Ty)) 3509 return false; 3510 Instruction *Inst = dyn_cast<Instruction>(V); 3511 if (!Inst || Inst->getOpcode() != Opcode0) 3512 return false; 3513 } 3514 3515 bool Changed = false; 3516 3517 // Keep track of values that were deleted by vectorizing in the loop below. 3518 SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); 3519 3520 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 3521 unsigned OpsWidth = 0; 3522 3523 if (i + VF > e) 3524 OpsWidth = e - i; 3525 else 3526 OpsWidth = VF; 3527 3528 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) 3529 break; 3530 3531 // Check that a previous iteration of this loop did not delete the Value. 3532 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth)) 3533 continue; 3534 3535 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " 3536 << "\n"); 3537 ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); 3538 3539 ArrayRef<Value *> BuildVectorSlice; 3540 if (!BuildVector.empty()) 3541 BuildVectorSlice = BuildVector.slice(i, OpsWidth); 3542 3543 R.buildTree(Ops, BuildVectorSlice); 3544 // TODO: check if we can allow reordering also for other cases than 3545 // tryToVectorizePair() 3546 if (allowReorder && R.shouldReorder()) { 3547 assert(Ops.size() == 2); 3548 assert(BuildVectorSlice.empty()); 3549 Value *ReorderedOps[] = { Ops[1], Ops[0] }; 3550 R.buildTree(ReorderedOps, None); 3551 } 3552 int Cost = R.getTreeCost(); 3553 3554 if (Cost < -SLPCostThreshold) { 3555 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); 3556 Value *VectorizedRoot = R.vectorizeTree(); 3557 3558 // Reconstruct the build vector by extracting the vectorized root. This 3559 // way we handle the case where some elements of the vector are undefined. 3560 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) 3561 if (!BuildVectorSlice.empty()) { 3562 // The insert point is the last build vector instruction. The vectorized 3563 // root will precede it. This guarantees that we get an instruction. The 3564 // vectorized tree could have been constant folded. 3565 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); 3566 unsigned VecIdx = 0; 3567 for (auto &V : BuildVectorSlice) { 3568 IRBuilder<true, NoFolder> Builder( 3569 InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter)); 3570 InsertElementInst *IE = cast<InsertElementInst>(V); 3571 Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( 3572 VectorizedRoot, Builder.getInt32(VecIdx++))); 3573 IE->setOperand(1, Extract); 3574 IE->removeFromParent(); 3575 IE->insertAfter(Extract); 3576 InsertAfter = IE; 3577 } 3578 } 3579 // Move to the next bundle. 3580 i += VF - 1; 3581 Changed = true; 3582 } 3583 } 3584 3585 return Changed; 3586 } 3587 3588 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { 3589 if (!V) 3590 return false; 3591 3592 // Try to vectorize V. 3593 if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) 3594 return true; 3595 3596 BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0)); 3597 BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1)); 3598 // Try to skip B. 3599 if (B && B->hasOneUse()) { 3600 BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); 3601 BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); 3602 if (tryToVectorizePair(A, B0, R)) { 3603 return true; 3604 } 3605 if (tryToVectorizePair(A, B1, R)) { 3606 return true; 3607 } 3608 } 3609 3610 // Try to skip A. 3611 if (A && A->hasOneUse()) { 3612 BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); 3613 BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); 3614 if (tryToVectorizePair(A0, B, R)) { 3615 return true; 3616 } 3617 if (tryToVectorizePair(A1, B, R)) { 3618 return true; 3619 } 3620 } 3621 return 0; 3622 } 3623 3624 /// \brief Generate a shuffle mask to be used in a reduction tree. 3625 /// 3626 /// \param VecLen The length of the vector to be reduced. 3627 /// \param NumEltsToRdx The number of elements that should be reduced in the 3628 /// vector. 3629 /// \param IsPairwise Whether the reduction is a pairwise or splitting 3630 /// reduction. A pairwise reduction will generate a mask of 3631 /// <0,2,...> or <1,3,..> while a splitting reduction will generate 3632 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2. 3633 /// \param IsLeft True will generate a mask of even elements, odd otherwise. 3634 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, 3635 bool IsPairwise, bool IsLeft, 3636 IRBuilder<> &Builder) { 3637 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); 3638 3639 SmallVector<Constant *, 32> ShuffleMask( 3640 VecLen, UndefValue::get(Builder.getInt32Ty())); 3641 3642 if (IsPairwise) 3643 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). 3644 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3645 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); 3646 else 3647 // Move the upper half of the vector to the lower half. 3648 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3649 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); 3650 3651 return ConstantVector::get(ShuffleMask); 3652 } 3653 3654 3655 /// Model horizontal reductions. 3656 /// 3657 /// A horizontal reduction is a tree of reduction operations (currently add and 3658 /// fadd) that has operations that can be put into a vector as its leaf. 3659 /// For example, this tree: 3660 /// 3661 /// mul mul mul mul 3662 /// \ / \ / 3663 /// + + 3664 /// \ / 3665 /// + 3666 /// This tree has "mul" as its reduced values and "+" as its reduction 3667 /// operations. A reduction might be feeding into a store or a binary operation 3668 /// feeding a phi. 3669 /// ... 3670 /// \ / 3671 /// + 3672 /// | 3673 /// phi += 3674 /// 3675 /// Or: 3676 /// ... 3677 /// \ / 3678 /// + 3679 /// | 3680 /// *p = 3681 /// 3682 class HorizontalReduction { 3683 SmallVector<Value *, 16> ReductionOps; 3684 SmallVector<Value *, 32> ReducedVals; 3685 3686 BinaryOperator *ReductionRoot; 3687 PHINode *ReductionPHI; 3688 3689 /// The opcode of the reduction. 3690 unsigned ReductionOpcode; 3691 /// The opcode of the values we perform a reduction on. 3692 unsigned ReducedValueOpcode; 3693 /// Should we model this reduction as a pairwise reduction tree or a tree that 3694 /// splits the vector in halves and adds those halves. 3695 bool IsPairwiseReduction; 3696 3697 public: 3698 /// The width of one full horizontal reduction operation. 3699 unsigned ReduxWidth; 3700 3701 HorizontalReduction() 3702 : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), 3703 ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {} 3704 3705 /// \brief Try to find a reduction tree. 3706 bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { 3707 assert((!Phi || 3708 std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && 3709 "Thi phi needs to use the binary operator"); 3710 3711 // We could have a initial reductions that is not an add. 3712 // r *= v1 + v2 + v3 + v4 3713 // In such a case start looking for a tree rooted in the first '+'. 3714 if (Phi) { 3715 if (B->getOperand(0) == Phi) { 3716 Phi = nullptr; 3717 B = dyn_cast<BinaryOperator>(B->getOperand(1)); 3718 } else if (B->getOperand(1) == Phi) { 3719 Phi = nullptr; 3720 B = dyn_cast<BinaryOperator>(B->getOperand(0)); 3721 } 3722 } 3723 3724 if (!B) 3725 return false; 3726 3727 Type *Ty = B->getType(); 3728 if (!isValidElementType(Ty)) 3729 return false; 3730 3731 const DataLayout &DL = B->getModule()->getDataLayout(); 3732 ReductionOpcode = B->getOpcode(); 3733 ReducedValueOpcode = 0; 3734 // FIXME: Register size should be a parameter to this function, so we can 3735 // try different vectorization factors. 3736 ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); 3737 ReductionRoot = B; 3738 ReductionPHI = Phi; 3739 3740 if (ReduxWidth < 4) 3741 return false; 3742 3743 // We currently only support adds. 3744 if (ReductionOpcode != Instruction::Add && 3745 ReductionOpcode != Instruction::FAdd) 3746 return false; 3747 3748 // Post order traverse the reduction tree starting at B. We only handle true 3749 // trees containing only binary operators or selects. 3750 SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; 3751 Stack.push_back(std::make_pair(B, 0)); 3752 while (!Stack.empty()) { 3753 Instruction *TreeN = Stack.back().first; 3754 unsigned EdgeToVist = Stack.back().second++; 3755 bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; 3756 3757 // Only handle trees in the current basic block. 3758 if (TreeN->getParent() != B->getParent()) 3759 return false; 3760 3761 // Each tree node needs to have one user except for the ultimate 3762 // reduction. 3763 if (!TreeN->hasOneUse() && TreeN != B) 3764 return false; 3765 3766 // Postorder vist. 3767 if (EdgeToVist == 2 || IsReducedValue) { 3768 if (IsReducedValue) { 3769 // Make sure that the opcodes of the operations that we are going to 3770 // reduce match. 3771 if (!ReducedValueOpcode) 3772 ReducedValueOpcode = TreeN->getOpcode(); 3773 else if (ReducedValueOpcode != TreeN->getOpcode()) 3774 return false; 3775 ReducedVals.push_back(TreeN); 3776 } else { 3777 // We need to be able to reassociate the adds. 3778 if (!TreeN->isAssociative()) 3779 return false; 3780 ReductionOps.push_back(TreeN); 3781 } 3782 // Retract. 3783 Stack.pop_back(); 3784 continue; 3785 } 3786 3787 // Visit left or right. 3788 Value *NextV = TreeN->getOperand(EdgeToVist); 3789 // We currently only allow BinaryOperator's and SelectInst's as reduction 3790 // values in our tree. 3791 if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV)) 3792 Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0)); 3793 else if (NextV != Phi) 3794 return false; 3795 } 3796 return true; 3797 } 3798 3799 /// \brief Attempt to vectorize the tree found by 3800 /// matchAssociativeReduction. 3801 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { 3802 if (ReducedVals.empty()) 3803 return false; 3804 3805 unsigned NumReducedVals = ReducedVals.size(); 3806 if (NumReducedVals < ReduxWidth) 3807 return false; 3808 3809 Value *VectorizedTree = nullptr; 3810 IRBuilder<> Builder(ReductionRoot); 3811 FastMathFlags Unsafe; 3812 Unsafe.setUnsafeAlgebra(); 3813 Builder.setFastMathFlags(Unsafe); 3814 unsigned i = 0; 3815 3816 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { 3817 V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps); 3818 3819 // Estimate cost. 3820 int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); 3821 if (Cost >= -SLPCostThreshold) 3822 break; 3823 3824 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost 3825 << ". (HorRdx)\n"); 3826 3827 // Vectorize a tree. 3828 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); 3829 Value *VectorizedRoot = V.vectorizeTree(); 3830 3831 // Emit a reduction. 3832 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); 3833 if (VectorizedTree) { 3834 Builder.SetCurrentDebugLocation(Loc); 3835 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3836 ReducedSubTree, "bin.rdx"); 3837 } else 3838 VectorizedTree = ReducedSubTree; 3839 } 3840 3841 if (VectorizedTree) { 3842 // Finish the reduction. 3843 for (; i < NumReducedVals; ++i) { 3844 Builder.SetCurrentDebugLocation( 3845 cast<Instruction>(ReducedVals[i])->getDebugLoc()); 3846 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3847 ReducedVals[i]); 3848 } 3849 // Update users. 3850 if (ReductionPHI) { 3851 assert(ReductionRoot && "Need a reduction operation"); 3852 ReductionRoot->setOperand(0, VectorizedTree); 3853 ReductionRoot->setOperand(1, ReductionPHI); 3854 } else 3855 ReductionRoot->replaceAllUsesWith(VectorizedTree); 3856 } 3857 return VectorizedTree != nullptr; 3858 } 3859 3860 unsigned numReductionValues() const { 3861 return ReducedVals.size(); 3862 } 3863 3864 private: 3865 /// \brief Calculate the cost of a reduction. 3866 int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { 3867 Type *ScalarTy = FirstReducedVal->getType(); 3868 Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); 3869 3870 int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); 3871 int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); 3872 3873 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; 3874 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; 3875 3876 int ScalarReduxCost = 3877 ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); 3878 3879 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost 3880 << " for reduction that starts with " << *FirstReducedVal 3881 << " (It is a " 3882 << (IsPairwiseReduction ? "pairwise" : "splitting") 3883 << " reduction)\n"); 3884 3885 return VecReduxCost - ScalarReduxCost; 3886 } 3887 3888 static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, 3889 Value *R, const Twine &Name = "") { 3890 if (Opcode == Instruction::FAdd) 3891 return Builder.CreateFAdd(L, R, Name); 3892 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); 3893 } 3894 3895 /// \brief Emit a horizontal reduction of the vectorized value. 3896 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { 3897 assert(VectorizedValue && "Need to have a vectorized tree node"); 3898 assert(isPowerOf2_32(ReduxWidth) && 3899 "We only handle power-of-two reductions for now"); 3900 3901 Value *TmpVec = VectorizedValue; 3902 for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { 3903 if (IsPairwiseReduction) { 3904 Value *LeftMask = 3905 createRdxShuffleMask(ReduxWidth, i, true, true, Builder); 3906 Value *RightMask = 3907 createRdxShuffleMask(ReduxWidth, i, true, false, Builder); 3908 3909 Value *LeftShuf = Builder.CreateShuffleVector( 3910 TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); 3911 Value *RightShuf = Builder.CreateShuffleVector( 3912 TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), 3913 "rdx.shuf.r"); 3914 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, 3915 "bin.rdx"); 3916 } else { 3917 Value *UpperHalf = 3918 createRdxShuffleMask(ReduxWidth, i, false, false, Builder); 3919 Value *Shuf = Builder.CreateShuffleVector( 3920 TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); 3921 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); 3922 } 3923 } 3924 3925 // The result is in the first element of the vector. 3926 return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 3927 } 3928 }; 3929 3930 /// \brief Recognize construction of vectors like 3931 /// %ra = insertelement <4 x float> undef, float %s0, i32 0 3932 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 3933 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 3934 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 3935 /// 3936 /// Returns true if it matches 3937 /// 3938 static bool findBuildVector(InsertElementInst *FirstInsertElem, 3939 SmallVectorImpl<Value *> &BuildVector, 3940 SmallVectorImpl<Value *> &BuildVectorOpds) { 3941 if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) 3942 return false; 3943 3944 InsertElementInst *IE = FirstInsertElem; 3945 while (true) { 3946 BuildVector.push_back(IE); 3947 BuildVectorOpds.push_back(IE->getOperand(1)); 3948 3949 if (IE->use_empty()) 3950 return false; 3951 3952 InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back()); 3953 if (!NextUse) 3954 return true; 3955 3956 // If this isn't the final use, make sure the next insertelement is the only 3957 // use. It's OK if the final constructed vector is used multiple times 3958 if (!IE->hasOneUse()) 3959 return false; 3960 3961 IE = NextUse; 3962 } 3963 3964 return false; 3965 } 3966 3967 static bool PhiTypeSorterFunc(Value *V, Value *V2) { 3968 return V->getType() < V2->getType(); 3969 } 3970 3971 /// \brief Try and get a reduction value from a phi node. 3972 /// 3973 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions 3974 /// if they come from either \p ParentBB or a containing loop latch. 3975 /// 3976 /// \returns A candidate reduction value if possible, or \code nullptr \endcode 3977 /// if not possible. 3978 static Value *getReductionValue(const DominatorTree *DT, PHINode *P, 3979 BasicBlock *ParentBB, LoopInfo *LI) { 3980 // There are situations where the reduction value is not dominated by the 3981 // reduction phi. Vectorizing such cases has been reported to cause 3982 // miscompiles. See PR25787. 3983 auto DominatedReduxValue = [&](Value *R) { 3984 return ( 3985 dyn_cast<Instruction>(R) && 3986 DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent())); 3987 }; 3988 3989 Value *Rdx = nullptr; 3990 3991 // Return the incoming value if it comes from the same BB as the phi node. 3992 if (P->getIncomingBlock(0) == ParentBB) { 3993 Rdx = P->getIncomingValue(0); 3994 } else if (P->getIncomingBlock(1) == ParentBB) { 3995 Rdx = P->getIncomingValue(1); 3996 } 3997 3998 if (Rdx && DominatedReduxValue(Rdx)) 3999 return Rdx; 4000 4001 // Otherwise, check whether we have a loop latch to look at. 4002 Loop *BBL = LI->getLoopFor(ParentBB); 4003 if (!BBL) 4004 return nullptr; 4005 BasicBlock *BBLatch = BBL->getLoopLatch(); 4006 if (!BBLatch) 4007 return nullptr; 4008 4009 // There is a loop latch, return the incoming value if it comes from 4010 // that. This reduction pattern occassionaly turns up. 4011 if (P->getIncomingBlock(0) == BBLatch) { 4012 Rdx = P->getIncomingValue(0); 4013 } else if (P->getIncomingBlock(1) == BBLatch) { 4014 Rdx = P->getIncomingValue(1); 4015 } 4016 4017 if (Rdx && DominatedReduxValue(Rdx)) 4018 return Rdx; 4019 4020 return nullptr; 4021 } 4022 4023 /// \brief Attempt to reduce a horizontal reduction. 4024 /// If it is legal to match a horizontal reduction feeding 4025 /// the phi node P with reduction operators BI, then check if it 4026 /// can be done. 4027 /// \returns true if a horizontal reduction was matched and reduced. 4028 /// \returns false if a horizontal reduction was not matched. 4029 static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI, 4030 BoUpSLP &R, TargetTransformInfo *TTI) { 4031 if (!ShouldVectorizeHor) 4032 return false; 4033 4034 HorizontalReduction HorRdx; 4035 if (!HorRdx.matchAssociativeReduction(P, BI)) 4036 return false; 4037 4038 // If there is a sufficient number of reduction values, reduce 4039 // to a nearby power-of-2. Can safely generate oversized 4040 // vectors and rely on the backend to split them to legal sizes. 4041 HorRdx.ReduxWidth = 4042 std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues())); 4043 4044 return HorRdx.tryToReduce(R, TTI); 4045 } 4046 4047 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { 4048 bool Changed = false; 4049 SmallVector<Value *, 4> Incoming; 4050 SmallSet<Value *, 16> VisitedInstrs; 4051 4052 bool HaveVectorizedPhiNodes = true; 4053 while (HaveVectorizedPhiNodes) { 4054 HaveVectorizedPhiNodes = false; 4055 4056 // Collect the incoming values from the PHIs. 4057 Incoming.clear(); 4058 for (Instruction &I : *BB) { 4059 PHINode *P = dyn_cast<PHINode>(&I); 4060 if (!P) 4061 break; 4062 4063 if (!VisitedInstrs.count(P)) 4064 Incoming.push_back(P); 4065 } 4066 4067 // Sort by type. 4068 std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc); 4069 4070 // Try to vectorize elements base on their type. 4071 for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), 4072 E = Incoming.end(); 4073 IncIt != E;) { 4074 4075 // Look for the next elements with the same type. 4076 SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; 4077 while (SameTypeIt != E && 4078 (*SameTypeIt)->getType() == (*IncIt)->getType()) { 4079 VisitedInstrs.insert(*SameTypeIt); 4080 ++SameTypeIt; 4081 } 4082 4083 // Try to vectorize them. 4084 unsigned NumElts = (SameTypeIt - IncIt); 4085 DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); 4086 if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { 4087 // Success start over because instructions might have been changed. 4088 HaveVectorizedPhiNodes = true; 4089 Changed = true; 4090 break; 4091 } 4092 4093 // Start over at the next instruction of a different type (or the end). 4094 IncIt = SameTypeIt; 4095 } 4096 } 4097 4098 VisitedInstrs.clear(); 4099 4100 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { 4101 // We may go through BB multiple times so skip the one we have checked. 4102 if (!VisitedInstrs.insert(&*it).second) 4103 continue; 4104 4105 if (isa<DbgInfoIntrinsic>(it)) 4106 continue; 4107 4108 // Try to vectorize reductions that use PHINodes. 4109 if (PHINode *P = dyn_cast<PHINode>(it)) { 4110 // Check that the PHI is a reduction PHI. 4111 if (P->getNumIncomingValues() != 2) 4112 return Changed; 4113 4114 Value *Rdx = getReductionValue(DT, P, BB, LI); 4115 4116 // Check if this is a Binary Operator. 4117 BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); 4118 if (!BI) 4119 continue; 4120 4121 // Try to match and vectorize a horizontal reduction. 4122 if (canMatchHorizontalReduction(P, BI, R, TTI)) { 4123 Changed = true; 4124 it = BB->begin(); 4125 e = BB->end(); 4126 continue; 4127 } 4128 4129 Value *Inst = BI->getOperand(0); 4130 if (Inst == P) 4131 Inst = BI->getOperand(1); 4132 4133 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) { 4134 // We would like to start over since some instructions are deleted 4135 // and the iterator may become invalid value. 4136 Changed = true; 4137 it = BB->begin(); 4138 e = BB->end(); 4139 continue; 4140 } 4141 4142 continue; 4143 } 4144 4145 if (ShouldStartVectorizeHorAtStore) 4146 if (StoreInst *SI = dyn_cast<StoreInst>(it)) 4147 if (BinaryOperator *BinOp = 4148 dyn_cast<BinaryOperator>(SI->getValueOperand())) { 4149 if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) || 4150 tryToVectorize(BinOp, R)) { 4151 Changed = true; 4152 it = BB->begin(); 4153 e = BB->end(); 4154 continue; 4155 } 4156 } 4157 4158 // Try to vectorize horizontal reductions feeding into a return. 4159 if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) 4160 if (RI->getNumOperands() != 0) 4161 if (BinaryOperator *BinOp = 4162 dyn_cast<BinaryOperator>(RI->getOperand(0))) { 4163 DEBUG(dbgs() << "SLP: Found a return to vectorize.\n"); 4164 if (tryToVectorizePair(BinOp->getOperand(0), 4165 BinOp->getOperand(1), R)) { 4166 Changed = true; 4167 it = BB->begin(); 4168 e = BB->end(); 4169 continue; 4170 } 4171 } 4172 4173 // Try to vectorize trees that start at compare instructions. 4174 if (CmpInst *CI = dyn_cast<CmpInst>(it)) { 4175 if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { 4176 Changed = true; 4177 // We would like to start over since some instructions are deleted 4178 // and the iterator may become invalid value. 4179 it = BB->begin(); 4180 e = BB->end(); 4181 continue; 4182 } 4183 4184 for (int i = 0; i < 2; ++i) { 4185 if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) { 4186 if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { 4187 Changed = true; 4188 // We would like to start over since some instructions are deleted 4189 // and the iterator may become invalid value. 4190 it = BB->begin(); 4191 e = BB->end(); 4192 break; 4193 } 4194 } 4195 } 4196 continue; 4197 } 4198 4199 // Try to vectorize trees that start at insertelement instructions. 4200 if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { 4201 SmallVector<Value *, 16> BuildVector; 4202 SmallVector<Value *, 16> BuildVectorOpds; 4203 if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) 4204 continue; 4205 4206 // Vectorize starting with the build vector operands ignoring the 4207 // BuildVector instructions for the purpose of scheduling and user 4208 // extraction. 4209 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { 4210 Changed = true; 4211 it = BB->begin(); 4212 e = BB->end(); 4213 } 4214 4215 continue; 4216 } 4217 } 4218 4219 return Changed; 4220 } 4221 4222 bool SLPVectorizer::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { 4223 auto Changed = false; 4224 for (auto &Entry : GEPs) { 4225 4226 // If the getelementptr list has fewer than two elements, there's nothing 4227 // to do. 4228 if (Entry.second.size() < 2) 4229 continue; 4230 4231 DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " 4232 << Entry.second.size() << ".\n"); 4233 4234 // We process the getelementptr list in chunks of 16 (like we do for 4235 // stores) to minimize compile-time. 4236 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) { 4237 auto Len = std::min<unsigned>(BE - BI, 16); 4238 auto GEPList = makeArrayRef(&Entry.second[BI], Len); 4239 4240 // Initialize a set a candidate getelementptrs. Note that we use a 4241 // SetVector here to preserve program order. If the index computations 4242 // are vectorizable and begin with loads, we want to minimize the chance 4243 // of having to reorder them later. 4244 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end()); 4245 4246 // Some of the candidates may have already been vectorized after we 4247 // initially collected them. If so, the WeakVHs will have nullified the 4248 // values, so remove them from the set of candidates. 4249 Candidates.remove(nullptr); 4250 4251 // Remove from the set of candidates all pairs of getelementptrs with 4252 // constant differences. Such getelementptrs are likely not good 4253 // candidates for vectorization in a bottom-up phase since one can be 4254 // computed from the other. We also ensure all candidate getelementptr 4255 // indices are unique. 4256 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { 4257 auto *GEPI = cast<GetElementPtrInst>(GEPList[I]); 4258 if (!Candidates.count(GEPI)) 4259 continue; 4260 auto *SCEVI = SE->getSCEV(GEPList[I]); 4261 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { 4262 auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]); 4263 auto *SCEVJ = SE->getSCEV(GEPList[J]); 4264 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) { 4265 Candidates.remove(GEPList[I]); 4266 Candidates.remove(GEPList[J]); 4267 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { 4268 Candidates.remove(GEPList[J]); 4269 } 4270 } 4271 } 4272 4273 // We break out of the above computation as soon as we know there are 4274 // fewer than two candidates remaining. 4275 if (Candidates.size() < 2) 4276 continue; 4277 4278 // Add the single, non-constant index of each candidate to the bundle. We 4279 // ensured the indices met these constraints when we originally collected 4280 // the getelementptrs. 4281 SmallVector<Value *, 16> Bundle(Candidates.size()); 4282 auto BundleIndex = 0u; 4283 for (auto *V : Candidates) { 4284 auto *GEP = cast<GetElementPtrInst>(V); 4285 auto *GEPIdx = GEP->idx_begin()->get(); 4286 assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx)); 4287 Bundle[BundleIndex++] = GEPIdx; 4288 } 4289 4290 // Try and vectorize the indices. We are currently only interested in 4291 // gather-like cases of the form: 4292 // 4293 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... 4294 // 4295 // where the loads of "a", the loads of "b", and the subtractions can be 4296 // performed in parallel. It's likely that detecting this pattern in a 4297 // bottom-up phase will be simpler and less costly than building a 4298 // full-blown top-down phase beginning at the consecutive loads. 4299 Changed |= tryToVectorizeList(Bundle, R); 4300 } 4301 } 4302 return Changed; 4303 } 4304 4305 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { 4306 bool Changed = false; 4307 // Attempt to sort and vectorize each of the store-groups. 4308 for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e; 4309 ++it) { 4310 if (it->second.size() < 2) 4311 continue; 4312 4313 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " 4314 << it->second.size() << ".\n"); 4315 4316 // Process the stores in chunks of 16. 4317 // TODO: The limit of 16 inhibits greater vectorization factors. 4318 // For example, AVX2 supports v32i8. Increasing this limit, however, 4319 // may cause a significant compile-time increase. 4320 for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { 4321 unsigned Len = std::min<unsigned>(CE - CI, 16); 4322 Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), 4323 -SLPCostThreshold, R); 4324 } 4325 } 4326 return Changed; 4327 } 4328 4329 } // end anonymous namespace 4330 4331 char SLPVectorizer::ID = 0; 4332 static const char lv_name[] = "SLP Vectorizer"; 4333 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) 4334 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 4335 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 4336 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 4337 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 4338 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 4339 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) 4340 4341 namespace llvm { 4342 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); } 4343 } 4344