1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive 10 // stores that can be put together into vector-stores. Next, it attempts to 11 // construct vectorizable tree using the use-def chains. If a profitable tree 12 // was found, the SLP vectorizer performs vectorization on the tree. 13 // 14 // The pass is inspired by the work described in the paper: 15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 16 // 17 //===----------------------------------------------------------------------===// 18 #include "llvm/Transforms/Vectorize.h" 19 #include "llvm/ADT/MapVector.h" 20 #include "llvm/ADT/Optional.h" 21 #include "llvm/ADT/PostOrderIterator.h" 22 #include "llvm/ADT/SetVector.h" 23 #include "llvm/ADT/Statistic.h" 24 #include "llvm/Analysis/AliasAnalysis.h" 25 #include "llvm/Analysis/GlobalsModRef.h" 26 #include "llvm/Analysis/AssumptionCache.h" 27 #include "llvm/Analysis/CodeMetrics.h" 28 #include "llvm/Analysis/LoopInfo.h" 29 #include "llvm/Analysis/ScalarEvolution.h" 30 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 31 #include "llvm/Analysis/TargetTransformInfo.h" 32 #include "llvm/Analysis/ValueTracking.h" 33 #include "llvm/IR/DataLayout.h" 34 #include "llvm/IR/Dominators.h" 35 #include "llvm/IR/IRBuilder.h" 36 #include "llvm/IR/Instructions.h" 37 #include "llvm/IR/IntrinsicInst.h" 38 #include "llvm/IR/Module.h" 39 #include "llvm/IR/NoFolder.h" 40 #include "llvm/IR/Type.h" 41 #include "llvm/IR/Value.h" 42 #include "llvm/IR/Verifier.h" 43 #include "llvm/Pass.h" 44 #include "llvm/Support/CommandLine.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/Support/raw_ostream.h" 47 #include "llvm/Analysis/VectorUtils.h" 48 #include <algorithm> 49 #include <map> 50 #include <memory> 51 52 using namespace llvm; 53 54 #define SV_NAME "slp-vectorizer" 55 #define DEBUG_TYPE "SLP" 56 57 STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); 58 59 static cl::opt<int> 60 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, 61 cl::desc("Only vectorize if you gain more than this " 62 "number ")); 63 64 static cl::opt<bool> 65 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, 66 cl::desc("Attempt to vectorize horizontal reductions")); 67 68 static cl::opt<bool> ShouldStartVectorizeHorAtStore( 69 "slp-vectorize-hor-store", cl::init(false), cl::Hidden, 70 cl::desc( 71 "Attempt to vectorize horizontal reductions feeding into a store")); 72 73 static cl::opt<int> 74 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, 75 cl::desc("Attempt to vectorize for this register size in bits")); 76 77 /// Limits the size of scheduling regions in a block. 78 /// It avoid long compile times for _very_ large blocks where vector 79 /// instructions are spread over a wide range. 80 /// This limit is way higher than needed by real-world functions. 81 static cl::opt<int> 82 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, 83 cl::desc("Limit the size of the SLP scheduling region per block")); 84 85 namespace { 86 87 // FIXME: Set this via cl::opt to allow overriding. 88 static const unsigned MinVecRegSize = 128; 89 90 static const unsigned RecursionMaxDepth = 12; 91 92 // Limit the number of alias checks. The limit is chosen so that 93 // it has no negative effect on the llvm benchmarks. 94 static const unsigned AliasedCheckLimit = 10; 95 96 // Another limit for the alias checks: The maximum distance between load/store 97 // instructions where alias checks are done. 98 // This limit is useful for very large basic blocks. 99 static const unsigned MaxMemDepDistance = 160; 100 101 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling 102 /// regions to be handled. 103 static const int MinScheduleRegionSize = 16; 104 105 /// \brief Predicate for the element types that the SLP vectorizer supports. 106 /// 107 /// The most important thing to filter here are types which are invalid in LLVM 108 /// vectors. We also filter target specific types which have absolutely no 109 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just 110 /// avoids spending time checking the cost model and realizing that they will 111 /// be inevitably scalarized. 112 static bool isValidElementType(Type *Ty) { 113 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && 114 !Ty->isPPC_FP128Ty(); 115 } 116 117 /// \returns the parent basic block if all of the instructions in \p VL 118 /// are in the same block or null otherwise. 119 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { 120 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 121 if (!I0) 122 return nullptr; 123 BasicBlock *BB = I0->getParent(); 124 for (int i = 1, e = VL.size(); i < e; i++) { 125 Instruction *I = dyn_cast<Instruction>(VL[i]); 126 if (!I) 127 return nullptr; 128 129 if (BB != I->getParent()) 130 return nullptr; 131 } 132 return BB; 133 } 134 135 /// \returns True if all of the values in \p VL are constants. 136 static bool allConstant(ArrayRef<Value *> VL) { 137 for (unsigned i = 0, e = VL.size(); i < e; ++i) 138 if (!isa<Constant>(VL[i])) 139 return false; 140 return true; 141 } 142 143 /// \returns True if all of the values in \p VL are identical. 144 static bool isSplat(ArrayRef<Value *> VL) { 145 for (unsigned i = 1, e = VL.size(); i < e; ++i) 146 if (VL[i] != VL[0]) 147 return false; 148 return true; 149 } 150 151 ///\returns Opcode that can be clubbed with \p Op to create an alternate 152 /// sequence which can later be merged as a ShuffleVector instruction. 153 static unsigned getAltOpcode(unsigned Op) { 154 switch (Op) { 155 case Instruction::FAdd: 156 return Instruction::FSub; 157 case Instruction::FSub: 158 return Instruction::FAdd; 159 case Instruction::Add: 160 return Instruction::Sub; 161 case Instruction::Sub: 162 return Instruction::Add; 163 default: 164 return 0; 165 } 166 } 167 168 ///\returns bool representing if Opcode \p Op can be part 169 /// of an alternate sequence which can later be merged as 170 /// a ShuffleVector instruction. 171 static bool canCombineAsAltInst(unsigned Op) { 172 return Op == Instruction::FAdd || Op == Instruction::FSub || 173 Op == Instruction::Sub || Op == Instruction::Add; 174 } 175 176 /// \returns ShuffleVector instruction if instructions in \p VL have 177 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. 178 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) 179 static unsigned isAltInst(ArrayRef<Value *> VL) { 180 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 181 unsigned Opcode = I0->getOpcode(); 182 unsigned AltOpcode = getAltOpcode(Opcode); 183 for (int i = 1, e = VL.size(); i < e; i++) { 184 Instruction *I = dyn_cast<Instruction>(VL[i]); 185 if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) 186 return 0; 187 } 188 return Instruction::ShuffleVector; 189 } 190 191 /// \returns The opcode if all of the Instructions in \p VL have the same 192 /// opcode, or zero. 193 static unsigned getSameOpcode(ArrayRef<Value *> VL) { 194 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 195 if (!I0) 196 return 0; 197 unsigned Opcode = I0->getOpcode(); 198 for (int i = 1, e = VL.size(); i < e; i++) { 199 Instruction *I = dyn_cast<Instruction>(VL[i]); 200 if (!I || Opcode != I->getOpcode()) { 201 if (canCombineAsAltInst(Opcode) && i == 1) 202 return isAltInst(VL); 203 return 0; 204 } 205 } 206 return Opcode; 207 } 208 209 /// Get the intersection (logical and) of all of the potential IR flags 210 /// of each scalar operation (VL) that will be converted into a vector (I). 211 /// Flag set: NSW, NUW, exact, and all of fast-math. 212 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { 213 if (auto *VecOp = dyn_cast<BinaryOperator>(I)) { 214 if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) { 215 // Intersection is initialized to the 0th scalar, 216 // so start counting from index '1'. 217 for (int i = 1, e = VL.size(); i < e; ++i) { 218 if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i])) 219 Intersection->andIRFlags(Scalar); 220 } 221 VecOp->copyIRFlags(Intersection); 222 } 223 } 224 } 225 226 /// \returns \p I after propagating metadata from \p VL. 227 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { 228 Instruction *I0 = cast<Instruction>(VL[0]); 229 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; 230 I0->getAllMetadataOtherThanDebugLoc(Metadata); 231 232 for (unsigned i = 0, n = Metadata.size(); i != n; ++i) { 233 unsigned Kind = Metadata[i].first; 234 MDNode *MD = Metadata[i].second; 235 236 for (int i = 1, e = VL.size(); MD && i != e; i++) { 237 Instruction *I = cast<Instruction>(VL[i]); 238 MDNode *IMD = I->getMetadata(Kind); 239 240 switch (Kind) { 241 default: 242 MD = nullptr; // Remove unknown metadata 243 break; 244 case LLVMContext::MD_tbaa: 245 MD = MDNode::getMostGenericTBAA(MD, IMD); 246 break; 247 case LLVMContext::MD_alias_scope: 248 MD = MDNode::getMostGenericAliasScope(MD, IMD); 249 break; 250 case LLVMContext::MD_noalias: 251 MD = MDNode::intersect(MD, IMD); 252 break; 253 case LLVMContext::MD_fpmath: 254 MD = MDNode::getMostGenericFPMath(MD, IMD); 255 break; 256 case LLVMContext::MD_nontemporal: 257 MD = MDNode::intersect(MD, IMD); 258 break; 259 } 260 } 261 I->setMetadata(Kind, MD); 262 } 263 return I; 264 } 265 266 /// \returns The type that all of the values in \p VL have or null if there 267 /// are different types. 268 static Type* getSameType(ArrayRef<Value *> VL) { 269 Type *Ty = VL[0]->getType(); 270 for (int i = 1, e = VL.size(); i < e; i++) 271 if (VL[i]->getType() != Ty) 272 return nullptr; 273 274 return Ty; 275 } 276 277 /// \returns True if the ExtractElement instructions in VL can be vectorized 278 /// to use the original vector. 279 static bool CanReuseExtract(ArrayRef<Value *> VL) { 280 assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); 281 // Check if all of the extracts come from the same vector and from the 282 // correct offset. 283 Value *VL0 = VL[0]; 284 ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); 285 Value *Vec = E0->getOperand(0); 286 287 // We have to extract from the same vector type. 288 unsigned NElts = Vec->getType()->getVectorNumElements(); 289 290 if (NElts != VL.size()) 291 return false; 292 293 // Check that all of the indices extract from the correct offset. 294 ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); 295 if (!CI || CI->getZExtValue()) 296 return false; 297 298 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 299 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 300 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); 301 302 if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) 303 return false; 304 } 305 306 return true; 307 } 308 309 /// \returns True if in-tree use also needs extract. This refers to 310 /// possible scalar operand in vectorized instruction. 311 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, 312 TargetLibraryInfo *TLI) { 313 314 unsigned Opcode = UserInst->getOpcode(); 315 switch (Opcode) { 316 case Instruction::Load: { 317 LoadInst *LI = cast<LoadInst>(UserInst); 318 return (LI->getPointerOperand() == Scalar); 319 } 320 case Instruction::Store: { 321 StoreInst *SI = cast<StoreInst>(UserInst); 322 return (SI->getPointerOperand() == Scalar); 323 } 324 case Instruction::Call: { 325 CallInst *CI = cast<CallInst>(UserInst); 326 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 327 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 328 return (CI->getArgOperand(1) == Scalar); 329 } 330 } 331 default: 332 return false; 333 } 334 } 335 336 /// \returns the AA location that is being access by the instruction. 337 static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) { 338 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 339 return MemoryLocation::get(SI); 340 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 341 return MemoryLocation::get(LI); 342 return MemoryLocation(); 343 } 344 345 /// \returns True if the instruction is not a volatile or atomic load/store. 346 static bool isSimple(Instruction *I) { 347 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 348 return LI->isSimple(); 349 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 350 return SI->isSimple(); 351 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 352 return !MI->isVolatile(); 353 return true; 354 } 355 356 /// Bottom Up SLP Vectorizer. 357 class BoUpSLP { 358 public: 359 typedef SmallVector<Value *, 8> ValueList; 360 typedef SmallVector<Instruction *, 16> InstrList; 361 typedef SmallPtrSet<Value *, 16> ValueSet; 362 typedef SmallVector<StoreInst *, 8> StoreList; 363 364 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, 365 TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, 366 DominatorTree *Dt, AssumptionCache *AC) 367 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), 368 SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), 369 Builder(Se->getContext()) { 370 CodeMetrics::collectEphemeralValues(F, AC, EphValues); 371 } 372 373 /// \brief Vectorize the tree that starts with the elements in \p VL. 374 /// Returns the vectorized root. 375 Value *vectorizeTree(); 376 377 /// \returns the cost incurred by unwanted spills and fills, caused by 378 /// holding live values over call sites. 379 int getSpillCost(); 380 381 /// \returns the vectorization cost of the subtree that starts at \p VL. 382 /// A negative number means that this is profitable. 383 int getTreeCost(); 384 385 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 386 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 387 void buildTree(ArrayRef<Value *> Roots, 388 ArrayRef<Value *> UserIgnoreLst = None); 389 390 /// Clear the internal data structures that are created by 'buildTree'. 391 void deleteTree() { 392 VectorizableTree.clear(); 393 ScalarToTreeEntry.clear(); 394 MustGather.clear(); 395 ExternalUses.clear(); 396 NumLoadsWantToKeepOrder = 0; 397 NumLoadsWantToChangeOrder = 0; 398 for (auto &Iter : BlocksSchedules) { 399 BlockScheduling *BS = Iter.second.get(); 400 BS->clear(); 401 } 402 } 403 404 /// \returns true if the memory operations A and B are consecutive. 405 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL); 406 407 /// \brief Perform LICM and CSE on the newly generated gather sequences. 408 void optimizeGatherSequence(); 409 410 /// \returns true if it is beneficial to reverse the vector order. 411 bool shouldReorder() const { 412 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; 413 } 414 415 /// \return The vector element size in bits to use when vectorizing the 416 /// expression tree ending at \p V. If V is a store, the size is the width of 417 /// the stored value. Otherwise, the size is the width of the largest loaded 418 /// value reaching V. This method is used by the vectorizer to calculate 419 /// vectorization factors. 420 unsigned getVectorElementSize(Value *V); 421 422 private: 423 struct TreeEntry; 424 425 /// \returns the cost of the vectorizable entry. 426 int getEntryCost(TreeEntry *E); 427 428 /// This is the recursive part of buildTree. 429 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); 430 431 /// Vectorize a single entry in the tree. 432 Value *vectorizeTree(TreeEntry *E); 433 434 /// Vectorize a single entry in the tree, starting in \p VL. 435 Value *vectorizeTree(ArrayRef<Value *> VL); 436 437 /// \returns the pointer to the vectorized value if \p VL is already 438 /// vectorized, or NULL. They may happen in cycles. 439 Value *alreadyVectorized(ArrayRef<Value *> VL) const; 440 441 /// \brief Take the pointer operand from the Load/Store instruction. 442 /// \returns NULL if this is not a valid Load/Store instruction. 443 static Value *getPointerOperand(Value *I); 444 445 /// \brief Take the address space operand from the Load/Store instruction. 446 /// \returns -1 if this is not a valid Load/Store instruction. 447 static unsigned getAddressSpaceOperand(Value *I); 448 449 /// \returns the scalarization cost for this type. Scalarization in this 450 /// context means the creation of vectors from a group of scalars. 451 int getGatherCost(Type *Ty); 452 453 /// \returns the scalarization cost for this list of values. Assuming that 454 /// this subtree gets vectorized, we may need to extract the values from the 455 /// roots. This method calculates the cost of extracting the values. 456 int getGatherCost(ArrayRef<Value *> VL); 457 458 /// \brief Set the Builder insert point to one after the last instruction in 459 /// the bundle 460 void setInsertPointAfterBundle(ArrayRef<Value *> VL); 461 462 /// \returns a vector from a collection of scalars in \p VL. 463 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); 464 465 /// \returns whether the VectorizableTree is fully vectorizable and will 466 /// be beneficial even the tree height is tiny. 467 bool isFullyVectorizableTinyTree(); 468 469 /// \reorder commutative operands in alt shuffle if they result in 470 /// vectorized code. 471 void reorderAltShuffleOperands(ArrayRef<Value *> VL, 472 SmallVectorImpl<Value *> &Left, 473 SmallVectorImpl<Value *> &Right); 474 /// \reorder commutative operands to get better probability of 475 /// generating vectorized code. 476 void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 477 SmallVectorImpl<Value *> &Left, 478 SmallVectorImpl<Value *> &Right); 479 struct TreeEntry { 480 TreeEntry() : Scalars(), VectorizedValue(nullptr), 481 NeedToGather(0) {} 482 483 /// \returns true if the scalars in VL are equal to this entry. 484 bool isSame(ArrayRef<Value *> VL) const { 485 assert(VL.size() == Scalars.size() && "Invalid size"); 486 return std::equal(VL.begin(), VL.end(), Scalars.begin()); 487 } 488 489 /// A vector of scalars. 490 ValueList Scalars; 491 492 /// The Scalars are vectorized into this value. It is initialized to Null. 493 Value *VectorizedValue; 494 495 /// Do we need to gather this sequence ? 496 bool NeedToGather; 497 }; 498 499 /// Create a new VectorizableTree entry. 500 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { 501 VectorizableTree.emplace_back(); 502 int idx = VectorizableTree.size() - 1; 503 TreeEntry *Last = &VectorizableTree[idx]; 504 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); 505 Last->NeedToGather = !Vectorized; 506 if (Vectorized) { 507 for (int i = 0, e = VL.size(); i != e; ++i) { 508 assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); 509 ScalarToTreeEntry[VL[i]] = idx; 510 } 511 } else { 512 MustGather.insert(VL.begin(), VL.end()); 513 } 514 return Last; 515 } 516 517 /// -- Vectorization State -- 518 /// Holds all of the tree entries. 519 std::vector<TreeEntry> VectorizableTree; 520 521 /// Maps a specific scalar to its tree entry. 522 SmallDenseMap<Value*, int> ScalarToTreeEntry; 523 524 /// A list of scalars that we found that we need to keep as scalars. 525 ValueSet MustGather; 526 527 /// This POD struct describes one external user in the vectorized tree. 528 struct ExternalUser { 529 ExternalUser (Value *S, llvm::User *U, int L) : 530 Scalar(S), User(U), Lane(L){} 531 // Which scalar in our function. 532 Value *Scalar; 533 // Which user that uses the scalar. 534 llvm::User *User; 535 // Which lane does the scalar belong to. 536 int Lane; 537 }; 538 typedef SmallVector<ExternalUser, 16> UserList; 539 540 /// Checks if two instructions may access the same memory. 541 /// 542 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it 543 /// is invariant in the calling loop. 544 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, 545 Instruction *Inst2) { 546 547 // First check if the result is already in the cache. 548 AliasCacheKey key = std::make_pair(Inst1, Inst2); 549 Optional<bool> &result = AliasCache[key]; 550 if (result.hasValue()) { 551 return result.getValue(); 552 } 553 MemoryLocation Loc2 = getLocation(Inst2, AA); 554 bool aliased = true; 555 if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { 556 // Do the alias check. 557 aliased = AA->alias(Loc1, Loc2); 558 } 559 // Store the result in the cache. 560 result = aliased; 561 return aliased; 562 } 563 564 typedef std::pair<Instruction *, Instruction *> AliasCacheKey; 565 566 /// Cache for alias results. 567 /// TODO: consider moving this to the AliasAnalysis itself. 568 DenseMap<AliasCacheKey, Optional<bool>> AliasCache; 569 570 /// Removes an instruction from its block and eventually deletes it. 571 /// It's like Instruction::eraseFromParent() except that the actual deletion 572 /// is delayed until BoUpSLP is destructed. 573 /// This is required to ensure that there are no incorrect collisions in the 574 /// AliasCache, which can happen if a new instruction is allocated at the 575 /// same address as a previously deleted instruction. 576 void eraseInstruction(Instruction *I) { 577 I->removeFromParent(); 578 I->dropAllReferences(); 579 DeletedInstructions.push_back(std::unique_ptr<Instruction>(I)); 580 } 581 582 /// Temporary store for deleted instructions. Instructions will be deleted 583 /// eventually when the BoUpSLP is destructed. 584 SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions; 585 586 /// A list of values that need to extracted out of the tree. 587 /// This list holds pairs of (Internal Scalar : External User). 588 UserList ExternalUses; 589 590 /// Values used only by @llvm.assume calls. 591 SmallPtrSet<const Value *, 32> EphValues; 592 593 /// Holds all of the instructions that we gathered. 594 SetVector<Instruction *> GatherSeq; 595 /// A list of blocks that we are going to CSE. 596 SetVector<BasicBlock *> CSEBlocks; 597 598 /// Contains all scheduling relevant data for an instruction. 599 /// A ScheduleData either represents a single instruction or a member of an 600 /// instruction bundle (= a group of instructions which is combined into a 601 /// vector instruction). 602 struct ScheduleData { 603 604 // The initial value for the dependency counters. It means that the 605 // dependencies are not calculated yet. 606 enum { InvalidDeps = -1 }; 607 608 ScheduleData() 609 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr), 610 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0), 611 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps), 612 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {} 613 614 void init(int BlockSchedulingRegionID) { 615 FirstInBundle = this; 616 NextInBundle = nullptr; 617 NextLoadStore = nullptr; 618 IsScheduled = false; 619 SchedulingRegionID = BlockSchedulingRegionID; 620 UnscheduledDepsInBundle = UnscheduledDeps; 621 clearDependencies(); 622 } 623 624 /// Returns true if the dependency information has been calculated. 625 bool hasValidDependencies() const { return Dependencies != InvalidDeps; } 626 627 /// Returns true for single instructions and for bundle representatives 628 /// (= the head of a bundle). 629 bool isSchedulingEntity() const { return FirstInBundle == this; } 630 631 /// Returns true if it represents an instruction bundle and not only a 632 /// single instruction. 633 bool isPartOfBundle() const { 634 return NextInBundle != nullptr || FirstInBundle != this; 635 } 636 637 /// Returns true if it is ready for scheduling, i.e. it has no more 638 /// unscheduled depending instructions/bundles. 639 bool isReady() const { 640 assert(isSchedulingEntity() && 641 "can't consider non-scheduling entity for ready list"); 642 return UnscheduledDepsInBundle == 0 && !IsScheduled; 643 } 644 645 /// Modifies the number of unscheduled dependencies, also updating it for 646 /// the whole bundle. 647 int incrementUnscheduledDeps(int Incr) { 648 UnscheduledDeps += Incr; 649 return FirstInBundle->UnscheduledDepsInBundle += Incr; 650 } 651 652 /// Sets the number of unscheduled dependencies to the number of 653 /// dependencies. 654 void resetUnscheduledDeps() { 655 incrementUnscheduledDeps(Dependencies - UnscheduledDeps); 656 } 657 658 /// Clears all dependency information. 659 void clearDependencies() { 660 Dependencies = InvalidDeps; 661 resetUnscheduledDeps(); 662 MemoryDependencies.clear(); 663 } 664 665 void dump(raw_ostream &os) const { 666 if (!isSchedulingEntity()) { 667 os << "/ " << *Inst; 668 } else if (NextInBundle) { 669 os << '[' << *Inst; 670 ScheduleData *SD = NextInBundle; 671 while (SD) { 672 os << ';' << *SD->Inst; 673 SD = SD->NextInBundle; 674 } 675 os << ']'; 676 } else { 677 os << *Inst; 678 } 679 } 680 681 Instruction *Inst; 682 683 /// Points to the head in an instruction bundle (and always to this for 684 /// single instructions). 685 ScheduleData *FirstInBundle; 686 687 /// Single linked list of all instructions in a bundle. Null if it is a 688 /// single instruction. 689 ScheduleData *NextInBundle; 690 691 /// Single linked list of all memory instructions (e.g. load, store, call) 692 /// in the block - until the end of the scheduling region. 693 ScheduleData *NextLoadStore; 694 695 /// The dependent memory instructions. 696 /// This list is derived on demand in calculateDependencies(). 697 SmallVector<ScheduleData *, 4> MemoryDependencies; 698 699 /// This ScheduleData is in the current scheduling region if this matches 700 /// the current SchedulingRegionID of BlockScheduling. 701 int SchedulingRegionID; 702 703 /// Used for getting a "good" final ordering of instructions. 704 int SchedulingPriority; 705 706 /// The number of dependencies. Constitutes of the number of users of the 707 /// instruction plus the number of dependent memory instructions (if any). 708 /// This value is calculated on demand. 709 /// If InvalidDeps, the number of dependencies is not calculated yet. 710 /// 711 int Dependencies; 712 713 /// The number of dependencies minus the number of dependencies of scheduled 714 /// instructions. As soon as this is zero, the instruction/bundle gets ready 715 /// for scheduling. 716 /// Note that this is negative as long as Dependencies is not calculated. 717 int UnscheduledDeps; 718 719 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for 720 /// single instructions. 721 int UnscheduledDepsInBundle; 722 723 /// True if this instruction is scheduled (or considered as scheduled in the 724 /// dry-run). 725 bool IsScheduled; 726 }; 727 728 #ifndef NDEBUG 729 friend raw_ostream &operator<<(raw_ostream &os, 730 const BoUpSLP::ScheduleData &SD); 731 #endif 732 733 /// Contains all scheduling data for a basic block. 734 /// 735 struct BlockScheduling { 736 737 BlockScheduling(BasicBlock *BB) 738 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), 739 ScheduleStart(nullptr), ScheduleEnd(nullptr), 740 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), 741 ScheduleRegionSize(0), 742 ScheduleRegionSizeLimit(ScheduleRegionSizeBudget), 743 // Make sure that the initial SchedulingRegionID is greater than the 744 // initial SchedulingRegionID in ScheduleData (which is 0). 745 SchedulingRegionID(1) {} 746 747 void clear() { 748 ReadyInsts.clear(); 749 ScheduleStart = nullptr; 750 ScheduleEnd = nullptr; 751 FirstLoadStoreInRegion = nullptr; 752 LastLoadStoreInRegion = nullptr; 753 754 // Reduce the maximum schedule region size by the size of the 755 // previous scheduling run. 756 ScheduleRegionSizeLimit -= ScheduleRegionSize; 757 if (ScheduleRegionSizeLimit < MinScheduleRegionSize) 758 ScheduleRegionSizeLimit = MinScheduleRegionSize; 759 ScheduleRegionSize = 0; 760 761 // Make a new scheduling region, i.e. all existing ScheduleData is not 762 // in the new region yet. 763 ++SchedulingRegionID; 764 } 765 766 ScheduleData *getScheduleData(Value *V) { 767 ScheduleData *SD = ScheduleDataMap[V]; 768 if (SD && SD->SchedulingRegionID == SchedulingRegionID) 769 return SD; 770 return nullptr; 771 } 772 773 bool isInSchedulingRegion(ScheduleData *SD) { 774 return SD->SchedulingRegionID == SchedulingRegionID; 775 } 776 777 /// Marks an instruction as scheduled and puts all dependent ready 778 /// instructions into the ready-list. 779 template <typename ReadyListType> 780 void schedule(ScheduleData *SD, ReadyListType &ReadyList) { 781 SD->IsScheduled = true; 782 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); 783 784 ScheduleData *BundleMember = SD; 785 while (BundleMember) { 786 // Handle the def-use chain dependencies. 787 for (Use &U : BundleMember->Inst->operands()) { 788 ScheduleData *OpDef = getScheduleData(U.get()); 789 if (OpDef && OpDef->hasValidDependencies() && 790 OpDef->incrementUnscheduledDeps(-1) == 0) { 791 // There are no more unscheduled dependencies after decrementing, 792 // so we can put the dependent instruction into the ready list. 793 ScheduleData *DepBundle = OpDef->FirstInBundle; 794 assert(!DepBundle->IsScheduled && 795 "already scheduled bundle gets ready"); 796 ReadyList.insert(DepBundle); 797 DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); 798 } 799 } 800 // Handle the memory dependencies. 801 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { 802 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { 803 // There are no more unscheduled dependencies after decrementing, 804 // so we can put the dependent instruction into the ready list. 805 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; 806 assert(!DepBundle->IsScheduled && 807 "already scheduled bundle gets ready"); 808 ReadyList.insert(DepBundle); 809 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n"); 810 } 811 } 812 BundleMember = BundleMember->NextInBundle; 813 } 814 } 815 816 /// Put all instructions into the ReadyList which are ready for scheduling. 817 template <typename ReadyListType> 818 void initialFillReadyList(ReadyListType &ReadyList) { 819 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 820 ScheduleData *SD = getScheduleData(I); 821 if (SD->isSchedulingEntity() && SD->isReady()) { 822 ReadyList.insert(SD); 823 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); 824 } 825 } 826 } 827 828 /// Checks if a bundle of instructions can be scheduled, i.e. has no 829 /// cyclic dependencies. This is only a dry-run, no instructions are 830 /// actually moved at this stage. 831 bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP); 832 833 /// Un-bundles a group of instructions. 834 void cancelScheduling(ArrayRef<Value *> VL); 835 836 /// Extends the scheduling region so that V is inside the region. 837 /// \returns true if the region size is within the limit. 838 bool extendSchedulingRegion(Value *V); 839 840 /// Initialize the ScheduleData structures for new instructions in the 841 /// scheduling region. 842 void initScheduleData(Instruction *FromI, Instruction *ToI, 843 ScheduleData *PrevLoadStore, 844 ScheduleData *NextLoadStore); 845 846 /// Updates the dependency information of a bundle and of all instructions/ 847 /// bundles which depend on the original bundle. 848 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, 849 BoUpSLP *SLP); 850 851 /// Sets all instruction in the scheduling region to un-scheduled. 852 void resetSchedule(); 853 854 BasicBlock *BB; 855 856 /// Simple memory allocation for ScheduleData. 857 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; 858 859 /// The size of a ScheduleData array in ScheduleDataChunks. 860 int ChunkSize; 861 862 /// The allocator position in the current chunk, which is the last entry 863 /// of ScheduleDataChunks. 864 int ChunkPos; 865 866 /// Attaches ScheduleData to Instruction. 867 /// Note that the mapping survives during all vectorization iterations, i.e. 868 /// ScheduleData structures are recycled. 869 DenseMap<Value *, ScheduleData *> ScheduleDataMap; 870 871 struct ReadyList : SmallVector<ScheduleData *, 8> { 872 void insert(ScheduleData *SD) { push_back(SD); } 873 }; 874 875 /// The ready-list for scheduling (only used for the dry-run). 876 ReadyList ReadyInsts; 877 878 /// The first instruction of the scheduling region. 879 Instruction *ScheduleStart; 880 881 /// The first instruction _after_ the scheduling region. 882 Instruction *ScheduleEnd; 883 884 /// The first memory accessing instruction in the scheduling region 885 /// (can be null). 886 ScheduleData *FirstLoadStoreInRegion; 887 888 /// The last memory accessing instruction in the scheduling region 889 /// (can be null). 890 ScheduleData *LastLoadStoreInRegion; 891 892 /// The current size of the scheduling region. 893 int ScheduleRegionSize; 894 895 /// The maximum size allowed for the scheduling region. 896 int ScheduleRegionSizeLimit; 897 898 /// The ID of the scheduling region. For a new vectorization iteration this 899 /// is incremented which "removes" all ScheduleData from the region. 900 int SchedulingRegionID; 901 }; 902 903 /// Attaches the BlockScheduling structures to basic blocks. 904 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; 905 906 /// Performs the "real" scheduling. Done before vectorization is actually 907 /// performed in a basic block. 908 void scheduleBlock(BlockScheduling *BS); 909 910 /// List of users to ignore during scheduling and that don't need extracting. 911 ArrayRef<Value *> UserIgnoreList; 912 913 // Number of load-bundles, which contain consecutive loads. 914 int NumLoadsWantToKeepOrder; 915 916 // Number of load-bundles of size 2, which are consecutive loads if reversed. 917 int NumLoadsWantToChangeOrder; 918 919 // Analysis and block reference. 920 Function *F; 921 ScalarEvolution *SE; 922 TargetTransformInfo *TTI; 923 TargetLibraryInfo *TLI; 924 AliasAnalysis *AA; 925 LoopInfo *LI; 926 DominatorTree *DT; 927 /// Instruction builder to construct the vectorized tree. 928 IRBuilder<> Builder; 929 }; 930 931 #ifndef NDEBUG 932 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) { 933 SD.dump(os); 934 return os; 935 } 936 #endif 937 938 void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 939 ArrayRef<Value *> UserIgnoreLst) { 940 deleteTree(); 941 UserIgnoreList = UserIgnoreLst; 942 if (!getSameType(Roots)) 943 return; 944 buildTree_rec(Roots, 0); 945 946 // Collect the values that we need to extract from the tree. 947 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 948 TreeEntry *Entry = &VectorizableTree[EIdx]; 949 950 // For each lane: 951 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 952 Value *Scalar = Entry->Scalars[Lane]; 953 954 // No need to handle users of gathered values. 955 if (Entry->NeedToGather) 956 continue; 957 958 for (User *U : Scalar->users()) { 959 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); 960 961 Instruction *UserInst = dyn_cast<Instruction>(U); 962 if (!UserInst) 963 continue; 964 965 // Skip in-tree scalars that become vectors 966 if (ScalarToTreeEntry.count(U)) { 967 int Idx = ScalarToTreeEntry[U]; 968 TreeEntry *UseEntry = &VectorizableTree[Idx]; 969 Value *UseScalar = UseEntry->Scalars[0]; 970 // Some in-tree scalars will remain as scalar in vectorized 971 // instructions. If that is the case, the one in Lane 0 will 972 // be used. 973 if (UseScalar != U || 974 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { 975 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U 976 << ".\n"); 977 assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); 978 continue; 979 } 980 } 981 982 // Ignore users in the user ignore list. 983 if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != 984 UserIgnoreList.end()) 985 continue; 986 987 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << 988 Lane << " from " << *Scalar << ".\n"); 989 ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); 990 } 991 } 992 } 993 } 994 995 996 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { 997 bool SameTy = getSameType(VL); (void)SameTy; 998 bool isAltShuffle = false; 999 assert(SameTy && "Invalid types!"); 1000 1001 if (Depth == RecursionMaxDepth) { 1002 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); 1003 newTreeEntry(VL, false); 1004 return; 1005 } 1006 1007 // Don't handle vectors. 1008 if (VL[0]->getType()->isVectorTy()) { 1009 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); 1010 newTreeEntry(VL, false); 1011 return; 1012 } 1013 1014 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1015 if (SI->getValueOperand()->getType()->isVectorTy()) { 1016 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); 1017 newTreeEntry(VL, false); 1018 return; 1019 } 1020 unsigned Opcode = getSameOpcode(VL); 1021 1022 // Check that this shuffle vector refers to the alternate 1023 // sequence of opcodes. 1024 if (Opcode == Instruction::ShuffleVector) { 1025 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 1026 unsigned Op = I0->getOpcode(); 1027 if (Op != Instruction::ShuffleVector) 1028 isAltShuffle = true; 1029 } 1030 1031 // If all of the operands are identical or constant we have a simple solution. 1032 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { 1033 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); 1034 newTreeEntry(VL, false); 1035 return; 1036 } 1037 1038 // We now know that this is a vector of instructions of the same type from 1039 // the same block. 1040 1041 // Don't vectorize ephemeral values. 1042 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1043 if (EphValues.count(VL[i])) { 1044 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 1045 ") is ephemeral.\n"); 1046 newTreeEntry(VL, false); 1047 return; 1048 } 1049 } 1050 1051 // Check if this is a duplicate of another entry. 1052 if (ScalarToTreeEntry.count(VL[0])) { 1053 int Idx = ScalarToTreeEntry[VL[0]]; 1054 TreeEntry *E = &VectorizableTree[Idx]; 1055 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1056 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); 1057 if (E->Scalars[i] != VL[i]) { 1058 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); 1059 newTreeEntry(VL, false); 1060 return; 1061 } 1062 } 1063 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); 1064 return; 1065 } 1066 1067 // Check that none of the instructions in the bundle are already in the tree. 1068 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1069 if (ScalarToTreeEntry.count(VL[i])) { 1070 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 1071 ") is already in tree.\n"); 1072 newTreeEntry(VL, false); 1073 return; 1074 } 1075 } 1076 1077 // If any of the scalars is marked as a value that needs to stay scalar then 1078 // we need to gather the scalars. 1079 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1080 if (MustGather.count(VL[i])) { 1081 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); 1082 newTreeEntry(VL, false); 1083 return; 1084 } 1085 } 1086 1087 // Check that all of the users of the scalars that we want to vectorize are 1088 // schedulable. 1089 Instruction *VL0 = cast<Instruction>(VL[0]); 1090 BasicBlock *BB = cast<Instruction>(VL0)->getParent(); 1091 1092 if (!DT->isReachableFromEntry(BB)) { 1093 // Don't go into unreachable blocks. They may contain instructions with 1094 // dependency cycles which confuse the final scheduling. 1095 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); 1096 newTreeEntry(VL, false); 1097 return; 1098 } 1099 1100 // Check that every instructions appears once in this bundle. 1101 for (unsigned i = 0, e = VL.size(); i < e; ++i) 1102 for (unsigned j = i+1; j < e; ++j) 1103 if (VL[i] == VL[j]) { 1104 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 1105 newTreeEntry(VL, false); 1106 return; 1107 } 1108 1109 auto &BSRef = BlocksSchedules[BB]; 1110 if (!BSRef) { 1111 BSRef = llvm::make_unique<BlockScheduling>(BB); 1112 } 1113 BlockScheduling &BS = *BSRef.get(); 1114 1115 if (!BS.tryScheduleBundle(VL, this)) { 1116 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); 1117 assert((!BS.getScheduleData(VL[0]) || 1118 !BS.getScheduleData(VL[0])->isPartOfBundle()) && 1119 "tryScheduleBundle should cancelScheduling on failure"); 1120 newTreeEntry(VL, false); 1121 return; 1122 } 1123 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); 1124 1125 switch (Opcode) { 1126 case Instruction::PHI: { 1127 PHINode *PH = dyn_cast<PHINode>(VL0); 1128 1129 // Check for terminator values (e.g. invoke). 1130 for (unsigned j = 0; j < VL.size(); ++j) 1131 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1132 TerminatorInst *Term = dyn_cast<TerminatorInst>( 1133 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); 1134 if (Term) { 1135 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); 1136 BS.cancelScheduling(VL); 1137 newTreeEntry(VL, false); 1138 return; 1139 } 1140 } 1141 1142 newTreeEntry(VL, true); 1143 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); 1144 1145 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1146 ValueList Operands; 1147 // Prepare the operand vector. 1148 for (unsigned j = 0; j < VL.size(); ++j) 1149 Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock( 1150 PH->getIncomingBlock(i))); 1151 1152 buildTree_rec(Operands, Depth + 1); 1153 } 1154 return; 1155 } 1156 case Instruction::ExtractElement: { 1157 bool Reuse = CanReuseExtract(VL); 1158 if (Reuse) { 1159 DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); 1160 } else { 1161 BS.cancelScheduling(VL); 1162 } 1163 newTreeEntry(VL, Reuse); 1164 return; 1165 } 1166 case Instruction::Load: { 1167 // Check that a vectorized load would load the same memory as a scalar 1168 // load. 1169 // For example we don't want vectorize loads that are smaller than 8 bit. 1170 // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats 1171 // loading/storing it as an i8 struct. If we vectorize loads/stores from 1172 // such a struct we read/write packed bits disagreeing with the 1173 // unvectorized version. 1174 const DataLayout &DL = F->getParent()->getDataLayout(); 1175 Type *ScalarTy = VL[0]->getType(); 1176 1177 if (DL.getTypeSizeInBits(ScalarTy) != 1178 DL.getTypeAllocSizeInBits(ScalarTy)) { 1179 BS.cancelScheduling(VL); 1180 newTreeEntry(VL, false); 1181 DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); 1182 return; 1183 } 1184 // Check if the loads are consecutive or of we need to swizzle them. 1185 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { 1186 LoadInst *L = cast<LoadInst>(VL[i]); 1187 if (!L->isSimple()) { 1188 BS.cancelScheduling(VL); 1189 newTreeEntry(VL, false); 1190 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); 1191 return; 1192 } 1193 1194 if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { 1195 if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) { 1196 ++NumLoadsWantToChangeOrder; 1197 } 1198 BS.cancelScheduling(VL); 1199 newTreeEntry(VL, false); 1200 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); 1201 return; 1202 } 1203 } 1204 ++NumLoadsWantToKeepOrder; 1205 newTreeEntry(VL, true); 1206 DEBUG(dbgs() << "SLP: added a vector of loads.\n"); 1207 return; 1208 } 1209 case Instruction::ZExt: 1210 case Instruction::SExt: 1211 case Instruction::FPToUI: 1212 case Instruction::FPToSI: 1213 case Instruction::FPExt: 1214 case Instruction::PtrToInt: 1215 case Instruction::IntToPtr: 1216 case Instruction::SIToFP: 1217 case Instruction::UIToFP: 1218 case Instruction::Trunc: 1219 case Instruction::FPTrunc: 1220 case Instruction::BitCast: { 1221 Type *SrcTy = VL0->getOperand(0)->getType(); 1222 for (unsigned i = 0; i < VL.size(); ++i) { 1223 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); 1224 if (Ty != SrcTy || !isValidElementType(Ty)) { 1225 BS.cancelScheduling(VL); 1226 newTreeEntry(VL, false); 1227 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); 1228 return; 1229 } 1230 } 1231 newTreeEntry(VL, true); 1232 DEBUG(dbgs() << "SLP: added a vector of casts.\n"); 1233 1234 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1235 ValueList Operands; 1236 // Prepare the operand vector. 1237 for (unsigned j = 0; j < VL.size(); ++j) 1238 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1239 1240 buildTree_rec(Operands, Depth+1); 1241 } 1242 return; 1243 } 1244 case Instruction::ICmp: 1245 case Instruction::FCmp: { 1246 // Check that all of the compares have the same predicate. 1247 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 1248 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); 1249 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 1250 CmpInst *Cmp = cast<CmpInst>(VL[i]); 1251 if (Cmp->getPredicate() != P0 || 1252 Cmp->getOperand(0)->getType() != ComparedTy) { 1253 BS.cancelScheduling(VL); 1254 newTreeEntry(VL, false); 1255 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); 1256 return; 1257 } 1258 } 1259 1260 newTreeEntry(VL, true); 1261 DEBUG(dbgs() << "SLP: added a vector of compares.\n"); 1262 1263 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1264 ValueList Operands; 1265 // Prepare the operand vector. 1266 for (unsigned j = 0; j < VL.size(); ++j) 1267 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1268 1269 buildTree_rec(Operands, Depth+1); 1270 } 1271 return; 1272 } 1273 case Instruction::Select: 1274 case Instruction::Add: 1275 case Instruction::FAdd: 1276 case Instruction::Sub: 1277 case Instruction::FSub: 1278 case Instruction::Mul: 1279 case Instruction::FMul: 1280 case Instruction::UDiv: 1281 case Instruction::SDiv: 1282 case Instruction::FDiv: 1283 case Instruction::URem: 1284 case Instruction::SRem: 1285 case Instruction::FRem: 1286 case Instruction::Shl: 1287 case Instruction::LShr: 1288 case Instruction::AShr: 1289 case Instruction::And: 1290 case Instruction::Or: 1291 case Instruction::Xor: { 1292 newTreeEntry(VL, true); 1293 DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); 1294 1295 // Sort operands of the instructions so that each side is more likely to 1296 // have the same opcode. 1297 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { 1298 ValueList Left, Right; 1299 reorderInputsAccordingToOpcode(VL, Left, Right); 1300 buildTree_rec(Left, Depth + 1); 1301 buildTree_rec(Right, Depth + 1); 1302 return; 1303 } 1304 1305 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1306 ValueList Operands; 1307 // Prepare the operand vector. 1308 for (unsigned j = 0; j < VL.size(); ++j) 1309 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1310 1311 buildTree_rec(Operands, Depth+1); 1312 } 1313 return; 1314 } 1315 case Instruction::GetElementPtr: { 1316 // We don't combine GEPs with complicated (nested) indexing. 1317 for (unsigned j = 0; j < VL.size(); ++j) { 1318 if (cast<Instruction>(VL[j])->getNumOperands() != 2) { 1319 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); 1320 BS.cancelScheduling(VL); 1321 newTreeEntry(VL, false); 1322 return; 1323 } 1324 } 1325 1326 // We can't combine several GEPs into one vector if they operate on 1327 // different types. 1328 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType(); 1329 for (unsigned j = 0; j < VL.size(); ++j) { 1330 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); 1331 if (Ty0 != CurTy) { 1332 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); 1333 BS.cancelScheduling(VL); 1334 newTreeEntry(VL, false); 1335 return; 1336 } 1337 } 1338 1339 // We don't combine GEPs with non-constant indexes. 1340 for (unsigned j = 0; j < VL.size(); ++j) { 1341 auto Op = cast<Instruction>(VL[j])->getOperand(1); 1342 if (!isa<ConstantInt>(Op)) { 1343 DEBUG( 1344 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); 1345 BS.cancelScheduling(VL); 1346 newTreeEntry(VL, false); 1347 return; 1348 } 1349 } 1350 1351 newTreeEntry(VL, true); 1352 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); 1353 for (unsigned i = 0, e = 2; i < e; ++i) { 1354 ValueList Operands; 1355 // Prepare the operand vector. 1356 for (unsigned j = 0; j < VL.size(); ++j) 1357 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1358 1359 buildTree_rec(Operands, Depth + 1); 1360 } 1361 return; 1362 } 1363 case Instruction::Store: { 1364 const DataLayout &DL = F->getParent()->getDataLayout(); 1365 // Check if the stores are consecutive or of we need to swizzle them. 1366 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) 1367 if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { 1368 BS.cancelScheduling(VL); 1369 newTreeEntry(VL, false); 1370 DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); 1371 return; 1372 } 1373 1374 newTreeEntry(VL, true); 1375 DEBUG(dbgs() << "SLP: added a vector of stores.\n"); 1376 1377 ValueList Operands; 1378 for (unsigned j = 0; j < VL.size(); ++j) 1379 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); 1380 1381 buildTree_rec(Operands, Depth + 1); 1382 return; 1383 } 1384 case Instruction::Call: { 1385 // Check if the calls are all to the same vectorizable intrinsic. 1386 CallInst *CI = cast<CallInst>(VL[0]); 1387 // Check if this is an Intrinsic call or something that can be 1388 // represented by an intrinsic call 1389 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1390 if (!isTriviallyVectorizable(ID)) { 1391 BS.cancelScheduling(VL); 1392 newTreeEntry(VL, false); 1393 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); 1394 return; 1395 } 1396 Function *Int = CI->getCalledFunction(); 1397 Value *A1I = nullptr; 1398 if (hasVectorInstrinsicScalarOpd(ID, 1)) 1399 A1I = CI->getArgOperand(1); 1400 for (unsigned i = 1, e = VL.size(); i != e; ++i) { 1401 CallInst *CI2 = dyn_cast<CallInst>(VL[i]); 1402 if (!CI2 || CI2->getCalledFunction() != Int || 1403 getIntrinsicIDForCall(CI2, TLI) != ID) { 1404 BS.cancelScheduling(VL); 1405 newTreeEntry(VL, false); 1406 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] 1407 << "\n"); 1408 return; 1409 } 1410 // ctlz,cttz and powi are special intrinsics whose second argument 1411 // should be same in order for them to be vectorized. 1412 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 1413 Value *A1J = CI2->getArgOperand(1); 1414 if (A1I != A1J) { 1415 BS.cancelScheduling(VL); 1416 newTreeEntry(VL, false); 1417 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI 1418 << " argument "<< A1I<<"!=" << A1J 1419 << "\n"); 1420 return; 1421 } 1422 } 1423 } 1424 1425 newTreeEntry(VL, true); 1426 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { 1427 ValueList Operands; 1428 // Prepare the operand vector. 1429 for (unsigned j = 0; j < VL.size(); ++j) { 1430 CallInst *CI2 = dyn_cast<CallInst>(VL[j]); 1431 Operands.push_back(CI2->getArgOperand(i)); 1432 } 1433 buildTree_rec(Operands, Depth + 1); 1434 } 1435 return; 1436 } 1437 case Instruction::ShuffleVector: { 1438 // If this is not an alternate sequence of opcode like add-sub 1439 // then do not vectorize this instruction. 1440 if (!isAltShuffle) { 1441 BS.cancelScheduling(VL); 1442 newTreeEntry(VL, false); 1443 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); 1444 return; 1445 } 1446 newTreeEntry(VL, true); 1447 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); 1448 1449 // Reorder operands if reordering would enable vectorization. 1450 if (isa<BinaryOperator>(VL0)) { 1451 ValueList Left, Right; 1452 reorderAltShuffleOperands(VL, Left, Right); 1453 buildTree_rec(Left, Depth + 1); 1454 buildTree_rec(Right, Depth + 1); 1455 return; 1456 } 1457 1458 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1459 ValueList Operands; 1460 // Prepare the operand vector. 1461 for (unsigned j = 0; j < VL.size(); ++j) 1462 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1463 1464 buildTree_rec(Operands, Depth + 1); 1465 } 1466 return; 1467 } 1468 default: 1469 BS.cancelScheduling(VL); 1470 newTreeEntry(VL, false); 1471 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); 1472 return; 1473 } 1474 } 1475 1476 int BoUpSLP::getEntryCost(TreeEntry *E) { 1477 ArrayRef<Value*> VL = E->Scalars; 1478 1479 Type *ScalarTy = VL[0]->getType(); 1480 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1481 ScalarTy = SI->getValueOperand()->getType(); 1482 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1483 1484 if (E->NeedToGather) { 1485 if (allConstant(VL)) 1486 return 0; 1487 if (isSplat(VL)) { 1488 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); 1489 } 1490 return getGatherCost(E->Scalars); 1491 } 1492 unsigned Opcode = getSameOpcode(VL); 1493 assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); 1494 Instruction *VL0 = cast<Instruction>(VL[0]); 1495 switch (Opcode) { 1496 case Instruction::PHI: { 1497 return 0; 1498 } 1499 case Instruction::ExtractElement: { 1500 if (CanReuseExtract(VL)) { 1501 int DeadCost = 0; 1502 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 1503 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 1504 if (E->hasOneUse()) 1505 // Take credit for instruction that will become dead. 1506 DeadCost += 1507 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); 1508 } 1509 return -DeadCost; 1510 } 1511 return getGatherCost(VecTy); 1512 } 1513 case Instruction::ZExt: 1514 case Instruction::SExt: 1515 case Instruction::FPToUI: 1516 case Instruction::FPToSI: 1517 case Instruction::FPExt: 1518 case Instruction::PtrToInt: 1519 case Instruction::IntToPtr: 1520 case Instruction::SIToFP: 1521 case Instruction::UIToFP: 1522 case Instruction::Trunc: 1523 case Instruction::FPTrunc: 1524 case Instruction::BitCast: { 1525 Type *SrcTy = VL0->getOperand(0)->getType(); 1526 1527 // Calculate the cost of this instruction. 1528 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), 1529 VL0->getType(), SrcTy); 1530 1531 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); 1532 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); 1533 return VecCost - ScalarCost; 1534 } 1535 case Instruction::FCmp: 1536 case Instruction::ICmp: 1537 case Instruction::Select: 1538 case Instruction::Add: 1539 case Instruction::FAdd: 1540 case Instruction::Sub: 1541 case Instruction::FSub: 1542 case Instruction::Mul: 1543 case Instruction::FMul: 1544 case Instruction::UDiv: 1545 case Instruction::SDiv: 1546 case Instruction::FDiv: 1547 case Instruction::URem: 1548 case Instruction::SRem: 1549 case Instruction::FRem: 1550 case Instruction::Shl: 1551 case Instruction::LShr: 1552 case Instruction::AShr: 1553 case Instruction::And: 1554 case Instruction::Or: 1555 case Instruction::Xor: { 1556 // Calculate the cost of this instruction. 1557 int ScalarCost = 0; 1558 int VecCost = 0; 1559 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || 1560 Opcode == Instruction::Select) { 1561 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); 1562 ScalarCost = VecTy->getNumElements() * 1563 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); 1564 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); 1565 } else { 1566 // Certain instructions can be cheaper to vectorize if they have a 1567 // constant second vector operand. 1568 TargetTransformInfo::OperandValueKind Op1VK = 1569 TargetTransformInfo::OK_AnyValue; 1570 TargetTransformInfo::OperandValueKind Op2VK = 1571 TargetTransformInfo::OK_UniformConstantValue; 1572 TargetTransformInfo::OperandValueProperties Op1VP = 1573 TargetTransformInfo::OP_None; 1574 TargetTransformInfo::OperandValueProperties Op2VP = 1575 TargetTransformInfo::OP_None; 1576 1577 // If all operands are exactly the same ConstantInt then set the 1578 // operand kind to OK_UniformConstantValue. 1579 // If instead not all operands are constants, then set the operand kind 1580 // to OK_AnyValue. If all operands are constants but not the same, 1581 // then set the operand kind to OK_NonUniformConstantValue. 1582 ConstantInt *CInt = nullptr; 1583 for (unsigned i = 0; i < VL.size(); ++i) { 1584 const Instruction *I = cast<Instruction>(VL[i]); 1585 if (!isa<ConstantInt>(I->getOperand(1))) { 1586 Op2VK = TargetTransformInfo::OK_AnyValue; 1587 break; 1588 } 1589 if (i == 0) { 1590 CInt = cast<ConstantInt>(I->getOperand(1)); 1591 continue; 1592 } 1593 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && 1594 CInt != cast<ConstantInt>(I->getOperand(1))) 1595 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 1596 } 1597 // FIXME: Currently cost of model modification for division by 1598 // power of 2 is handled only for X86. Add support for other targets. 1599 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && 1600 CInt->getValue().isPowerOf2()) 1601 Op2VP = TargetTransformInfo::OP_PowerOf2; 1602 1603 ScalarCost = VecTy->getNumElements() * 1604 TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, 1605 Op1VP, Op2VP); 1606 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, 1607 Op1VP, Op2VP); 1608 } 1609 return VecCost - ScalarCost; 1610 } 1611 case Instruction::GetElementPtr: { 1612 TargetTransformInfo::OperandValueKind Op1VK = 1613 TargetTransformInfo::OK_AnyValue; 1614 TargetTransformInfo::OperandValueKind Op2VK = 1615 TargetTransformInfo::OK_UniformConstantValue; 1616 1617 int ScalarCost = 1618 VecTy->getNumElements() * 1619 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); 1620 int VecCost = 1621 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); 1622 1623 return VecCost - ScalarCost; 1624 } 1625 case Instruction::Load: { 1626 // Cost of wide load - cost of scalar loads. 1627 int ScalarLdCost = VecTy->getNumElements() * 1628 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); 1629 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0); 1630 return VecLdCost - ScalarLdCost; 1631 } 1632 case Instruction::Store: { 1633 // We know that we can merge the stores. Calculate the cost. 1634 int ScalarStCost = VecTy->getNumElements() * 1635 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); 1636 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0); 1637 return VecStCost - ScalarStCost; 1638 } 1639 case Instruction::Call: { 1640 CallInst *CI = cast<CallInst>(VL0); 1641 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1642 1643 // Calculate the cost of the scalar and vector calls. 1644 SmallVector<Type*, 4> ScalarTys, VecTys; 1645 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { 1646 ScalarTys.push_back(CI->getArgOperand(op)->getType()); 1647 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), 1648 VecTy->getNumElements())); 1649 } 1650 1651 int ScalarCallCost = VecTy->getNumElements() * 1652 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys); 1653 1654 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys); 1655 1656 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost 1657 << " (" << VecCallCost << "-" << ScalarCallCost << ")" 1658 << " for " << *CI << "\n"); 1659 1660 return VecCallCost - ScalarCallCost; 1661 } 1662 case Instruction::ShuffleVector: { 1663 TargetTransformInfo::OperandValueKind Op1VK = 1664 TargetTransformInfo::OK_AnyValue; 1665 TargetTransformInfo::OperandValueKind Op2VK = 1666 TargetTransformInfo::OK_AnyValue; 1667 int ScalarCost = 0; 1668 int VecCost = 0; 1669 for (unsigned i = 0; i < VL.size(); ++i) { 1670 Instruction *I = cast<Instruction>(VL[i]); 1671 if (!I) 1672 break; 1673 ScalarCost += 1674 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); 1675 } 1676 // VecCost is equal to sum of the cost of creating 2 vectors 1677 // and the cost of creating shuffle. 1678 Instruction *I0 = cast<Instruction>(VL[0]); 1679 VecCost = 1680 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); 1681 Instruction *I1 = cast<Instruction>(VL[1]); 1682 VecCost += 1683 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); 1684 VecCost += 1685 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); 1686 return VecCost - ScalarCost; 1687 } 1688 default: 1689 llvm_unreachable("Unknown instruction"); 1690 } 1691 } 1692 1693 bool BoUpSLP::isFullyVectorizableTinyTree() { 1694 DEBUG(dbgs() << "SLP: Check whether the tree with height " << 1695 VectorizableTree.size() << " is fully vectorizable .\n"); 1696 1697 // We only handle trees of height 2. 1698 if (VectorizableTree.size() != 2) 1699 return false; 1700 1701 // Handle splat and all-constants stores. 1702 if (!VectorizableTree[0].NeedToGather && 1703 (allConstant(VectorizableTree[1].Scalars) || 1704 isSplat(VectorizableTree[1].Scalars))) 1705 return true; 1706 1707 // Gathering cost would be too much for tiny trees. 1708 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) 1709 return false; 1710 1711 return true; 1712 } 1713 1714 int BoUpSLP::getSpillCost() { 1715 // Walk from the bottom of the tree to the top, tracking which values are 1716 // live. When we see a call instruction that is not part of our tree, 1717 // query TTI to see if there is a cost to keeping values live over it 1718 // (for example, if spills and fills are required). 1719 unsigned BundleWidth = VectorizableTree.front().Scalars.size(); 1720 int Cost = 0; 1721 1722 SmallPtrSet<Instruction*, 4> LiveValues; 1723 Instruction *PrevInst = nullptr; 1724 1725 for (unsigned N = 0; N < VectorizableTree.size(); ++N) { 1726 Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]); 1727 if (!Inst) 1728 continue; 1729 1730 if (!PrevInst) { 1731 PrevInst = Inst; 1732 continue; 1733 } 1734 1735 DEBUG( 1736 dbgs() << "SLP: #LV: " << LiveValues.size(); 1737 for (auto *X : LiveValues) 1738 dbgs() << " " << X->getName(); 1739 dbgs() << ", Looking at "; 1740 Inst->dump(); 1741 ); 1742 1743 // Update LiveValues. 1744 LiveValues.erase(PrevInst); 1745 for (auto &J : PrevInst->operands()) { 1746 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J)) 1747 LiveValues.insert(cast<Instruction>(&*J)); 1748 } 1749 1750 // Now find the sequence of instructions between PrevInst and Inst. 1751 BasicBlock::reverse_iterator InstIt(Inst->getIterator()), 1752 PrevInstIt(PrevInst->getIterator()); 1753 --PrevInstIt; 1754 while (InstIt != PrevInstIt) { 1755 if (PrevInstIt == PrevInst->getParent()->rend()) { 1756 PrevInstIt = Inst->getParent()->rbegin(); 1757 continue; 1758 } 1759 1760 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) { 1761 SmallVector<Type*, 4> V; 1762 for (auto *II : LiveValues) 1763 V.push_back(VectorType::get(II->getType(), BundleWidth)); 1764 Cost += TTI->getCostOfKeepingLiveOverCall(V); 1765 } 1766 1767 ++PrevInstIt; 1768 } 1769 1770 PrevInst = Inst; 1771 } 1772 1773 DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n"); 1774 return Cost; 1775 } 1776 1777 int BoUpSLP::getTreeCost() { 1778 int Cost = 0; 1779 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << 1780 VectorizableTree.size() << ".\n"); 1781 1782 // We only vectorize tiny trees if it is fully vectorizable. 1783 if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { 1784 if (VectorizableTree.empty()) { 1785 assert(!ExternalUses.size() && "We should not have any external users"); 1786 } 1787 return INT_MAX; 1788 } 1789 1790 unsigned BundleWidth = VectorizableTree[0].Scalars.size(); 1791 1792 for (TreeEntry &TE : VectorizableTree) { 1793 int C = getEntryCost(&TE); 1794 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " 1795 << TE.Scalars[0] << " .\n"); 1796 Cost += C; 1797 } 1798 1799 SmallSet<Value *, 16> ExtractCostCalculated; 1800 int ExtractCost = 0; 1801 for (ExternalUser &EU : ExternalUses) { 1802 // We only add extract cost once for the same scalar. 1803 if (!ExtractCostCalculated.insert(EU.Scalar).second) 1804 continue; 1805 1806 // Uses by ephemeral values are free (because the ephemeral value will be 1807 // removed prior to code generation, and so the extraction will be 1808 // removed as well). 1809 if (EphValues.count(EU.User)) 1810 continue; 1811 1812 VectorType *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); 1813 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1814 EU.Lane); 1815 } 1816 1817 Cost += getSpillCost(); 1818 1819 DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); 1820 return Cost + ExtractCost; 1821 } 1822 1823 int BoUpSLP::getGatherCost(Type *Ty) { 1824 int Cost = 0; 1825 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) 1826 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); 1827 return Cost; 1828 } 1829 1830 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { 1831 // Find the type of the operands in VL. 1832 Type *ScalarTy = VL[0]->getType(); 1833 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1834 ScalarTy = SI->getValueOperand()->getType(); 1835 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1836 // Find the cost of inserting/extracting values from the vector. 1837 return getGatherCost(VecTy); 1838 } 1839 1840 Value *BoUpSLP::getPointerOperand(Value *I) { 1841 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1842 return LI->getPointerOperand(); 1843 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 1844 return SI->getPointerOperand(); 1845 return nullptr; 1846 } 1847 1848 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { 1849 if (LoadInst *L = dyn_cast<LoadInst>(I)) 1850 return L->getPointerAddressSpace(); 1851 if (StoreInst *S = dyn_cast<StoreInst>(I)) 1852 return S->getPointerAddressSpace(); 1853 return -1; 1854 } 1855 1856 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) { 1857 Value *PtrA = getPointerOperand(A); 1858 Value *PtrB = getPointerOperand(B); 1859 unsigned ASA = getAddressSpaceOperand(A); 1860 unsigned ASB = getAddressSpaceOperand(B); 1861 1862 // Check that the address spaces match and that the pointers are valid. 1863 if (!PtrA || !PtrB || (ASA != ASB)) 1864 return false; 1865 1866 // Make sure that A and B are different pointers of the same type. 1867 if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) 1868 return false; 1869 1870 unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); 1871 Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); 1872 APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty)); 1873 1874 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); 1875 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); 1876 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); 1877 1878 APInt OffsetDelta = OffsetB - OffsetA; 1879 1880 // Check if they are based on the same pointer. That makes the offsets 1881 // sufficient. 1882 if (PtrA == PtrB) 1883 return OffsetDelta == Size; 1884 1885 // Compute the necessary base pointer delta to have the necessary final delta 1886 // equal to the size. 1887 APInt BaseDelta = Size - OffsetDelta; 1888 1889 // Otherwise compute the distance with SCEV between the base pointers. 1890 const SCEV *PtrSCEVA = SE->getSCEV(PtrA); 1891 const SCEV *PtrSCEVB = SE->getSCEV(PtrB); 1892 const SCEV *C = SE->getConstant(BaseDelta); 1893 const SCEV *X = SE->getAddExpr(PtrSCEVA, C); 1894 return X == PtrSCEVB; 1895 } 1896 1897 // Reorder commutative operations in alternate shuffle if the resulting vectors 1898 // are consecutive loads. This would allow us to vectorize the tree. 1899 // If we have something like- 1900 // load a[0] - load b[0] 1901 // load b[1] + load a[1] 1902 // load a[2] - load b[2] 1903 // load a[3] + load b[3] 1904 // Reordering the second load b[1] load a[1] would allow us to vectorize this 1905 // code. 1906 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, 1907 SmallVectorImpl<Value *> &Left, 1908 SmallVectorImpl<Value *> &Right) { 1909 const DataLayout &DL = F->getParent()->getDataLayout(); 1910 1911 // Push left and right operands of binary operation into Left and Right 1912 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 1913 Left.push_back(cast<Instruction>(VL[i])->getOperand(0)); 1914 Right.push_back(cast<Instruction>(VL[i])->getOperand(1)); 1915 } 1916 1917 // Reorder if we have a commutative operation and consecutive access 1918 // are on either side of the alternate instructions. 1919 for (unsigned j = 0; j < VL.size() - 1; ++j) { 1920 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { 1921 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { 1922 Instruction *VL1 = cast<Instruction>(VL[j]); 1923 Instruction *VL2 = cast<Instruction>(VL[j + 1]); 1924 if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) { 1925 std::swap(Left[j], Right[j]); 1926 continue; 1927 } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) { 1928 std::swap(Left[j + 1], Right[j + 1]); 1929 continue; 1930 } 1931 // else unchanged 1932 } 1933 } 1934 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { 1935 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { 1936 Instruction *VL1 = cast<Instruction>(VL[j]); 1937 Instruction *VL2 = cast<Instruction>(VL[j + 1]); 1938 if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) { 1939 std::swap(Left[j], Right[j]); 1940 continue; 1941 } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) { 1942 std::swap(Left[j + 1], Right[j + 1]); 1943 continue; 1944 } 1945 // else unchanged 1946 } 1947 } 1948 } 1949 } 1950 1951 // Return true if I should be commuted before adding it's left and right 1952 // operands to the arrays Left and Right. 1953 // 1954 // The vectorizer is trying to either have all elements one side being 1955 // instruction with the same opcode to enable further vectorization, or having 1956 // a splat to lower the vectorizing cost. 1957 static bool shouldReorderOperands(int i, Instruction &I, 1958 SmallVectorImpl<Value *> &Left, 1959 SmallVectorImpl<Value *> &Right, 1960 bool AllSameOpcodeLeft, 1961 bool AllSameOpcodeRight, bool SplatLeft, 1962 bool SplatRight) { 1963 Value *VLeft = I.getOperand(0); 1964 Value *VRight = I.getOperand(1); 1965 // If we have "SplatRight", try to see if commuting is needed to preserve it. 1966 if (SplatRight) { 1967 if (VRight == Right[i - 1]) 1968 // Preserve SplatRight 1969 return false; 1970 if (VLeft == Right[i - 1]) { 1971 // Commuting would preserve SplatRight, but we don't want to break 1972 // SplatLeft either, i.e. preserve the original order if possible. 1973 // (FIXME: why do we care?) 1974 if (SplatLeft && VLeft == Left[i - 1]) 1975 return false; 1976 return true; 1977 } 1978 } 1979 // Symmetrically handle Right side. 1980 if (SplatLeft) { 1981 if (VLeft == Left[i - 1]) 1982 // Preserve SplatLeft 1983 return false; 1984 if (VRight == Left[i - 1]) 1985 return true; 1986 } 1987 1988 Instruction *ILeft = dyn_cast<Instruction>(VLeft); 1989 Instruction *IRight = dyn_cast<Instruction>(VRight); 1990 1991 // If we have "AllSameOpcodeRight", try to see if the left operands preserves 1992 // it and not the right, in this case we want to commute. 1993 if (AllSameOpcodeRight) { 1994 unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode(); 1995 if (IRight && RightPrevOpcode == IRight->getOpcode()) 1996 // Do not commute, a match on the right preserves AllSameOpcodeRight 1997 return false; 1998 if (ILeft && RightPrevOpcode == ILeft->getOpcode()) { 1999 // We have a match and may want to commute, but first check if there is 2000 // not also a match on the existing operands on the Left to preserve 2001 // AllSameOpcodeLeft, i.e. preserve the original order if possible. 2002 // (FIXME: why do we care?) 2003 if (AllSameOpcodeLeft && ILeft && 2004 cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode()) 2005 return false; 2006 return true; 2007 } 2008 } 2009 // Symmetrically handle Left side. 2010 if (AllSameOpcodeLeft) { 2011 unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode(); 2012 if (ILeft && LeftPrevOpcode == ILeft->getOpcode()) 2013 return false; 2014 if (IRight && LeftPrevOpcode == IRight->getOpcode()) 2015 return true; 2016 } 2017 return false; 2018 } 2019 2020 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 2021 SmallVectorImpl<Value *> &Left, 2022 SmallVectorImpl<Value *> &Right) { 2023 2024 if (VL.size()) { 2025 // Peel the first iteration out of the loop since there's nothing 2026 // interesting to do anyway and it simplifies the checks in the loop. 2027 auto VLeft = cast<Instruction>(VL[0])->getOperand(0); 2028 auto VRight = cast<Instruction>(VL[0])->getOperand(1); 2029 if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft)) 2030 // Favor having instruction to the right. FIXME: why? 2031 std::swap(VLeft, VRight); 2032 Left.push_back(VLeft); 2033 Right.push_back(VRight); 2034 } 2035 2036 // Keep track if we have instructions with all the same opcode on one side. 2037 bool AllSameOpcodeLeft = isa<Instruction>(Left[0]); 2038 bool AllSameOpcodeRight = isa<Instruction>(Right[0]); 2039 // Keep track if we have one side with all the same value (broadcast). 2040 bool SplatLeft = true; 2041 bool SplatRight = true; 2042 2043 for (unsigned i = 1, e = VL.size(); i != e; ++i) { 2044 Instruction *I = cast<Instruction>(VL[i]); 2045 assert(I->isCommutative() && "Can only process commutative instruction"); 2046 // Commute to favor either a splat or maximizing having the same opcodes on 2047 // one side. 2048 if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, 2049 AllSameOpcodeRight, SplatLeft, SplatRight)) { 2050 Left.push_back(I->getOperand(1)); 2051 Right.push_back(I->getOperand(0)); 2052 } else { 2053 Left.push_back(I->getOperand(0)); 2054 Right.push_back(I->getOperand(1)); 2055 } 2056 // Update Splat* and AllSameOpcode* after the insertion. 2057 SplatRight = SplatRight && (Right[i - 1] == Right[i]); 2058 SplatLeft = SplatLeft && (Left[i - 1] == Left[i]); 2059 AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) && 2060 (cast<Instruction>(Left[i - 1])->getOpcode() == 2061 cast<Instruction>(Left[i])->getOpcode()); 2062 AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) && 2063 (cast<Instruction>(Right[i - 1])->getOpcode() == 2064 cast<Instruction>(Right[i])->getOpcode()); 2065 } 2066 2067 // If one operand end up being broadcast, return this operand order. 2068 if (SplatRight || SplatLeft) 2069 return; 2070 2071 const DataLayout &DL = F->getParent()->getDataLayout(); 2072 2073 // Finally check if we can get longer vectorizable chain by reordering 2074 // without breaking the good operand order detected above. 2075 // E.g. If we have something like- 2076 // load a[0] load b[0] 2077 // load b[1] load a[1] 2078 // load a[2] load b[2] 2079 // load a[3] load b[3] 2080 // Reordering the second load b[1] load a[1] would allow us to vectorize 2081 // this code and we still retain AllSameOpcode property. 2082 // FIXME: This load reordering might break AllSameOpcode in some rare cases 2083 // such as- 2084 // add a[0],c[0] load b[0] 2085 // add a[1],c[2] load b[1] 2086 // b[2] load b[2] 2087 // add a[3],c[3] load b[3] 2088 for (unsigned j = 0; j < VL.size() - 1; ++j) { 2089 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { 2090 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { 2091 if (isConsecutiveAccess(L, L1, DL)) { 2092 std::swap(Left[j + 1], Right[j + 1]); 2093 continue; 2094 } 2095 } 2096 } 2097 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { 2098 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { 2099 if (isConsecutiveAccess(L, L1, DL)) { 2100 std::swap(Left[j + 1], Right[j + 1]); 2101 continue; 2102 } 2103 } 2104 } 2105 // else unchanged 2106 } 2107 } 2108 2109 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { 2110 Instruction *VL0 = cast<Instruction>(VL[0]); 2111 BasicBlock::iterator NextInst(VL0); 2112 ++NextInst; 2113 Builder.SetInsertPoint(VL0->getParent(), NextInst); 2114 Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); 2115 } 2116 2117 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { 2118 Value *Vec = UndefValue::get(Ty); 2119 // Generate the 'InsertElement' instruction. 2120 for (unsigned i = 0; i < Ty->getNumElements(); ++i) { 2121 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); 2122 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { 2123 GatherSeq.insert(Insrt); 2124 CSEBlocks.insert(Insrt->getParent()); 2125 2126 // Add to our 'need-to-extract' list. 2127 if (ScalarToTreeEntry.count(VL[i])) { 2128 int Idx = ScalarToTreeEntry[VL[i]]; 2129 TreeEntry *E = &VectorizableTree[Idx]; 2130 // Find which lane we need to extract. 2131 int FoundLane = -1; 2132 for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { 2133 // Is this the lane of the scalar that we are looking for ? 2134 if (E->Scalars[Lane] == VL[i]) { 2135 FoundLane = Lane; 2136 break; 2137 } 2138 } 2139 assert(FoundLane >= 0 && "Could not find the correct lane"); 2140 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); 2141 } 2142 } 2143 } 2144 2145 return Vec; 2146 } 2147 2148 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { 2149 SmallDenseMap<Value*, int>::const_iterator Entry 2150 = ScalarToTreeEntry.find(VL[0]); 2151 if (Entry != ScalarToTreeEntry.end()) { 2152 int Idx = Entry->second; 2153 const TreeEntry *En = &VectorizableTree[Idx]; 2154 if (En->isSame(VL) && En->VectorizedValue) 2155 return En->VectorizedValue; 2156 } 2157 return nullptr; 2158 } 2159 2160 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { 2161 if (ScalarToTreeEntry.count(VL[0])) { 2162 int Idx = ScalarToTreeEntry[VL[0]]; 2163 TreeEntry *E = &VectorizableTree[Idx]; 2164 if (E->isSame(VL)) 2165 return vectorizeTree(E); 2166 } 2167 2168 Type *ScalarTy = VL[0]->getType(); 2169 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 2170 ScalarTy = SI->getValueOperand()->getType(); 2171 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 2172 2173 return Gather(VL, VecTy); 2174 } 2175 2176 Value *BoUpSLP::vectorizeTree(TreeEntry *E) { 2177 IRBuilder<>::InsertPointGuard Guard(Builder); 2178 2179 if (E->VectorizedValue) { 2180 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); 2181 return E->VectorizedValue; 2182 } 2183 2184 Instruction *VL0 = cast<Instruction>(E->Scalars[0]); 2185 Type *ScalarTy = VL0->getType(); 2186 if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) 2187 ScalarTy = SI->getValueOperand()->getType(); 2188 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); 2189 2190 if (E->NeedToGather) { 2191 setInsertPointAfterBundle(E->Scalars); 2192 return Gather(E->Scalars, VecTy); 2193 } 2194 2195 const DataLayout &DL = F->getParent()->getDataLayout(); 2196 unsigned Opcode = getSameOpcode(E->Scalars); 2197 2198 switch (Opcode) { 2199 case Instruction::PHI: { 2200 PHINode *PH = dyn_cast<PHINode>(VL0); 2201 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); 2202 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 2203 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); 2204 E->VectorizedValue = NewPhi; 2205 2206 // PHINodes may have multiple entries from the same block. We want to 2207 // visit every block once. 2208 SmallSet<BasicBlock*, 4> VisitedBBs; 2209 2210 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 2211 ValueList Operands; 2212 BasicBlock *IBB = PH->getIncomingBlock(i); 2213 2214 if (!VisitedBBs.insert(IBB).second) { 2215 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); 2216 continue; 2217 } 2218 2219 // Prepare the operand vector. 2220 for (Value *V : E->Scalars) 2221 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB)); 2222 2223 Builder.SetInsertPoint(IBB->getTerminator()); 2224 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 2225 Value *Vec = vectorizeTree(Operands); 2226 NewPhi->addIncoming(Vec, IBB); 2227 } 2228 2229 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && 2230 "Invalid number of incoming values"); 2231 return NewPhi; 2232 } 2233 2234 case Instruction::ExtractElement: { 2235 if (CanReuseExtract(E->Scalars)) { 2236 Value *V = VL0->getOperand(0); 2237 E->VectorizedValue = V; 2238 return V; 2239 } 2240 return Gather(E->Scalars, VecTy); 2241 } 2242 case Instruction::ZExt: 2243 case Instruction::SExt: 2244 case Instruction::FPToUI: 2245 case Instruction::FPToSI: 2246 case Instruction::FPExt: 2247 case Instruction::PtrToInt: 2248 case Instruction::IntToPtr: 2249 case Instruction::SIToFP: 2250 case Instruction::UIToFP: 2251 case Instruction::Trunc: 2252 case Instruction::FPTrunc: 2253 case Instruction::BitCast: { 2254 ValueList INVL; 2255 for (Value *V : E->Scalars) 2256 INVL.push_back(cast<Instruction>(V)->getOperand(0)); 2257 2258 setInsertPointAfterBundle(E->Scalars); 2259 2260 Value *InVec = vectorizeTree(INVL); 2261 2262 if (Value *V = alreadyVectorized(E->Scalars)) 2263 return V; 2264 2265 CastInst *CI = dyn_cast<CastInst>(VL0); 2266 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); 2267 E->VectorizedValue = V; 2268 ++NumVectorInstructions; 2269 return V; 2270 } 2271 case Instruction::FCmp: 2272 case Instruction::ICmp: { 2273 ValueList LHSV, RHSV; 2274 for (Value *V : E->Scalars) { 2275 LHSV.push_back(cast<Instruction>(V)->getOperand(0)); 2276 RHSV.push_back(cast<Instruction>(V)->getOperand(1)); 2277 } 2278 2279 setInsertPointAfterBundle(E->Scalars); 2280 2281 Value *L = vectorizeTree(LHSV); 2282 Value *R = vectorizeTree(RHSV); 2283 2284 if (Value *V = alreadyVectorized(E->Scalars)) 2285 return V; 2286 2287 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 2288 Value *V; 2289 if (Opcode == Instruction::FCmp) 2290 V = Builder.CreateFCmp(P0, L, R); 2291 else 2292 V = Builder.CreateICmp(P0, L, R); 2293 2294 E->VectorizedValue = V; 2295 ++NumVectorInstructions; 2296 return V; 2297 } 2298 case Instruction::Select: { 2299 ValueList TrueVec, FalseVec, CondVec; 2300 for (Value *V : E->Scalars) { 2301 CondVec.push_back(cast<Instruction>(V)->getOperand(0)); 2302 TrueVec.push_back(cast<Instruction>(V)->getOperand(1)); 2303 FalseVec.push_back(cast<Instruction>(V)->getOperand(2)); 2304 } 2305 2306 setInsertPointAfterBundle(E->Scalars); 2307 2308 Value *Cond = vectorizeTree(CondVec); 2309 Value *True = vectorizeTree(TrueVec); 2310 Value *False = vectorizeTree(FalseVec); 2311 2312 if (Value *V = alreadyVectorized(E->Scalars)) 2313 return V; 2314 2315 Value *V = Builder.CreateSelect(Cond, True, False); 2316 E->VectorizedValue = V; 2317 ++NumVectorInstructions; 2318 return V; 2319 } 2320 case Instruction::Add: 2321 case Instruction::FAdd: 2322 case Instruction::Sub: 2323 case Instruction::FSub: 2324 case Instruction::Mul: 2325 case Instruction::FMul: 2326 case Instruction::UDiv: 2327 case Instruction::SDiv: 2328 case Instruction::FDiv: 2329 case Instruction::URem: 2330 case Instruction::SRem: 2331 case Instruction::FRem: 2332 case Instruction::Shl: 2333 case Instruction::LShr: 2334 case Instruction::AShr: 2335 case Instruction::And: 2336 case Instruction::Or: 2337 case Instruction::Xor: { 2338 ValueList LHSVL, RHSVL; 2339 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) 2340 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); 2341 else 2342 for (Value *V : E->Scalars) { 2343 LHSVL.push_back(cast<Instruction>(V)->getOperand(0)); 2344 RHSVL.push_back(cast<Instruction>(V)->getOperand(1)); 2345 } 2346 2347 setInsertPointAfterBundle(E->Scalars); 2348 2349 Value *LHS = vectorizeTree(LHSVL); 2350 Value *RHS = vectorizeTree(RHSVL); 2351 2352 if (LHS == RHS && isa<Instruction>(LHS)) { 2353 assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); 2354 } 2355 2356 if (Value *V = alreadyVectorized(E->Scalars)) 2357 return V; 2358 2359 BinaryOperator *BinOp = cast<BinaryOperator>(VL0); 2360 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); 2361 E->VectorizedValue = V; 2362 propagateIRFlags(E->VectorizedValue, E->Scalars); 2363 ++NumVectorInstructions; 2364 2365 if (Instruction *I = dyn_cast<Instruction>(V)) 2366 return propagateMetadata(I, E->Scalars); 2367 2368 return V; 2369 } 2370 case Instruction::Load: { 2371 // Loads are inserted at the head of the tree because we don't want to 2372 // sink them all the way down past store instructions. 2373 setInsertPointAfterBundle(E->Scalars); 2374 2375 LoadInst *LI = cast<LoadInst>(VL0); 2376 Type *ScalarLoadTy = LI->getType(); 2377 unsigned AS = LI->getPointerAddressSpace(); 2378 2379 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), 2380 VecTy->getPointerTo(AS)); 2381 2382 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2383 // ExternalUses list to make sure that an extract will be generated in the 2384 // future. 2385 if (ScalarToTreeEntry.count(LI->getPointerOperand())) 2386 ExternalUses.push_back( 2387 ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0)); 2388 2389 unsigned Alignment = LI->getAlignment(); 2390 LI = Builder.CreateLoad(VecPtr); 2391 if (!Alignment) { 2392 Alignment = DL.getABITypeAlignment(ScalarLoadTy); 2393 } 2394 LI->setAlignment(Alignment); 2395 E->VectorizedValue = LI; 2396 ++NumVectorInstructions; 2397 return propagateMetadata(LI, E->Scalars); 2398 } 2399 case Instruction::Store: { 2400 StoreInst *SI = cast<StoreInst>(VL0); 2401 unsigned Alignment = SI->getAlignment(); 2402 unsigned AS = SI->getPointerAddressSpace(); 2403 2404 ValueList ValueOp; 2405 for (Value *V : E->Scalars) 2406 ValueOp.push_back(cast<StoreInst>(V)->getValueOperand()); 2407 2408 setInsertPointAfterBundle(E->Scalars); 2409 2410 Value *VecValue = vectorizeTree(ValueOp); 2411 Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), 2412 VecTy->getPointerTo(AS)); 2413 StoreInst *S = Builder.CreateStore(VecValue, VecPtr); 2414 2415 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2416 // ExternalUses list to make sure that an extract will be generated in the 2417 // future. 2418 if (ScalarToTreeEntry.count(SI->getPointerOperand())) 2419 ExternalUses.push_back( 2420 ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0)); 2421 2422 if (!Alignment) { 2423 Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType()); 2424 } 2425 S->setAlignment(Alignment); 2426 E->VectorizedValue = S; 2427 ++NumVectorInstructions; 2428 return propagateMetadata(S, E->Scalars); 2429 } 2430 case Instruction::GetElementPtr: { 2431 setInsertPointAfterBundle(E->Scalars); 2432 2433 ValueList Op0VL; 2434 for (Value *V : E->Scalars) 2435 Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0)); 2436 2437 Value *Op0 = vectorizeTree(Op0VL); 2438 2439 std::vector<Value *> OpVecs; 2440 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; 2441 ++j) { 2442 ValueList OpVL; 2443 for (Value *V : E->Scalars) 2444 OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j)); 2445 2446 Value *OpVec = vectorizeTree(OpVL); 2447 OpVecs.push_back(OpVec); 2448 } 2449 2450 Value *V = Builder.CreateGEP( 2451 cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs); 2452 E->VectorizedValue = V; 2453 ++NumVectorInstructions; 2454 2455 if (Instruction *I = dyn_cast<Instruction>(V)) 2456 return propagateMetadata(I, E->Scalars); 2457 2458 return V; 2459 } 2460 case Instruction::Call: { 2461 CallInst *CI = cast<CallInst>(VL0); 2462 setInsertPointAfterBundle(E->Scalars); 2463 Function *FI; 2464 Intrinsic::ID IID = Intrinsic::not_intrinsic; 2465 Value *ScalarArg = nullptr; 2466 if (CI && (FI = CI->getCalledFunction())) { 2467 IID = FI->getIntrinsicID(); 2468 } 2469 std::vector<Value *> OpVecs; 2470 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { 2471 ValueList OpVL; 2472 // ctlz,cttz and powi are special intrinsics whose second argument is 2473 // a scalar. This argument should not be vectorized. 2474 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { 2475 CallInst *CEI = cast<CallInst>(E->Scalars[0]); 2476 ScalarArg = CEI->getArgOperand(j); 2477 OpVecs.push_back(CEI->getArgOperand(j)); 2478 continue; 2479 } 2480 for (Value *V : E->Scalars) { 2481 CallInst *CEI = cast<CallInst>(V); 2482 OpVL.push_back(CEI->getArgOperand(j)); 2483 } 2484 2485 Value *OpVec = vectorizeTree(OpVL); 2486 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); 2487 OpVecs.push_back(OpVec); 2488 } 2489 2490 Module *M = F->getParent(); 2491 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 2492 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; 2493 Function *CF = Intrinsic::getDeclaration(M, ID, Tys); 2494 Value *V = Builder.CreateCall(CF, OpVecs); 2495 2496 // The scalar argument uses an in-tree scalar so we add the new vectorized 2497 // call to ExternalUses list to make sure that an extract will be 2498 // generated in the future. 2499 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg)) 2500 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); 2501 2502 E->VectorizedValue = V; 2503 ++NumVectorInstructions; 2504 return V; 2505 } 2506 case Instruction::ShuffleVector: { 2507 ValueList LHSVL, RHSVL; 2508 assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand"); 2509 reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); 2510 setInsertPointAfterBundle(E->Scalars); 2511 2512 Value *LHS = vectorizeTree(LHSVL); 2513 Value *RHS = vectorizeTree(RHSVL); 2514 2515 if (Value *V = alreadyVectorized(E->Scalars)) 2516 return V; 2517 2518 // Create a vector of LHS op1 RHS 2519 BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0); 2520 Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); 2521 2522 // Create a vector of LHS op2 RHS 2523 Instruction *VL1 = cast<Instruction>(E->Scalars[1]); 2524 BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1); 2525 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); 2526 2527 // Create shuffle to take alternate operations from the vector. 2528 // Also, gather up odd and even scalar ops to propagate IR flags to 2529 // each vector operation. 2530 ValueList OddScalars, EvenScalars; 2531 unsigned e = E->Scalars.size(); 2532 SmallVector<Constant *, 8> Mask(e); 2533 for (unsigned i = 0; i < e; ++i) { 2534 if (i & 1) { 2535 Mask[i] = Builder.getInt32(e + i); 2536 OddScalars.push_back(E->Scalars[i]); 2537 } else { 2538 Mask[i] = Builder.getInt32(i); 2539 EvenScalars.push_back(E->Scalars[i]); 2540 } 2541 } 2542 2543 Value *ShuffleMask = ConstantVector::get(Mask); 2544 propagateIRFlags(V0, EvenScalars); 2545 propagateIRFlags(V1, OddScalars); 2546 2547 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); 2548 E->VectorizedValue = V; 2549 ++NumVectorInstructions; 2550 if (Instruction *I = dyn_cast<Instruction>(V)) 2551 return propagateMetadata(I, E->Scalars); 2552 2553 return V; 2554 } 2555 default: 2556 llvm_unreachable("unknown inst"); 2557 } 2558 return nullptr; 2559 } 2560 2561 Value *BoUpSLP::vectorizeTree() { 2562 2563 // All blocks must be scheduled before any instructions are inserted. 2564 for (auto &BSIter : BlocksSchedules) { 2565 scheduleBlock(BSIter.second.get()); 2566 } 2567 2568 Builder.SetInsertPoint(&F->getEntryBlock().front()); 2569 vectorizeTree(&VectorizableTree[0]); 2570 2571 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); 2572 2573 // Extract all of the elements with the external uses. 2574 for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end(); 2575 it != e; ++it) { 2576 Value *Scalar = it->Scalar; 2577 llvm::User *User = it->User; 2578 2579 // Skip users that we already RAUW. This happens when one instruction 2580 // has multiple uses of the same value. 2581 if (std::find(Scalar->user_begin(), Scalar->user_end(), User) == 2582 Scalar->user_end()) 2583 continue; 2584 assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); 2585 2586 int Idx = ScalarToTreeEntry[Scalar]; 2587 TreeEntry *E = &VectorizableTree[Idx]; 2588 assert(!E->NeedToGather && "Extracting from a gather list"); 2589 2590 Value *Vec = E->VectorizedValue; 2591 assert(Vec && "Can't find vectorizable value"); 2592 2593 Value *Lane = Builder.getInt32(it->Lane); 2594 // Generate extracts for out-of-tree users. 2595 // Find the insertion point for the extractelement lane. 2596 if (isa<Instruction>(Vec)){ 2597 if (PHINode *PH = dyn_cast<PHINode>(User)) { 2598 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { 2599 if (PH->getIncomingValue(i) == Scalar) { 2600 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); 2601 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2602 CSEBlocks.insert(PH->getIncomingBlock(i)); 2603 PH->setOperand(i, Ex); 2604 } 2605 } 2606 } else { 2607 Builder.SetInsertPoint(cast<Instruction>(User)); 2608 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2609 CSEBlocks.insert(cast<Instruction>(User)->getParent()); 2610 User->replaceUsesOfWith(Scalar, Ex); 2611 } 2612 } else { 2613 Builder.SetInsertPoint(&F->getEntryBlock().front()); 2614 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2615 CSEBlocks.insert(&F->getEntryBlock()); 2616 User->replaceUsesOfWith(Scalar, Ex); 2617 } 2618 2619 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); 2620 } 2621 2622 // For each vectorized value: 2623 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 2624 TreeEntry *Entry = &VectorizableTree[EIdx]; 2625 2626 // For each lane: 2627 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 2628 Value *Scalar = Entry->Scalars[Lane]; 2629 // No need to handle users of gathered values. 2630 if (Entry->NeedToGather) 2631 continue; 2632 2633 assert(Entry->VectorizedValue && "Can't find vectorizable value"); 2634 2635 Type *Ty = Scalar->getType(); 2636 if (!Ty->isVoidTy()) { 2637 #ifndef NDEBUG 2638 for (User *U : Scalar->users()) { 2639 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); 2640 2641 assert((ScalarToTreeEntry.count(U) || 2642 // It is legal to replace users in the ignorelist by undef. 2643 (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != 2644 UserIgnoreList.end())) && 2645 "Replacing out-of-tree value with undef"); 2646 } 2647 #endif 2648 Value *Undef = UndefValue::get(Ty); 2649 Scalar->replaceAllUsesWith(Undef); 2650 } 2651 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); 2652 eraseInstruction(cast<Instruction>(Scalar)); 2653 } 2654 } 2655 2656 Builder.ClearInsertionPoint(); 2657 2658 return VectorizableTree[0].VectorizedValue; 2659 } 2660 2661 void BoUpSLP::optimizeGatherSequence() { 2662 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() 2663 << " gather sequences instructions.\n"); 2664 // LICM InsertElementInst sequences. 2665 for (SetVector<Instruction *>::iterator it = GatherSeq.begin(), 2666 e = GatherSeq.end(); it != e; ++it) { 2667 InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); 2668 2669 if (!Insert) 2670 continue; 2671 2672 // Check if this block is inside a loop. 2673 Loop *L = LI->getLoopFor(Insert->getParent()); 2674 if (!L) 2675 continue; 2676 2677 // Check if it has a preheader. 2678 BasicBlock *PreHeader = L->getLoopPreheader(); 2679 if (!PreHeader) 2680 continue; 2681 2682 // If the vector or the element that we insert into it are 2683 // instructions that are defined in this basic block then we can't 2684 // hoist this instruction. 2685 Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); 2686 Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); 2687 if (CurrVec && L->contains(CurrVec)) 2688 continue; 2689 if (NewElem && L->contains(NewElem)) 2690 continue; 2691 2692 // We can hoist this instruction. Move it to the pre-header. 2693 Insert->moveBefore(PreHeader->getTerminator()); 2694 } 2695 2696 // Make a list of all reachable blocks in our CSE queue. 2697 SmallVector<const DomTreeNode *, 8> CSEWorkList; 2698 CSEWorkList.reserve(CSEBlocks.size()); 2699 for (BasicBlock *BB : CSEBlocks) 2700 if (DomTreeNode *N = DT->getNode(BB)) { 2701 assert(DT->isReachableFromEntry(N)); 2702 CSEWorkList.push_back(N); 2703 } 2704 2705 // Sort blocks by domination. This ensures we visit a block after all blocks 2706 // dominating it are visited. 2707 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), 2708 [this](const DomTreeNode *A, const DomTreeNode *B) { 2709 return DT->properlyDominates(A, B); 2710 }); 2711 2712 // Perform O(N^2) search over the gather sequences and merge identical 2713 // instructions. TODO: We can further optimize this scan if we split the 2714 // instructions into different buckets based on the insert lane. 2715 SmallVector<Instruction *, 16> Visited; 2716 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { 2717 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && 2718 "Worklist not sorted properly!"); 2719 BasicBlock *BB = (*I)->getBlock(); 2720 // For all instructions in blocks containing gather sequences: 2721 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { 2722 Instruction *In = &*it++; 2723 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) 2724 continue; 2725 2726 // Check if we can replace this instruction with any of the 2727 // visited instructions. 2728 for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(), 2729 ve = Visited.end(); 2730 v != ve; ++v) { 2731 if (In->isIdenticalTo(*v) && 2732 DT->dominates((*v)->getParent(), In->getParent())) { 2733 In->replaceAllUsesWith(*v); 2734 eraseInstruction(In); 2735 In = nullptr; 2736 break; 2737 } 2738 } 2739 if (In) { 2740 assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end()); 2741 Visited.push_back(In); 2742 } 2743 } 2744 } 2745 CSEBlocks.clear(); 2746 GatherSeq.clear(); 2747 } 2748 2749 // Groups the instructions to a bundle (which is then a single scheduling entity) 2750 // and schedules instructions until the bundle gets ready. 2751 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, 2752 BoUpSLP *SLP) { 2753 if (isa<PHINode>(VL[0])) 2754 return true; 2755 2756 // Initialize the instruction bundle. 2757 Instruction *OldScheduleEnd = ScheduleEnd; 2758 ScheduleData *PrevInBundle = nullptr; 2759 ScheduleData *Bundle = nullptr; 2760 bool ReSchedule = false; 2761 DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); 2762 2763 // Make sure that the scheduling region contains all 2764 // instructions of the bundle. 2765 for (Value *V : VL) { 2766 if (!extendSchedulingRegion(V)) 2767 return false; 2768 } 2769 2770 for (Value *V : VL) { 2771 ScheduleData *BundleMember = getScheduleData(V); 2772 assert(BundleMember && 2773 "no ScheduleData for bundle member (maybe not in same basic block)"); 2774 if (BundleMember->IsScheduled) { 2775 // A bundle member was scheduled as single instruction before and now 2776 // needs to be scheduled as part of the bundle. We just get rid of the 2777 // existing schedule. 2778 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember 2779 << " was already scheduled\n"); 2780 ReSchedule = true; 2781 } 2782 assert(BundleMember->isSchedulingEntity() && 2783 "bundle member already part of other bundle"); 2784 if (PrevInBundle) { 2785 PrevInBundle->NextInBundle = BundleMember; 2786 } else { 2787 Bundle = BundleMember; 2788 } 2789 BundleMember->UnscheduledDepsInBundle = 0; 2790 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; 2791 2792 // Group the instructions to a bundle. 2793 BundleMember->FirstInBundle = Bundle; 2794 PrevInBundle = BundleMember; 2795 } 2796 if (ScheduleEnd != OldScheduleEnd) { 2797 // The scheduling region got new instructions at the lower end (or it is a 2798 // new region for the first bundle). This makes it necessary to 2799 // recalculate all dependencies. 2800 // It is seldom that this needs to be done a second time after adding the 2801 // initial bundle to the region. 2802 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2803 ScheduleData *SD = getScheduleData(I); 2804 SD->clearDependencies(); 2805 } 2806 ReSchedule = true; 2807 } 2808 if (ReSchedule) { 2809 resetSchedule(); 2810 initialFillReadyList(ReadyInsts); 2811 } 2812 2813 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " 2814 << BB->getName() << "\n"); 2815 2816 calculateDependencies(Bundle, true, SLP); 2817 2818 // Now try to schedule the new bundle. As soon as the bundle is "ready" it 2819 // means that there are no cyclic dependencies and we can schedule it. 2820 // Note that's important that we don't "schedule" the bundle yet (see 2821 // cancelScheduling). 2822 while (!Bundle->isReady() && !ReadyInsts.empty()) { 2823 2824 ScheduleData *pickedSD = ReadyInsts.back(); 2825 ReadyInsts.pop_back(); 2826 2827 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { 2828 schedule(pickedSD, ReadyInsts); 2829 } 2830 } 2831 if (!Bundle->isReady()) { 2832 cancelScheduling(VL); 2833 return false; 2834 } 2835 return true; 2836 } 2837 2838 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) { 2839 if (isa<PHINode>(VL[0])) 2840 return; 2841 2842 ScheduleData *Bundle = getScheduleData(VL[0]); 2843 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); 2844 assert(!Bundle->IsScheduled && 2845 "Can't cancel bundle which is already scheduled"); 2846 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && 2847 "tried to unbundle something which is not a bundle"); 2848 2849 // Un-bundle: make single instructions out of the bundle. 2850 ScheduleData *BundleMember = Bundle; 2851 while (BundleMember) { 2852 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); 2853 BundleMember->FirstInBundle = BundleMember; 2854 ScheduleData *Next = BundleMember->NextInBundle; 2855 BundleMember->NextInBundle = nullptr; 2856 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; 2857 if (BundleMember->UnscheduledDepsInBundle == 0) { 2858 ReadyInsts.insert(BundleMember); 2859 } 2860 BundleMember = Next; 2861 } 2862 } 2863 2864 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { 2865 if (getScheduleData(V)) 2866 return true; 2867 Instruction *I = dyn_cast<Instruction>(V); 2868 assert(I && "bundle member must be an instruction"); 2869 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); 2870 if (!ScheduleStart) { 2871 // It's the first instruction in the new region. 2872 initScheduleData(I, I->getNextNode(), nullptr, nullptr); 2873 ScheduleStart = I; 2874 ScheduleEnd = I->getNextNode(); 2875 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2876 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); 2877 return true; 2878 } 2879 // Search up and down at the same time, because we don't know if the new 2880 // instruction is above or below the existing scheduling region. 2881 BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator()); 2882 BasicBlock::reverse_iterator UpperEnd = BB->rend(); 2883 BasicBlock::iterator DownIter(ScheduleEnd); 2884 BasicBlock::iterator LowerEnd = BB->end(); 2885 for (;;) { 2886 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { 2887 DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); 2888 return false; 2889 } 2890 2891 if (UpIter != UpperEnd) { 2892 if (&*UpIter == I) { 2893 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); 2894 ScheduleStart = I; 2895 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); 2896 return true; 2897 } 2898 UpIter++; 2899 } 2900 if (DownIter != LowerEnd) { 2901 if (&*DownIter == I) { 2902 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, 2903 nullptr); 2904 ScheduleEnd = I->getNextNode(); 2905 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2906 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); 2907 return true; 2908 } 2909 DownIter++; 2910 } 2911 assert((UpIter != UpperEnd || DownIter != LowerEnd) && 2912 "instruction not found in block"); 2913 } 2914 return true; 2915 } 2916 2917 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, 2918 Instruction *ToI, 2919 ScheduleData *PrevLoadStore, 2920 ScheduleData *NextLoadStore) { 2921 ScheduleData *CurrentLoadStore = PrevLoadStore; 2922 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { 2923 ScheduleData *SD = ScheduleDataMap[I]; 2924 if (!SD) { 2925 // Allocate a new ScheduleData for the instruction. 2926 if (ChunkPos >= ChunkSize) { 2927 ScheduleDataChunks.push_back( 2928 llvm::make_unique<ScheduleData[]>(ChunkSize)); 2929 ChunkPos = 0; 2930 } 2931 SD = &(ScheduleDataChunks.back()[ChunkPos++]); 2932 ScheduleDataMap[I] = SD; 2933 SD->Inst = I; 2934 } 2935 assert(!isInSchedulingRegion(SD) && 2936 "new ScheduleData already in scheduling region"); 2937 SD->init(SchedulingRegionID); 2938 2939 if (I->mayReadOrWriteMemory()) { 2940 // Update the linked list of memory accessing instructions. 2941 if (CurrentLoadStore) { 2942 CurrentLoadStore->NextLoadStore = SD; 2943 } else { 2944 FirstLoadStoreInRegion = SD; 2945 } 2946 CurrentLoadStore = SD; 2947 } 2948 } 2949 if (NextLoadStore) { 2950 if (CurrentLoadStore) 2951 CurrentLoadStore->NextLoadStore = NextLoadStore; 2952 } else { 2953 LastLoadStoreInRegion = CurrentLoadStore; 2954 } 2955 } 2956 2957 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, 2958 bool InsertInReadyList, 2959 BoUpSLP *SLP) { 2960 assert(SD->isSchedulingEntity()); 2961 2962 SmallVector<ScheduleData *, 10> WorkList; 2963 WorkList.push_back(SD); 2964 2965 while (!WorkList.empty()) { 2966 ScheduleData *SD = WorkList.back(); 2967 WorkList.pop_back(); 2968 2969 ScheduleData *BundleMember = SD; 2970 while (BundleMember) { 2971 assert(isInSchedulingRegion(BundleMember)); 2972 if (!BundleMember->hasValidDependencies()) { 2973 2974 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n"); 2975 BundleMember->Dependencies = 0; 2976 BundleMember->resetUnscheduledDeps(); 2977 2978 // Handle def-use chain dependencies. 2979 for (User *U : BundleMember->Inst->users()) { 2980 if (isa<Instruction>(U)) { 2981 ScheduleData *UseSD = getScheduleData(U); 2982 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { 2983 BundleMember->Dependencies++; 2984 ScheduleData *DestBundle = UseSD->FirstInBundle; 2985 if (!DestBundle->IsScheduled) { 2986 BundleMember->incrementUnscheduledDeps(1); 2987 } 2988 if (!DestBundle->hasValidDependencies()) { 2989 WorkList.push_back(DestBundle); 2990 } 2991 } 2992 } else { 2993 // I'm not sure if this can ever happen. But we need to be safe. 2994 // This lets the instruction/bundle never be scheduled and 2995 // eventually disable vectorization. 2996 BundleMember->Dependencies++; 2997 BundleMember->incrementUnscheduledDeps(1); 2998 } 2999 } 3000 3001 // Handle the memory dependencies. 3002 ScheduleData *DepDest = BundleMember->NextLoadStore; 3003 if (DepDest) { 3004 Instruction *SrcInst = BundleMember->Inst; 3005 MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); 3006 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); 3007 unsigned numAliased = 0; 3008 unsigned DistToSrc = 1; 3009 3010 while (DepDest) { 3011 assert(isInSchedulingRegion(DepDest)); 3012 3013 // We have two limits to reduce the complexity: 3014 // 1) AliasedCheckLimit: It's a small limit to reduce calls to 3015 // SLP->isAliased (which is the expensive part in this loop). 3016 // 2) MaxMemDepDistance: It's for very large blocks and it aborts 3017 // the whole loop (even if the loop is fast, it's quadratic). 3018 // It's important for the loop break condition (see below) to 3019 // check this limit even between two read-only instructions. 3020 if (DistToSrc >= MaxMemDepDistance || 3021 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && 3022 (numAliased >= AliasedCheckLimit || 3023 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { 3024 3025 // We increment the counter only if the locations are aliased 3026 // (instead of counting all alias checks). This gives a better 3027 // balance between reduced runtime and accurate dependencies. 3028 numAliased++; 3029 3030 DepDest->MemoryDependencies.push_back(BundleMember); 3031 BundleMember->Dependencies++; 3032 ScheduleData *DestBundle = DepDest->FirstInBundle; 3033 if (!DestBundle->IsScheduled) { 3034 BundleMember->incrementUnscheduledDeps(1); 3035 } 3036 if (!DestBundle->hasValidDependencies()) { 3037 WorkList.push_back(DestBundle); 3038 } 3039 } 3040 DepDest = DepDest->NextLoadStore; 3041 3042 // Example, explaining the loop break condition: Let's assume our 3043 // starting instruction is i0 and MaxMemDepDistance = 3. 3044 // 3045 // +--------v--v--v 3046 // i0,i1,i2,i3,i4,i5,i6,i7,i8 3047 // +--------^--^--^ 3048 // 3049 // MaxMemDepDistance let us stop alias-checking at i3 and we add 3050 // dependencies from i0 to i3,i4,.. (even if they are not aliased). 3051 // Previously we already added dependencies from i3 to i6,i7,i8 3052 // (because of MaxMemDepDistance). As we added a dependency from 3053 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 3054 // and we can abort this loop at i6. 3055 if (DistToSrc >= 2 * MaxMemDepDistance) 3056 break; 3057 DistToSrc++; 3058 } 3059 } 3060 } 3061 BundleMember = BundleMember->NextInBundle; 3062 } 3063 if (InsertInReadyList && SD->isReady()) { 3064 ReadyInsts.push_back(SD); 3065 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n"); 3066 } 3067 } 3068 } 3069 3070 void BoUpSLP::BlockScheduling::resetSchedule() { 3071 assert(ScheduleStart && 3072 "tried to reset schedule on block which has not been scheduled"); 3073 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 3074 ScheduleData *SD = getScheduleData(I); 3075 assert(isInSchedulingRegion(SD)); 3076 SD->IsScheduled = false; 3077 SD->resetUnscheduledDeps(); 3078 } 3079 ReadyInsts.clear(); 3080 } 3081 3082 void BoUpSLP::scheduleBlock(BlockScheduling *BS) { 3083 3084 if (!BS->ScheduleStart) 3085 return; 3086 3087 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); 3088 3089 BS->resetSchedule(); 3090 3091 // For the real scheduling we use a more sophisticated ready-list: it is 3092 // sorted by the original instruction location. This lets the final schedule 3093 // be as close as possible to the original instruction order. 3094 struct ScheduleDataCompare { 3095 bool operator()(ScheduleData *SD1, ScheduleData *SD2) { 3096 return SD2->SchedulingPriority < SD1->SchedulingPriority; 3097 } 3098 }; 3099 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; 3100 3101 // Ensure that all dependency data is updated and fill the ready-list with 3102 // initial instructions. 3103 int Idx = 0; 3104 int NumToSchedule = 0; 3105 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; 3106 I = I->getNextNode()) { 3107 ScheduleData *SD = BS->getScheduleData(I); 3108 assert( 3109 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && 3110 "scheduler and vectorizer have different opinion on what is a bundle"); 3111 SD->FirstInBundle->SchedulingPriority = Idx++; 3112 if (SD->isSchedulingEntity()) { 3113 BS->calculateDependencies(SD, false, this); 3114 NumToSchedule++; 3115 } 3116 } 3117 BS->initialFillReadyList(ReadyInsts); 3118 3119 Instruction *LastScheduledInst = BS->ScheduleEnd; 3120 3121 // Do the "real" scheduling. 3122 while (!ReadyInsts.empty()) { 3123 ScheduleData *picked = *ReadyInsts.begin(); 3124 ReadyInsts.erase(ReadyInsts.begin()); 3125 3126 // Move the scheduled instruction(s) to their dedicated places, if not 3127 // there yet. 3128 ScheduleData *BundleMember = picked; 3129 while (BundleMember) { 3130 Instruction *pickedInst = BundleMember->Inst; 3131 if (LastScheduledInst->getNextNode() != pickedInst) { 3132 BS->BB->getInstList().remove(pickedInst); 3133 BS->BB->getInstList().insert(LastScheduledInst->getIterator(), 3134 pickedInst); 3135 } 3136 LastScheduledInst = pickedInst; 3137 BundleMember = BundleMember->NextInBundle; 3138 } 3139 3140 BS->schedule(picked, ReadyInsts); 3141 NumToSchedule--; 3142 } 3143 assert(NumToSchedule == 0 && "could not schedule all instructions"); 3144 3145 // Avoid duplicate scheduling of the block. 3146 BS->ScheduleStart = nullptr; 3147 } 3148 3149 unsigned BoUpSLP::getVectorElementSize(Value *V) { 3150 auto &DL = F->getParent()->getDataLayout(); 3151 3152 // If V is a store, just return the width of the stored value without 3153 // traversing the expression tree. This is the common case. 3154 if (auto *Store = dyn_cast<StoreInst>(V)) 3155 return DL.getTypeSizeInBits(Store->getValueOperand()->getType()); 3156 3157 // If V is not a store, we can traverse the expression tree to find loads 3158 // that feed it. The type of the loaded value may indicate a more suitable 3159 // width than V's type. We want to base the vector element size on the width 3160 // of memory operations where possible. 3161 SmallVector<Instruction *, 16> Worklist; 3162 SmallPtrSet<Instruction *, 16> Visited; 3163 if (auto *I = dyn_cast<Instruction>(V)) 3164 Worklist.push_back(I); 3165 3166 // Traverse the expression tree in bottom-up order looking for loads. If we 3167 // encounter an instruciton we don't yet handle, we give up. 3168 auto MaxWidth = 0u; 3169 auto FoundUnknownInst = false; 3170 while (!Worklist.empty() && !FoundUnknownInst) { 3171 auto *I = Worklist.pop_back_val(); 3172 Visited.insert(I); 3173 3174 // We should only be looking at scalar instructions here. If the current 3175 // instruction has a vector type, give up. 3176 auto *Ty = I->getType(); 3177 if (isa<VectorType>(Ty)) 3178 FoundUnknownInst = true; 3179 3180 // If the current instruction is a load, update MaxWidth to reflect the 3181 // width of the loaded value. 3182 else if (isa<LoadInst>(I)) 3183 MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty)); 3184 3185 // Otherwise, we need to visit the operands of the instruction. We only 3186 // handle the interesting cases from buildTree here. If an operand is an 3187 // instruction we haven't yet visited, we add it to the worklist. 3188 else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) || 3189 isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) { 3190 for (Use &U : I->operands()) 3191 if (auto *J = dyn_cast<Instruction>(U.get())) 3192 if (!Visited.count(J)) 3193 Worklist.push_back(J); 3194 } 3195 3196 // If we don't yet handle the instruction, give up. 3197 else 3198 FoundUnknownInst = true; 3199 } 3200 3201 // If we didn't encounter a memory access in the expression tree, or if we 3202 // gave up for some reason, just return the width of V. 3203 if (!MaxWidth || FoundUnknownInst) 3204 return DL.getTypeSizeInBits(V->getType()); 3205 3206 // Otherwise, return the maximum width we found. 3207 return MaxWidth; 3208 } 3209 3210 /// The SLPVectorizer Pass. 3211 struct SLPVectorizer : public FunctionPass { 3212 typedef SmallVector<StoreInst *, 8> StoreList; 3213 typedef MapVector<Value *, StoreList> StoreListMap; 3214 typedef SmallVector<WeakVH, 8> WeakVHList; 3215 typedef MapVector<Value *, WeakVHList> WeakVHListMap; 3216 3217 /// Pass identification, replacement for typeid 3218 static char ID; 3219 3220 explicit SLPVectorizer() : FunctionPass(ID) { 3221 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); 3222 } 3223 3224 ScalarEvolution *SE; 3225 TargetTransformInfo *TTI; 3226 TargetLibraryInfo *TLI; 3227 AliasAnalysis *AA; 3228 LoopInfo *LI; 3229 DominatorTree *DT; 3230 AssumptionCache *AC; 3231 3232 bool runOnFunction(Function &F) override { 3233 if (skipOptnoneFunction(F)) 3234 return false; 3235 3236 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 3237 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 3238 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 3239 TLI = TLIP ? &TLIP->getTLI() : nullptr; 3240 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 3241 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 3242 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 3243 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 3244 3245 Stores.clear(); 3246 GEPs.clear(); 3247 bool Changed = false; 3248 3249 // If the target claims to have no vector registers don't attempt 3250 // vectorization. 3251 if (!TTI->getNumberOfRegisters(true)) 3252 return false; 3253 3254 // Use the vector register size specified by the target unless overridden 3255 // by a command-line option. 3256 // TODO: It would be better to limit the vectorization factor based on 3257 // data type rather than just register size. For example, x86 AVX has 3258 // 256-bit registers, but it does not support integer operations 3259 // at that width (that requires AVX2). 3260 if (MaxVectorRegSizeOption.getNumOccurrences()) 3261 MaxVecRegSize = MaxVectorRegSizeOption; 3262 else 3263 MaxVecRegSize = TTI->getRegisterBitWidth(true); 3264 3265 // Don't vectorize when the attribute NoImplicitFloat is used. 3266 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 3267 return false; 3268 3269 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); 3270 3271 // Use the bottom up slp vectorizer to construct chains that start with 3272 // store instructions. 3273 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC); 3274 3275 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to 3276 // delete instructions. 3277 3278 // Scan the blocks in the function in post order. 3279 for (auto BB : post_order(&F.getEntryBlock())) { 3280 collectSeedInstructions(BB); 3281 3282 // Vectorize trees that end at stores. 3283 if (NumStores > 0) { 3284 DEBUG(dbgs() << "SLP: Found " << NumStores << " stores.\n"); 3285 Changed |= vectorizeStoreChains(R); 3286 } 3287 3288 // Vectorize trees that end at reductions. 3289 Changed |= vectorizeChainsInBlock(BB, R); 3290 3291 // Vectorize the index computations of getelementptr instructions. This 3292 // is primarily intended to catch gather-like idioms ending at 3293 // non-consecutive loads. 3294 if (NumGEPs > 0) { 3295 DEBUG(dbgs() << "SLP: Found " << NumGEPs << " GEPs.\n"); 3296 Changed |= vectorizeGEPIndices(BB, R); 3297 } 3298 } 3299 3300 if (Changed) { 3301 R.optimizeGatherSequence(); 3302 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); 3303 DEBUG(verifyFunction(F)); 3304 } 3305 return Changed; 3306 } 3307 3308 void getAnalysisUsage(AnalysisUsage &AU) const override { 3309 FunctionPass::getAnalysisUsage(AU); 3310 AU.addRequired<AssumptionCacheTracker>(); 3311 AU.addRequired<ScalarEvolutionWrapperPass>(); 3312 AU.addRequired<AAResultsWrapperPass>(); 3313 AU.addRequired<TargetTransformInfoWrapperPass>(); 3314 AU.addRequired<LoopInfoWrapperPass>(); 3315 AU.addRequired<DominatorTreeWrapperPass>(); 3316 AU.addPreserved<LoopInfoWrapperPass>(); 3317 AU.addPreserved<DominatorTreeWrapperPass>(); 3318 AU.addPreserved<AAResultsWrapperPass>(); 3319 AU.addPreserved<GlobalsAAWrapperPass>(); 3320 AU.setPreservesCFG(); 3321 } 3322 3323 private: 3324 /// \brief Collect store and getelementptr instructions and organize them 3325 /// according to the underlying object of their pointer operands. We sort the 3326 /// instructions by their underlying objects to reduce the cost of 3327 /// consecutive access queries. 3328 /// 3329 /// TODO: We can further reduce this cost if we flush the chain creation 3330 /// every time we run into a memory barrier. 3331 void collectSeedInstructions(BasicBlock *BB); 3332 3333 /// \brief Try to vectorize a chain that starts at two arithmetic instrs. 3334 bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); 3335 3336 /// \brief Try to vectorize a list of operands. 3337 /// \@param BuildVector A list of users to ignore for the purpose of 3338 /// scheduling and that don't need extracting. 3339 /// \returns true if a value was vectorized. 3340 bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 3341 ArrayRef<Value *> BuildVector = None, 3342 bool allowReorder = false); 3343 3344 /// \brief Try to vectorize a chain that may start at the operands of \V; 3345 bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); 3346 3347 /// \brief Vectorize the store instructions collected in Stores. 3348 bool vectorizeStoreChains(BoUpSLP &R); 3349 3350 /// \brief Vectorize the index computations of the getelementptr instructions 3351 /// collected in GEPs. 3352 bool vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R); 3353 3354 /// \brief Scan the basic block and look for patterns that are likely to start 3355 /// a vectorization chain. 3356 bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); 3357 3358 bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, 3359 BoUpSLP &R, unsigned VecRegSize); 3360 3361 bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, 3362 BoUpSLP &R); 3363 3364 /// The store instructions in a basic block organized by base pointer. 3365 StoreListMap Stores; 3366 3367 /// The getelementptr instructions in a basic block organized by base pointer. 3368 WeakVHListMap GEPs; 3369 3370 /// The number of store instructions in a basic block. 3371 unsigned NumStores; 3372 3373 /// The number of getelementptr instructions in a basic block. 3374 unsigned NumGEPs; 3375 3376 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. 3377 }; 3378 3379 /// \brief Check that the Values in the slice in VL array are still existent in 3380 /// the WeakVH array. 3381 /// Vectorization of part of the VL array may cause later values in the VL array 3382 /// to become invalid. We track when this has happened in the WeakVH array. 3383 static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH, 3384 unsigned SliceBegin, unsigned SliceSize) { 3385 VL = VL.slice(SliceBegin, SliceSize); 3386 VH = VH.slice(SliceBegin, SliceSize); 3387 return !std::equal(VL.begin(), VL.end(), VH.begin()); 3388 } 3389 3390 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, 3391 int CostThreshold, BoUpSLP &R, 3392 unsigned VecRegSize) { 3393 unsigned ChainLen = Chain.size(); 3394 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen 3395 << "\n"); 3396 unsigned Sz = R.getVectorElementSize(Chain[0]); 3397 unsigned VF = VecRegSize / Sz; 3398 3399 if (!isPowerOf2_32(Sz) || VF < 2) 3400 return false; 3401 3402 // Keep track of values that were deleted by vectorizing in the loop below. 3403 SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end()); 3404 3405 bool Changed = false; 3406 // Look for profitable vectorizable trees at all offsets, starting at zero. 3407 for (unsigned i = 0, e = ChainLen; i < e; ++i) { 3408 if (i + VF > e) 3409 break; 3410 3411 // Check that a previous iteration of this loop did not delete the Value. 3412 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) 3413 continue; 3414 3415 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i 3416 << "\n"); 3417 ArrayRef<Value *> Operands = Chain.slice(i, VF); 3418 3419 R.buildTree(Operands); 3420 3421 int Cost = R.getTreeCost(); 3422 3423 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); 3424 if (Cost < CostThreshold) { 3425 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); 3426 R.vectorizeTree(); 3427 3428 // Move to the next bundle. 3429 i += VF - 1; 3430 Changed = true; 3431 } 3432 } 3433 3434 return Changed; 3435 } 3436 3437 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, 3438 int costThreshold, BoUpSLP &R) { 3439 SetVector<StoreInst *> Heads, Tails; 3440 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; 3441 3442 // We may run into multiple chains that merge into a single chain. We mark the 3443 // stores that we vectorized so that we don't visit the same store twice. 3444 BoUpSLP::ValueSet VectorizedStores; 3445 bool Changed = false; 3446 3447 // Do a quadratic search on all of the given stores and find 3448 // all of the pairs of stores that follow each other. 3449 SmallVector<unsigned, 16> IndexQueue; 3450 for (unsigned i = 0, e = Stores.size(); i < e; ++i) { 3451 const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); 3452 IndexQueue.clear(); 3453 // If a store has multiple consecutive store candidates, search Stores 3454 // array according to the sequence: from i+1 to e, then from i-1 to 0. 3455 // This is because usually pairing with immediate succeeding or preceding 3456 // candidate create the best chance to find slp vectorization opportunity. 3457 unsigned j = 0; 3458 for (j = i + 1; j < e; ++j) 3459 IndexQueue.push_back(j); 3460 for (j = i; j > 0; --j) 3461 IndexQueue.push_back(j - 1); 3462 3463 for (auto &k : IndexQueue) { 3464 if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) { 3465 Tails.insert(Stores[k]); 3466 Heads.insert(Stores[i]); 3467 ConsecutiveChain[Stores[i]] = Stores[k]; 3468 break; 3469 } 3470 } 3471 } 3472 3473 // For stores that start but don't end a link in the chain: 3474 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); 3475 it != e; ++it) { 3476 if (Tails.count(*it)) 3477 continue; 3478 3479 // We found a store instr that starts a chain. Now follow the chain and try 3480 // to vectorize it. 3481 BoUpSLP::ValueList Operands; 3482 StoreInst *I = *it; 3483 // Collect the chain into a list. 3484 while (Tails.count(I) || Heads.count(I)) { 3485 if (VectorizedStores.count(I)) 3486 break; 3487 Operands.push_back(I); 3488 // Move to the next value in the chain. 3489 I = ConsecutiveChain[I]; 3490 } 3491 3492 // FIXME: Is division-by-2 the correct step? Should we assert that the 3493 // register size is a power-of-2? 3494 for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { 3495 if (vectorizeStoreChain(Operands, costThreshold, R, Size)) { 3496 // Mark the vectorized stores so that we don't vectorize them again. 3497 VectorizedStores.insert(Operands.begin(), Operands.end()); 3498 Changed = true; 3499 break; 3500 } 3501 } 3502 } 3503 3504 return Changed; 3505 } 3506 3507 void SLPVectorizer::collectSeedInstructions(BasicBlock *BB) { 3508 3509 // Initialize the collections. We will make a single pass over the block. 3510 Stores.clear(); 3511 GEPs.clear(); 3512 NumStores = NumGEPs = 0; 3513 const DataLayout &DL = BB->getModule()->getDataLayout(); 3514 3515 // Visit the store and getelementptr instructions in BB and organize them in 3516 // Stores and GEPs according to the underlying objects of their pointer 3517 // operands. 3518 for (Instruction &I : *BB) { 3519 3520 // Ignore store instructions that are volatile or have a pointer operand 3521 // that doesn't point to a scalar type. 3522 if (auto *SI = dyn_cast<StoreInst>(&I)) { 3523 if (!SI->isSimple()) 3524 continue; 3525 if (!isValidElementType(SI->getValueOperand()->getType())) 3526 continue; 3527 Stores[GetUnderlyingObject(SI->getPointerOperand(), DL)].push_back(SI); 3528 ++NumStores; 3529 } 3530 3531 // Ignore getelementptr instructions that have more than one index, a 3532 // constant index, or a pointer operand that doesn't point to a scalar 3533 // type. 3534 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 3535 auto Idx = GEP->idx_begin()->get(); 3536 if (GEP->getNumIndices() > 1 || isa<Constant>(Idx)) 3537 continue; 3538 if (!isValidElementType(Idx->getType())) 3539 continue; 3540 GEPs[GetUnderlyingObject(GEP->getPointerOperand(), DL)].push_back(GEP); 3541 ++NumGEPs; 3542 } 3543 } 3544 } 3545 3546 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { 3547 if (!A || !B) 3548 return false; 3549 Value *VL[] = { A, B }; 3550 return tryToVectorizeList(VL, R, None, true); 3551 } 3552 3553 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 3554 ArrayRef<Value *> BuildVector, 3555 bool allowReorder) { 3556 if (VL.size() < 2) 3557 return false; 3558 3559 DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n"); 3560 3561 // Check that all of the parts are scalar instructions of the same type. 3562 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 3563 if (!I0) 3564 return false; 3565 3566 unsigned Opcode0 = I0->getOpcode(); 3567 3568 // FIXME: Register size should be a parameter to this function, so we can 3569 // try different vectorization factors. 3570 unsigned Sz = R.getVectorElementSize(I0); 3571 unsigned VF = MinVecRegSize / Sz; 3572 3573 for (Value *V : VL) { 3574 Type *Ty = V->getType(); 3575 if (!isValidElementType(Ty)) 3576 return false; 3577 Instruction *Inst = dyn_cast<Instruction>(V); 3578 if (!Inst || Inst->getOpcode() != Opcode0) 3579 return false; 3580 } 3581 3582 bool Changed = false; 3583 3584 // Keep track of values that were deleted by vectorizing in the loop below. 3585 SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); 3586 3587 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 3588 unsigned OpsWidth = 0; 3589 3590 if (i + VF > e) 3591 OpsWidth = e - i; 3592 else 3593 OpsWidth = VF; 3594 3595 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) 3596 break; 3597 3598 // Check that a previous iteration of this loop did not delete the Value. 3599 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth)) 3600 continue; 3601 3602 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " 3603 << "\n"); 3604 ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); 3605 3606 ArrayRef<Value *> BuildVectorSlice; 3607 if (!BuildVector.empty()) 3608 BuildVectorSlice = BuildVector.slice(i, OpsWidth); 3609 3610 R.buildTree(Ops, BuildVectorSlice); 3611 // TODO: check if we can allow reordering also for other cases than 3612 // tryToVectorizePair() 3613 if (allowReorder && R.shouldReorder()) { 3614 assert(Ops.size() == 2); 3615 assert(BuildVectorSlice.empty()); 3616 Value *ReorderedOps[] = { Ops[1], Ops[0] }; 3617 R.buildTree(ReorderedOps, None); 3618 } 3619 int Cost = R.getTreeCost(); 3620 3621 if (Cost < -SLPCostThreshold) { 3622 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); 3623 Value *VectorizedRoot = R.vectorizeTree(); 3624 3625 // Reconstruct the build vector by extracting the vectorized root. This 3626 // way we handle the case where some elements of the vector are undefined. 3627 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) 3628 if (!BuildVectorSlice.empty()) { 3629 // The insert point is the last build vector instruction. The vectorized 3630 // root will precede it. This guarantees that we get an instruction. The 3631 // vectorized tree could have been constant folded. 3632 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); 3633 unsigned VecIdx = 0; 3634 for (auto &V : BuildVectorSlice) { 3635 IRBuilder<true, NoFolder> Builder( 3636 InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter)); 3637 InsertElementInst *IE = cast<InsertElementInst>(V); 3638 Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( 3639 VectorizedRoot, Builder.getInt32(VecIdx++))); 3640 IE->setOperand(1, Extract); 3641 IE->removeFromParent(); 3642 IE->insertAfter(Extract); 3643 InsertAfter = IE; 3644 } 3645 } 3646 // Move to the next bundle. 3647 i += VF - 1; 3648 Changed = true; 3649 } 3650 } 3651 3652 return Changed; 3653 } 3654 3655 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { 3656 if (!V) 3657 return false; 3658 3659 // Try to vectorize V. 3660 if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) 3661 return true; 3662 3663 BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0)); 3664 BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1)); 3665 // Try to skip B. 3666 if (B && B->hasOneUse()) { 3667 BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); 3668 BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); 3669 if (tryToVectorizePair(A, B0, R)) { 3670 return true; 3671 } 3672 if (tryToVectorizePair(A, B1, R)) { 3673 return true; 3674 } 3675 } 3676 3677 // Try to skip A. 3678 if (A && A->hasOneUse()) { 3679 BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); 3680 BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); 3681 if (tryToVectorizePair(A0, B, R)) { 3682 return true; 3683 } 3684 if (tryToVectorizePair(A1, B, R)) { 3685 return true; 3686 } 3687 } 3688 return 0; 3689 } 3690 3691 /// \brief Generate a shuffle mask to be used in a reduction tree. 3692 /// 3693 /// \param VecLen The length of the vector to be reduced. 3694 /// \param NumEltsToRdx The number of elements that should be reduced in the 3695 /// vector. 3696 /// \param IsPairwise Whether the reduction is a pairwise or splitting 3697 /// reduction. A pairwise reduction will generate a mask of 3698 /// <0,2,...> or <1,3,..> while a splitting reduction will generate 3699 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2. 3700 /// \param IsLeft True will generate a mask of even elements, odd otherwise. 3701 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, 3702 bool IsPairwise, bool IsLeft, 3703 IRBuilder<> &Builder) { 3704 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); 3705 3706 SmallVector<Constant *, 32> ShuffleMask( 3707 VecLen, UndefValue::get(Builder.getInt32Ty())); 3708 3709 if (IsPairwise) 3710 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). 3711 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3712 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); 3713 else 3714 // Move the upper half of the vector to the lower half. 3715 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3716 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); 3717 3718 return ConstantVector::get(ShuffleMask); 3719 } 3720 3721 3722 /// Model horizontal reductions. 3723 /// 3724 /// A horizontal reduction is a tree of reduction operations (currently add and 3725 /// fadd) that has operations that can be put into a vector as its leaf. 3726 /// For example, this tree: 3727 /// 3728 /// mul mul mul mul 3729 /// \ / \ / 3730 /// + + 3731 /// \ / 3732 /// + 3733 /// This tree has "mul" as its reduced values and "+" as its reduction 3734 /// operations. A reduction might be feeding into a store or a binary operation 3735 /// feeding a phi. 3736 /// ... 3737 /// \ / 3738 /// + 3739 /// | 3740 /// phi += 3741 /// 3742 /// Or: 3743 /// ... 3744 /// \ / 3745 /// + 3746 /// | 3747 /// *p = 3748 /// 3749 class HorizontalReduction { 3750 SmallVector<Value *, 16> ReductionOps; 3751 SmallVector<Value *, 32> ReducedVals; 3752 3753 BinaryOperator *ReductionRoot; 3754 PHINode *ReductionPHI; 3755 3756 /// The opcode of the reduction. 3757 unsigned ReductionOpcode; 3758 /// The opcode of the values we perform a reduction on. 3759 unsigned ReducedValueOpcode; 3760 /// Should we model this reduction as a pairwise reduction tree or a tree that 3761 /// splits the vector in halves and adds those halves. 3762 bool IsPairwiseReduction; 3763 3764 public: 3765 /// The width of one full horizontal reduction operation. 3766 unsigned ReduxWidth; 3767 3768 HorizontalReduction() 3769 : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), 3770 ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {} 3771 3772 /// \brief Try to find a reduction tree. 3773 bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { 3774 assert((!Phi || 3775 std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && 3776 "Thi phi needs to use the binary operator"); 3777 3778 // We could have a initial reductions that is not an add. 3779 // r *= v1 + v2 + v3 + v4 3780 // In such a case start looking for a tree rooted in the first '+'. 3781 if (Phi) { 3782 if (B->getOperand(0) == Phi) { 3783 Phi = nullptr; 3784 B = dyn_cast<BinaryOperator>(B->getOperand(1)); 3785 } else if (B->getOperand(1) == Phi) { 3786 Phi = nullptr; 3787 B = dyn_cast<BinaryOperator>(B->getOperand(0)); 3788 } 3789 } 3790 3791 if (!B) 3792 return false; 3793 3794 Type *Ty = B->getType(); 3795 if (!isValidElementType(Ty)) 3796 return false; 3797 3798 const DataLayout &DL = B->getModule()->getDataLayout(); 3799 ReductionOpcode = B->getOpcode(); 3800 ReducedValueOpcode = 0; 3801 // FIXME: Register size should be a parameter to this function, so we can 3802 // try different vectorization factors. 3803 ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); 3804 ReductionRoot = B; 3805 ReductionPHI = Phi; 3806 3807 if (ReduxWidth < 4) 3808 return false; 3809 3810 // We currently only support adds. 3811 if (ReductionOpcode != Instruction::Add && 3812 ReductionOpcode != Instruction::FAdd) 3813 return false; 3814 3815 // Post order traverse the reduction tree starting at B. We only handle true 3816 // trees containing only binary operators or selects. 3817 SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; 3818 Stack.push_back(std::make_pair(B, 0)); 3819 while (!Stack.empty()) { 3820 Instruction *TreeN = Stack.back().first; 3821 unsigned EdgeToVist = Stack.back().second++; 3822 bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; 3823 3824 // Only handle trees in the current basic block. 3825 if (TreeN->getParent() != B->getParent()) 3826 return false; 3827 3828 // Each tree node needs to have one user except for the ultimate 3829 // reduction. 3830 if (!TreeN->hasOneUse() && TreeN != B) 3831 return false; 3832 3833 // Postorder vist. 3834 if (EdgeToVist == 2 || IsReducedValue) { 3835 if (IsReducedValue) { 3836 // Make sure that the opcodes of the operations that we are going to 3837 // reduce match. 3838 if (!ReducedValueOpcode) 3839 ReducedValueOpcode = TreeN->getOpcode(); 3840 else if (ReducedValueOpcode != TreeN->getOpcode()) 3841 return false; 3842 ReducedVals.push_back(TreeN); 3843 } else { 3844 // We need to be able to reassociate the adds. 3845 if (!TreeN->isAssociative()) 3846 return false; 3847 ReductionOps.push_back(TreeN); 3848 } 3849 // Retract. 3850 Stack.pop_back(); 3851 continue; 3852 } 3853 3854 // Visit left or right. 3855 Value *NextV = TreeN->getOperand(EdgeToVist); 3856 // We currently only allow BinaryOperator's and SelectInst's as reduction 3857 // values in our tree. 3858 if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV)) 3859 Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0)); 3860 else if (NextV != Phi) 3861 return false; 3862 } 3863 return true; 3864 } 3865 3866 /// \brief Attempt to vectorize the tree found by 3867 /// matchAssociativeReduction. 3868 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { 3869 if (ReducedVals.empty()) 3870 return false; 3871 3872 unsigned NumReducedVals = ReducedVals.size(); 3873 if (NumReducedVals < ReduxWidth) 3874 return false; 3875 3876 Value *VectorizedTree = nullptr; 3877 IRBuilder<> Builder(ReductionRoot); 3878 FastMathFlags Unsafe; 3879 Unsafe.setUnsafeAlgebra(); 3880 Builder.setFastMathFlags(Unsafe); 3881 unsigned i = 0; 3882 3883 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { 3884 V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps); 3885 3886 // Estimate cost. 3887 int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); 3888 if (Cost >= -SLPCostThreshold) 3889 break; 3890 3891 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost 3892 << ". (HorRdx)\n"); 3893 3894 // Vectorize a tree. 3895 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); 3896 Value *VectorizedRoot = V.vectorizeTree(); 3897 3898 // Emit a reduction. 3899 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); 3900 if (VectorizedTree) { 3901 Builder.SetCurrentDebugLocation(Loc); 3902 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3903 ReducedSubTree, "bin.rdx"); 3904 } else 3905 VectorizedTree = ReducedSubTree; 3906 } 3907 3908 if (VectorizedTree) { 3909 // Finish the reduction. 3910 for (; i < NumReducedVals; ++i) { 3911 Builder.SetCurrentDebugLocation( 3912 cast<Instruction>(ReducedVals[i])->getDebugLoc()); 3913 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3914 ReducedVals[i]); 3915 } 3916 // Update users. 3917 if (ReductionPHI) { 3918 assert(ReductionRoot && "Need a reduction operation"); 3919 ReductionRoot->setOperand(0, VectorizedTree); 3920 ReductionRoot->setOperand(1, ReductionPHI); 3921 } else 3922 ReductionRoot->replaceAllUsesWith(VectorizedTree); 3923 } 3924 return VectorizedTree != nullptr; 3925 } 3926 3927 unsigned numReductionValues() const { 3928 return ReducedVals.size(); 3929 } 3930 3931 private: 3932 /// \brief Calculate the cost of a reduction. 3933 int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { 3934 Type *ScalarTy = FirstReducedVal->getType(); 3935 Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); 3936 3937 int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); 3938 int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); 3939 3940 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; 3941 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; 3942 3943 int ScalarReduxCost = 3944 ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); 3945 3946 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost 3947 << " for reduction that starts with " << *FirstReducedVal 3948 << " (It is a " 3949 << (IsPairwiseReduction ? "pairwise" : "splitting") 3950 << " reduction)\n"); 3951 3952 return VecReduxCost - ScalarReduxCost; 3953 } 3954 3955 static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, 3956 Value *R, const Twine &Name = "") { 3957 if (Opcode == Instruction::FAdd) 3958 return Builder.CreateFAdd(L, R, Name); 3959 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); 3960 } 3961 3962 /// \brief Emit a horizontal reduction of the vectorized value. 3963 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { 3964 assert(VectorizedValue && "Need to have a vectorized tree node"); 3965 assert(isPowerOf2_32(ReduxWidth) && 3966 "We only handle power-of-two reductions for now"); 3967 3968 Value *TmpVec = VectorizedValue; 3969 for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { 3970 if (IsPairwiseReduction) { 3971 Value *LeftMask = 3972 createRdxShuffleMask(ReduxWidth, i, true, true, Builder); 3973 Value *RightMask = 3974 createRdxShuffleMask(ReduxWidth, i, true, false, Builder); 3975 3976 Value *LeftShuf = Builder.CreateShuffleVector( 3977 TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); 3978 Value *RightShuf = Builder.CreateShuffleVector( 3979 TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), 3980 "rdx.shuf.r"); 3981 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, 3982 "bin.rdx"); 3983 } else { 3984 Value *UpperHalf = 3985 createRdxShuffleMask(ReduxWidth, i, false, false, Builder); 3986 Value *Shuf = Builder.CreateShuffleVector( 3987 TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); 3988 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); 3989 } 3990 } 3991 3992 // The result is in the first element of the vector. 3993 return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 3994 } 3995 }; 3996 3997 /// \brief Recognize construction of vectors like 3998 /// %ra = insertelement <4 x float> undef, float %s0, i32 0 3999 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 4000 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 4001 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 4002 /// 4003 /// Returns true if it matches 4004 /// 4005 static bool findBuildVector(InsertElementInst *FirstInsertElem, 4006 SmallVectorImpl<Value *> &BuildVector, 4007 SmallVectorImpl<Value *> &BuildVectorOpds) { 4008 if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) 4009 return false; 4010 4011 InsertElementInst *IE = FirstInsertElem; 4012 while (true) { 4013 BuildVector.push_back(IE); 4014 BuildVectorOpds.push_back(IE->getOperand(1)); 4015 4016 if (IE->use_empty()) 4017 return false; 4018 4019 InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back()); 4020 if (!NextUse) 4021 return true; 4022 4023 // If this isn't the final use, make sure the next insertelement is the only 4024 // use. It's OK if the final constructed vector is used multiple times 4025 if (!IE->hasOneUse()) 4026 return false; 4027 4028 IE = NextUse; 4029 } 4030 4031 return false; 4032 } 4033 4034 static bool PhiTypeSorterFunc(Value *V, Value *V2) { 4035 return V->getType() < V2->getType(); 4036 } 4037 4038 /// \brief Try and get a reduction value from a phi node. 4039 /// 4040 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions 4041 /// if they come from either \p ParentBB or a containing loop latch. 4042 /// 4043 /// \returns A candidate reduction value if possible, or \code nullptr \endcode 4044 /// if not possible. 4045 static Value *getReductionValue(const DominatorTree *DT, PHINode *P, 4046 BasicBlock *ParentBB, LoopInfo *LI) { 4047 // There are situations where the reduction value is not dominated by the 4048 // reduction phi. Vectorizing such cases has been reported to cause 4049 // miscompiles. See PR25787. 4050 auto DominatedReduxValue = [&](Value *R) { 4051 return ( 4052 dyn_cast<Instruction>(R) && 4053 DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent())); 4054 }; 4055 4056 Value *Rdx = nullptr; 4057 4058 // Return the incoming value if it comes from the same BB as the phi node. 4059 if (P->getIncomingBlock(0) == ParentBB) { 4060 Rdx = P->getIncomingValue(0); 4061 } else if (P->getIncomingBlock(1) == ParentBB) { 4062 Rdx = P->getIncomingValue(1); 4063 } 4064 4065 if (Rdx && DominatedReduxValue(Rdx)) 4066 return Rdx; 4067 4068 // Otherwise, check whether we have a loop latch to look at. 4069 Loop *BBL = LI->getLoopFor(ParentBB); 4070 if (!BBL) 4071 return nullptr; 4072 BasicBlock *BBLatch = BBL->getLoopLatch(); 4073 if (!BBLatch) 4074 return nullptr; 4075 4076 // There is a loop latch, return the incoming value if it comes from 4077 // that. This reduction pattern occassionaly turns up. 4078 if (P->getIncomingBlock(0) == BBLatch) { 4079 Rdx = P->getIncomingValue(0); 4080 } else if (P->getIncomingBlock(1) == BBLatch) { 4081 Rdx = P->getIncomingValue(1); 4082 } 4083 4084 if (Rdx && DominatedReduxValue(Rdx)) 4085 return Rdx; 4086 4087 return nullptr; 4088 } 4089 4090 /// \brief Attempt to reduce a horizontal reduction. 4091 /// If it is legal to match a horizontal reduction feeding 4092 /// the phi node P with reduction operators BI, then check if it 4093 /// can be done. 4094 /// \returns true if a horizontal reduction was matched and reduced. 4095 /// \returns false if a horizontal reduction was not matched. 4096 static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI, 4097 BoUpSLP &R, TargetTransformInfo *TTI) { 4098 if (!ShouldVectorizeHor) 4099 return false; 4100 4101 HorizontalReduction HorRdx; 4102 if (!HorRdx.matchAssociativeReduction(P, BI)) 4103 return false; 4104 4105 // If there is a sufficient number of reduction values, reduce 4106 // to a nearby power-of-2. Can safely generate oversized 4107 // vectors and rely on the backend to split them to legal sizes. 4108 HorRdx.ReduxWidth = 4109 std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues())); 4110 4111 return HorRdx.tryToReduce(R, TTI); 4112 } 4113 4114 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { 4115 bool Changed = false; 4116 SmallVector<Value *, 4> Incoming; 4117 SmallSet<Value *, 16> VisitedInstrs; 4118 4119 bool HaveVectorizedPhiNodes = true; 4120 while (HaveVectorizedPhiNodes) { 4121 HaveVectorizedPhiNodes = false; 4122 4123 // Collect the incoming values from the PHIs. 4124 Incoming.clear(); 4125 for (Instruction &I : *BB) { 4126 PHINode *P = dyn_cast<PHINode>(&I); 4127 if (!P) 4128 break; 4129 4130 if (!VisitedInstrs.count(P)) 4131 Incoming.push_back(P); 4132 } 4133 4134 // Sort by type. 4135 std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc); 4136 4137 // Try to vectorize elements base on their type. 4138 for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), 4139 E = Incoming.end(); 4140 IncIt != E;) { 4141 4142 // Look for the next elements with the same type. 4143 SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; 4144 while (SameTypeIt != E && 4145 (*SameTypeIt)->getType() == (*IncIt)->getType()) { 4146 VisitedInstrs.insert(*SameTypeIt); 4147 ++SameTypeIt; 4148 } 4149 4150 // Try to vectorize them. 4151 unsigned NumElts = (SameTypeIt - IncIt); 4152 DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); 4153 if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { 4154 // Success start over because instructions might have been changed. 4155 HaveVectorizedPhiNodes = true; 4156 Changed = true; 4157 break; 4158 } 4159 4160 // Start over at the next instruction of a different type (or the end). 4161 IncIt = SameTypeIt; 4162 } 4163 } 4164 4165 VisitedInstrs.clear(); 4166 4167 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { 4168 // We may go through BB multiple times so skip the one we have checked. 4169 if (!VisitedInstrs.insert(&*it).second) 4170 continue; 4171 4172 if (isa<DbgInfoIntrinsic>(it)) 4173 continue; 4174 4175 // Try to vectorize reductions that use PHINodes. 4176 if (PHINode *P = dyn_cast<PHINode>(it)) { 4177 // Check that the PHI is a reduction PHI. 4178 if (P->getNumIncomingValues() != 2) 4179 return Changed; 4180 4181 Value *Rdx = getReductionValue(DT, P, BB, LI); 4182 4183 // Check if this is a Binary Operator. 4184 BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); 4185 if (!BI) 4186 continue; 4187 4188 // Try to match and vectorize a horizontal reduction. 4189 if (canMatchHorizontalReduction(P, BI, R, TTI)) { 4190 Changed = true; 4191 it = BB->begin(); 4192 e = BB->end(); 4193 continue; 4194 } 4195 4196 Value *Inst = BI->getOperand(0); 4197 if (Inst == P) 4198 Inst = BI->getOperand(1); 4199 4200 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) { 4201 // We would like to start over since some instructions are deleted 4202 // and the iterator may become invalid value. 4203 Changed = true; 4204 it = BB->begin(); 4205 e = BB->end(); 4206 continue; 4207 } 4208 4209 continue; 4210 } 4211 4212 if (ShouldStartVectorizeHorAtStore) 4213 if (StoreInst *SI = dyn_cast<StoreInst>(it)) 4214 if (BinaryOperator *BinOp = 4215 dyn_cast<BinaryOperator>(SI->getValueOperand())) { 4216 if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) || 4217 tryToVectorize(BinOp, R)) { 4218 Changed = true; 4219 it = BB->begin(); 4220 e = BB->end(); 4221 continue; 4222 } 4223 } 4224 4225 // Try to vectorize horizontal reductions feeding into a return. 4226 if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) 4227 if (RI->getNumOperands() != 0) 4228 if (BinaryOperator *BinOp = 4229 dyn_cast<BinaryOperator>(RI->getOperand(0))) { 4230 DEBUG(dbgs() << "SLP: Found a return to vectorize.\n"); 4231 if (tryToVectorizePair(BinOp->getOperand(0), 4232 BinOp->getOperand(1), R)) { 4233 Changed = true; 4234 it = BB->begin(); 4235 e = BB->end(); 4236 continue; 4237 } 4238 } 4239 4240 // Try to vectorize trees that start at compare instructions. 4241 if (CmpInst *CI = dyn_cast<CmpInst>(it)) { 4242 if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { 4243 Changed = true; 4244 // We would like to start over since some instructions are deleted 4245 // and the iterator may become invalid value. 4246 it = BB->begin(); 4247 e = BB->end(); 4248 continue; 4249 } 4250 4251 for (int i = 0; i < 2; ++i) { 4252 if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) { 4253 if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { 4254 Changed = true; 4255 // We would like to start over since some instructions are deleted 4256 // and the iterator may become invalid value. 4257 it = BB->begin(); 4258 e = BB->end(); 4259 break; 4260 } 4261 } 4262 } 4263 continue; 4264 } 4265 4266 // Try to vectorize trees that start at insertelement instructions. 4267 if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { 4268 SmallVector<Value *, 16> BuildVector; 4269 SmallVector<Value *, 16> BuildVectorOpds; 4270 if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) 4271 continue; 4272 4273 // Vectorize starting with the build vector operands ignoring the 4274 // BuildVector instructions for the purpose of scheduling and user 4275 // extraction. 4276 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { 4277 Changed = true; 4278 it = BB->begin(); 4279 e = BB->end(); 4280 } 4281 4282 continue; 4283 } 4284 } 4285 4286 return Changed; 4287 } 4288 4289 bool SLPVectorizer::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { 4290 auto Changed = false; 4291 for (auto &Entry : GEPs) { 4292 4293 // If the getelementptr list has fewer than two elements, there's nothing 4294 // to do. 4295 if (Entry.second.size() < 2) 4296 continue; 4297 4298 DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " 4299 << Entry.second.size() << ".\n"); 4300 4301 // We process the getelementptr list in chunks of 16 (like we do for 4302 // stores) to minimize compile-time. 4303 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) { 4304 auto Len = std::min<unsigned>(BE - BI, 16); 4305 auto GEPList = makeArrayRef(&Entry.second[BI], Len); 4306 4307 // Initialize a set a candidate getelementptrs. Note that we use a 4308 // SetVector here to preserve program order. If the index computations 4309 // are vectorizable and begin with loads, we want to minimize the chance 4310 // of having to reorder them later. 4311 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end()); 4312 4313 // Some of the candidates may have already been vectorized after we 4314 // initially collected them. If so, the WeakVHs will have nullified the 4315 // values, so remove them from the set of candidates. 4316 Candidates.remove(nullptr); 4317 4318 // Remove from the set of candidates all pairs of getelementptrs with 4319 // constant differences. Such getelementptrs are likely not good 4320 // candidates for vectorization in a bottom-up phase since one can be 4321 // computed from the other. We also ensure all candidate getelementptr 4322 // indices are unique. 4323 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { 4324 auto *GEPI = cast<GetElementPtrInst>(GEPList[I]); 4325 if (!Candidates.count(GEPI)) 4326 continue; 4327 auto *SCEVI = SE->getSCEV(GEPList[I]); 4328 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { 4329 auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]); 4330 auto *SCEVJ = SE->getSCEV(GEPList[J]); 4331 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) { 4332 Candidates.remove(GEPList[I]); 4333 Candidates.remove(GEPList[J]); 4334 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { 4335 Candidates.remove(GEPList[J]); 4336 } 4337 } 4338 } 4339 4340 // We break out of the above computation as soon as we know there are 4341 // fewer than two candidates remaining. 4342 if (Candidates.size() < 2) 4343 continue; 4344 4345 // Add the single, non-constant index of each candidate to the bundle. We 4346 // ensured the indices met these constraints when we originally collected 4347 // the getelementptrs. 4348 SmallVector<Value *, 16> Bundle(Candidates.size()); 4349 auto BundleIndex = 0u; 4350 for (auto *V : Candidates) { 4351 auto *GEP = cast<GetElementPtrInst>(V); 4352 auto *GEPIdx = GEP->idx_begin()->get(); 4353 assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx)); 4354 Bundle[BundleIndex++] = GEPIdx; 4355 } 4356 4357 // Try and vectorize the indices. We are currently only interested in 4358 // gather-like cases of the form: 4359 // 4360 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... 4361 // 4362 // where the loads of "a", the loads of "b", and the subtractions can be 4363 // performed in parallel. It's likely that detecting this pattern in a 4364 // bottom-up phase will be simpler and less costly than building a 4365 // full-blown top-down phase beginning at the consecutive loads. 4366 Changed |= tryToVectorizeList(Bundle, R); 4367 } 4368 } 4369 return Changed; 4370 } 4371 4372 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { 4373 bool Changed = false; 4374 // Attempt to sort and vectorize each of the store-groups. 4375 for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e; 4376 ++it) { 4377 if (it->second.size() < 2) 4378 continue; 4379 4380 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " 4381 << it->second.size() << ".\n"); 4382 4383 // Process the stores in chunks of 16. 4384 // TODO: The limit of 16 inhibits greater vectorization factors. 4385 // For example, AVX2 supports v32i8. Increasing this limit, however, 4386 // may cause a significant compile-time increase. 4387 for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { 4388 unsigned Len = std::min<unsigned>(CE - CI, 16); 4389 Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), 4390 -SLPCostThreshold, R); 4391 } 4392 } 4393 return Changed; 4394 } 4395 4396 } // end anonymous namespace 4397 4398 char SLPVectorizer::ID = 0; 4399 static const char lv_name[] = "SLP Vectorizer"; 4400 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) 4401 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 4402 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 4403 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 4404 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 4405 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 4406 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) 4407 4408 namespace llvm { 4409 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); } 4410 } 4411