1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive 10 // stores that can be put together into vector-stores. Next, it attempts to 11 // construct vectorizable tree using the use-def chains. If a profitable tree 12 // was found, the SLP vectorizer performs vectorization on the tree. 13 // 14 // The pass is inspired by the work described in the paper: 15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 16 // 17 //===----------------------------------------------------------------------===// 18 #include "llvm/Transforms/Vectorize.h" 19 #include "llvm/ADT/MapVector.h" 20 #include "llvm/ADT/PostOrderIterator.h" 21 #include "llvm/ADT/SetVector.h" 22 #include "llvm/ADT/Optional.h" 23 #include "llvm/ADT/Statistic.h" 24 #include "llvm/Analysis/AliasAnalysis.h" 25 #include "llvm/Analysis/AssumptionCache.h" 26 #include "llvm/Analysis/CodeMetrics.h" 27 #include "llvm/Analysis/LoopInfo.h" 28 #include "llvm/Analysis/ScalarEvolution.h" 29 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 30 #include "llvm/Analysis/TargetTransformInfo.h" 31 #include "llvm/Analysis/ValueTracking.h" 32 #include "llvm/IR/DataLayout.h" 33 #include "llvm/IR/Dominators.h" 34 #include "llvm/IR/IRBuilder.h" 35 #include "llvm/IR/Instructions.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Module.h" 38 #include "llvm/IR/NoFolder.h" 39 #include "llvm/IR/Type.h" 40 #include "llvm/IR/Value.h" 41 #include "llvm/IR/Verifier.h" 42 #include "llvm/Pass.h" 43 #include "llvm/Support/CommandLine.h" 44 #include "llvm/Support/Debug.h" 45 #include "llvm/Support/raw_ostream.h" 46 #include "llvm/Transforms/Utils/VectorUtils.h" 47 #include <algorithm> 48 #include <map> 49 #include <memory> 50 51 using namespace llvm; 52 53 #define SV_NAME "slp-vectorizer" 54 #define DEBUG_TYPE "SLP" 55 56 STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); 57 58 static cl::opt<int> 59 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, 60 cl::desc("Only vectorize if you gain more than this " 61 "number ")); 62 63 static cl::opt<bool> 64 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, 65 cl::desc("Attempt to vectorize horizontal reductions")); 66 67 static cl::opt<bool> ShouldStartVectorizeHorAtStore( 68 "slp-vectorize-hor-store", cl::init(false), cl::Hidden, 69 cl::desc( 70 "Attempt to vectorize horizontal reductions feeding into a store")); 71 72 namespace { 73 74 static const unsigned MinVecRegSize = 128; 75 76 static const unsigned RecursionMaxDepth = 12; 77 78 // Limit the number of alias checks. The limit is chosen so that 79 // it has no negative effect on the llvm benchmarks. 80 static const unsigned AliasedCheckLimit = 10; 81 82 // Another limit for the alias checks: The maximum distance between load/store 83 // instructions where alias checks are done. 84 // This limit is useful for very large basic blocks. 85 static const unsigned MaxMemDepDistance = 160; 86 87 /// \returns the parent basic block if all of the instructions in \p VL 88 /// are in the same block or null otherwise. 89 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { 90 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 91 if (!I0) 92 return nullptr; 93 BasicBlock *BB = I0->getParent(); 94 for (int i = 1, e = VL.size(); i < e; i++) { 95 Instruction *I = dyn_cast<Instruction>(VL[i]); 96 if (!I) 97 return nullptr; 98 99 if (BB != I->getParent()) 100 return nullptr; 101 } 102 return BB; 103 } 104 105 /// \returns True if all of the values in \p VL are constants. 106 static bool allConstant(ArrayRef<Value *> VL) { 107 for (unsigned i = 0, e = VL.size(); i < e; ++i) 108 if (!isa<Constant>(VL[i])) 109 return false; 110 return true; 111 } 112 113 /// \returns True if all of the values in \p VL are identical. 114 static bool isSplat(ArrayRef<Value *> VL) { 115 for (unsigned i = 1, e = VL.size(); i < e; ++i) 116 if (VL[i] != VL[0]) 117 return false; 118 return true; 119 } 120 121 ///\returns Opcode that can be clubbed with \p Op to create an alternate 122 /// sequence which can later be merged as a ShuffleVector instruction. 123 static unsigned getAltOpcode(unsigned Op) { 124 switch (Op) { 125 case Instruction::FAdd: 126 return Instruction::FSub; 127 case Instruction::FSub: 128 return Instruction::FAdd; 129 case Instruction::Add: 130 return Instruction::Sub; 131 case Instruction::Sub: 132 return Instruction::Add; 133 default: 134 return 0; 135 } 136 } 137 138 ///\returns bool representing if Opcode \p Op can be part 139 /// of an alternate sequence which can later be merged as 140 /// a ShuffleVector instruction. 141 static bool canCombineAsAltInst(unsigned Op) { 142 if (Op == Instruction::FAdd || Op == Instruction::FSub || 143 Op == Instruction::Sub || Op == Instruction::Add) 144 return true; 145 return false; 146 } 147 148 /// \returns ShuffleVector instruction if intructions in \p VL have 149 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. 150 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) 151 static unsigned isAltInst(ArrayRef<Value *> VL) { 152 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 153 unsigned Opcode = I0->getOpcode(); 154 unsigned AltOpcode = getAltOpcode(Opcode); 155 for (int i = 1, e = VL.size(); i < e; i++) { 156 Instruction *I = dyn_cast<Instruction>(VL[i]); 157 if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) 158 return 0; 159 } 160 return Instruction::ShuffleVector; 161 } 162 163 /// \returns The opcode if all of the Instructions in \p VL have the same 164 /// opcode, or zero. 165 static unsigned getSameOpcode(ArrayRef<Value *> VL) { 166 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 167 if (!I0) 168 return 0; 169 unsigned Opcode = I0->getOpcode(); 170 for (int i = 1, e = VL.size(); i < e; i++) { 171 Instruction *I = dyn_cast<Instruction>(VL[i]); 172 if (!I || Opcode != I->getOpcode()) { 173 if (canCombineAsAltInst(Opcode) && i == 1) 174 return isAltInst(VL); 175 return 0; 176 } 177 } 178 return Opcode; 179 } 180 181 /// Get the intersection (logical and) of all of the potential IR flags 182 /// of each scalar operation (VL) that will be converted into a vector (I). 183 /// Flag set: NSW, NUW, exact, and all of fast-math. 184 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { 185 if (auto *VecOp = dyn_cast<BinaryOperator>(I)) { 186 if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) { 187 // Intersection is initialized to the 0th scalar, 188 // so start counting from index '1'. 189 for (int i = 1, e = VL.size(); i < e; ++i) { 190 if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i])) 191 Intersection->andIRFlags(Scalar); 192 } 193 VecOp->copyIRFlags(Intersection); 194 } 195 } 196 } 197 198 /// \returns \p I after propagating metadata from \p VL. 199 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { 200 Instruction *I0 = cast<Instruction>(VL[0]); 201 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; 202 I0->getAllMetadataOtherThanDebugLoc(Metadata); 203 204 for (unsigned i = 0, n = Metadata.size(); i != n; ++i) { 205 unsigned Kind = Metadata[i].first; 206 MDNode *MD = Metadata[i].second; 207 208 for (int i = 1, e = VL.size(); MD && i != e; i++) { 209 Instruction *I = cast<Instruction>(VL[i]); 210 MDNode *IMD = I->getMetadata(Kind); 211 212 switch (Kind) { 213 default: 214 MD = nullptr; // Remove unknown metadata 215 break; 216 case LLVMContext::MD_tbaa: 217 MD = MDNode::getMostGenericTBAA(MD, IMD); 218 break; 219 case LLVMContext::MD_alias_scope: 220 case LLVMContext::MD_noalias: 221 MD = MDNode::intersect(MD, IMD); 222 break; 223 case LLVMContext::MD_fpmath: 224 MD = MDNode::getMostGenericFPMath(MD, IMD); 225 break; 226 } 227 } 228 I->setMetadata(Kind, MD); 229 } 230 return I; 231 } 232 233 /// \returns The type that all of the values in \p VL have or null if there 234 /// are different types. 235 static Type* getSameType(ArrayRef<Value *> VL) { 236 Type *Ty = VL[0]->getType(); 237 for (int i = 1, e = VL.size(); i < e; i++) 238 if (VL[i]->getType() != Ty) 239 return nullptr; 240 241 return Ty; 242 } 243 244 /// \returns True if the ExtractElement instructions in VL can be vectorized 245 /// to use the original vector. 246 static bool CanReuseExtract(ArrayRef<Value *> VL) { 247 assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); 248 // Check if all of the extracts come from the same vector and from the 249 // correct offset. 250 Value *VL0 = VL[0]; 251 ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); 252 Value *Vec = E0->getOperand(0); 253 254 // We have to extract from the same vector type. 255 unsigned NElts = Vec->getType()->getVectorNumElements(); 256 257 if (NElts != VL.size()) 258 return false; 259 260 // Check that all of the indices extract from the correct offset. 261 ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); 262 if (!CI || CI->getZExtValue()) 263 return false; 264 265 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 266 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 267 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); 268 269 if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) 270 return false; 271 } 272 273 return true; 274 } 275 276 /// \returns True if in-tree use also needs extract. This refers to 277 /// possible scalar operand in vectorized instruction. 278 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, 279 TargetLibraryInfo *TLI) { 280 281 unsigned Opcode = UserInst->getOpcode(); 282 switch (Opcode) { 283 case Instruction::Load: { 284 LoadInst *LI = cast<LoadInst>(UserInst); 285 return (LI->getPointerOperand() == Scalar); 286 } 287 case Instruction::Store: { 288 StoreInst *SI = cast<StoreInst>(UserInst); 289 return (SI->getPointerOperand() == Scalar); 290 } 291 case Instruction::Call: { 292 CallInst *CI = cast<CallInst>(UserInst); 293 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 294 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 295 return (CI->getArgOperand(1) == Scalar); 296 } 297 } 298 default: 299 return false; 300 } 301 } 302 303 /// \returns the AA location that is being access by the instruction. 304 static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) { 305 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 306 return AA->getLocation(SI); 307 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 308 return AA->getLocation(LI); 309 return AliasAnalysis::Location(); 310 } 311 312 /// \returns True if the instruction is not a volatile or atomic load/store. 313 static bool isSimple(Instruction *I) { 314 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 315 return LI->isSimple(); 316 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 317 return SI->isSimple(); 318 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 319 return !MI->isVolatile(); 320 return true; 321 } 322 323 /// Bottom Up SLP Vectorizer. 324 class BoUpSLP { 325 public: 326 typedef SmallVector<Value *, 8> ValueList; 327 typedef SmallVector<Instruction *, 16> InstrList; 328 typedef SmallPtrSet<Value *, 16> ValueSet; 329 typedef SmallVector<StoreInst *, 8> StoreList; 330 331 BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, 332 TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, 333 LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC) 334 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), 335 SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), 336 Builder(Se->getContext()) { 337 CodeMetrics::collectEphemeralValues(F, AC, EphValues); 338 } 339 340 /// \brief Vectorize the tree that starts with the elements in \p VL. 341 /// Returns the vectorized root. 342 Value *vectorizeTree(); 343 344 /// \returns the cost incurred by unwanted spills and fills, caused by 345 /// holding live values over call sites. 346 int getSpillCost(); 347 348 /// \returns the vectorization cost of the subtree that starts at \p VL. 349 /// A negative number means that this is profitable. 350 int getTreeCost(); 351 352 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 353 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 354 void buildTree(ArrayRef<Value *> Roots, 355 ArrayRef<Value *> UserIgnoreLst = None); 356 357 /// Clear the internal data structures that are created by 'buildTree'. 358 void deleteTree() { 359 VectorizableTree.clear(); 360 ScalarToTreeEntry.clear(); 361 MustGather.clear(); 362 ExternalUses.clear(); 363 NumLoadsWantToKeepOrder = 0; 364 NumLoadsWantToChangeOrder = 0; 365 for (auto &Iter : BlocksSchedules) { 366 BlockScheduling *BS = Iter.second.get(); 367 BS->clear(); 368 } 369 } 370 371 /// \returns true if the memory operations A and B are consecutive. 372 bool isConsecutiveAccess(Value *A, Value *B); 373 374 /// \brief Perform LICM and CSE on the newly generated gather sequences. 375 void optimizeGatherSequence(); 376 377 /// \returns true if it is benefitial to reverse the vector order. 378 bool shouldReorder() const { 379 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; 380 } 381 382 private: 383 struct TreeEntry; 384 385 /// \returns the cost of the vectorizable entry. 386 int getEntryCost(TreeEntry *E); 387 388 /// This is the recursive part of buildTree. 389 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); 390 391 /// Vectorize a single entry in the tree. 392 Value *vectorizeTree(TreeEntry *E); 393 394 /// Vectorize a single entry in the tree, starting in \p VL. 395 Value *vectorizeTree(ArrayRef<Value *> VL); 396 397 /// \returns the pointer to the vectorized value if \p VL is already 398 /// vectorized, or NULL. They may happen in cycles. 399 Value *alreadyVectorized(ArrayRef<Value *> VL) const; 400 401 /// \brief Take the pointer operand from the Load/Store instruction. 402 /// \returns NULL if this is not a valid Load/Store instruction. 403 static Value *getPointerOperand(Value *I); 404 405 /// \brief Take the address space operand from the Load/Store instruction. 406 /// \returns -1 if this is not a valid Load/Store instruction. 407 static unsigned getAddressSpaceOperand(Value *I); 408 409 /// \returns the scalarization cost for this type. Scalarization in this 410 /// context means the creation of vectors from a group of scalars. 411 int getGatherCost(Type *Ty); 412 413 /// \returns the scalarization cost for this list of values. Assuming that 414 /// this subtree gets vectorized, we may need to extract the values from the 415 /// roots. This method calculates the cost of extracting the values. 416 int getGatherCost(ArrayRef<Value *> VL); 417 418 /// \brief Set the Builder insert point to one after the last instruction in 419 /// the bundle 420 void setInsertPointAfterBundle(ArrayRef<Value *> VL); 421 422 /// \returns a vector from a collection of scalars in \p VL. 423 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); 424 425 /// \returns whether the VectorizableTree is fully vectoriable and will 426 /// be beneficial even the tree height is tiny. 427 bool isFullyVectorizableTinyTree(); 428 429 /// \reorder commutative operands in alt shuffle if they result in 430 /// vectorized code. 431 void reorderAltShuffleOperands(ArrayRef<Value *> VL, 432 SmallVectorImpl<Value *> &Left, 433 SmallVectorImpl<Value *> &Right); 434 /// \reorder commutative operands to get better probability of 435 /// generating vectorized code. 436 void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 437 SmallVectorImpl<Value *> &Left, 438 SmallVectorImpl<Value *> &Right); 439 struct TreeEntry { 440 TreeEntry() : Scalars(), VectorizedValue(nullptr), 441 NeedToGather(0) {} 442 443 /// \returns true if the scalars in VL are equal to this entry. 444 bool isSame(ArrayRef<Value *> VL) const { 445 assert(VL.size() == Scalars.size() && "Invalid size"); 446 return std::equal(VL.begin(), VL.end(), Scalars.begin()); 447 } 448 449 /// A vector of scalars. 450 ValueList Scalars; 451 452 /// The Scalars are vectorized into this value. It is initialized to Null. 453 Value *VectorizedValue; 454 455 /// Do we need to gather this sequence ? 456 bool NeedToGather; 457 }; 458 459 /// Create a new VectorizableTree entry. 460 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { 461 VectorizableTree.push_back(TreeEntry()); 462 int idx = VectorizableTree.size() - 1; 463 TreeEntry *Last = &VectorizableTree[idx]; 464 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); 465 Last->NeedToGather = !Vectorized; 466 if (Vectorized) { 467 for (int i = 0, e = VL.size(); i != e; ++i) { 468 assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); 469 ScalarToTreeEntry[VL[i]] = idx; 470 } 471 } else { 472 MustGather.insert(VL.begin(), VL.end()); 473 } 474 return Last; 475 } 476 477 /// -- Vectorization State -- 478 /// Holds all of the tree entries. 479 std::vector<TreeEntry> VectorizableTree; 480 481 /// Maps a specific scalar to its tree entry. 482 SmallDenseMap<Value*, int> ScalarToTreeEntry; 483 484 /// A list of scalars that we found that we need to keep as scalars. 485 ValueSet MustGather; 486 487 /// This POD struct describes one external user in the vectorized tree. 488 struct ExternalUser { 489 ExternalUser (Value *S, llvm::User *U, int L) : 490 Scalar(S), User(U), Lane(L){}; 491 // Which scalar in our function. 492 Value *Scalar; 493 // Which user that uses the scalar. 494 llvm::User *User; 495 // Which lane does the scalar belong to. 496 int Lane; 497 }; 498 typedef SmallVector<ExternalUser, 16> UserList; 499 500 /// Checks if two instructions may access the same memory. 501 /// 502 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it 503 /// is invariant in the calling loop. 504 bool isAliased(const AliasAnalysis::Location &Loc1, Instruction *Inst1, 505 Instruction *Inst2) { 506 507 // First check if the result is already in the cache. 508 AliasCacheKey key = std::make_pair(Inst1, Inst2); 509 Optional<bool> &result = AliasCache[key]; 510 if (result.hasValue()) { 511 return result.getValue(); 512 } 513 AliasAnalysis::Location Loc2 = getLocation(Inst2, AA); 514 bool aliased = true; 515 if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { 516 // Do the alias check. 517 aliased = AA->alias(Loc1, Loc2); 518 } 519 // Store the result in the cache. 520 result = aliased; 521 return aliased; 522 } 523 524 typedef std::pair<Instruction *, Instruction *> AliasCacheKey; 525 526 /// Cache for alias results. 527 /// TODO: consider moving this to the AliasAnalysis itself. 528 DenseMap<AliasCacheKey, Optional<bool>> AliasCache; 529 530 /// Removes an instruction from its block and eventually deletes it. 531 /// It's like Instruction::eraseFromParent() except that the actual deletion 532 /// is delayed until BoUpSLP is destructed. 533 /// This is required to ensure that there are no incorrect collisions in the 534 /// AliasCache, which can happen if a new instruction is allocated at the 535 /// same address as a previously deleted instruction. 536 void eraseInstruction(Instruction *I) { 537 I->removeFromParent(); 538 I->dropAllReferences(); 539 DeletedInstructions.push_back(std::unique_ptr<Instruction>(I)); 540 } 541 542 /// Temporary store for deleted instructions. Instructions will be deleted 543 /// eventually when the BoUpSLP is destructed. 544 SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions; 545 546 /// A list of values that need to extracted out of the tree. 547 /// This list holds pairs of (Internal Scalar : External User). 548 UserList ExternalUses; 549 550 /// Values used only by @llvm.assume calls. 551 SmallPtrSet<const Value *, 32> EphValues; 552 553 /// Holds all of the instructions that we gathered. 554 SetVector<Instruction *> GatherSeq; 555 /// A list of blocks that we are going to CSE. 556 SetVector<BasicBlock *> CSEBlocks; 557 558 /// Contains all scheduling relevant data for an instruction. 559 /// A ScheduleData either represents a single instruction or a member of an 560 /// instruction bundle (= a group of instructions which is combined into a 561 /// vector instruction). 562 struct ScheduleData { 563 564 // The initial value for the dependency counters. It means that the 565 // dependencies are not calculated yet. 566 enum { InvalidDeps = -1 }; 567 568 ScheduleData() 569 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr), 570 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0), 571 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps), 572 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {} 573 574 void init(int BlockSchedulingRegionID) { 575 FirstInBundle = this; 576 NextInBundle = nullptr; 577 NextLoadStore = nullptr; 578 IsScheduled = false; 579 SchedulingRegionID = BlockSchedulingRegionID; 580 UnscheduledDepsInBundle = UnscheduledDeps; 581 clearDependencies(); 582 } 583 584 /// Returns true if the dependency information has been calculated. 585 bool hasValidDependencies() const { return Dependencies != InvalidDeps; } 586 587 /// Returns true for single instructions and for bundle representatives 588 /// (= the head of a bundle). 589 bool isSchedulingEntity() const { return FirstInBundle == this; } 590 591 /// Returns true if it represents an instruction bundle and not only a 592 /// single instruction. 593 bool isPartOfBundle() const { 594 return NextInBundle != nullptr || FirstInBundle != this; 595 } 596 597 /// Returns true if it is ready for scheduling, i.e. it has no more 598 /// unscheduled depending instructions/bundles. 599 bool isReady() const { 600 assert(isSchedulingEntity() && 601 "can't consider non-scheduling entity for ready list"); 602 return UnscheduledDepsInBundle == 0 && !IsScheduled; 603 } 604 605 /// Modifies the number of unscheduled dependencies, also updating it for 606 /// the whole bundle. 607 int incrementUnscheduledDeps(int Incr) { 608 UnscheduledDeps += Incr; 609 return FirstInBundle->UnscheduledDepsInBundle += Incr; 610 } 611 612 /// Sets the number of unscheduled dependencies to the number of 613 /// dependencies. 614 void resetUnscheduledDeps() { 615 incrementUnscheduledDeps(Dependencies - UnscheduledDeps); 616 } 617 618 /// Clears all dependency information. 619 void clearDependencies() { 620 Dependencies = InvalidDeps; 621 resetUnscheduledDeps(); 622 MemoryDependencies.clear(); 623 } 624 625 void dump(raw_ostream &os) const { 626 if (!isSchedulingEntity()) { 627 os << "/ " << *Inst; 628 } else if (NextInBundle) { 629 os << '[' << *Inst; 630 ScheduleData *SD = NextInBundle; 631 while (SD) { 632 os << ';' << *SD->Inst; 633 SD = SD->NextInBundle; 634 } 635 os << ']'; 636 } else { 637 os << *Inst; 638 } 639 } 640 641 Instruction *Inst; 642 643 /// Points to the head in an instruction bundle (and always to this for 644 /// single instructions). 645 ScheduleData *FirstInBundle; 646 647 /// Single linked list of all instructions in a bundle. Null if it is a 648 /// single instruction. 649 ScheduleData *NextInBundle; 650 651 /// Single linked list of all memory instructions (e.g. load, store, call) 652 /// in the block - until the end of the scheduling region. 653 ScheduleData *NextLoadStore; 654 655 /// The dependent memory instructions. 656 /// This list is derived on demand in calculateDependencies(). 657 SmallVector<ScheduleData *, 4> MemoryDependencies; 658 659 /// This ScheduleData is in the current scheduling region if this matches 660 /// the current SchedulingRegionID of BlockScheduling. 661 int SchedulingRegionID; 662 663 /// Used for getting a "good" final ordering of instructions. 664 int SchedulingPriority; 665 666 /// The number of dependencies. Constitutes of the number of users of the 667 /// instruction plus the number of dependent memory instructions (if any). 668 /// This value is calculated on demand. 669 /// If InvalidDeps, the number of dependencies is not calculated yet. 670 /// 671 int Dependencies; 672 673 /// The number of dependencies minus the number of dependencies of scheduled 674 /// instructions. As soon as this is zero, the instruction/bundle gets ready 675 /// for scheduling. 676 /// Note that this is negative as long as Dependencies is not calculated. 677 int UnscheduledDeps; 678 679 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for 680 /// single instructions. 681 int UnscheduledDepsInBundle; 682 683 /// True if this instruction is scheduled (or considered as scheduled in the 684 /// dry-run). 685 bool IsScheduled; 686 }; 687 688 #ifndef NDEBUG 689 friend raw_ostream &operator<<(raw_ostream &os, 690 const BoUpSLP::ScheduleData &SD); 691 #endif 692 693 /// Contains all scheduling data for a basic block. 694 /// 695 struct BlockScheduling { 696 697 BlockScheduling(BasicBlock *BB) 698 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), 699 ScheduleStart(nullptr), ScheduleEnd(nullptr), 700 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), 701 // Make sure that the initial SchedulingRegionID is greater than the 702 // initial SchedulingRegionID in ScheduleData (which is 0). 703 SchedulingRegionID(1) {} 704 705 void clear() { 706 ReadyInsts.clear(); 707 ScheduleStart = nullptr; 708 ScheduleEnd = nullptr; 709 FirstLoadStoreInRegion = nullptr; 710 LastLoadStoreInRegion = nullptr; 711 712 // Make a new scheduling region, i.e. all existing ScheduleData is not 713 // in the new region yet. 714 ++SchedulingRegionID; 715 } 716 717 ScheduleData *getScheduleData(Value *V) { 718 ScheduleData *SD = ScheduleDataMap[V]; 719 if (SD && SD->SchedulingRegionID == SchedulingRegionID) 720 return SD; 721 return nullptr; 722 } 723 724 bool isInSchedulingRegion(ScheduleData *SD) { 725 return SD->SchedulingRegionID == SchedulingRegionID; 726 } 727 728 /// Marks an instruction as scheduled and puts all dependent ready 729 /// instructions into the ready-list. 730 template <typename ReadyListType> 731 void schedule(ScheduleData *SD, ReadyListType &ReadyList) { 732 SD->IsScheduled = true; 733 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); 734 735 ScheduleData *BundleMember = SD; 736 while (BundleMember) { 737 // Handle the def-use chain dependencies. 738 for (Use &U : BundleMember->Inst->operands()) { 739 ScheduleData *OpDef = getScheduleData(U.get()); 740 if (OpDef && OpDef->hasValidDependencies() && 741 OpDef->incrementUnscheduledDeps(-1) == 0) { 742 // There are no more unscheduled dependencies after decrementing, 743 // so we can put the dependent instruction into the ready list. 744 ScheduleData *DepBundle = OpDef->FirstInBundle; 745 assert(!DepBundle->IsScheduled && 746 "already scheduled bundle gets ready"); 747 ReadyList.insert(DepBundle); 748 DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); 749 } 750 } 751 // Handle the memory dependencies. 752 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { 753 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { 754 // There are no more unscheduled dependencies after decrementing, 755 // so we can put the dependent instruction into the ready list. 756 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; 757 assert(!DepBundle->IsScheduled && 758 "already scheduled bundle gets ready"); 759 ReadyList.insert(DepBundle); 760 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n"); 761 } 762 } 763 BundleMember = BundleMember->NextInBundle; 764 } 765 } 766 767 /// Put all instructions into the ReadyList which are ready for scheduling. 768 template <typename ReadyListType> 769 void initialFillReadyList(ReadyListType &ReadyList) { 770 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 771 ScheduleData *SD = getScheduleData(I); 772 if (SD->isSchedulingEntity() && SD->isReady()) { 773 ReadyList.insert(SD); 774 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); 775 } 776 } 777 } 778 779 /// Checks if a bundle of instructions can be scheduled, i.e. has no 780 /// cyclic dependencies. This is only a dry-run, no instructions are 781 /// actually moved at this stage. 782 bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP); 783 784 /// Un-bundles a group of instructions. 785 void cancelScheduling(ArrayRef<Value *> VL); 786 787 /// Extends the scheduling region so that V is inside the region. 788 void extendSchedulingRegion(Value *V); 789 790 /// Initialize the ScheduleData structures for new instructions in the 791 /// scheduling region. 792 void initScheduleData(Instruction *FromI, Instruction *ToI, 793 ScheduleData *PrevLoadStore, 794 ScheduleData *NextLoadStore); 795 796 /// Updates the dependency information of a bundle and of all instructions/ 797 /// bundles which depend on the original bundle. 798 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, 799 BoUpSLP *SLP); 800 801 /// Sets all instruction in the scheduling region to un-scheduled. 802 void resetSchedule(); 803 804 BasicBlock *BB; 805 806 /// Simple memory allocation for ScheduleData. 807 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; 808 809 /// The size of a ScheduleData array in ScheduleDataChunks. 810 int ChunkSize; 811 812 /// The allocator position in the current chunk, which is the last entry 813 /// of ScheduleDataChunks. 814 int ChunkPos; 815 816 /// Attaches ScheduleData to Instruction. 817 /// Note that the mapping survives during all vectorization iterations, i.e. 818 /// ScheduleData structures are recycled. 819 DenseMap<Value *, ScheduleData *> ScheduleDataMap; 820 821 struct ReadyList : SmallVector<ScheduleData *, 8> { 822 void insert(ScheduleData *SD) { push_back(SD); } 823 }; 824 825 /// The ready-list for scheduling (only used for the dry-run). 826 ReadyList ReadyInsts; 827 828 /// The first instruction of the scheduling region. 829 Instruction *ScheduleStart; 830 831 /// The first instruction _after_ the scheduling region. 832 Instruction *ScheduleEnd; 833 834 /// The first memory accessing instruction in the scheduling region 835 /// (can be null). 836 ScheduleData *FirstLoadStoreInRegion; 837 838 /// The last memory accessing instruction in the scheduling region 839 /// (can be null). 840 ScheduleData *LastLoadStoreInRegion; 841 842 /// The ID of the scheduling region. For a new vectorization iteration this 843 /// is incremented which "removes" all ScheduleData from the region. 844 int SchedulingRegionID; 845 }; 846 847 /// Attaches the BlockScheduling structures to basic blocks. 848 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; 849 850 /// Performs the "real" scheduling. Done before vectorization is actually 851 /// performed in a basic block. 852 void scheduleBlock(BlockScheduling *BS); 853 854 /// List of users to ignore during scheduling and that don't need extracting. 855 ArrayRef<Value *> UserIgnoreList; 856 857 // Number of load-bundles, which contain consecutive loads. 858 int NumLoadsWantToKeepOrder; 859 860 // Number of load-bundles of size 2, which are consecutive loads if reversed. 861 int NumLoadsWantToChangeOrder; 862 863 // Analysis and block reference. 864 Function *F; 865 ScalarEvolution *SE; 866 const DataLayout *DL; 867 TargetTransformInfo *TTI; 868 TargetLibraryInfo *TLI; 869 AliasAnalysis *AA; 870 LoopInfo *LI; 871 DominatorTree *DT; 872 /// Instruction builder to construct the vectorized tree. 873 IRBuilder<> Builder; 874 }; 875 876 #ifndef NDEBUG 877 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) { 878 SD.dump(os); 879 return os; 880 } 881 #endif 882 883 void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 884 ArrayRef<Value *> UserIgnoreLst) { 885 deleteTree(); 886 UserIgnoreList = UserIgnoreLst; 887 if (!getSameType(Roots)) 888 return; 889 buildTree_rec(Roots, 0); 890 891 // Collect the values that we need to extract from the tree. 892 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 893 TreeEntry *Entry = &VectorizableTree[EIdx]; 894 895 // For each lane: 896 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 897 Value *Scalar = Entry->Scalars[Lane]; 898 899 // No need to handle users of gathered values. 900 if (Entry->NeedToGather) 901 continue; 902 903 for (User *U : Scalar->users()) { 904 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); 905 906 Instruction *UserInst = dyn_cast<Instruction>(U); 907 if (!UserInst) 908 continue; 909 910 // Skip in-tree scalars that become vectors 911 if (ScalarToTreeEntry.count(U)) { 912 int Idx = ScalarToTreeEntry[U]; 913 TreeEntry *UseEntry = &VectorizableTree[Idx]; 914 Value *UseScalar = UseEntry->Scalars[0]; 915 // Some in-tree scalars will remain as scalar in vectorized 916 // instructions. If that is the case, the one in Lane 0 will 917 // be used. 918 if (UseScalar != U || 919 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { 920 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U 921 << ".\n"); 922 assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); 923 continue; 924 } 925 } 926 927 // Ignore users in the user ignore list. 928 if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != 929 UserIgnoreList.end()) 930 continue; 931 932 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << 933 Lane << " from " << *Scalar << ".\n"); 934 ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); 935 } 936 } 937 } 938 } 939 940 941 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { 942 bool SameTy = getSameType(VL); (void)SameTy; 943 bool isAltShuffle = false; 944 assert(SameTy && "Invalid types!"); 945 946 if (Depth == RecursionMaxDepth) { 947 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); 948 newTreeEntry(VL, false); 949 return; 950 } 951 952 // Don't handle vectors. 953 if (VL[0]->getType()->isVectorTy()) { 954 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); 955 newTreeEntry(VL, false); 956 return; 957 } 958 959 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 960 if (SI->getValueOperand()->getType()->isVectorTy()) { 961 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); 962 newTreeEntry(VL, false); 963 return; 964 } 965 unsigned Opcode = getSameOpcode(VL); 966 967 // Check that this shuffle vector refers to the alternate 968 // sequence of opcodes. 969 if (Opcode == Instruction::ShuffleVector) { 970 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 971 unsigned Op = I0->getOpcode(); 972 if (Op != Instruction::ShuffleVector) 973 isAltShuffle = true; 974 } 975 976 // If all of the operands are identical or constant we have a simple solution. 977 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { 978 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); 979 newTreeEntry(VL, false); 980 return; 981 } 982 983 // We now know that this is a vector of instructions of the same type from 984 // the same block. 985 986 // Don't vectorize ephemeral values. 987 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 988 if (EphValues.count(VL[i])) { 989 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 990 ") is ephemeral.\n"); 991 newTreeEntry(VL, false); 992 return; 993 } 994 } 995 996 // Check if this is a duplicate of another entry. 997 if (ScalarToTreeEntry.count(VL[0])) { 998 int Idx = ScalarToTreeEntry[VL[0]]; 999 TreeEntry *E = &VectorizableTree[Idx]; 1000 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1001 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); 1002 if (E->Scalars[i] != VL[i]) { 1003 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); 1004 newTreeEntry(VL, false); 1005 return; 1006 } 1007 } 1008 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); 1009 return; 1010 } 1011 1012 // Check that none of the instructions in the bundle are already in the tree. 1013 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1014 if (ScalarToTreeEntry.count(VL[i])) { 1015 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 1016 ") is already in tree.\n"); 1017 newTreeEntry(VL, false); 1018 return; 1019 } 1020 } 1021 1022 // If any of the scalars is marked as a value that needs to stay scalar then 1023 // we need to gather the scalars. 1024 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1025 if (MustGather.count(VL[i])) { 1026 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); 1027 newTreeEntry(VL, false); 1028 return; 1029 } 1030 } 1031 1032 // Check that all of the users of the scalars that we want to vectorize are 1033 // schedulable. 1034 Instruction *VL0 = cast<Instruction>(VL[0]); 1035 BasicBlock *BB = cast<Instruction>(VL0)->getParent(); 1036 1037 if (!DT->isReachableFromEntry(BB)) { 1038 // Don't go into unreachable blocks. They may contain instructions with 1039 // dependency cycles which confuse the final scheduling. 1040 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); 1041 newTreeEntry(VL, false); 1042 return; 1043 } 1044 1045 // Check that every instructions appears once in this bundle. 1046 for (unsigned i = 0, e = VL.size(); i < e; ++i) 1047 for (unsigned j = i+1; j < e; ++j) 1048 if (VL[i] == VL[j]) { 1049 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 1050 newTreeEntry(VL, false); 1051 return; 1052 } 1053 1054 auto &BSRef = BlocksSchedules[BB]; 1055 if (!BSRef) { 1056 BSRef = llvm::make_unique<BlockScheduling>(BB); 1057 } 1058 BlockScheduling &BS = *BSRef.get(); 1059 1060 if (!BS.tryScheduleBundle(VL, this)) { 1061 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); 1062 BS.cancelScheduling(VL); 1063 newTreeEntry(VL, false); 1064 return; 1065 } 1066 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); 1067 1068 switch (Opcode) { 1069 case Instruction::PHI: { 1070 PHINode *PH = dyn_cast<PHINode>(VL0); 1071 1072 // Check for terminator values (e.g. invoke). 1073 for (unsigned j = 0; j < VL.size(); ++j) 1074 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1075 TerminatorInst *Term = dyn_cast<TerminatorInst>( 1076 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); 1077 if (Term) { 1078 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); 1079 BS.cancelScheduling(VL); 1080 newTreeEntry(VL, false); 1081 return; 1082 } 1083 } 1084 1085 newTreeEntry(VL, true); 1086 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); 1087 1088 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1089 ValueList Operands; 1090 // Prepare the operand vector. 1091 for (unsigned j = 0; j < VL.size(); ++j) 1092 Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock( 1093 PH->getIncomingBlock(i))); 1094 1095 buildTree_rec(Operands, Depth + 1); 1096 } 1097 return; 1098 } 1099 case Instruction::ExtractElement: { 1100 bool Reuse = CanReuseExtract(VL); 1101 if (Reuse) { 1102 DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); 1103 } else { 1104 BS.cancelScheduling(VL); 1105 } 1106 newTreeEntry(VL, Reuse); 1107 return; 1108 } 1109 case Instruction::Load: { 1110 // Check if the loads are consecutive or of we need to swizzle them. 1111 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { 1112 LoadInst *L = cast<LoadInst>(VL[i]); 1113 if (!L->isSimple()) { 1114 BS.cancelScheduling(VL); 1115 newTreeEntry(VL, false); 1116 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); 1117 return; 1118 } 1119 if (!isConsecutiveAccess(VL[i], VL[i + 1])) { 1120 if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) { 1121 ++NumLoadsWantToChangeOrder; 1122 } 1123 BS.cancelScheduling(VL); 1124 newTreeEntry(VL, false); 1125 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); 1126 return; 1127 } 1128 } 1129 ++NumLoadsWantToKeepOrder; 1130 newTreeEntry(VL, true); 1131 DEBUG(dbgs() << "SLP: added a vector of loads.\n"); 1132 return; 1133 } 1134 case Instruction::ZExt: 1135 case Instruction::SExt: 1136 case Instruction::FPToUI: 1137 case Instruction::FPToSI: 1138 case Instruction::FPExt: 1139 case Instruction::PtrToInt: 1140 case Instruction::IntToPtr: 1141 case Instruction::SIToFP: 1142 case Instruction::UIToFP: 1143 case Instruction::Trunc: 1144 case Instruction::FPTrunc: 1145 case Instruction::BitCast: { 1146 Type *SrcTy = VL0->getOperand(0)->getType(); 1147 for (unsigned i = 0; i < VL.size(); ++i) { 1148 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); 1149 if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { 1150 BS.cancelScheduling(VL); 1151 newTreeEntry(VL, false); 1152 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); 1153 return; 1154 } 1155 } 1156 newTreeEntry(VL, true); 1157 DEBUG(dbgs() << "SLP: added a vector of casts.\n"); 1158 1159 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1160 ValueList Operands; 1161 // Prepare the operand vector. 1162 for (unsigned j = 0; j < VL.size(); ++j) 1163 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1164 1165 buildTree_rec(Operands, Depth+1); 1166 } 1167 return; 1168 } 1169 case Instruction::ICmp: 1170 case Instruction::FCmp: { 1171 // Check that all of the compares have the same predicate. 1172 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); 1173 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); 1174 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 1175 CmpInst *Cmp = cast<CmpInst>(VL[i]); 1176 if (Cmp->getPredicate() != P0 || 1177 Cmp->getOperand(0)->getType() != ComparedTy) { 1178 BS.cancelScheduling(VL); 1179 newTreeEntry(VL, false); 1180 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); 1181 return; 1182 } 1183 } 1184 1185 newTreeEntry(VL, true); 1186 DEBUG(dbgs() << "SLP: added a vector of compares.\n"); 1187 1188 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1189 ValueList Operands; 1190 // Prepare the operand vector. 1191 for (unsigned j = 0; j < VL.size(); ++j) 1192 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1193 1194 buildTree_rec(Operands, Depth+1); 1195 } 1196 return; 1197 } 1198 case Instruction::Select: 1199 case Instruction::Add: 1200 case Instruction::FAdd: 1201 case Instruction::Sub: 1202 case Instruction::FSub: 1203 case Instruction::Mul: 1204 case Instruction::FMul: 1205 case Instruction::UDiv: 1206 case Instruction::SDiv: 1207 case Instruction::FDiv: 1208 case Instruction::URem: 1209 case Instruction::SRem: 1210 case Instruction::FRem: 1211 case Instruction::Shl: 1212 case Instruction::LShr: 1213 case Instruction::AShr: 1214 case Instruction::And: 1215 case Instruction::Or: 1216 case Instruction::Xor: { 1217 newTreeEntry(VL, true); 1218 DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); 1219 1220 // Sort operands of the instructions so that each side is more likely to 1221 // have the same opcode. 1222 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { 1223 ValueList Left, Right; 1224 reorderInputsAccordingToOpcode(VL, Left, Right); 1225 buildTree_rec(Left, Depth + 1); 1226 buildTree_rec(Right, Depth + 1); 1227 return; 1228 } 1229 1230 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1231 ValueList Operands; 1232 // Prepare the operand vector. 1233 for (unsigned j = 0; j < VL.size(); ++j) 1234 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1235 1236 buildTree_rec(Operands, Depth+1); 1237 } 1238 return; 1239 } 1240 case Instruction::GetElementPtr: { 1241 // We don't combine GEPs with complicated (nested) indexing. 1242 for (unsigned j = 0; j < VL.size(); ++j) { 1243 if (cast<Instruction>(VL[j])->getNumOperands() != 2) { 1244 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); 1245 BS.cancelScheduling(VL); 1246 newTreeEntry(VL, false); 1247 return; 1248 } 1249 } 1250 1251 // We can't combine several GEPs into one vector if they operate on 1252 // different types. 1253 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType(); 1254 for (unsigned j = 0; j < VL.size(); ++j) { 1255 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); 1256 if (Ty0 != CurTy) { 1257 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); 1258 BS.cancelScheduling(VL); 1259 newTreeEntry(VL, false); 1260 return; 1261 } 1262 } 1263 1264 // We don't combine GEPs with non-constant indexes. 1265 for (unsigned j = 0; j < VL.size(); ++j) { 1266 auto Op = cast<Instruction>(VL[j])->getOperand(1); 1267 if (!isa<ConstantInt>(Op)) { 1268 DEBUG( 1269 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); 1270 BS.cancelScheduling(VL); 1271 newTreeEntry(VL, false); 1272 return; 1273 } 1274 } 1275 1276 newTreeEntry(VL, true); 1277 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); 1278 for (unsigned i = 0, e = 2; i < e; ++i) { 1279 ValueList Operands; 1280 // Prepare the operand vector. 1281 for (unsigned j = 0; j < VL.size(); ++j) 1282 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1283 1284 buildTree_rec(Operands, Depth + 1); 1285 } 1286 return; 1287 } 1288 case Instruction::Store: { 1289 // Check if the stores are consecutive or of we need to swizzle them. 1290 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) 1291 if (!isConsecutiveAccess(VL[i], VL[i + 1])) { 1292 BS.cancelScheduling(VL); 1293 newTreeEntry(VL, false); 1294 DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); 1295 return; 1296 } 1297 1298 newTreeEntry(VL, true); 1299 DEBUG(dbgs() << "SLP: added a vector of stores.\n"); 1300 1301 ValueList Operands; 1302 for (unsigned j = 0; j < VL.size(); ++j) 1303 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); 1304 1305 buildTree_rec(Operands, Depth + 1); 1306 return; 1307 } 1308 case Instruction::Call: { 1309 // Check if the calls are all to the same vectorizable intrinsic. 1310 CallInst *CI = cast<CallInst>(VL[0]); 1311 // Check if this is an Intrinsic call or something that can be 1312 // represented by an intrinsic call 1313 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1314 if (!isTriviallyVectorizable(ID)) { 1315 BS.cancelScheduling(VL); 1316 newTreeEntry(VL, false); 1317 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); 1318 return; 1319 } 1320 Function *Int = CI->getCalledFunction(); 1321 Value *A1I = nullptr; 1322 if (hasVectorInstrinsicScalarOpd(ID, 1)) 1323 A1I = CI->getArgOperand(1); 1324 for (unsigned i = 1, e = VL.size(); i != e; ++i) { 1325 CallInst *CI2 = dyn_cast<CallInst>(VL[i]); 1326 if (!CI2 || CI2->getCalledFunction() != Int || 1327 getIntrinsicIDForCall(CI2, TLI) != ID) { 1328 BS.cancelScheduling(VL); 1329 newTreeEntry(VL, false); 1330 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] 1331 << "\n"); 1332 return; 1333 } 1334 // ctlz,cttz and powi are special intrinsics whose second argument 1335 // should be same in order for them to be vectorized. 1336 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 1337 Value *A1J = CI2->getArgOperand(1); 1338 if (A1I != A1J) { 1339 BS.cancelScheduling(VL); 1340 newTreeEntry(VL, false); 1341 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI 1342 << " argument "<< A1I<<"!=" << A1J 1343 << "\n"); 1344 return; 1345 } 1346 } 1347 } 1348 1349 newTreeEntry(VL, true); 1350 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { 1351 ValueList Operands; 1352 // Prepare the operand vector. 1353 for (unsigned j = 0; j < VL.size(); ++j) { 1354 CallInst *CI2 = dyn_cast<CallInst>(VL[j]); 1355 Operands.push_back(CI2->getArgOperand(i)); 1356 } 1357 buildTree_rec(Operands, Depth + 1); 1358 } 1359 return; 1360 } 1361 case Instruction::ShuffleVector: { 1362 // If this is not an alternate sequence of opcode like add-sub 1363 // then do not vectorize this instruction. 1364 if (!isAltShuffle) { 1365 BS.cancelScheduling(VL); 1366 newTreeEntry(VL, false); 1367 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); 1368 return; 1369 } 1370 newTreeEntry(VL, true); 1371 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); 1372 1373 // Reorder operands if reordering would enable vectorization. 1374 if (isa<BinaryOperator>(VL0)) { 1375 ValueList Left, Right; 1376 reorderAltShuffleOperands(VL, Left, Right); 1377 buildTree_rec(Left, Depth + 1); 1378 buildTree_rec(Right, Depth + 1); 1379 return; 1380 } 1381 1382 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1383 ValueList Operands; 1384 // Prepare the operand vector. 1385 for (unsigned j = 0; j < VL.size(); ++j) 1386 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1387 1388 buildTree_rec(Operands, Depth + 1); 1389 } 1390 return; 1391 } 1392 default: 1393 BS.cancelScheduling(VL); 1394 newTreeEntry(VL, false); 1395 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); 1396 return; 1397 } 1398 } 1399 1400 int BoUpSLP::getEntryCost(TreeEntry *E) { 1401 ArrayRef<Value*> VL = E->Scalars; 1402 1403 Type *ScalarTy = VL[0]->getType(); 1404 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1405 ScalarTy = SI->getValueOperand()->getType(); 1406 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1407 1408 if (E->NeedToGather) { 1409 if (allConstant(VL)) 1410 return 0; 1411 if (isSplat(VL)) { 1412 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); 1413 } 1414 return getGatherCost(E->Scalars); 1415 } 1416 unsigned Opcode = getSameOpcode(VL); 1417 assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); 1418 Instruction *VL0 = cast<Instruction>(VL[0]); 1419 switch (Opcode) { 1420 case Instruction::PHI: { 1421 return 0; 1422 } 1423 case Instruction::ExtractElement: { 1424 if (CanReuseExtract(VL)) { 1425 int DeadCost = 0; 1426 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 1427 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 1428 if (E->hasOneUse()) 1429 // Take credit for instruction that will become dead. 1430 DeadCost += 1431 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); 1432 } 1433 return -DeadCost; 1434 } 1435 return getGatherCost(VecTy); 1436 } 1437 case Instruction::ZExt: 1438 case Instruction::SExt: 1439 case Instruction::FPToUI: 1440 case Instruction::FPToSI: 1441 case Instruction::FPExt: 1442 case Instruction::PtrToInt: 1443 case Instruction::IntToPtr: 1444 case Instruction::SIToFP: 1445 case Instruction::UIToFP: 1446 case Instruction::Trunc: 1447 case Instruction::FPTrunc: 1448 case Instruction::BitCast: { 1449 Type *SrcTy = VL0->getOperand(0)->getType(); 1450 1451 // Calculate the cost of this instruction. 1452 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), 1453 VL0->getType(), SrcTy); 1454 1455 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); 1456 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); 1457 return VecCost - ScalarCost; 1458 } 1459 case Instruction::FCmp: 1460 case Instruction::ICmp: 1461 case Instruction::Select: 1462 case Instruction::Add: 1463 case Instruction::FAdd: 1464 case Instruction::Sub: 1465 case Instruction::FSub: 1466 case Instruction::Mul: 1467 case Instruction::FMul: 1468 case Instruction::UDiv: 1469 case Instruction::SDiv: 1470 case Instruction::FDiv: 1471 case Instruction::URem: 1472 case Instruction::SRem: 1473 case Instruction::FRem: 1474 case Instruction::Shl: 1475 case Instruction::LShr: 1476 case Instruction::AShr: 1477 case Instruction::And: 1478 case Instruction::Or: 1479 case Instruction::Xor: { 1480 // Calculate the cost of this instruction. 1481 int ScalarCost = 0; 1482 int VecCost = 0; 1483 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || 1484 Opcode == Instruction::Select) { 1485 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); 1486 ScalarCost = VecTy->getNumElements() * 1487 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); 1488 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); 1489 } else { 1490 // Certain instructions can be cheaper to vectorize if they have a 1491 // constant second vector operand. 1492 TargetTransformInfo::OperandValueKind Op1VK = 1493 TargetTransformInfo::OK_AnyValue; 1494 TargetTransformInfo::OperandValueKind Op2VK = 1495 TargetTransformInfo::OK_UniformConstantValue; 1496 TargetTransformInfo::OperandValueProperties Op1VP = 1497 TargetTransformInfo::OP_None; 1498 TargetTransformInfo::OperandValueProperties Op2VP = 1499 TargetTransformInfo::OP_None; 1500 1501 // If all operands are exactly the same ConstantInt then set the 1502 // operand kind to OK_UniformConstantValue. 1503 // If instead not all operands are constants, then set the operand kind 1504 // to OK_AnyValue. If all operands are constants but not the same, 1505 // then set the operand kind to OK_NonUniformConstantValue. 1506 ConstantInt *CInt = nullptr; 1507 for (unsigned i = 0; i < VL.size(); ++i) { 1508 const Instruction *I = cast<Instruction>(VL[i]); 1509 if (!isa<ConstantInt>(I->getOperand(1))) { 1510 Op2VK = TargetTransformInfo::OK_AnyValue; 1511 break; 1512 } 1513 if (i == 0) { 1514 CInt = cast<ConstantInt>(I->getOperand(1)); 1515 continue; 1516 } 1517 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && 1518 CInt != cast<ConstantInt>(I->getOperand(1))) 1519 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 1520 } 1521 // FIXME: Currently cost of model modification for division by 1522 // power of 2 is handled only for X86. Add support for other targets. 1523 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && 1524 CInt->getValue().isPowerOf2()) 1525 Op2VP = TargetTransformInfo::OP_PowerOf2; 1526 1527 ScalarCost = VecTy->getNumElements() * 1528 TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, 1529 Op1VP, Op2VP); 1530 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, 1531 Op1VP, Op2VP); 1532 } 1533 return VecCost - ScalarCost; 1534 } 1535 case Instruction::GetElementPtr: { 1536 TargetTransformInfo::OperandValueKind Op1VK = 1537 TargetTransformInfo::OK_AnyValue; 1538 TargetTransformInfo::OperandValueKind Op2VK = 1539 TargetTransformInfo::OK_UniformConstantValue; 1540 1541 int ScalarCost = 1542 VecTy->getNumElements() * 1543 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); 1544 int VecCost = 1545 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); 1546 1547 return VecCost - ScalarCost; 1548 } 1549 case Instruction::Load: { 1550 // Cost of wide load - cost of scalar loads. 1551 int ScalarLdCost = VecTy->getNumElements() * 1552 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); 1553 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0); 1554 return VecLdCost - ScalarLdCost; 1555 } 1556 case Instruction::Store: { 1557 // We know that we can merge the stores. Calculate the cost. 1558 int ScalarStCost = VecTy->getNumElements() * 1559 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); 1560 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0); 1561 return VecStCost - ScalarStCost; 1562 } 1563 case Instruction::Call: { 1564 CallInst *CI = cast<CallInst>(VL0); 1565 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1566 1567 // Calculate the cost of the scalar and vector calls. 1568 SmallVector<Type*, 4> ScalarTys, VecTys; 1569 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { 1570 ScalarTys.push_back(CI->getArgOperand(op)->getType()); 1571 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), 1572 VecTy->getNumElements())); 1573 } 1574 1575 int ScalarCallCost = VecTy->getNumElements() * 1576 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys); 1577 1578 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys); 1579 1580 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost 1581 << " (" << VecCallCost << "-" << ScalarCallCost << ")" 1582 << " for " << *CI << "\n"); 1583 1584 return VecCallCost - ScalarCallCost; 1585 } 1586 case Instruction::ShuffleVector: { 1587 TargetTransformInfo::OperandValueKind Op1VK = 1588 TargetTransformInfo::OK_AnyValue; 1589 TargetTransformInfo::OperandValueKind Op2VK = 1590 TargetTransformInfo::OK_AnyValue; 1591 int ScalarCost = 0; 1592 int VecCost = 0; 1593 for (unsigned i = 0; i < VL.size(); ++i) { 1594 Instruction *I = cast<Instruction>(VL[i]); 1595 if (!I) 1596 break; 1597 ScalarCost += 1598 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); 1599 } 1600 // VecCost is equal to sum of the cost of creating 2 vectors 1601 // and the cost of creating shuffle. 1602 Instruction *I0 = cast<Instruction>(VL[0]); 1603 VecCost = 1604 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); 1605 Instruction *I1 = cast<Instruction>(VL[1]); 1606 VecCost += 1607 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); 1608 VecCost += 1609 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); 1610 return VecCost - ScalarCost; 1611 } 1612 default: 1613 llvm_unreachable("Unknown instruction"); 1614 } 1615 } 1616 1617 bool BoUpSLP::isFullyVectorizableTinyTree() { 1618 DEBUG(dbgs() << "SLP: Check whether the tree with height " << 1619 VectorizableTree.size() << " is fully vectorizable .\n"); 1620 1621 // We only handle trees of height 2. 1622 if (VectorizableTree.size() != 2) 1623 return false; 1624 1625 // Handle splat stores. 1626 if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars)) 1627 return true; 1628 1629 // Gathering cost would be too much for tiny trees. 1630 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) 1631 return false; 1632 1633 return true; 1634 } 1635 1636 int BoUpSLP::getSpillCost() { 1637 // Walk from the bottom of the tree to the top, tracking which values are 1638 // live. When we see a call instruction that is not part of our tree, 1639 // query TTI to see if there is a cost to keeping values live over it 1640 // (for example, if spills and fills are required). 1641 unsigned BundleWidth = VectorizableTree.front().Scalars.size(); 1642 int Cost = 0; 1643 1644 SmallPtrSet<Instruction*, 4> LiveValues; 1645 Instruction *PrevInst = nullptr; 1646 1647 for (unsigned N = 0; N < VectorizableTree.size(); ++N) { 1648 Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]); 1649 if (!Inst) 1650 continue; 1651 1652 if (!PrevInst) { 1653 PrevInst = Inst; 1654 continue; 1655 } 1656 1657 DEBUG( 1658 dbgs() << "SLP: #LV: " << LiveValues.size(); 1659 for (auto *X : LiveValues) 1660 dbgs() << " " << X->getName(); 1661 dbgs() << ", Looking at "; 1662 Inst->dump(); 1663 ); 1664 1665 // Update LiveValues. 1666 LiveValues.erase(PrevInst); 1667 for (auto &J : PrevInst->operands()) { 1668 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J)) 1669 LiveValues.insert(cast<Instruction>(&*J)); 1670 } 1671 1672 // Now find the sequence of instructions between PrevInst and Inst. 1673 BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst); 1674 --PrevInstIt; 1675 while (InstIt != PrevInstIt) { 1676 if (PrevInstIt == PrevInst->getParent()->rend()) { 1677 PrevInstIt = Inst->getParent()->rbegin(); 1678 continue; 1679 } 1680 1681 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) { 1682 SmallVector<Type*, 4> V; 1683 for (auto *II : LiveValues) 1684 V.push_back(VectorType::get(II->getType(), BundleWidth)); 1685 Cost += TTI->getCostOfKeepingLiveOverCall(V); 1686 } 1687 1688 ++PrevInstIt; 1689 } 1690 1691 PrevInst = Inst; 1692 } 1693 1694 DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n"); 1695 return Cost; 1696 } 1697 1698 int BoUpSLP::getTreeCost() { 1699 int Cost = 0; 1700 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << 1701 VectorizableTree.size() << ".\n"); 1702 1703 // We only vectorize tiny trees if it is fully vectorizable. 1704 if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { 1705 if (VectorizableTree.empty()) { 1706 assert(!ExternalUses.size() && "We should not have any external users"); 1707 } 1708 return INT_MAX; 1709 } 1710 1711 unsigned BundleWidth = VectorizableTree[0].Scalars.size(); 1712 1713 for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { 1714 int C = getEntryCost(&VectorizableTree[i]); 1715 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " 1716 << *VectorizableTree[i].Scalars[0] << " .\n"); 1717 Cost += C; 1718 } 1719 1720 SmallSet<Value *, 16> ExtractCostCalculated; 1721 int ExtractCost = 0; 1722 for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end(); 1723 I != E; ++I) { 1724 // We only add extract cost once for the same scalar. 1725 if (!ExtractCostCalculated.insert(I->Scalar).second) 1726 continue; 1727 1728 // Uses by ephemeral values are free (because the ephemeral value will be 1729 // removed prior to code generation, and so the extraction will be 1730 // removed as well). 1731 if (EphValues.count(I->User)) 1732 continue; 1733 1734 VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); 1735 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1736 I->Lane); 1737 } 1738 1739 Cost += getSpillCost(); 1740 1741 DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); 1742 return Cost + ExtractCost; 1743 } 1744 1745 int BoUpSLP::getGatherCost(Type *Ty) { 1746 int Cost = 0; 1747 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) 1748 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); 1749 return Cost; 1750 } 1751 1752 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { 1753 // Find the type of the operands in VL. 1754 Type *ScalarTy = VL[0]->getType(); 1755 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1756 ScalarTy = SI->getValueOperand()->getType(); 1757 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1758 // Find the cost of inserting/extracting values from the vector. 1759 return getGatherCost(VecTy); 1760 } 1761 1762 Value *BoUpSLP::getPointerOperand(Value *I) { 1763 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1764 return LI->getPointerOperand(); 1765 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 1766 return SI->getPointerOperand(); 1767 return nullptr; 1768 } 1769 1770 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { 1771 if (LoadInst *L = dyn_cast<LoadInst>(I)) 1772 return L->getPointerAddressSpace(); 1773 if (StoreInst *S = dyn_cast<StoreInst>(I)) 1774 return S->getPointerAddressSpace(); 1775 return -1; 1776 } 1777 1778 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { 1779 Value *PtrA = getPointerOperand(A); 1780 Value *PtrB = getPointerOperand(B); 1781 unsigned ASA = getAddressSpaceOperand(A); 1782 unsigned ASB = getAddressSpaceOperand(B); 1783 1784 // Check that the address spaces match and that the pointers are valid. 1785 if (!PtrA || !PtrB || (ASA != ASB)) 1786 return false; 1787 1788 // Make sure that A and B are different pointers of the same type. 1789 if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) 1790 return false; 1791 1792 unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA); 1793 Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); 1794 APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty)); 1795 1796 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); 1797 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA); 1798 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB); 1799 1800 APInt OffsetDelta = OffsetB - OffsetA; 1801 1802 // Check if they are based on the same pointer. That makes the offsets 1803 // sufficient. 1804 if (PtrA == PtrB) 1805 return OffsetDelta == Size; 1806 1807 // Compute the necessary base pointer delta to have the necessary final delta 1808 // equal to the size. 1809 APInt BaseDelta = Size - OffsetDelta; 1810 1811 // Otherwise compute the distance with SCEV between the base pointers. 1812 const SCEV *PtrSCEVA = SE->getSCEV(PtrA); 1813 const SCEV *PtrSCEVB = SE->getSCEV(PtrB); 1814 const SCEV *C = SE->getConstant(BaseDelta); 1815 const SCEV *X = SE->getAddExpr(PtrSCEVA, C); 1816 return X == PtrSCEVB; 1817 } 1818 1819 // Reorder commutative operations in alternate shuffle if the resulting vectors 1820 // are consecutive loads. This would allow us to vectorize the tree. 1821 // If we have something like- 1822 // load a[0] - load b[0] 1823 // load b[1] + load a[1] 1824 // load a[2] - load b[2] 1825 // load a[3] + load b[3] 1826 // Reordering the second load b[1] load a[1] would allow us to vectorize this 1827 // code. 1828 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, 1829 SmallVectorImpl<Value *> &Left, 1830 SmallVectorImpl<Value *> &Right) { 1831 1832 // Push left and right operands of binary operation into Left and Right 1833 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 1834 Left.push_back(cast<Instruction>(VL[i])->getOperand(0)); 1835 Right.push_back(cast<Instruction>(VL[i])->getOperand(1)); 1836 } 1837 1838 // Reorder if we have a commutative operation and consecutive access 1839 // are on either side of the alternate instructions. 1840 for (unsigned j = 0; j < VL.size() - 1; ++j) { 1841 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { 1842 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { 1843 Instruction *VL1 = cast<Instruction>(VL[j]); 1844 Instruction *VL2 = cast<Instruction>(VL[j + 1]); 1845 if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) { 1846 std::swap(Left[j], Right[j]); 1847 continue; 1848 } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) { 1849 std::swap(Left[j + 1], Right[j + 1]); 1850 continue; 1851 } 1852 // else unchanged 1853 } 1854 } 1855 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { 1856 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { 1857 Instruction *VL1 = cast<Instruction>(VL[j]); 1858 Instruction *VL2 = cast<Instruction>(VL[j + 1]); 1859 if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) { 1860 std::swap(Left[j], Right[j]); 1861 continue; 1862 } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) { 1863 std::swap(Left[j + 1], Right[j + 1]); 1864 continue; 1865 } 1866 // else unchanged 1867 } 1868 } 1869 } 1870 } 1871 1872 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 1873 SmallVectorImpl<Value *> &Left, 1874 SmallVectorImpl<Value *> &Right) { 1875 1876 SmallVector<Value *, 16> OrigLeft, OrigRight; 1877 1878 bool AllSameOpcodeLeft = true; 1879 bool AllSameOpcodeRight = true; 1880 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1881 Instruction *I = cast<Instruction>(VL[i]); 1882 Value *VLeft = I->getOperand(0); 1883 Value *VRight = I->getOperand(1); 1884 1885 OrigLeft.push_back(VLeft); 1886 OrigRight.push_back(VRight); 1887 1888 Instruction *ILeft = dyn_cast<Instruction>(VLeft); 1889 Instruction *IRight = dyn_cast<Instruction>(VRight); 1890 1891 // Check whether all operands on one side have the same opcode. In this case 1892 // we want to preserve the original order and not make things worse by 1893 // reordering. 1894 if (i && AllSameOpcodeLeft && ILeft) { 1895 if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) { 1896 if (PLeft->getOpcode() != ILeft->getOpcode()) 1897 AllSameOpcodeLeft = false; 1898 } else 1899 AllSameOpcodeLeft = false; 1900 } 1901 if (i && AllSameOpcodeRight && IRight) { 1902 if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) { 1903 if (PRight->getOpcode() != IRight->getOpcode()) 1904 AllSameOpcodeRight = false; 1905 } else 1906 AllSameOpcodeRight = false; 1907 } 1908 1909 // Sort two opcodes. In the code below we try to preserve the ability to use 1910 // broadcast of values instead of individual inserts. 1911 // vl1 = load 1912 // vl2 = phi 1913 // vr1 = load 1914 // vr2 = vr2 1915 // = vl1 x vr1 1916 // = vl2 x vr2 1917 // If we just sorted according to opcode we would leave the first line in 1918 // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). 1919 // = vl1 x vr1 1920 // = vr2 x vl2 1921 // Because vr2 and vr1 are from the same load we loose the opportunity of a 1922 // broadcast for the packed right side in the backend: we have [vr1, vl2] 1923 // instead of [vr1, vr2=vr1]. 1924 if (ILeft && IRight) { 1925 if (!i && ILeft->getOpcode() > IRight->getOpcode()) { 1926 Left.push_back(IRight); 1927 Right.push_back(ILeft); 1928 } else if (i && ILeft->getOpcode() > IRight->getOpcode() && 1929 Right[i - 1] != IRight) { 1930 // Try not to destroy a broad cast for no apparent benefit. 1931 Left.push_back(IRight); 1932 Right.push_back(ILeft); 1933 } else if (i && ILeft->getOpcode() == IRight->getOpcode() && 1934 Right[i - 1] == ILeft) { 1935 // Try preserve broadcasts. 1936 Left.push_back(IRight); 1937 Right.push_back(ILeft); 1938 } else if (i && ILeft->getOpcode() == IRight->getOpcode() && 1939 Left[i - 1] == IRight) { 1940 // Try preserve broadcasts. 1941 Left.push_back(IRight); 1942 Right.push_back(ILeft); 1943 } else { 1944 Left.push_back(ILeft); 1945 Right.push_back(IRight); 1946 } 1947 continue; 1948 } 1949 // One opcode, put the instruction on the right. 1950 if (ILeft) { 1951 Left.push_back(VRight); 1952 Right.push_back(ILeft); 1953 continue; 1954 } 1955 Left.push_back(VLeft); 1956 Right.push_back(VRight); 1957 } 1958 1959 bool LeftBroadcast = isSplat(Left); 1960 bool RightBroadcast = isSplat(Right); 1961 1962 // If operands end up being broadcast return this operand order. 1963 if (LeftBroadcast || RightBroadcast) 1964 return; 1965 1966 // Don't reorder if the operands where good to begin. 1967 if (AllSameOpcodeRight || AllSameOpcodeLeft) { 1968 Left = OrigLeft; 1969 Right = OrigRight; 1970 } 1971 1972 // Finally check if we can get longer vectorizable chain by reordering 1973 // without breaking the good operand order detected above. 1974 // E.g. If we have something like- 1975 // load a[0] load b[0] 1976 // load b[1] load a[1] 1977 // load a[2] load b[2] 1978 // load a[3] load b[3] 1979 // Reordering the second load b[1] load a[1] would allow us to vectorize 1980 // this code and we still retain AllSameOpcode property. 1981 // FIXME: This load reordering might break AllSameOpcode in some rare cases 1982 // such as- 1983 // add a[0],c[0] load b[0] 1984 // add a[1],c[2] load b[1] 1985 // b[2] load b[2] 1986 // add a[3],c[3] load b[3] 1987 for (unsigned j = 0; j < VL.size() - 1; ++j) { 1988 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { 1989 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { 1990 if (isConsecutiveAccess(L, L1)) { 1991 std::swap(Left[j + 1], Right[j + 1]); 1992 continue; 1993 } 1994 } 1995 } 1996 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { 1997 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { 1998 if (isConsecutiveAccess(L, L1)) { 1999 std::swap(Left[j + 1], Right[j + 1]); 2000 continue; 2001 } 2002 } 2003 } 2004 // else unchanged 2005 } 2006 } 2007 2008 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { 2009 Instruction *VL0 = cast<Instruction>(VL[0]); 2010 BasicBlock::iterator NextInst = VL0; 2011 ++NextInst; 2012 Builder.SetInsertPoint(VL0->getParent(), NextInst); 2013 Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); 2014 } 2015 2016 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { 2017 Value *Vec = UndefValue::get(Ty); 2018 // Generate the 'InsertElement' instruction. 2019 for (unsigned i = 0; i < Ty->getNumElements(); ++i) { 2020 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); 2021 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { 2022 GatherSeq.insert(Insrt); 2023 CSEBlocks.insert(Insrt->getParent()); 2024 2025 // Add to our 'need-to-extract' list. 2026 if (ScalarToTreeEntry.count(VL[i])) { 2027 int Idx = ScalarToTreeEntry[VL[i]]; 2028 TreeEntry *E = &VectorizableTree[Idx]; 2029 // Find which lane we need to extract. 2030 int FoundLane = -1; 2031 for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { 2032 // Is this the lane of the scalar that we are looking for ? 2033 if (E->Scalars[Lane] == VL[i]) { 2034 FoundLane = Lane; 2035 break; 2036 } 2037 } 2038 assert(FoundLane >= 0 && "Could not find the correct lane"); 2039 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); 2040 } 2041 } 2042 } 2043 2044 return Vec; 2045 } 2046 2047 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { 2048 SmallDenseMap<Value*, int>::const_iterator Entry 2049 = ScalarToTreeEntry.find(VL[0]); 2050 if (Entry != ScalarToTreeEntry.end()) { 2051 int Idx = Entry->second; 2052 const TreeEntry *En = &VectorizableTree[Idx]; 2053 if (En->isSame(VL) && En->VectorizedValue) 2054 return En->VectorizedValue; 2055 } 2056 return nullptr; 2057 } 2058 2059 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { 2060 if (ScalarToTreeEntry.count(VL[0])) { 2061 int Idx = ScalarToTreeEntry[VL[0]]; 2062 TreeEntry *E = &VectorizableTree[Idx]; 2063 if (E->isSame(VL)) 2064 return vectorizeTree(E); 2065 } 2066 2067 Type *ScalarTy = VL[0]->getType(); 2068 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 2069 ScalarTy = SI->getValueOperand()->getType(); 2070 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 2071 2072 return Gather(VL, VecTy); 2073 } 2074 2075 Value *BoUpSLP::vectorizeTree(TreeEntry *E) { 2076 IRBuilder<>::InsertPointGuard Guard(Builder); 2077 2078 if (E->VectorizedValue) { 2079 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); 2080 return E->VectorizedValue; 2081 } 2082 2083 Instruction *VL0 = cast<Instruction>(E->Scalars[0]); 2084 Type *ScalarTy = VL0->getType(); 2085 if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) 2086 ScalarTy = SI->getValueOperand()->getType(); 2087 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); 2088 2089 if (E->NeedToGather) { 2090 setInsertPointAfterBundle(E->Scalars); 2091 return Gather(E->Scalars, VecTy); 2092 } 2093 2094 unsigned Opcode = getSameOpcode(E->Scalars); 2095 2096 switch (Opcode) { 2097 case Instruction::PHI: { 2098 PHINode *PH = dyn_cast<PHINode>(VL0); 2099 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); 2100 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 2101 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); 2102 E->VectorizedValue = NewPhi; 2103 2104 // PHINodes may have multiple entries from the same block. We want to 2105 // visit every block once. 2106 SmallSet<BasicBlock*, 4> VisitedBBs; 2107 2108 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 2109 ValueList Operands; 2110 BasicBlock *IBB = PH->getIncomingBlock(i); 2111 2112 if (!VisitedBBs.insert(IBB).second) { 2113 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); 2114 continue; 2115 } 2116 2117 // Prepare the operand vector. 2118 for (unsigned j = 0; j < E->Scalars.size(); ++j) 2119 Operands.push_back(cast<PHINode>(E->Scalars[j])-> 2120 getIncomingValueForBlock(IBB)); 2121 2122 Builder.SetInsertPoint(IBB->getTerminator()); 2123 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 2124 Value *Vec = vectorizeTree(Operands); 2125 NewPhi->addIncoming(Vec, IBB); 2126 } 2127 2128 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && 2129 "Invalid number of incoming values"); 2130 return NewPhi; 2131 } 2132 2133 case Instruction::ExtractElement: { 2134 if (CanReuseExtract(E->Scalars)) { 2135 Value *V = VL0->getOperand(0); 2136 E->VectorizedValue = V; 2137 return V; 2138 } 2139 return Gather(E->Scalars, VecTy); 2140 } 2141 case Instruction::ZExt: 2142 case Instruction::SExt: 2143 case Instruction::FPToUI: 2144 case Instruction::FPToSI: 2145 case Instruction::FPExt: 2146 case Instruction::PtrToInt: 2147 case Instruction::IntToPtr: 2148 case Instruction::SIToFP: 2149 case Instruction::UIToFP: 2150 case Instruction::Trunc: 2151 case Instruction::FPTrunc: 2152 case Instruction::BitCast: { 2153 ValueList INVL; 2154 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2155 INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2156 2157 setInsertPointAfterBundle(E->Scalars); 2158 2159 Value *InVec = vectorizeTree(INVL); 2160 2161 if (Value *V = alreadyVectorized(E->Scalars)) 2162 return V; 2163 2164 CastInst *CI = dyn_cast<CastInst>(VL0); 2165 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); 2166 E->VectorizedValue = V; 2167 ++NumVectorInstructions; 2168 return V; 2169 } 2170 case Instruction::FCmp: 2171 case Instruction::ICmp: { 2172 ValueList LHSV, RHSV; 2173 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2174 LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2175 RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2176 } 2177 2178 setInsertPointAfterBundle(E->Scalars); 2179 2180 Value *L = vectorizeTree(LHSV); 2181 Value *R = vectorizeTree(RHSV); 2182 2183 if (Value *V = alreadyVectorized(E->Scalars)) 2184 return V; 2185 2186 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); 2187 Value *V; 2188 if (Opcode == Instruction::FCmp) 2189 V = Builder.CreateFCmp(P0, L, R); 2190 else 2191 V = Builder.CreateICmp(P0, L, R); 2192 2193 E->VectorizedValue = V; 2194 ++NumVectorInstructions; 2195 return V; 2196 } 2197 case Instruction::Select: { 2198 ValueList TrueVec, FalseVec, CondVec; 2199 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2200 CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2201 TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2202 FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2)); 2203 } 2204 2205 setInsertPointAfterBundle(E->Scalars); 2206 2207 Value *Cond = vectorizeTree(CondVec); 2208 Value *True = vectorizeTree(TrueVec); 2209 Value *False = vectorizeTree(FalseVec); 2210 2211 if (Value *V = alreadyVectorized(E->Scalars)) 2212 return V; 2213 2214 Value *V = Builder.CreateSelect(Cond, True, False); 2215 E->VectorizedValue = V; 2216 ++NumVectorInstructions; 2217 return V; 2218 } 2219 case Instruction::Add: 2220 case Instruction::FAdd: 2221 case Instruction::Sub: 2222 case Instruction::FSub: 2223 case Instruction::Mul: 2224 case Instruction::FMul: 2225 case Instruction::UDiv: 2226 case Instruction::SDiv: 2227 case Instruction::FDiv: 2228 case Instruction::URem: 2229 case Instruction::SRem: 2230 case Instruction::FRem: 2231 case Instruction::Shl: 2232 case Instruction::LShr: 2233 case Instruction::AShr: 2234 case Instruction::And: 2235 case Instruction::Or: 2236 case Instruction::Xor: { 2237 ValueList LHSVL, RHSVL; 2238 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) 2239 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); 2240 else 2241 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2242 LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2243 RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2244 } 2245 2246 setInsertPointAfterBundle(E->Scalars); 2247 2248 Value *LHS = vectorizeTree(LHSVL); 2249 Value *RHS = vectorizeTree(RHSVL); 2250 2251 if (LHS == RHS && isa<Instruction>(LHS)) { 2252 assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); 2253 } 2254 2255 if (Value *V = alreadyVectorized(E->Scalars)) 2256 return V; 2257 2258 BinaryOperator *BinOp = cast<BinaryOperator>(VL0); 2259 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); 2260 E->VectorizedValue = V; 2261 propagateIRFlags(E->VectorizedValue, E->Scalars); 2262 ++NumVectorInstructions; 2263 2264 if (Instruction *I = dyn_cast<Instruction>(V)) 2265 return propagateMetadata(I, E->Scalars); 2266 2267 return V; 2268 } 2269 case Instruction::Load: { 2270 // Loads are inserted at the head of the tree because we don't want to 2271 // sink them all the way down past store instructions. 2272 setInsertPointAfterBundle(E->Scalars); 2273 2274 LoadInst *LI = cast<LoadInst>(VL0); 2275 Type *ScalarLoadTy = LI->getType(); 2276 unsigned AS = LI->getPointerAddressSpace(); 2277 2278 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), 2279 VecTy->getPointerTo(AS)); 2280 2281 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2282 // ExternalUses list to make sure that an extract will be generated in the 2283 // future. 2284 if (ScalarToTreeEntry.count(LI->getPointerOperand())) 2285 ExternalUses.push_back( 2286 ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0)); 2287 2288 unsigned Alignment = LI->getAlignment(); 2289 LI = Builder.CreateLoad(VecPtr); 2290 if (!Alignment) 2291 Alignment = DL->getABITypeAlignment(ScalarLoadTy); 2292 LI->setAlignment(Alignment); 2293 E->VectorizedValue = LI; 2294 ++NumVectorInstructions; 2295 return propagateMetadata(LI, E->Scalars); 2296 } 2297 case Instruction::Store: { 2298 StoreInst *SI = cast<StoreInst>(VL0); 2299 unsigned Alignment = SI->getAlignment(); 2300 unsigned AS = SI->getPointerAddressSpace(); 2301 2302 ValueList ValueOp; 2303 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2304 ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand()); 2305 2306 setInsertPointAfterBundle(E->Scalars); 2307 2308 Value *VecValue = vectorizeTree(ValueOp); 2309 Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), 2310 VecTy->getPointerTo(AS)); 2311 StoreInst *S = Builder.CreateStore(VecValue, VecPtr); 2312 2313 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2314 // ExternalUses list to make sure that an extract will be generated in the 2315 // future. 2316 if (ScalarToTreeEntry.count(SI->getPointerOperand())) 2317 ExternalUses.push_back( 2318 ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0)); 2319 2320 if (!Alignment) 2321 Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); 2322 S->setAlignment(Alignment); 2323 E->VectorizedValue = S; 2324 ++NumVectorInstructions; 2325 return propagateMetadata(S, E->Scalars); 2326 } 2327 case Instruction::GetElementPtr: { 2328 setInsertPointAfterBundle(E->Scalars); 2329 2330 ValueList Op0VL; 2331 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2332 Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0)); 2333 2334 Value *Op0 = vectorizeTree(Op0VL); 2335 2336 std::vector<Value *> OpVecs; 2337 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; 2338 ++j) { 2339 ValueList OpVL; 2340 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2341 OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j)); 2342 2343 Value *OpVec = vectorizeTree(OpVL); 2344 OpVecs.push_back(OpVec); 2345 } 2346 2347 Value *V = Builder.CreateGEP(Op0, OpVecs); 2348 E->VectorizedValue = V; 2349 ++NumVectorInstructions; 2350 2351 if (Instruction *I = dyn_cast<Instruction>(V)) 2352 return propagateMetadata(I, E->Scalars); 2353 2354 return V; 2355 } 2356 case Instruction::Call: { 2357 CallInst *CI = cast<CallInst>(VL0); 2358 setInsertPointAfterBundle(E->Scalars); 2359 Function *FI; 2360 Intrinsic::ID IID = Intrinsic::not_intrinsic; 2361 Value *ScalarArg = nullptr; 2362 if (CI && (FI = CI->getCalledFunction())) { 2363 IID = (Intrinsic::ID) FI->getIntrinsicID(); 2364 } 2365 std::vector<Value *> OpVecs; 2366 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { 2367 ValueList OpVL; 2368 // ctlz,cttz and powi are special intrinsics whose second argument is 2369 // a scalar. This argument should not be vectorized. 2370 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { 2371 CallInst *CEI = cast<CallInst>(E->Scalars[0]); 2372 ScalarArg = CEI->getArgOperand(j); 2373 OpVecs.push_back(CEI->getArgOperand(j)); 2374 continue; 2375 } 2376 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2377 CallInst *CEI = cast<CallInst>(E->Scalars[i]); 2378 OpVL.push_back(CEI->getArgOperand(j)); 2379 } 2380 2381 Value *OpVec = vectorizeTree(OpVL); 2382 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); 2383 OpVecs.push_back(OpVec); 2384 } 2385 2386 Module *M = F->getParent(); 2387 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 2388 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; 2389 Function *CF = Intrinsic::getDeclaration(M, ID, Tys); 2390 Value *V = Builder.CreateCall(CF, OpVecs); 2391 2392 // The scalar argument uses an in-tree scalar so we add the new vectorized 2393 // call to ExternalUses list to make sure that an extract will be 2394 // generated in the future. 2395 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg)) 2396 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); 2397 2398 E->VectorizedValue = V; 2399 ++NumVectorInstructions; 2400 return V; 2401 } 2402 case Instruction::ShuffleVector: { 2403 ValueList LHSVL, RHSVL; 2404 assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand"); 2405 reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); 2406 setInsertPointAfterBundle(E->Scalars); 2407 2408 Value *LHS = vectorizeTree(LHSVL); 2409 Value *RHS = vectorizeTree(RHSVL); 2410 2411 if (Value *V = alreadyVectorized(E->Scalars)) 2412 return V; 2413 2414 // Create a vector of LHS op1 RHS 2415 BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0); 2416 Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); 2417 2418 // Create a vector of LHS op2 RHS 2419 Instruction *VL1 = cast<Instruction>(E->Scalars[1]); 2420 BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1); 2421 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); 2422 2423 // Create shuffle to take alternate operations from the vector. 2424 // Also, gather up odd and even scalar ops to propagate IR flags to 2425 // each vector operation. 2426 ValueList OddScalars, EvenScalars; 2427 unsigned e = E->Scalars.size(); 2428 SmallVector<Constant *, 8> Mask(e); 2429 for (unsigned i = 0; i < e; ++i) { 2430 if (i & 1) { 2431 Mask[i] = Builder.getInt32(e + i); 2432 OddScalars.push_back(E->Scalars[i]); 2433 } else { 2434 Mask[i] = Builder.getInt32(i); 2435 EvenScalars.push_back(E->Scalars[i]); 2436 } 2437 } 2438 2439 Value *ShuffleMask = ConstantVector::get(Mask); 2440 propagateIRFlags(V0, EvenScalars); 2441 propagateIRFlags(V1, OddScalars); 2442 2443 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); 2444 E->VectorizedValue = V; 2445 ++NumVectorInstructions; 2446 if (Instruction *I = dyn_cast<Instruction>(V)) 2447 return propagateMetadata(I, E->Scalars); 2448 2449 return V; 2450 } 2451 default: 2452 llvm_unreachable("unknown inst"); 2453 } 2454 return nullptr; 2455 } 2456 2457 Value *BoUpSLP::vectorizeTree() { 2458 2459 // All blocks must be scheduled before any instructions are inserted. 2460 for (auto &BSIter : BlocksSchedules) { 2461 scheduleBlock(BSIter.second.get()); 2462 } 2463 2464 Builder.SetInsertPoint(F->getEntryBlock().begin()); 2465 vectorizeTree(&VectorizableTree[0]); 2466 2467 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); 2468 2469 // Extract all of the elements with the external uses. 2470 for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end(); 2471 it != e; ++it) { 2472 Value *Scalar = it->Scalar; 2473 llvm::User *User = it->User; 2474 2475 // Skip users that we already RAUW. This happens when one instruction 2476 // has multiple uses of the same value. 2477 if (std::find(Scalar->user_begin(), Scalar->user_end(), User) == 2478 Scalar->user_end()) 2479 continue; 2480 assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); 2481 2482 int Idx = ScalarToTreeEntry[Scalar]; 2483 TreeEntry *E = &VectorizableTree[Idx]; 2484 assert(!E->NeedToGather && "Extracting from a gather list"); 2485 2486 Value *Vec = E->VectorizedValue; 2487 assert(Vec && "Can't find vectorizable value"); 2488 2489 Value *Lane = Builder.getInt32(it->Lane); 2490 // Generate extracts for out-of-tree users. 2491 // Find the insertion point for the extractelement lane. 2492 if (isa<Instruction>(Vec)){ 2493 if (PHINode *PH = dyn_cast<PHINode>(User)) { 2494 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { 2495 if (PH->getIncomingValue(i) == Scalar) { 2496 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); 2497 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2498 CSEBlocks.insert(PH->getIncomingBlock(i)); 2499 PH->setOperand(i, Ex); 2500 } 2501 } 2502 } else { 2503 Builder.SetInsertPoint(cast<Instruction>(User)); 2504 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2505 CSEBlocks.insert(cast<Instruction>(User)->getParent()); 2506 User->replaceUsesOfWith(Scalar, Ex); 2507 } 2508 } else { 2509 Builder.SetInsertPoint(F->getEntryBlock().begin()); 2510 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2511 CSEBlocks.insert(&F->getEntryBlock()); 2512 User->replaceUsesOfWith(Scalar, Ex); 2513 } 2514 2515 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); 2516 } 2517 2518 // For each vectorized value: 2519 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 2520 TreeEntry *Entry = &VectorizableTree[EIdx]; 2521 2522 // For each lane: 2523 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 2524 Value *Scalar = Entry->Scalars[Lane]; 2525 // No need to handle users of gathered values. 2526 if (Entry->NeedToGather) 2527 continue; 2528 2529 assert(Entry->VectorizedValue && "Can't find vectorizable value"); 2530 2531 Type *Ty = Scalar->getType(); 2532 if (!Ty->isVoidTy()) { 2533 #ifndef NDEBUG 2534 for (User *U : Scalar->users()) { 2535 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); 2536 2537 assert((ScalarToTreeEntry.count(U) || 2538 // It is legal to replace users in the ignorelist by undef. 2539 (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != 2540 UserIgnoreList.end())) && 2541 "Replacing out-of-tree value with undef"); 2542 } 2543 #endif 2544 Value *Undef = UndefValue::get(Ty); 2545 Scalar->replaceAllUsesWith(Undef); 2546 } 2547 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); 2548 eraseInstruction(cast<Instruction>(Scalar)); 2549 } 2550 } 2551 2552 Builder.ClearInsertionPoint(); 2553 2554 return VectorizableTree[0].VectorizedValue; 2555 } 2556 2557 void BoUpSLP::optimizeGatherSequence() { 2558 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() 2559 << " gather sequences instructions.\n"); 2560 // LICM InsertElementInst sequences. 2561 for (SetVector<Instruction *>::iterator it = GatherSeq.begin(), 2562 e = GatherSeq.end(); it != e; ++it) { 2563 InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); 2564 2565 if (!Insert) 2566 continue; 2567 2568 // Check if this block is inside a loop. 2569 Loop *L = LI->getLoopFor(Insert->getParent()); 2570 if (!L) 2571 continue; 2572 2573 // Check if it has a preheader. 2574 BasicBlock *PreHeader = L->getLoopPreheader(); 2575 if (!PreHeader) 2576 continue; 2577 2578 // If the vector or the element that we insert into it are 2579 // instructions that are defined in this basic block then we can't 2580 // hoist this instruction. 2581 Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); 2582 Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); 2583 if (CurrVec && L->contains(CurrVec)) 2584 continue; 2585 if (NewElem && L->contains(NewElem)) 2586 continue; 2587 2588 // We can hoist this instruction. Move it to the pre-header. 2589 Insert->moveBefore(PreHeader->getTerminator()); 2590 } 2591 2592 // Make a list of all reachable blocks in our CSE queue. 2593 SmallVector<const DomTreeNode *, 8> CSEWorkList; 2594 CSEWorkList.reserve(CSEBlocks.size()); 2595 for (BasicBlock *BB : CSEBlocks) 2596 if (DomTreeNode *N = DT->getNode(BB)) { 2597 assert(DT->isReachableFromEntry(N)); 2598 CSEWorkList.push_back(N); 2599 } 2600 2601 // Sort blocks by domination. This ensures we visit a block after all blocks 2602 // dominating it are visited. 2603 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), 2604 [this](const DomTreeNode *A, const DomTreeNode *B) { 2605 return DT->properlyDominates(A, B); 2606 }); 2607 2608 // Perform O(N^2) search over the gather sequences and merge identical 2609 // instructions. TODO: We can further optimize this scan if we split the 2610 // instructions into different buckets based on the insert lane. 2611 SmallVector<Instruction *, 16> Visited; 2612 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { 2613 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && 2614 "Worklist not sorted properly!"); 2615 BasicBlock *BB = (*I)->getBlock(); 2616 // For all instructions in blocks containing gather sequences: 2617 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { 2618 Instruction *In = it++; 2619 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) 2620 continue; 2621 2622 // Check if we can replace this instruction with any of the 2623 // visited instructions. 2624 for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(), 2625 ve = Visited.end(); 2626 v != ve; ++v) { 2627 if (In->isIdenticalTo(*v) && 2628 DT->dominates((*v)->getParent(), In->getParent())) { 2629 In->replaceAllUsesWith(*v); 2630 eraseInstruction(In); 2631 In = nullptr; 2632 break; 2633 } 2634 } 2635 if (In) { 2636 assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end()); 2637 Visited.push_back(In); 2638 } 2639 } 2640 } 2641 CSEBlocks.clear(); 2642 GatherSeq.clear(); 2643 } 2644 2645 // Groups the instructions to a bundle (which is then a single scheduling entity) 2646 // and schedules instructions until the bundle gets ready. 2647 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, 2648 BoUpSLP *SLP) { 2649 if (isa<PHINode>(VL[0])) 2650 return true; 2651 2652 // Initialize the instruction bundle. 2653 Instruction *OldScheduleEnd = ScheduleEnd; 2654 ScheduleData *PrevInBundle = nullptr; 2655 ScheduleData *Bundle = nullptr; 2656 bool ReSchedule = false; 2657 DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); 2658 for (Value *V : VL) { 2659 extendSchedulingRegion(V); 2660 ScheduleData *BundleMember = getScheduleData(V); 2661 assert(BundleMember && 2662 "no ScheduleData for bundle member (maybe not in same basic block)"); 2663 if (BundleMember->IsScheduled) { 2664 // A bundle member was scheduled as single instruction before and now 2665 // needs to be scheduled as part of the bundle. We just get rid of the 2666 // existing schedule. 2667 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember 2668 << " was already scheduled\n"); 2669 ReSchedule = true; 2670 } 2671 assert(BundleMember->isSchedulingEntity() && 2672 "bundle member already part of other bundle"); 2673 if (PrevInBundle) { 2674 PrevInBundle->NextInBundle = BundleMember; 2675 } else { 2676 Bundle = BundleMember; 2677 } 2678 BundleMember->UnscheduledDepsInBundle = 0; 2679 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; 2680 2681 // Group the instructions to a bundle. 2682 BundleMember->FirstInBundle = Bundle; 2683 PrevInBundle = BundleMember; 2684 } 2685 if (ScheduleEnd != OldScheduleEnd) { 2686 // The scheduling region got new instructions at the lower end (or it is a 2687 // new region for the first bundle). This makes it necessary to 2688 // recalculate all dependencies. 2689 // It is seldom that this needs to be done a second time after adding the 2690 // initial bundle to the region. 2691 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2692 ScheduleData *SD = getScheduleData(I); 2693 SD->clearDependencies(); 2694 } 2695 ReSchedule = true; 2696 } 2697 if (ReSchedule) { 2698 resetSchedule(); 2699 initialFillReadyList(ReadyInsts); 2700 } 2701 2702 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " 2703 << BB->getName() << "\n"); 2704 2705 calculateDependencies(Bundle, true, SLP); 2706 2707 // Now try to schedule the new bundle. As soon as the bundle is "ready" it 2708 // means that there are no cyclic dependencies and we can schedule it. 2709 // Note that's important that we don't "schedule" the bundle yet (see 2710 // cancelScheduling). 2711 while (!Bundle->isReady() && !ReadyInsts.empty()) { 2712 2713 ScheduleData *pickedSD = ReadyInsts.back(); 2714 ReadyInsts.pop_back(); 2715 2716 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { 2717 schedule(pickedSD, ReadyInsts); 2718 } 2719 } 2720 return Bundle->isReady(); 2721 } 2722 2723 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) { 2724 if (isa<PHINode>(VL[0])) 2725 return; 2726 2727 ScheduleData *Bundle = getScheduleData(VL[0]); 2728 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); 2729 assert(!Bundle->IsScheduled && 2730 "Can't cancel bundle which is already scheduled"); 2731 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && 2732 "tried to unbundle something which is not a bundle"); 2733 2734 // Un-bundle: make single instructions out of the bundle. 2735 ScheduleData *BundleMember = Bundle; 2736 while (BundleMember) { 2737 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); 2738 BundleMember->FirstInBundle = BundleMember; 2739 ScheduleData *Next = BundleMember->NextInBundle; 2740 BundleMember->NextInBundle = nullptr; 2741 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; 2742 if (BundleMember->UnscheduledDepsInBundle == 0) { 2743 ReadyInsts.insert(BundleMember); 2744 } 2745 BundleMember = Next; 2746 } 2747 } 2748 2749 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { 2750 if (getScheduleData(V)) 2751 return; 2752 Instruction *I = dyn_cast<Instruction>(V); 2753 assert(I && "bundle member must be an instruction"); 2754 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); 2755 if (!ScheduleStart) { 2756 // It's the first instruction in the new region. 2757 initScheduleData(I, I->getNextNode(), nullptr, nullptr); 2758 ScheduleStart = I; 2759 ScheduleEnd = I->getNextNode(); 2760 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2761 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); 2762 return; 2763 } 2764 // Search up and down at the same time, because we don't know if the new 2765 // instruction is above or below the existing scheduling region. 2766 BasicBlock::reverse_iterator UpIter(ScheduleStart); 2767 BasicBlock::reverse_iterator UpperEnd = BB->rend(); 2768 BasicBlock::iterator DownIter(ScheduleEnd); 2769 BasicBlock::iterator LowerEnd = BB->end(); 2770 for (;;) { 2771 if (UpIter != UpperEnd) { 2772 if (&*UpIter == I) { 2773 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); 2774 ScheduleStart = I; 2775 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); 2776 return; 2777 } 2778 UpIter++; 2779 } 2780 if (DownIter != LowerEnd) { 2781 if (&*DownIter == I) { 2782 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, 2783 nullptr); 2784 ScheduleEnd = I->getNextNode(); 2785 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2786 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); 2787 return; 2788 } 2789 DownIter++; 2790 } 2791 assert((UpIter != UpperEnd || DownIter != LowerEnd) && 2792 "instruction not found in block"); 2793 } 2794 } 2795 2796 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, 2797 Instruction *ToI, 2798 ScheduleData *PrevLoadStore, 2799 ScheduleData *NextLoadStore) { 2800 ScheduleData *CurrentLoadStore = PrevLoadStore; 2801 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { 2802 ScheduleData *SD = ScheduleDataMap[I]; 2803 if (!SD) { 2804 // Allocate a new ScheduleData for the instruction. 2805 if (ChunkPos >= ChunkSize) { 2806 ScheduleDataChunks.push_back( 2807 llvm::make_unique<ScheduleData[]>(ChunkSize)); 2808 ChunkPos = 0; 2809 } 2810 SD = &(ScheduleDataChunks.back()[ChunkPos++]); 2811 ScheduleDataMap[I] = SD; 2812 SD->Inst = I; 2813 } 2814 assert(!isInSchedulingRegion(SD) && 2815 "new ScheduleData already in scheduling region"); 2816 SD->init(SchedulingRegionID); 2817 2818 if (I->mayReadOrWriteMemory()) { 2819 // Update the linked list of memory accessing instructions. 2820 if (CurrentLoadStore) { 2821 CurrentLoadStore->NextLoadStore = SD; 2822 } else { 2823 FirstLoadStoreInRegion = SD; 2824 } 2825 CurrentLoadStore = SD; 2826 } 2827 } 2828 if (NextLoadStore) { 2829 if (CurrentLoadStore) 2830 CurrentLoadStore->NextLoadStore = NextLoadStore; 2831 } else { 2832 LastLoadStoreInRegion = CurrentLoadStore; 2833 } 2834 } 2835 2836 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, 2837 bool InsertInReadyList, 2838 BoUpSLP *SLP) { 2839 assert(SD->isSchedulingEntity()); 2840 2841 SmallVector<ScheduleData *, 10> WorkList; 2842 WorkList.push_back(SD); 2843 2844 while (!WorkList.empty()) { 2845 ScheduleData *SD = WorkList.back(); 2846 WorkList.pop_back(); 2847 2848 ScheduleData *BundleMember = SD; 2849 while (BundleMember) { 2850 assert(isInSchedulingRegion(BundleMember)); 2851 if (!BundleMember->hasValidDependencies()) { 2852 2853 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n"); 2854 BundleMember->Dependencies = 0; 2855 BundleMember->resetUnscheduledDeps(); 2856 2857 // Handle def-use chain dependencies. 2858 for (User *U : BundleMember->Inst->users()) { 2859 if (isa<Instruction>(U)) { 2860 ScheduleData *UseSD = getScheduleData(U); 2861 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { 2862 BundleMember->Dependencies++; 2863 ScheduleData *DestBundle = UseSD->FirstInBundle; 2864 if (!DestBundle->IsScheduled) { 2865 BundleMember->incrementUnscheduledDeps(1); 2866 } 2867 if (!DestBundle->hasValidDependencies()) { 2868 WorkList.push_back(DestBundle); 2869 } 2870 } 2871 } else { 2872 // I'm not sure if this can ever happen. But we need to be safe. 2873 // This lets the instruction/bundle never be scheduled and eventally 2874 // disable vectorization. 2875 BundleMember->Dependencies++; 2876 BundleMember->incrementUnscheduledDeps(1); 2877 } 2878 } 2879 2880 // Handle the memory dependencies. 2881 ScheduleData *DepDest = BundleMember->NextLoadStore; 2882 if (DepDest) { 2883 Instruction *SrcInst = BundleMember->Inst; 2884 AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA); 2885 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); 2886 unsigned numAliased = 0; 2887 unsigned DistToSrc = 1; 2888 2889 while (DepDest) { 2890 assert(isInSchedulingRegion(DepDest)); 2891 2892 // We have two limits to reduce the complexity: 2893 // 1) AliasedCheckLimit: It's a small limit to reduce calls to 2894 // SLP->isAliased (which is the expensive part in this loop). 2895 // 2) MaxMemDepDistance: It's for very large blocks and it aborts 2896 // the whole loop (even if the loop is fast, it's quadratic). 2897 // It's important for the loop break condition (see below) to 2898 // check this limit even between two read-only instructions. 2899 if (DistToSrc >= MaxMemDepDistance || 2900 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && 2901 (numAliased >= AliasedCheckLimit || 2902 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { 2903 2904 // We increment the counter only if the locations are aliased 2905 // (instead of counting all alias checks). This gives a better 2906 // balance between reduced runtime and accurate dependencies. 2907 numAliased++; 2908 2909 DepDest->MemoryDependencies.push_back(BundleMember); 2910 BundleMember->Dependencies++; 2911 ScheduleData *DestBundle = DepDest->FirstInBundle; 2912 if (!DestBundle->IsScheduled) { 2913 BundleMember->incrementUnscheduledDeps(1); 2914 } 2915 if (!DestBundle->hasValidDependencies()) { 2916 WorkList.push_back(DestBundle); 2917 } 2918 } 2919 DepDest = DepDest->NextLoadStore; 2920 2921 // Example, explaining the loop break condition: Let's assume our 2922 // starting instruction is i0 and MaxMemDepDistance = 3. 2923 // 2924 // +--------v--v--v 2925 // i0,i1,i2,i3,i4,i5,i6,i7,i8 2926 // +--------^--^--^ 2927 // 2928 // MaxMemDepDistance let us stop alias-checking at i3 and we add 2929 // dependencies from i0 to i3,i4,.. (even if they are not aliased). 2930 // Previously we already added dependencies from i3 to i6,i7,i8 2931 // (because of MaxMemDepDistance). As we added a dependency from 2932 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 2933 // and we can abort this loop at i6. 2934 if (DistToSrc >= 2 * MaxMemDepDistance) 2935 break; 2936 DistToSrc++; 2937 } 2938 } 2939 } 2940 BundleMember = BundleMember->NextInBundle; 2941 } 2942 if (InsertInReadyList && SD->isReady()) { 2943 ReadyInsts.push_back(SD); 2944 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n"); 2945 } 2946 } 2947 } 2948 2949 void BoUpSLP::BlockScheduling::resetSchedule() { 2950 assert(ScheduleStart && 2951 "tried to reset schedule on block which has not been scheduled"); 2952 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2953 ScheduleData *SD = getScheduleData(I); 2954 assert(isInSchedulingRegion(SD)); 2955 SD->IsScheduled = false; 2956 SD->resetUnscheduledDeps(); 2957 } 2958 ReadyInsts.clear(); 2959 } 2960 2961 void BoUpSLP::scheduleBlock(BlockScheduling *BS) { 2962 2963 if (!BS->ScheduleStart) 2964 return; 2965 2966 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); 2967 2968 BS->resetSchedule(); 2969 2970 // For the real scheduling we use a more sophisticated ready-list: it is 2971 // sorted by the original instruction location. This lets the final schedule 2972 // be as close as possible to the original instruction order. 2973 struct ScheduleDataCompare { 2974 bool operator()(ScheduleData *SD1, ScheduleData *SD2) { 2975 return SD2->SchedulingPriority < SD1->SchedulingPriority; 2976 } 2977 }; 2978 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; 2979 2980 // Ensure that all depencency data is updated and fill the ready-list with 2981 // initial instructions. 2982 int Idx = 0; 2983 int NumToSchedule = 0; 2984 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; 2985 I = I->getNextNode()) { 2986 ScheduleData *SD = BS->getScheduleData(I); 2987 assert( 2988 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && 2989 "scheduler and vectorizer have different opinion on what is a bundle"); 2990 SD->FirstInBundle->SchedulingPriority = Idx++; 2991 if (SD->isSchedulingEntity()) { 2992 BS->calculateDependencies(SD, false, this); 2993 NumToSchedule++; 2994 } 2995 } 2996 BS->initialFillReadyList(ReadyInsts); 2997 2998 Instruction *LastScheduledInst = BS->ScheduleEnd; 2999 3000 // Do the "real" scheduling. 3001 while (!ReadyInsts.empty()) { 3002 ScheduleData *picked = *ReadyInsts.begin(); 3003 ReadyInsts.erase(ReadyInsts.begin()); 3004 3005 // Move the scheduled instruction(s) to their dedicated places, if not 3006 // there yet. 3007 ScheduleData *BundleMember = picked; 3008 while (BundleMember) { 3009 Instruction *pickedInst = BundleMember->Inst; 3010 if (LastScheduledInst->getNextNode() != pickedInst) { 3011 BS->BB->getInstList().remove(pickedInst); 3012 BS->BB->getInstList().insert(LastScheduledInst, pickedInst); 3013 } 3014 LastScheduledInst = pickedInst; 3015 BundleMember = BundleMember->NextInBundle; 3016 } 3017 3018 BS->schedule(picked, ReadyInsts); 3019 NumToSchedule--; 3020 } 3021 assert(NumToSchedule == 0 && "could not schedule all instructions"); 3022 3023 // Avoid duplicate scheduling of the block. 3024 BS->ScheduleStart = nullptr; 3025 } 3026 3027 /// The SLPVectorizer Pass. 3028 struct SLPVectorizer : public FunctionPass { 3029 typedef SmallVector<StoreInst *, 8> StoreList; 3030 typedef MapVector<Value *, StoreList> StoreListMap; 3031 3032 /// Pass identification, replacement for typeid 3033 static char ID; 3034 3035 explicit SLPVectorizer() : FunctionPass(ID) { 3036 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); 3037 } 3038 3039 ScalarEvolution *SE; 3040 const DataLayout *DL; 3041 TargetTransformInfo *TTI; 3042 TargetLibraryInfo *TLI; 3043 AliasAnalysis *AA; 3044 LoopInfo *LI; 3045 DominatorTree *DT; 3046 AssumptionCache *AC; 3047 3048 bool runOnFunction(Function &F) override { 3049 if (skipOptnoneFunction(F)) 3050 return false; 3051 3052 SE = &getAnalysis<ScalarEvolution>(); 3053 DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); 3054 DL = DLP ? &DLP->getDataLayout() : nullptr; 3055 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 3056 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 3057 TLI = TLIP ? &TLIP->getTLI() : nullptr; 3058 AA = &getAnalysis<AliasAnalysis>(); 3059 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 3060 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 3061 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 3062 3063 StoreRefs.clear(); 3064 bool Changed = false; 3065 3066 // If the target claims to have no vector registers don't attempt 3067 // vectorization. 3068 if (!TTI->getNumberOfRegisters(true)) 3069 return false; 3070 3071 // Must have DataLayout. We can't require it because some tests run w/o 3072 // triple. 3073 if (!DL) 3074 return false; 3075 3076 // Don't vectorize when the attribute NoImplicitFloat is used. 3077 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 3078 return false; 3079 3080 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); 3081 3082 // Use the bottom up slp vectorizer to construct chains that start with 3083 // store instructions. 3084 BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC); 3085 3086 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to 3087 // delete instructions. 3088 3089 // Scan the blocks in the function in post order. 3090 for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), 3091 e = po_end(&F.getEntryBlock()); it != e; ++it) { 3092 BasicBlock *BB = *it; 3093 // Vectorize trees that end at stores. 3094 if (unsigned count = collectStores(BB, R)) { 3095 (void)count; 3096 DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); 3097 Changed |= vectorizeStoreChains(R); 3098 } 3099 3100 // Vectorize trees that end at reductions. 3101 Changed |= vectorizeChainsInBlock(BB, R); 3102 } 3103 3104 if (Changed) { 3105 R.optimizeGatherSequence(); 3106 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); 3107 DEBUG(verifyFunction(F)); 3108 } 3109 return Changed; 3110 } 3111 3112 void getAnalysisUsage(AnalysisUsage &AU) const override { 3113 FunctionPass::getAnalysisUsage(AU); 3114 AU.addRequired<AssumptionCacheTracker>(); 3115 AU.addRequired<ScalarEvolution>(); 3116 AU.addRequired<AliasAnalysis>(); 3117 AU.addRequired<TargetTransformInfoWrapperPass>(); 3118 AU.addRequired<LoopInfoWrapperPass>(); 3119 AU.addRequired<DominatorTreeWrapperPass>(); 3120 AU.addPreserved<LoopInfoWrapperPass>(); 3121 AU.addPreserved<DominatorTreeWrapperPass>(); 3122 AU.setPreservesCFG(); 3123 } 3124 3125 private: 3126 3127 /// \brief Collect memory references and sort them according to their base 3128 /// object. We sort the stores to their base objects to reduce the cost of the 3129 /// quadratic search on the stores. TODO: We can further reduce this cost 3130 /// if we flush the chain creation every time we run into a memory barrier. 3131 unsigned collectStores(BasicBlock *BB, BoUpSLP &R); 3132 3133 /// \brief Try to vectorize a chain that starts at two arithmetic instrs. 3134 bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); 3135 3136 /// \brief Try to vectorize a list of operands. 3137 /// \@param BuildVector A list of users to ignore for the purpose of 3138 /// scheduling and that don't need extracting. 3139 /// \returns true if a value was vectorized. 3140 bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 3141 ArrayRef<Value *> BuildVector = None, 3142 bool allowReorder = false); 3143 3144 /// \brief Try to vectorize a chain that may start at the operands of \V; 3145 bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); 3146 3147 /// \brief Vectorize the stores that were collected in StoreRefs. 3148 bool vectorizeStoreChains(BoUpSLP &R); 3149 3150 /// \brief Scan the basic block and look for patterns that are likely to start 3151 /// a vectorization chain. 3152 bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); 3153 3154 bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, 3155 BoUpSLP &R); 3156 3157 bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, 3158 BoUpSLP &R); 3159 private: 3160 StoreListMap StoreRefs; 3161 }; 3162 3163 /// \brief Check that the Values in the slice in VL array are still existent in 3164 /// the WeakVH array. 3165 /// Vectorization of part of the VL array may cause later values in the VL array 3166 /// to become invalid. We track when this has happened in the WeakVH array. 3167 static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL, 3168 SmallVectorImpl<WeakVH> &VH, 3169 unsigned SliceBegin, 3170 unsigned SliceSize) { 3171 for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i) 3172 if (VH[i] != VL[i]) 3173 return true; 3174 3175 return false; 3176 } 3177 3178 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, 3179 int CostThreshold, BoUpSLP &R) { 3180 unsigned ChainLen = Chain.size(); 3181 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen 3182 << "\n"); 3183 Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); 3184 unsigned Sz = DL->getTypeSizeInBits(StoreTy); 3185 unsigned VF = MinVecRegSize / Sz; 3186 3187 if (!isPowerOf2_32(Sz) || VF < 2) 3188 return false; 3189 3190 // Keep track of values that were deleted by vectorizing in the loop below. 3191 SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end()); 3192 3193 bool Changed = false; 3194 // Look for profitable vectorizable trees at all offsets, starting at zero. 3195 for (unsigned i = 0, e = ChainLen; i < e; ++i) { 3196 if (i + VF > e) 3197 break; 3198 3199 // Check that a previous iteration of this loop did not delete the Value. 3200 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) 3201 continue; 3202 3203 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i 3204 << "\n"); 3205 ArrayRef<Value *> Operands = Chain.slice(i, VF); 3206 3207 R.buildTree(Operands); 3208 3209 int Cost = R.getTreeCost(); 3210 3211 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); 3212 if (Cost < CostThreshold) { 3213 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); 3214 R.vectorizeTree(); 3215 3216 // Move to the next bundle. 3217 i += VF - 1; 3218 Changed = true; 3219 } 3220 } 3221 3222 return Changed; 3223 } 3224 3225 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, 3226 int costThreshold, BoUpSLP &R) { 3227 SetVector<Value *> Heads, Tails; 3228 SmallDenseMap<Value *, Value *> ConsecutiveChain; 3229 3230 // We may run into multiple chains that merge into a single chain. We mark the 3231 // stores that we vectorized so that we don't visit the same store twice. 3232 BoUpSLP::ValueSet VectorizedStores; 3233 bool Changed = false; 3234 3235 // Do a quadratic search on all of the given stores and find 3236 // all of the pairs of stores that follow each other. 3237 for (unsigned i = 0, e = Stores.size(); i < e; ++i) { 3238 for (unsigned j = 0; j < e; ++j) { 3239 if (i == j) 3240 continue; 3241 3242 if (R.isConsecutiveAccess(Stores[i], Stores[j])) { 3243 Tails.insert(Stores[j]); 3244 Heads.insert(Stores[i]); 3245 ConsecutiveChain[Stores[i]] = Stores[j]; 3246 } 3247 } 3248 } 3249 3250 // For stores that start but don't end a link in the chain: 3251 for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); 3252 it != e; ++it) { 3253 if (Tails.count(*it)) 3254 continue; 3255 3256 // We found a store instr that starts a chain. Now follow the chain and try 3257 // to vectorize it. 3258 BoUpSLP::ValueList Operands; 3259 Value *I = *it; 3260 // Collect the chain into a list. 3261 while (Tails.count(I) || Heads.count(I)) { 3262 if (VectorizedStores.count(I)) 3263 break; 3264 Operands.push_back(I); 3265 // Move to the next value in the chain. 3266 I = ConsecutiveChain[I]; 3267 } 3268 3269 bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); 3270 3271 // Mark the vectorized stores so that we don't vectorize them again. 3272 if (Vectorized) 3273 VectorizedStores.insert(Operands.begin(), Operands.end()); 3274 Changed |= Vectorized; 3275 } 3276 3277 return Changed; 3278 } 3279 3280 3281 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { 3282 unsigned count = 0; 3283 StoreRefs.clear(); 3284 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3285 StoreInst *SI = dyn_cast<StoreInst>(it); 3286 if (!SI) 3287 continue; 3288 3289 // Don't touch volatile stores. 3290 if (!SI->isSimple()) 3291 continue; 3292 3293 // Check that the pointer points to scalars. 3294 Type *Ty = SI->getValueOperand()->getType(); 3295 if (Ty->isAggregateType() || Ty->isVectorTy()) 3296 continue; 3297 3298 // Find the base pointer. 3299 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); 3300 3301 // Save the store locations. 3302 StoreRefs[Ptr].push_back(SI); 3303 count++; 3304 } 3305 return count; 3306 } 3307 3308 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { 3309 if (!A || !B) 3310 return false; 3311 Value *VL[] = { A, B }; 3312 return tryToVectorizeList(VL, R, None, true); 3313 } 3314 3315 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 3316 ArrayRef<Value *> BuildVector, 3317 bool allowReorder) { 3318 if (VL.size() < 2) 3319 return false; 3320 3321 DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n"); 3322 3323 // Check that all of the parts are scalar instructions of the same type. 3324 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 3325 if (!I0) 3326 return false; 3327 3328 unsigned Opcode0 = I0->getOpcode(); 3329 3330 Type *Ty0 = I0->getType(); 3331 unsigned Sz = DL->getTypeSizeInBits(Ty0); 3332 unsigned VF = MinVecRegSize / Sz; 3333 3334 for (int i = 0, e = VL.size(); i < e; ++i) { 3335 Type *Ty = VL[i]->getType(); 3336 if (Ty->isAggregateType() || Ty->isVectorTy()) 3337 return false; 3338 Instruction *Inst = dyn_cast<Instruction>(VL[i]); 3339 if (!Inst || Inst->getOpcode() != Opcode0) 3340 return false; 3341 } 3342 3343 bool Changed = false; 3344 3345 // Keep track of values that were deleted by vectorizing in the loop below. 3346 SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); 3347 3348 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 3349 unsigned OpsWidth = 0; 3350 3351 if (i + VF > e) 3352 OpsWidth = e - i; 3353 else 3354 OpsWidth = VF; 3355 3356 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) 3357 break; 3358 3359 // Check that a previous iteration of this loop did not delete the Value. 3360 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth)) 3361 continue; 3362 3363 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " 3364 << "\n"); 3365 ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); 3366 3367 ArrayRef<Value *> BuildVectorSlice; 3368 if (!BuildVector.empty()) 3369 BuildVectorSlice = BuildVector.slice(i, OpsWidth); 3370 3371 R.buildTree(Ops, BuildVectorSlice); 3372 // TODO: check if we can allow reordering also for other cases than 3373 // tryToVectorizePair() 3374 if (allowReorder && R.shouldReorder()) { 3375 assert(Ops.size() == 2); 3376 assert(BuildVectorSlice.empty()); 3377 Value *ReorderedOps[] = { Ops[1], Ops[0] }; 3378 R.buildTree(ReorderedOps, None); 3379 } 3380 int Cost = R.getTreeCost(); 3381 3382 if (Cost < -SLPCostThreshold) { 3383 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); 3384 Value *VectorizedRoot = R.vectorizeTree(); 3385 3386 // Reconstruct the build vector by extracting the vectorized root. This 3387 // way we handle the case where some elements of the vector are undefined. 3388 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) 3389 if (!BuildVectorSlice.empty()) { 3390 // The insert point is the last build vector instruction. The vectorized 3391 // root will precede it. This guarantees that we get an instruction. The 3392 // vectorized tree could have been constant folded. 3393 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); 3394 unsigned VecIdx = 0; 3395 for (auto &V : BuildVectorSlice) { 3396 IRBuilder<true, NoFolder> Builder( 3397 ++BasicBlock::iterator(InsertAfter)); 3398 InsertElementInst *IE = cast<InsertElementInst>(V); 3399 Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( 3400 VectorizedRoot, Builder.getInt32(VecIdx++))); 3401 IE->setOperand(1, Extract); 3402 IE->removeFromParent(); 3403 IE->insertAfter(Extract); 3404 InsertAfter = IE; 3405 } 3406 } 3407 // Move to the next bundle. 3408 i += VF - 1; 3409 Changed = true; 3410 } 3411 } 3412 3413 return Changed; 3414 } 3415 3416 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { 3417 if (!V) 3418 return false; 3419 3420 // Try to vectorize V. 3421 if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) 3422 return true; 3423 3424 BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0)); 3425 BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1)); 3426 // Try to skip B. 3427 if (B && B->hasOneUse()) { 3428 BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); 3429 BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); 3430 if (tryToVectorizePair(A, B0, R)) { 3431 return true; 3432 } 3433 if (tryToVectorizePair(A, B1, R)) { 3434 return true; 3435 } 3436 } 3437 3438 // Try to skip A. 3439 if (A && A->hasOneUse()) { 3440 BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); 3441 BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); 3442 if (tryToVectorizePair(A0, B, R)) { 3443 return true; 3444 } 3445 if (tryToVectorizePair(A1, B, R)) { 3446 return true; 3447 } 3448 } 3449 return 0; 3450 } 3451 3452 /// \brief Generate a shuffle mask to be used in a reduction tree. 3453 /// 3454 /// \param VecLen The length of the vector to be reduced. 3455 /// \param NumEltsToRdx The number of elements that should be reduced in the 3456 /// vector. 3457 /// \param IsPairwise Whether the reduction is a pairwise or splitting 3458 /// reduction. A pairwise reduction will generate a mask of 3459 /// <0,2,...> or <1,3,..> while a splitting reduction will generate 3460 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2. 3461 /// \param IsLeft True will generate a mask of even elements, odd otherwise. 3462 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, 3463 bool IsPairwise, bool IsLeft, 3464 IRBuilder<> &Builder) { 3465 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); 3466 3467 SmallVector<Constant *, 32> ShuffleMask( 3468 VecLen, UndefValue::get(Builder.getInt32Ty())); 3469 3470 if (IsPairwise) 3471 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). 3472 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3473 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); 3474 else 3475 // Move the upper half of the vector to the lower half. 3476 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3477 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); 3478 3479 return ConstantVector::get(ShuffleMask); 3480 } 3481 3482 3483 /// Model horizontal reductions. 3484 /// 3485 /// A horizontal reduction is a tree of reduction operations (currently add and 3486 /// fadd) that has operations that can be put into a vector as its leaf. 3487 /// For example, this tree: 3488 /// 3489 /// mul mul mul mul 3490 /// \ / \ / 3491 /// + + 3492 /// \ / 3493 /// + 3494 /// This tree has "mul" as its reduced values and "+" as its reduction 3495 /// operations. A reduction might be feeding into a store or a binary operation 3496 /// feeding a phi. 3497 /// ... 3498 /// \ / 3499 /// + 3500 /// | 3501 /// phi += 3502 /// 3503 /// Or: 3504 /// ... 3505 /// \ / 3506 /// + 3507 /// | 3508 /// *p = 3509 /// 3510 class HorizontalReduction { 3511 SmallVector<Value *, 16> ReductionOps; 3512 SmallVector<Value *, 32> ReducedVals; 3513 3514 BinaryOperator *ReductionRoot; 3515 PHINode *ReductionPHI; 3516 3517 /// The opcode of the reduction. 3518 unsigned ReductionOpcode; 3519 /// The opcode of the values we perform a reduction on. 3520 unsigned ReducedValueOpcode; 3521 /// The width of one full horizontal reduction operation. 3522 unsigned ReduxWidth; 3523 /// Should we model this reduction as a pairwise reduction tree or a tree that 3524 /// splits the vector in halves and adds those halves. 3525 bool IsPairwiseReduction; 3526 3527 public: 3528 HorizontalReduction() 3529 : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), 3530 ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} 3531 3532 /// \brief Try to find a reduction tree. 3533 bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, 3534 const DataLayout *DL) { 3535 assert((!Phi || 3536 std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && 3537 "Thi phi needs to use the binary operator"); 3538 3539 // We could have a initial reductions that is not an add. 3540 // r *= v1 + v2 + v3 + v4 3541 // In such a case start looking for a tree rooted in the first '+'. 3542 if (Phi) { 3543 if (B->getOperand(0) == Phi) { 3544 Phi = nullptr; 3545 B = dyn_cast<BinaryOperator>(B->getOperand(1)); 3546 } else if (B->getOperand(1) == Phi) { 3547 Phi = nullptr; 3548 B = dyn_cast<BinaryOperator>(B->getOperand(0)); 3549 } 3550 } 3551 3552 if (!B) 3553 return false; 3554 3555 Type *Ty = B->getType(); 3556 if (Ty->isVectorTy()) 3557 return false; 3558 3559 ReductionOpcode = B->getOpcode(); 3560 ReducedValueOpcode = 0; 3561 ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); 3562 ReductionRoot = B; 3563 ReductionPHI = Phi; 3564 3565 if (ReduxWidth < 4) 3566 return false; 3567 3568 // We currently only support adds. 3569 if (ReductionOpcode != Instruction::Add && 3570 ReductionOpcode != Instruction::FAdd) 3571 return false; 3572 3573 // Post order traverse the reduction tree starting at B. We only handle true 3574 // trees containing only binary operators. 3575 SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack; 3576 Stack.push_back(std::make_pair(B, 0)); 3577 while (!Stack.empty()) { 3578 BinaryOperator *TreeN = Stack.back().first; 3579 unsigned EdgeToVist = Stack.back().second++; 3580 bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; 3581 3582 // Only handle trees in the current basic block. 3583 if (TreeN->getParent() != B->getParent()) 3584 return false; 3585 3586 // Each tree node needs to have one user except for the ultimate 3587 // reduction. 3588 if (!TreeN->hasOneUse() && TreeN != B) 3589 return false; 3590 3591 // Postorder vist. 3592 if (EdgeToVist == 2 || IsReducedValue) { 3593 if (IsReducedValue) { 3594 // Make sure that the opcodes of the operations that we are going to 3595 // reduce match. 3596 if (!ReducedValueOpcode) 3597 ReducedValueOpcode = TreeN->getOpcode(); 3598 else if (ReducedValueOpcode != TreeN->getOpcode()) 3599 return false; 3600 ReducedVals.push_back(TreeN); 3601 } else { 3602 // We need to be able to reassociate the adds. 3603 if (!TreeN->isAssociative()) 3604 return false; 3605 ReductionOps.push_back(TreeN); 3606 } 3607 // Retract. 3608 Stack.pop_back(); 3609 continue; 3610 } 3611 3612 // Visit left or right. 3613 Value *NextV = TreeN->getOperand(EdgeToVist); 3614 BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV); 3615 if (Next) 3616 Stack.push_back(std::make_pair(Next, 0)); 3617 else if (NextV != Phi) 3618 return false; 3619 } 3620 return true; 3621 } 3622 3623 /// \brief Attempt to vectorize the tree found by 3624 /// matchAssociativeReduction. 3625 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { 3626 if (ReducedVals.empty()) 3627 return false; 3628 3629 unsigned NumReducedVals = ReducedVals.size(); 3630 if (NumReducedVals < ReduxWidth) 3631 return false; 3632 3633 Value *VectorizedTree = nullptr; 3634 IRBuilder<> Builder(ReductionRoot); 3635 FastMathFlags Unsafe; 3636 Unsafe.setUnsafeAlgebra(); 3637 Builder.SetFastMathFlags(Unsafe); 3638 unsigned i = 0; 3639 3640 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { 3641 V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps); 3642 3643 // Estimate cost. 3644 int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); 3645 if (Cost >= -SLPCostThreshold) 3646 break; 3647 3648 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost 3649 << ". (HorRdx)\n"); 3650 3651 // Vectorize a tree. 3652 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); 3653 Value *VectorizedRoot = V.vectorizeTree(); 3654 3655 // Emit a reduction. 3656 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); 3657 if (VectorizedTree) { 3658 Builder.SetCurrentDebugLocation(Loc); 3659 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3660 ReducedSubTree, "bin.rdx"); 3661 } else 3662 VectorizedTree = ReducedSubTree; 3663 } 3664 3665 if (VectorizedTree) { 3666 // Finish the reduction. 3667 for (; i < NumReducedVals; ++i) { 3668 Builder.SetCurrentDebugLocation( 3669 cast<Instruction>(ReducedVals[i])->getDebugLoc()); 3670 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3671 ReducedVals[i]); 3672 } 3673 // Update users. 3674 if (ReductionPHI) { 3675 assert(ReductionRoot && "Need a reduction operation"); 3676 ReductionRoot->setOperand(0, VectorizedTree); 3677 ReductionRoot->setOperand(1, ReductionPHI); 3678 } else 3679 ReductionRoot->replaceAllUsesWith(VectorizedTree); 3680 } 3681 return VectorizedTree != nullptr; 3682 } 3683 3684 private: 3685 3686 /// \brief Calcuate the cost of a reduction. 3687 int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { 3688 Type *ScalarTy = FirstReducedVal->getType(); 3689 Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); 3690 3691 int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); 3692 int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); 3693 3694 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; 3695 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; 3696 3697 int ScalarReduxCost = 3698 ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); 3699 3700 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost 3701 << " for reduction that starts with " << *FirstReducedVal 3702 << " (It is a " 3703 << (IsPairwiseReduction ? "pairwise" : "splitting") 3704 << " reduction)\n"); 3705 3706 return VecReduxCost - ScalarReduxCost; 3707 } 3708 3709 static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, 3710 Value *R, const Twine &Name = "") { 3711 if (Opcode == Instruction::FAdd) 3712 return Builder.CreateFAdd(L, R, Name); 3713 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); 3714 } 3715 3716 /// \brief Emit a horizontal reduction of the vectorized value. 3717 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { 3718 assert(VectorizedValue && "Need to have a vectorized tree node"); 3719 assert(isPowerOf2_32(ReduxWidth) && 3720 "We only handle power-of-two reductions for now"); 3721 3722 Value *TmpVec = VectorizedValue; 3723 for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { 3724 if (IsPairwiseReduction) { 3725 Value *LeftMask = 3726 createRdxShuffleMask(ReduxWidth, i, true, true, Builder); 3727 Value *RightMask = 3728 createRdxShuffleMask(ReduxWidth, i, true, false, Builder); 3729 3730 Value *LeftShuf = Builder.CreateShuffleVector( 3731 TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); 3732 Value *RightShuf = Builder.CreateShuffleVector( 3733 TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), 3734 "rdx.shuf.r"); 3735 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, 3736 "bin.rdx"); 3737 } else { 3738 Value *UpperHalf = 3739 createRdxShuffleMask(ReduxWidth, i, false, false, Builder); 3740 Value *Shuf = Builder.CreateShuffleVector( 3741 TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); 3742 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); 3743 } 3744 } 3745 3746 // The result is in the first element of the vector. 3747 return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 3748 } 3749 }; 3750 3751 /// \brief Recognize construction of vectors like 3752 /// %ra = insertelement <4 x float> undef, float %s0, i32 0 3753 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 3754 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 3755 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 3756 /// 3757 /// Returns true if it matches 3758 /// 3759 static bool findBuildVector(InsertElementInst *FirstInsertElem, 3760 SmallVectorImpl<Value *> &BuildVector, 3761 SmallVectorImpl<Value *> &BuildVectorOpds) { 3762 if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) 3763 return false; 3764 3765 InsertElementInst *IE = FirstInsertElem; 3766 while (true) { 3767 BuildVector.push_back(IE); 3768 BuildVectorOpds.push_back(IE->getOperand(1)); 3769 3770 if (IE->use_empty()) 3771 return false; 3772 3773 InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back()); 3774 if (!NextUse) 3775 return true; 3776 3777 // If this isn't the final use, make sure the next insertelement is the only 3778 // use. It's OK if the final constructed vector is used multiple times 3779 if (!IE->hasOneUse()) 3780 return false; 3781 3782 IE = NextUse; 3783 } 3784 3785 return false; 3786 } 3787 3788 static bool PhiTypeSorterFunc(Value *V, Value *V2) { 3789 return V->getType() < V2->getType(); 3790 } 3791 3792 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { 3793 bool Changed = false; 3794 SmallVector<Value *, 4> Incoming; 3795 SmallSet<Value *, 16> VisitedInstrs; 3796 3797 bool HaveVectorizedPhiNodes = true; 3798 while (HaveVectorizedPhiNodes) { 3799 HaveVectorizedPhiNodes = false; 3800 3801 // Collect the incoming values from the PHIs. 3802 Incoming.clear(); 3803 for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie; 3804 ++instr) { 3805 PHINode *P = dyn_cast<PHINode>(instr); 3806 if (!P) 3807 break; 3808 3809 if (!VisitedInstrs.count(P)) 3810 Incoming.push_back(P); 3811 } 3812 3813 // Sort by type. 3814 std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc); 3815 3816 // Try to vectorize elements base on their type. 3817 for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), 3818 E = Incoming.end(); 3819 IncIt != E;) { 3820 3821 // Look for the next elements with the same type. 3822 SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; 3823 while (SameTypeIt != E && 3824 (*SameTypeIt)->getType() == (*IncIt)->getType()) { 3825 VisitedInstrs.insert(*SameTypeIt); 3826 ++SameTypeIt; 3827 } 3828 3829 // Try to vectorize them. 3830 unsigned NumElts = (SameTypeIt - IncIt); 3831 DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); 3832 if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { 3833 // Success start over because instructions might have been changed. 3834 HaveVectorizedPhiNodes = true; 3835 Changed = true; 3836 break; 3837 } 3838 3839 // Start over at the next instruction of a different type (or the end). 3840 IncIt = SameTypeIt; 3841 } 3842 } 3843 3844 VisitedInstrs.clear(); 3845 3846 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { 3847 // We may go through BB multiple times so skip the one we have checked. 3848 if (!VisitedInstrs.insert(it).second) 3849 continue; 3850 3851 if (isa<DbgInfoIntrinsic>(it)) 3852 continue; 3853 3854 // Try to vectorize reductions that use PHINodes. 3855 if (PHINode *P = dyn_cast<PHINode>(it)) { 3856 // Check that the PHI is a reduction PHI. 3857 if (P->getNumIncomingValues() != 2) 3858 return Changed; 3859 Value *Rdx = 3860 (P->getIncomingBlock(0) == BB 3861 ? (P->getIncomingValue(0)) 3862 : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) 3863 : nullptr)); 3864 // Check if this is a Binary Operator. 3865 BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); 3866 if (!BI) 3867 continue; 3868 3869 // Try to match and vectorize a horizontal reduction. 3870 HorizontalReduction HorRdx; 3871 if (ShouldVectorizeHor && 3872 HorRdx.matchAssociativeReduction(P, BI, DL) && 3873 HorRdx.tryToReduce(R, TTI)) { 3874 Changed = true; 3875 it = BB->begin(); 3876 e = BB->end(); 3877 continue; 3878 } 3879 3880 Value *Inst = BI->getOperand(0); 3881 if (Inst == P) 3882 Inst = BI->getOperand(1); 3883 3884 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) { 3885 // We would like to start over since some instructions are deleted 3886 // and the iterator may become invalid value. 3887 Changed = true; 3888 it = BB->begin(); 3889 e = BB->end(); 3890 continue; 3891 } 3892 3893 continue; 3894 } 3895 3896 // Try to vectorize horizontal reductions feeding into a store. 3897 if (ShouldStartVectorizeHorAtStore) 3898 if (StoreInst *SI = dyn_cast<StoreInst>(it)) 3899 if (BinaryOperator *BinOp = 3900 dyn_cast<BinaryOperator>(SI->getValueOperand())) { 3901 HorizontalReduction HorRdx; 3902 if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) && 3903 HorRdx.tryToReduce(R, TTI)) || 3904 tryToVectorize(BinOp, R))) { 3905 Changed = true; 3906 it = BB->begin(); 3907 e = BB->end(); 3908 continue; 3909 } 3910 } 3911 3912 // Try to vectorize horizontal reductions feeding into a return. 3913 if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) 3914 if (RI->getNumOperands() != 0) 3915 if (BinaryOperator *BinOp = 3916 dyn_cast<BinaryOperator>(RI->getOperand(0))) { 3917 DEBUG(dbgs() << "SLP: Found a return to vectorize.\n"); 3918 if (tryToVectorizePair(BinOp->getOperand(0), 3919 BinOp->getOperand(1), R)) { 3920 Changed = true; 3921 it = BB->begin(); 3922 e = BB->end(); 3923 continue; 3924 } 3925 } 3926 3927 // Try to vectorize trees that start at compare instructions. 3928 if (CmpInst *CI = dyn_cast<CmpInst>(it)) { 3929 if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { 3930 Changed = true; 3931 // We would like to start over since some instructions are deleted 3932 // and the iterator may become invalid value. 3933 it = BB->begin(); 3934 e = BB->end(); 3935 continue; 3936 } 3937 3938 for (int i = 0; i < 2; ++i) { 3939 if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) { 3940 if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { 3941 Changed = true; 3942 // We would like to start over since some instructions are deleted 3943 // and the iterator may become invalid value. 3944 it = BB->begin(); 3945 e = BB->end(); 3946 break; 3947 } 3948 } 3949 } 3950 continue; 3951 } 3952 3953 // Try to vectorize trees that start at insertelement instructions. 3954 if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { 3955 SmallVector<Value *, 16> BuildVector; 3956 SmallVector<Value *, 16> BuildVectorOpds; 3957 if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) 3958 continue; 3959 3960 // Vectorize starting with the build vector operands ignoring the 3961 // BuildVector instructions for the purpose of scheduling and user 3962 // extraction. 3963 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { 3964 Changed = true; 3965 it = BB->begin(); 3966 e = BB->end(); 3967 } 3968 3969 continue; 3970 } 3971 } 3972 3973 return Changed; 3974 } 3975 3976 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { 3977 bool Changed = false; 3978 // Attempt to sort and vectorize each of the store-groups. 3979 for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); 3980 it != e; ++it) { 3981 if (it->second.size() < 2) 3982 continue; 3983 3984 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " 3985 << it->second.size() << ".\n"); 3986 3987 // Process the stores in chunks of 16. 3988 for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { 3989 unsigned Len = std::min<unsigned>(CE - CI, 16); 3990 Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), 3991 -SLPCostThreshold, R); 3992 } 3993 } 3994 return Changed; 3995 } 3996 3997 } // end anonymous namespace 3998 3999 char SLPVectorizer::ID = 0; 4000 static const char lv_name[] = "SLP Vectorizer"; 4001 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) 4002 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 4003 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 4004 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 4005 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 4006 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 4007 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) 4008 4009 namespace llvm { 4010 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); } 4011 } 4012