1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive 10 // stores that can be put together into vector-stores. Next, it attempts to 11 // construct vectorizable tree using the use-def chains. If a profitable tree 12 // was found, the SLP vectorizer performs vectorization on the tree. 13 // 14 // The pass is inspired by the work described in the paper: 15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 16 // 17 //===----------------------------------------------------------------------===// 18 #include "llvm/Transforms/Vectorize.h" 19 #include "llvm/ADT/MapVector.h" 20 #include "llvm/ADT/PostOrderIterator.h" 21 #include "llvm/ADT/SetVector.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/Analysis/AliasAnalysis.h" 24 #include "llvm/Analysis/AssumptionTracker.h" 25 #include "llvm/Analysis/CodeMetrics.h" 26 #include "llvm/Analysis/LoopInfo.h" 27 #include "llvm/Analysis/ScalarEvolution.h" 28 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/Analysis/ValueTracking.h" 31 #include "llvm/IR/DataLayout.h" 32 #include "llvm/IR/Dominators.h" 33 #include "llvm/IR/IRBuilder.h" 34 #include "llvm/IR/Instructions.h" 35 #include "llvm/IR/IntrinsicInst.h" 36 #include "llvm/IR/Module.h" 37 #include "llvm/IR/NoFolder.h" 38 #include "llvm/IR/Type.h" 39 #include "llvm/IR/Value.h" 40 #include "llvm/IR/Verifier.h" 41 #include "llvm/Pass.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/Debug.h" 44 #include "llvm/Support/raw_ostream.h" 45 #include "llvm/Transforms/Utils/VectorUtils.h" 46 #include <algorithm> 47 #include <map> 48 #include <memory> 49 50 using namespace llvm; 51 52 #define SV_NAME "slp-vectorizer" 53 #define DEBUG_TYPE "SLP" 54 55 STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); 56 57 static cl::opt<int> 58 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, 59 cl::desc("Only vectorize if you gain more than this " 60 "number ")); 61 62 static cl::opt<bool> 63 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, 64 cl::desc("Attempt to vectorize horizontal reductions")); 65 66 static cl::opt<bool> ShouldStartVectorizeHorAtStore( 67 "slp-vectorize-hor-store", cl::init(false), cl::Hidden, 68 cl::desc( 69 "Attempt to vectorize horizontal reductions feeding into a store")); 70 71 namespace { 72 73 static const unsigned MinVecRegSize = 128; 74 75 static const unsigned RecursionMaxDepth = 12; 76 77 /// \returns the parent basic block if all of the instructions in \p VL 78 /// are in the same block or null otherwise. 79 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { 80 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 81 if (!I0) 82 return nullptr; 83 BasicBlock *BB = I0->getParent(); 84 for (int i = 1, e = VL.size(); i < e; i++) { 85 Instruction *I = dyn_cast<Instruction>(VL[i]); 86 if (!I) 87 return nullptr; 88 89 if (BB != I->getParent()) 90 return nullptr; 91 } 92 return BB; 93 } 94 95 /// \returns True if all of the values in \p VL are constants. 96 static bool allConstant(ArrayRef<Value *> VL) { 97 for (unsigned i = 0, e = VL.size(); i < e; ++i) 98 if (!isa<Constant>(VL[i])) 99 return false; 100 return true; 101 } 102 103 /// \returns True if all of the values in \p VL are identical. 104 static bool isSplat(ArrayRef<Value *> VL) { 105 for (unsigned i = 1, e = VL.size(); i < e; ++i) 106 if (VL[i] != VL[0]) 107 return false; 108 return true; 109 } 110 111 ///\returns Opcode that can be clubbed with \p Op to create an alternate 112 /// sequence which can later be merged as a ShuffleVector instruction. 113 static unsigned getAltOpcode(unsigned Op) { 114 switch (Op) { 115 case Instruction::FAdd: 116 return Instruction::FSub; 117 case Instruction::FSub: 118 return Instruction::FAdd; 119 case Instruction::Add: 120 return Instruction::Sub; 121 case Instruction::Sub: 122 return Instruction::Add; 123 default: 124 return 0; 125 } 126 } 127 128 ///\returns bool representing if Opcode \p Op can be part 129 /// of an alternate sequence which can later be merged as 130 /// a ShuffleVector instruction. 131 static bool canCombineAsAltInst(unsigned Op) { 132 if (Op == Instruction::FAdd || Op == Instruction::FSub || 133 Op == Instruction::Sub || Op == Instruction::Add) 134 return true; 135 return false; 136 } 137 138 /// \returns ShuffleVector instruction if intructions in \p VL have 139 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. 140 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) 141 static unsigned isAltInst(ArrayRef<Value *> VL) { 142 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 143 unsigned Opcode = I0->getOpcode(); 144 unsigned AltOpcode = getAltOpcode(Opcode); 145 for (int i = 1, e = VL.size(); i < e; i++) { 146 Instruction *I = dyn_cast<Instruction>(VL[i]); 147 if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) 148 return 0; 149 } 150 return Instruction::ShuffleVector; 151 } 152 153 /// \returns The opcode if all of the Instructions in \p VL have the same 154 /// opcode, or zero. 155 static unsigned getSameOpcode(ArrayRef<Value *> VL) { 156 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 157 if (!I0) 158 return 0; 159 unsigned Opcode = I0->getOpcode(); 160 for (int i = 1, e = VL.size(); i < e; i++) { 161 Instruction *I = dyn_cast<Instruction>(VL[i]); 162 if (!I || Opcode != I->getOpcode()) { 163 if (canCombineAsAltInst(Opcode) && i == 1) 164 return isAltInst(VL); 165 return 0; 166 } 167 } 168 return Opcode; 169 } 170 171 /// Get the intersection (logical and) of all of the potential IR flags 172 /// of each scalar operation (VL) that will be converted into a vector (I). 173 /// Flag set: NSW, NUW, exact, and all of fast-math. 174 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { 175 if (auto *VecOp = dyn_cast<BinaryOperator>(I)) { 176 if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) { 177 // Intersection is initialized to the 0th scalar, 178 // so start counting from index '1'. 179 for (int i = 1, e = VL.size(); i < e; ++i) { 180 if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i])) 181 Intersection->andIRFlags(Scalar); 182 } 183 VecOp->copyIRFlags(Intersection); 184 } 185 } 186 } 187 188 /// \returns \p I after propagating metadata from \p VL. 189 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { 190 Instruction *I0 = cast<Instruction>(VL[0]); 191 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; 192 I0->getAllMetadataOtherThanDebugLoc(Metadata); 193 194 for (unsigned i = 0, n = Metadata.size(); i != n; ++i) { 195 unsigned Kind = Metadata[i].first; 196 MDNode *MD = Metadata[i].second; 197 198 for (int i = 1, e = VL.size(); MD && i != e; i++) { 199 Instruction *I = cast<Instruction>(VL[i]); 200 MDNode *IMD = I->getMetadata(Kind); 201 202 switch (Kind) { 203 default: 204 MD = nullptr; // Remove unknown metadata 205 break; 206 case LLVMContext::MD_tbaa: 207 MD = MDNode::getMostGenericTBAA(MD, IMD); 208 break; 209 case LLVMContext::MD_alias_scope: 210 case LLVMContext::MD_noalias: 211 MD = MDNode::intersect(MD, IMD); 212 break; 213 case LLVMContext::MD_fpmath: 214 MD = MDNode::getMostGenericFPMath(MD, IMD); 215 break; 216 } 217 } 218 I->setMetadata(Kind, MD); 219 } 220 return I; 221 } 222 223 /// \returns The type that all of the values in \p VL have or null if there 224 /// are different types. 225 static Type* getSameType(ArrayRef<Value *> VL) { 226 Type *Ty = VL[0]->getType(); 227 for (int i = 1, e = VL.size(); i < e; i++) 228 if (VL[i]->getType() != Ty) 229 return nullptr; 230 231 return Ty; 232 } 233 234 /// \returns True if the ExtractElement instructions in VL can be vectorized 235 /// to use the original vector. 236 static bool CanReuseExtract(ArrayRef<Value *> VL) { 237 assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); 238 // Check if all of the extracts come from the same vector and from the 239 // correct offset. 240 Value *VL0 = VL[0]; 241 ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); 242 Value *Vec = E0->getOperand(0); 243 244 // We have to extract from the same vector type. 245 unsigned NElts = Vec->getType()->getVectorNumElements(); 246 247 if (NElts != VL.size()) 248 return false; 249 250 // Check that all of the indices extract from the correct offset. 251 ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); 252 if (!CI || CI->getZExtValue()) 253 return false; 254 255 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 256 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 257 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); 258 259 if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) 260 return false; 261 } 262 263 return true; 264 } 265 266 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 267 SmallVectorImpl<Value *> &Left, 268 SmallVectorImpl<Value *> &Right) { 269 270 SmallVector<Value *, 16> OrigLeft, OrigRight; 271 272 bool AllSameOpcodeLeft = true; 273 bool AllSameOpcodeRight = true; 274 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 275 Instruction *I = cast<Instruction>(VL[i]); 276 Value *V0 = I->getOperand(0); 277 Value *V1 = I->getOperand(1); 278 279 OrigLeft.push_back(V0); 280 OrigRight.push_back(V1); 281 282 Instruction *I0 = dyn_cast<Instruction>(V0); 283 Instruction *I1 = dyn_cast<Instruction>(V1); 284 285 // Check whether all operands on one side have the same opcode. In this case 286 // we want to preserve the original order and not make things worse by 287 // reordering. 288 AllSameOpcodeLeft = I0; 289 AllSameOpcodeRight = I1; 290 291 if (i && AllSameOpcodeLeft) { 292 if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) { 293 if(P0->getOpcode() != I0->getOpcode()) 294 AllSameOpcodeLeft = false; 295 } else 296 AllSameOpcodeLeft = false; 297 } 298 if (i && AllSameOpcodeRight) { 299 if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) { 300 if(P1->getOpcode() != I1->getOpcode()) 301 AllSameOpcodeRight = false; 302 } else 303 AllSameOpcodeRight = false; 304 } 305 306 // Sort two opcodes. In the code below we try to preserve the ability to use 307 // broadcast of values instead of individual inserts. 308 // vl1 = load 309 // vl2 = phi 310 // vr1 = load 311 // vr2 = vr2 312 // = vl1 x vr1 313 // = vl2 x vr2 314 // If we just sorted according to opcode we would leave the first line in 315 // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). 316 // = vl1 x vr1 317 // = vr2 x vl2 318 // Because vr2 and vr1 are from the same load we loose the opportunity of a 319 // broadcast for the packed right side in the backend: we have [vr1, vl2] 320 // instead of [vr1, vr2=vr1]. 321 if (I0 && I1) { 322 if(!i && I0->getOpcode() > I1->getOpcode()) { 323 Left.push_back(I1); 324 Right.push_back(I0); 325 } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) { 326 // Try not to destroy a broad cast for no apparent benefit. 327 Left.push_back(I1); 328 Right.push_back(I0); 329 } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) { 330 // Try preserve broadcasts. 331 Left.push_back(I1); 332 Right.push_back(I0); 333 } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) { 334 // Try preserve broadcasts. 335 Left.push_back(I1); 336 Right.push_back(I0); 337 } else { 338 Left.push_back(I0); 339 Right.push_back(I1); 340 } 341 continue; 342 } 343 // One opcode, put the instruction on the right. 344 if (I0) { 345 Left.push_back(V1); 346 Right.push_back(I0); 347 continue; 348 } 349 Left.push_back(V0); 350 Right.push_back(V1); 351 } 352 353 bool LeftBroadcast = isSplat(Left); 354 bool RightBroadcast = isSplat(Right); 355 356 // Don't reorder if the operands where good to begin with. 357 if (!(LeftBroadcast || RightBroadcast) && 358 (AllSameOpcodeRight || AllSameOpcodeLeft)) { 359 Left = OrigLeft; 360 Right = OrigRight; 361 } 362 } 363 364 /// \returns True if in-tree use also needs extract. This refers to 365 /// possible scalar operand in vectorized instruction. 366 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, 367 TargetLibraryInfo *TLI) { 368 369 unsigned Opcode = UserInst->getOpcode(); 370 switch (Opcode) { 371 case Instruction::Load: { 372 LoadInst *LI = cast<LoadInst>(UserInst); 373 return (LI->getPointerOperand() == Scalar); 374 } 375 case Instruction::Store: { 376 StoreInst *SI = cast<StoreInst>(UserInst); 377 return (SI->getPointerOperand() == Scalar); 378 } 379 case Instruction::Call: { 380 CallInst *CI = cast<CallInst>(UserInst); 381 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 382 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 383 return (CI->getArgOperand(1) == Scalar); 384 } 385 } 386 default: 387 return false; 388 } 389 } 390 391 /// Bottom Up SLP Vectorizer. 392 class BoUpSLP { 393 public: 394 typedef SmallVector<Value *, 8> ValueList; 395 typedef SmallVector<Instruction *, 16> InstrList; 396 typedef SmallPtrSet<Value *, 16> ValueSet; 397 typedef SmallVector<StoreInst *, 8> StoreList; 398 399 BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, 400 TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, 401 LoopInfo *Li, DominatorTree *Dt, AssumptionTracker *AT) 402 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), 403 F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), 404 Builder(Se->getContext()) { 405 CodeMetrics::collectEphemeralValues(F, AT, EphValues); 406 } 407 408 /// \brief Vectorize the tree that starts with the elements in \p VL. 409 /// Returns the vectorized root. 410 Value *vectorizeTree(); 411 412 /// \returns the cost incurred by unwanted spills and fills, caused by 413 /// holding live values over call sites. 414 int getSpillCost(); 415 416 /// \returns the vectorization cost of the subtree that starts at \p VL. 417 /// A negative number means that this is profitable. 418 int getTreeCost(); 419 420 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 421 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 422 void buildTree(ArrayRef<Value *> Roots, 423 ArrayRef<Value *> UserIgnoreLst = None); 424 425 /// Clear the internal data structures that are created by 'buildTree'. 426 void deleteTree() { 427 VectorizableTree.clear(); 428 ScalarToTreeEntry.clear(); 429 MustGather.clear(); 430 ExternalUses.clear(); 431 NumLoadsWantToKeepOrder = 0; 432 NumLoadsWantToChangeOrder = 0; 433 for (auto &Iter : BlocksSchedules) { 434 BlockScheduling *BS = Iter.second.get(); 435 BS->clear(); 436 } 437 } 438 439 /// \returns true if the memory operations A and B are consecutive. 440 bool isConsecutiveAccess(Value *A, Value *B); 441 442 /// \brief Perform LICM and CSE on the newly generated gather sequences. 443 void optimizeGatherSequence(); 444 445 /// \returns true if it is benefitial to reverse the vector order. 446 bool shouldReorder() const { 447 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; 448 } 449 450 private: 451 struct TreeEntry; 452 453 /// \returns the cost of the vectorizable entry. 454 int getEntryCost(TreeEntry *E); 455 456 /// This is the recursive part of buildTree. 457 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); 458 459 /// Vectorize a single entry in the tree. 460 Value *vectorizeTree(TreeEntry *E); 461 462 /// Vectorize a single entry in the tree, starting in \p VL. 463 Value *vectorizeTree(ArrayRef<Value *> VL); 464 465 /// \returns the pointer to the vectorized value if \p VL is already 466 /// vectorized, or NULL. They may happen in cycles. 467 Value *alreadyVectorized(ArrayRef<Value *> VL) const; 468 469 /// \brief Take the pointer operand from the Load/Store instruction. 470 /// \returns NULL if this is not a valid Load/Store instruction. 471 static Value *getPointerOperand(Value *I); 472 473 /// \brief Take the address space operand from the Load/Store instruction. 474 /// \returns -1 if this is not a valid Load/Store instruction. 475 static unsigned getAddressSpaceOperand(Value *I); 476 477 /// \returns the scalarization cost for this type. Scalarization in this 478 /// context means the creation of vectors from a group of scalars. 479 int getGatherCost(Type *Ty); 480 481 /// \returns the scalarization cost for this list of values. Assuming that 482 /// this subtree gets vectorized, we may need to extract the values from the 483 /// roots. This method calculates the cost of extracting the values. 484 int getGatherCost(ArrayRef<Value *> VL); 485 486 /// \brief Set the Builder insert point to one after the last instruction in 487 /// the bundle 488 void setInsertPointAfterBundle(ArrayRef<Value *> VL); 489 490 /// \returns a vector from a collection of scalars in \p VL. 491 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); 492 493 /// \returns whether the VectorizableTree is fully vectoriable and will 494 /// be beneficial even the tree height is tiny. 495 bool isFullyVectorizableTinyTree(); 496 497 struct TreeEntry { 498 TreeEntry() : Scalars(), VectorizedValue(nullptr), 499 NeedToGather(0) {} 500 501 /// \returns true if the scalars in VL are equal to this entry. 502 bool isSame(ArrayRef<Value *> VL) const { 503 assert(VL.size() == Scalars.size() && "Invalid size"); 504 return std::equal(VL.begin(), VL.end(), Scalars.begin()); 505 } 506 507 /// A vector of scalars. 508 ValueList Scalars; 509 510 /// The Scalars are vectorized into this value. It is initialized to Null. 511 Value *VectorizedValue; 512 513 /// Do we need to gather this sequence ? 514 bool NeedToGather; 515 }; 516 517 /// Create a new VectorizableTree entry. 518 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { 519 VectorizableTree.push_back(TreeEntry()); 520 int idx = VectorizableTree.size() - 1; 521 TreeEntry *Last = &VectorizableTree[idx]; 522 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); 523 Last->NeedToGather = !Vectorized; 524 if (Vectorized) { 525 for (int i = 0, e = VL.size(); i != e; ++i) { 526 assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); 527 ScalarToTreeEntry[VL[i]] = idx; 528 } 529 } else { 530 MustGather.insert(VL.begin(), VL.end()); 531 } 532 return Last; 533 } 534 535 /// -- Vectorization State -- 536 /// Holds all of the tree entries. 537 std::vector<TreeEntry> VectorizableTree; 538 539 /// Maps a specific scalar to its tree entry. 540 SmallDenseMap<Value*, int> ScalarToTreeEntry; 541 542 /// A list of scalars that we found that we need to keep as scalars. 543 ValueSet MustGather; 544 545 /// This POD struct describes one external user in the vectorized tree. 546 struct ExternalUser { 547 ExternalUser (Value *S, llvm::User *U, int L) : 548 Scalar(S), User(U), Lane(L){}; 549 // Which scalar in our function. 550 Value *Scalar; 551 // Which user that uses the scalar. 552 llvm::User *User; 553 // Which lane does the scalar belong to. 554 int Lane; 555 }; 556 typedef SmallVector<ExternalUser, 16> UserList; 557 558 /// A list of values that need to extracted out of the tree. 559 /// This list holds pairs of (Internal Scalar : External User). 560 UserList ExternalUses; 561 562 /// Values used only by @llvm.assume calls. 563 SmallPtrSet<const Value *, 32> EphValues; 564 565 /// Holds all of the instructions that we gathered. 566 SetVector<Instruction *> GatherSeq; 567 /// A list of blocks that we are going to CSE. 568 SetVector<BasicBlock *> CSEBlocks; 569 570 /// Contains all scheduling relevant data for an instruction. 571 /// A ScheduleData either represents a single instruction or a member of an 572 /// instruction bundle (= a group of instructions which is combined into a 573 /// vector instruction). 574 struct ScheduleData { 575 576 // The initial value for the dependency counters. It means that the 577 // dependencies are not calculated yet. 578 enum { InvalidDeps = -1 }; 579 580 ScheduleData() 581 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr), 582 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0), 583 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps), 584 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {} 585 586 void init(int BlockSchedulingRegionID) { 587 FirstInBundle = this; 588 NextInBundle = nullptr; 589 NextLoadStore = nullptr; 590 IsScheduled = false; 591 SchedulingRegionID = BlockSchedulingRegionID; 592 UnscheduledDepsInBundle = UnscheduledDeps; 593 clearDependencies(); 594 } 595 596 /// Returns true if the dependency information has been calculated. 597 bool hasValidDependencies() const { return Dependencies != InvalidDeps; } 598 599 /// Returns true for single instructions and for bundle representatives 600 /// (= the head of a bundle). 601 bool isSchedulingEntity() const { return FirstInBundle == this; } 602 603 /// Returns true if it represents an instruction bundle and not only a 604 /// single instruction. 605 bool isPartOfBundle() const { 606 return NextInBundle != nullptr || FirstInBundle != this; 607 } 608 609 /// Returns true if it is ready for scheduling, i.e. it has no more 610 /// unscheduled depending instructions/bundles. 611 bool isReady() const { 612 assert(isSchedulingEntity() && 613 "can't consider non-scheduling entity for ready list"); 614 return UnscheduledDepsInBundle == 0 && !IsScheduled; 615 } 616 617 /// Modifies the number of unscheduled dependencies, also updating it for 618 /// the whole bundle. 619 int incrementUnscheduledDeps(int Incr) { 620 UnscheduledDeps += Incr; 621 return FirstInBundle->UnscheduledDepsInBundle += Incr; 622 } 623 624 /// Sets the number of unscheduled dependencies to the number of 625 /// dependencies. 626 void resetUnscheduledDeps() { 627 incrementUnscheduledDeps(Dependencies - UnscheduledDeps); 628 } 629 630 /// Clears all dependency information. 631 void clearDependencies() { 632 Dependencies = InvalidDeps; 633 resetUnscheduledDeps(); 634 MemoryDependencies.clear(); 635 } 636 637 void dump(raw_ostream &os) const { 638 if (!isSchedulingEntity()) { 639 os << "/ " << *Inst; 640 } else if (NextInBundle) { 641 os << '[' << *Inst; 642 ScheduleData *SD = NextInBundle; 643 while (SD) { 644 os << ';' << *SD->Inst; 645 SD = SD->NextInBundle; 646 } 647 os << ']'; 648 } else { 649 os << *Inst; 650 } 651 } 652 653 Instruction *Inst; 654 655 /// Points to the head in an instruction bundle (and always to this for 656 /// single instructions). 657 ScheduleData *FirstInBundle; 658 659 /// Single linked list of all instructions in a bundle. Null if it is a 660 /// single instruction. 661 ScheduleData *NextInBundle; 662 663 /// Single linked list of all memory instructions (e.g. load, store, call) 664 /// in the block - until the end of the scheduling region. 665 ScheduleData *NextLoadStore; 666 667 /// The dependent memory instructions. 668 /// This list is derived on demand in calculateDependencies(). 669 SmallVector<ScheduleData *, 4> MemoryDependencies; 670 671 /// This ScheduleData is in the current scheduling region if this matches 672 /// the current SchedulingRegionID of BlockScheduling. 673 int SchedulingRegionID; 674 675 /// Used for getting a "good" final ordering of instructions. 676 int SchedulingPriority; 677 678 /// The number of dependencies. Constitutes of the number of users of the 679 /// instruction plus the number of dependent memory instructions (if any). 680 /// This value is calculated on demand. 681 /// If InvalidDeps, the number of dependencies is not calculated yet. 682 /// 683 int Dependencies; 684 685 /// The number of dependencies minus the number of dependencies of scheduled 686 /// instructions. As soon as this is zero, the instruction/bundle gets ready 687 /// for scheduling. 688 /// Note that this is negative as long as Dependencies is not calculated. 689 int UnscheduledDeps; 690 691 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for 692 /// single instructions. 693 int UnscheduledDepsInBundle; 694 695 /// True if this instruction is scheduled (or considered as scheduled in the 696 /// dry-run). 697 bool IsScheduled; 698 }; 699 700 #ifndef NDEBUG 701 friend raw_ostream &operator<<(raw_ostream &os, 702 const BoUpSLP::ScheduleData &SD); 703 #endif 704 705 /// Contains all scheduling data for a basic block. 706 /// 707 struct BlockScheduling { 708 709 BlockScheduling(BasicBlock *BB) 710 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), 711 ScheduleStart(nullptr), ScheduleEnd(nullptr), 712 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), 713 // Make sure that the initial SchedulingRegionID is greater than the 714 // initial SchedulingRegionID in ScheduleData (which is 0). 715 SchedulingRegionID(1) {} 716 717 void clear() { 718 ReadyInsts.clear(); 719 ScheduleStart = nullptr; 720 ScheduleEnd = nullptr; 721 FirstLoadStoreInRegion = nullptr; 722 LastLoadStoreInRegion = nullptr; 723 724 // Make a new scheduling region, i.e. all existing ScheduleData is not 725 // in the new region yet. 726 ++SchedulingRegionID; 727 } 728 729 ScheduleData *getScheduleData(Value *V) { 730 ScheduleData *SD = ScheduleDataMap[V]; 731 if (SD && SD->SchedulingRegionID == SchedulingRegionID) 732 return SD; 733 return nullptr; 734 } 735 736 bool isInSchedulingRegion(ScheduleData *SD) { 737 return SD->SchedulingRegionID == SchedulingRegionID; 738 } 739 740 /// Marks an instruction as scheduled and puts all dependent ready 741 /// instructions into the ready-list. 742 template <typename ReadyListType> 743 void schedule(ScheduleData *SD, ReadyListType &ReadyList) { 744 SD->IsScheduled = true; 745 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); 746 747 ScheduleData *BundleMember = SD; 748 while (BundleMember) { 749 // Handle the def-use chain dependencies. 750 for (Use &U : BundleMember->Inst->operands()) { 751 ScheduleData *OpDef = getScheduleData(U.get()); 752 if (OpDef && OpDef->hasValidDependencies() && 753 OpDef->incrementUnscheduledDeps(-1) == 0) { 754 // There are no more unscheduled dependencies after decrementing, 755 // so we can put the dependent instruction into the ready list. 756 ScheduleData *DepBundle = OpDef->FirstInBundle; 757 assert(!DepBundle->IsScheduled && 758 "already scheduled bundle gets ready"); 759 ReadyList.insert(DepBundle); 760 DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); 761 } 762 } 763 // Handle the memory dependencies. 764 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { 765 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { 766 // There are no more unscheduled dependencies after decrementing, 767 // so we can put the dependent instruction into the ready list. 768 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; 769 assert(!DepBundle->IsScheduled && 770 "already scheduled bundle gets ready"); 771 ReadyList.insert(DepBundle); 772 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n"); 773 } 774 } 775 BundleMember = BundleMember->NextInBundle; 776 } 777 } 778 779 /// Put all instructions into the ReadyList which are ready for scheduling. 780 template <typename ReadyListType> 781 void initialFillReadyList(ReadyListType &ReadyList) { 782 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 783 ScheduleData *SD = getScheduleData(I); 784 if (SD->isSchedulingEntity() && SD->isReady()) { 785 ReadyList.insert(SD); 786 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); 787 } 788 } 789 } 790 791 /// Checks if a bundle of instructions can be scheduled, i.e. has no 792 /// cyclic dependencies. This is only a dry-run, no instructions are 793 /// actually moved at this stage. 794 bool tryScheduleBundle(ArrayRef<Value *> VL, AliasAnalysis *AA); 795 796 /// Un-bundles a group of instructions. 797 void cancelScheduling(ArrayRef<Value *> VL); 798 799 /// Extends the scheduling region so that V is inside the region. 800 void extendSchedulingRegion(Value *V); 801 802 /// Initialize the ScheduleData structures for new instructions in the 803 /// scheduling region. 804 void initScheduleData(Instruction *FromI, Instruction *ToI, 805 ScheduleData *PrevLoadStore, 806 ScheduleData *NextLoadStore); 807 808 /// Updates the dependency information of a bundle and of all instructions/ 809 /// bundles which depend on the original bundle. 810 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, 811 AliasAnalysis *AA); 812 813 /// Sets all instruction in the scheduling region to un-scheduled. 814 void resetSchedule(); 815 816 BasicBlock *BB; 817 818 /// Simple memory allocation for ScheduleData. 819 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; 820 821 /// The size of a ScheduleData array in ScheduleDataChunks. 822 int ChunkSize; 823 824 /// The allocator position in the current chunk, which is the last entry 825 /// of ScheduleDataChunks. 826 int ChunkPos; 827 828 /// Attaches ScheduleData to Instruction. 829 /// Note that the mapping survives during all vectorization iterations, i.e. 830 /// ScheduleData structures are recycled. 831 DenseMap<Value *, ScheduleData *> ScheduleDataMap; 832 833 struct ReadyList : SmallVector<ScheduleData *, 8> { 834 void insert(ScheduleData *SD) { push_back(SD); } 835 }; 836 837 /// The ready-list for scheduling (only used for the dry-run). 838 ReadyList ReadyInsts; 839 840 /// The first instruction of the scheduling region. 841 Instruction *ScheduleStart; 842 843 /// The first instruction _after_ the scheduling region. 844 Instruction *ScheduleEnd; 845 846 /// The first memory accessing instruction in the scheduling region 847 /// (can be null). 848 ScheduleData *FirstLoadStoreInRegion; 849 850 /// The last memory accessing instruction in the scheduling region 851 /// (can be null). 852 ScheduleData *LastLoadStoreInRegion; 853 854 /// The ID of the scheduling region. For a new vectorization iteration this 855 /// is incremented which "removes" all ScheduleData from the region. 856 int SchedulingRegionID; 857 }; 858 859 /// Attaches the BlockScheduling structures to basic blocks. 860 DenseMap<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; 861 862 /// Performs the "real" scheduling. Done before vectorization is actually 863 /// performed in a basic block. 864 void scheduleBlock(BlockScheduling *BS); 865 866 /// List of users to ignore during scheduling and that don't need extracting. 867 ArrayRef<Value *> UserIgnoreList; 868 869 // Number of load-bundles, which contain consecutive loads. 870 int NumLoadsWantToKeepOrder; 871 872 // Number of load-bundles of size 2, which are consecutive loads if reversed. 873 int NumLoadsWantToChangeOrder; 874 875 // Analysis and block reference. 876 Function *F; 877 ScalarEvolution *SE; 878 const DataLayout *DL; 879 TargetTransformInfo *TTI; 880 TargetLibraryInfo *TLI; 881 AliasAnalysis *AA; 882 LoopInfo *LI; 883 DominatorTree *DT; 884 /// Instruction builder to construct the vectorized tree. 885 IRBuilder<> Builder; 886 }; 887 888 #ifndef NDEBUG 889 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) { 890 SD.dump(os); 891 return os; 892 } 893 #endif 894 895 void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 896 ArrayRef<Value *> UserIgnoreLst) { 897 deleteTree(); 898 UserIgnoreList = UserIgnoreLst; 899 if (!getSameType(Roots)) 900 return; 901 buildTree_rec(Roots, 0); 902 903 // Collect the values that we need to extract from the tree. 904 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 905 TreeEntry *Entry = &VectorizableTree[EIdx]; 906 907 // For each lane: 908 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 909 Value *Scalar = Entry->Scalars[Lane]; 910 911 // No need to handle users of gathered values. 912 if (Entry->NeedToGather) 913 continue; 914 915 for (User *U : Scalar->users()) { 916 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); 917 918 Instruction *UserInst = dyn_cast<Instruction>(U); 919 if (!UserInst) 920 continue; 921 922 // Skip in-tree scalars that become vectors 923 if (ScalarToTreeEntry.count(U)) { 924 int Idx = ScalarToTreeEntry[U]; 925 TreeEntry *UseEntry = &VectorizableTree[Idx]; 926 Value *UseScalar = UseEntry->Scalars[0]; 927 // Some in-tree scalars will remain as scalar in vectorized 928 // instructions. If that is the case, the one in Lane 0 will 929 // be used. 930 if (UseScalar != U || 931 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { 932 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U 933 << ".\n"); 934 assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); 935 continue; 936 } 937 } 938 939 // Ignore users in the user ignore list. 940 if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != 941 UserIgnoreList.end()) 942 continue; 943 944 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << 945 Lane << " from " << *Scalar << ".\n"); 946 ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); 947 } 948 } 949 } 950 } 951 952 953 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { 954 bool SameTy = getSameType(VL); (void)SameTy; 955 bool isAltShuffle = false; 956 assert(SameTy && "Invalid types!"); 957 958 if (Depth == RecursionMaxDepth) { 959 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); 960 newTreeEntry(VL, false); 961 return; 962 } 963 964 // Don't handle vectors. 965 if (VL[0]->getType()->isVectorTy()) { 966 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); 967 newTreeEntry(VL, false); 968 return; 969 } 970 971 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 972 if (SI->getValueOperand()->getType()->isVectorTy()) { 973 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); 974 newTreeEntry(VL, false); 975 return; 976 } 977 unsigned Opcode = getSameOpcode(VL); 978 979 // Check that this shuffle vector refers to the alternate 980 // sequence of opcodes. 981 if (Opcode == Instruction::ShuffleVector) { 982 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 983 unsigned Op = I0->getOpcode(); 984 if (Op != Instruction::ShuffleVector) 985 isAltShuffle = true; 986 } 987 988 // If all of the operands are identical or constant we have a simple solution. 989 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { 990 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); 991 newTreeEntry(VL, false); 992 return; 993 } 994 995 // We now know that this is a vector of instructions of the same type from 996 // the same block. 997 998 // Don't vectorize ephemeral values. 999 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1000 if (EphValues.count(VL[i])) { 1001 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 1002 ") is ephemeral.\n"); 1003 newTreeEntry(VL, false); 1004 return; 1005 } 1006 } 1007 1008 // Check if this is a duplicate of another entry. 1009 if (ScalarToTreeEntry.count(VL[0])) { 1010 int Idx = ScalarToTreeEntry[VL[0]]; 1011 TreeEntry *E = &VectorizableTree[Idx]; 1012 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1013 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); 1014 if (E->Scalars[i] != VL[i]) { 1015 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); 1016 newTreeEntry(VL, false); 1017 return; 1018 } 1019 } 1020 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); 1021 return; 1022 } 1023 1024 // Check that none of the instructions in the bundle are already in the tree. 1025 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1026 if (ScalarToTreeEntry.count(VL[i])) { 1027 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 1028 ") is already in tree.\n"); 1029 newTreeEntry(VL, false); 1030 return; 1031 } 1032 } 1033 1034 // If any of the scalars appears in the table OR it is marked as a value that 1035 // needs to stat scalar then we need to gather the scalars. 1036 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1037 if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) { 1038 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n"); 1039 newTreeEntry(VL, false); 1040 return; 1041 } 1042 } 1043 1044 // Check that all of the users of the scalars that we want to vectorize are 1045 // schedulable. 1046 Instruction *VL0 = cast<Instruction>(VL[0]); 1047 BasicBlock *BB = cast<Instruction>(VL0)->getParent(); 1048 1049 if (!DT->isReachableFromEntry(BB)) { 1050 // Don't go into unreachable blocks. They may contain instructions with 1051 // dependency cycles which confuse the final scheduling. 1052 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); 1053 newTreeEntry(VL, false); 1054 return; 1055 } 1056 1057 // Check that every instructions appears once in this bundle. 1058 for (unsigned i = 0, e = VL.size(); i < e; ++i) 1059 for (unsigned j = i+1; j < e; ++j) 1060 if (VL[i] == VL[j]) { 1061 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 1062 newTreeEntry(VL, false); 1063 return; 1064 } 1065 1066 auto &BSRef = BlocksSchedules[BB]; 1067 if (!BSRef) { 1068 BSRef = llvm::make_unique<BlockScheduling>(BB); 1069 } 1070 BlockScheduling &BS = *BSRef.get(); 1071 1072 if (!BS.tryScheduleBundle(VL, AA)) { 1073 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); 1074 BS.cancelScheduling(VL); 1075 newTreeEntry(VL, false); 1076 return; 1077 } 1078 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); 1079 1080 switch (Opcode) { 1081 case Instruction::PHI: { 1082 PHINode *PH = dyn_cast<PHINode>(VL0); 1083 1084 // Check for terminator values (e.g. invoke). 1085 for (unsigned j = 0; j < VL.size(); ++j) 1086 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1087 TerminatorInst *Term = dyn_cast<TerminatorInst>( 1088 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); 1089 if (Term) { 1090 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); 1091 BS.cancelScheduling(VL); 1092 newTreeEntry(VL, false); 1093 return; 1094 } 1095 } 1096 1097 newTreeEntry(VL, true); 1098 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); 1099 1100 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1101 ValueList Operands; 1102 // Prepare the operand vector. 1103 for (unsigned j = 0; j < VL.size(); ++j) 1104 Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock( 1105 PH->getIncomingBlock(i))); 1106 1107 buildTree_rec(Operands, Depth + 1); 1108 } 1109 return; 1110 } 1111 case Instruction::ExtractElement: { 1112 bool Reuse = CanReuseExtract(VL); 1113 if (Reuse) { 1114 DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); 1115 } else { 1116 BS.cancelScheduling(VL); 1117 } 1118 newTreeEntry(VL, Reuse); 1119 return; 1120 } 1121 case Instruction::Load: { 1122 // Check if the loads are consecutive or of we need to swizzle them. 1123 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { 1124 LoadInst *L = cast<LoadInst>(VL[i]); 1125 if (!L->isSimple()) { 1126 BS.cancelScheduling(VL); 1127 newTreeEntry(VL, false); 1128 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); 1129 return; 1130 } 1131 if (!isConsecutiveAccess(VL[i], VL[i + 1])) { 1132 if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) { 1133 ++NumLoadsWantToChangeOrder; 1134 } 1135 BS.cancelScheduling(VL); 1136 newTreeEntry(VL, false); 1137 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); 1138 return; 1139 } 1140 } 1141 ++NumLoadsWantToKeepOrder; 1142 newTreeEntry(VL, true); 1143 DEBUG(dbgs() << "SLP: added a vector of loads.\n"); 1144 return; 1145 } 1146 case Instruction::ZExt: 1147 case Instruction::SExt: 1148 case Instruction::FPToUI: 1149 case Instruction::FPToSI: 1150 case Instruction::FPExt: 1151 case Instruction::PtrToInt: 1152 case Instruction::IntToPtr: 1153 case Instruction::SIToFP: 1154 case Instruction::UIToFP: 1155 case Instruction::Trunc: 1156 case Instruction::FPTrunc: 1157 case Instruction::BitCast: { 1158 Type *SrcTy = VL0->getOperand(0)->getType(); 1159 for (unsigned i = 0; i < VL.size(); ++i) { 1160 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); 1161 if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { 1162 BS.cancelScheduling(VL); 1163 newTreeEntry(VL, false); 1164 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); 1165 return; 1166 } 1167 } 1168 newTreeEntry(VL, true); 1169 DEBUG(dbgs() << "SLP: added a vector of casts.\n"); 1170 1171 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1172 ValueList Operands; 1173 // Prepare the operand vector. 1174 for (unsigned j = 0; j < VL.size(); ++j) 1175 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1176 1177 buildTree_rec(Operands, Depth+1); 1178 } 1179 return; 1180 } 1181 case Instruction::ICmp: 1182 case Instruction::FCmp: { 1183 // Check that all of the compares have the same predicate. 1184 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); 1185 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); 1186 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 1187 CmpInst *Cmp = cast<CmpInst>(VL[i]); 1188 if (Cmp->getPredicate() != P0 || 1189 Cmp->getOperand(0)->getType() != ComparedTy) { 1190 BS.cancelScheduling(VL); 1191 newTreeEntry(VL, false); 1192 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); 1193 return; 1194 } 1195 } 1196 1197 newTreeEntry(VL, true); 1198 DEBUG(dbgs() << "SLP: added a vector of compares.\n"); 1199 1200 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1201 ValueList Operands; 1202 // Prepare the operand vector. 1203 for (unsigned j = 0; j < VL.size(); ++j) 1204 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1205 1206 buildTree_rec(Operands, Depth+1); 1207 } 1208 return; 1209 } 1210 case Instruction::Select: 1211 case Instruction::Add: 1212 case Instruction::FAdd: 1213 case Instruction::Sub: 1214 case Instruction::FSub: 1215 case Instruction::Mul: 1216 case Instruction::FMul: 1217 case Instruction::UDiv: 1218 case Instruction::SDiv: 1219 case Instruction::FDiv: 1220 case Instruction::URem: 1221 case Instruction::SRem: 1222 case Instruction::FRem: 1223 case Instruction::Shl: 1224 case Instruction::LShr: 1225 case Instruction::AShr: 1226 case Instruction::And: 1227 case Instruction::Or: 1228 case Instruction::Xor: { 1229 newTreeEntry(VL, true); 1230 DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); 1231 1232 // Sort operands of the instructions so that each side is more likely to 1233 // have the same opcode. 1234 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { 1235 ValueList Left, Right; 1236 reorderInputsAccordingToOpcode(VL, Left, Right); 1237 buildTree_rec(Left, Depth + 1); 1238 buildTree_rec(Right, Depth + 1); 1239 return; 1240 } 1241 1242 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1243 ValueList Operands; 1244 // Prepare the operand vector. 1245 for (unsigned j = 0; j < VL.size(); ++j) 1246 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1247 1248 buildTree_rec(Operands, Depth+1); 1249 } 1250 return; 1251 } 1252 case Instruction::GetElementPtr: { 1253 // We don't combine GEPs with complicated (nested) indexing. 1254 for (unsigned j = 0; j < VL.size(); ++j) { 1255 if (cast<Instruction>(VL[j])->getNumOperands() != 2) { 1256 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); 1257 BS.cancelScheduling(VL); 1258 newTreeEntry(VL, false); 1259 return; 1260 } 1261 } 1262 1263 // We can't combine several GEPs into one vector if they operate on 1264 // different types. 1265 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType(); 1266 for (unsigned j = 0; j < VL.size(); ++j) { 1267 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); 1268 if (Ty0 != CurTy) { 1269 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); 1270 BS.cancelScheduling(VL); 1271 newTreeEntry(VL, false); 1272 return; 1273 } 1274 } 1275 1276 // We don't combine GEPs with non-constant indexes. 1277 for (unsigned j = 0; j < VL.size(); ++j) { 1278 auto Op = cast<Instruction>(VL[j])->getOperand(1); 1279 if (!isa<ConstantInt>(Op)) { 1280 DEBUG( 1281 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); 1282 BS.cancelScheduling(VL); 1283 newTreeEntry(VL, false); 1284 return; 1285 } 1286 } 1287 1288 newTreeEntry(VL, true); 1289 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); 1290 for (unsigned i = 0, e = 2; i < e; ++i) { 1291 ValueList Operands; 1292 // Prepare the operand vector. 1293 for (unsigned j = 0; j < VL.size(); ++j) 1294 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1295 1296 buildTree_rec(Operands, Depth + 1); 1297 } 1298 return; 1299 } 1300 case Instruction::Store: { 1301 // Check if the stores are consecutive or of we need to swizzle them. 1302 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) 1303 if (!isConsecutiveAccess(VL[i], VL[i + 1])) { 1304 BS.cancelScheduling(VL); 1305 newTreeEntry(VL, false); 1306 DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); 1307 return; 1308 } 1309 1310 newTreeEntry(VL, true); 1311 DEBUG(dbgs() << "SLP: added a vector of stores.\n"); 1312 1313 ValueList Operands; 1314 for (unsigned j = 0; j < VL.size(); ++j) 1315 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); 1316 1317 buildTree_rec(Operands, Depth + 1); 1318 return; 1319 } 1320 case Instruction::Call: { 1321 // Check if the calls are all to the same vectorizable intrinsic. 1322 CallInst *CI = cast<CallInst>(VL[0]); 1323 // Check if this is an Intrinsic call or something that can be 1324 // represented by an intrinsic call 1325 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1326 if (!isTriviallyVectorizable(ID)) { 1327 BS.cancelScheduling(VL); 1328 newTreeEntry(VL, false); 1329 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); 1330 return; 1331 } 1332 Function *Int = CI->getCalledFunction(); 1333 Value *A1I = nullptr; 1334 if (hasVectorInstrinsicScalarOpd(ID, 1)) 1335 A1I = CI->getArgOperand(1); 1336 for (unsigned i = 1, e = VL.size(); i != e; ++i) { 1337 CallInst *CI2 = dyn_cast<CallInst>(VL[i]); 1338 if (!CI2 || CI2->getCalledFunction() != Int || 1339 getIntrinsicIDForCall(CI2, TLI) != ID) { 1340 BS.cancelScheduling(VL); 1341 newTreeEntry(VL, false); 1342 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] 1343 << "\n"); 1344 return; 1345 } 1346 // ctlz,cttz and powi are special intrinsics whose second argument 1347 // should be same in order for them to be vectorized. 1348 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 1349 Value *A1J = CI2->getArgOperand(1); 1350 if (A1I != A1J) { 1351 BS.cancelScheduling(VL); 1352 newTreeEntry(VL, false); 1353 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI 1354 << " argument "<< A1I<<"!=" << A1J 1355 << "\n"); 1356 return; 1357 } 1358 } 1359 } 1360 1361 newTreeEntry(VL, true); 1362 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { 1363 ValueList Operands; 1364 // Prepare the operand vector. 1365 for (unsigned j = 0; j < VL.size(); ++j) { 1366 CallInst *CI2 = dyn_cast<CallInst>(VL[j]); 1367 Operands.push_back(CI2->getArgOperand(i)); 1368 } 1369 buildTree_rec(Operands, Depth + 1); 1370 } 1371 return; 1372 } 1373 case Instruction::ShuffleVector: { 1374 // If this is not an alternate sequence of opcode like add-sub 1375 // then do not vectorize this instruction. 1376 if (!isAltShuffle) { 1377 BS.cancelScheduling(VL); 1378 newTreeEntry(VL, false); 1379 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); 1380 return; 1381 } 1382 newTreeEntry(VL, true); 1383 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); 1384 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1385 ValueList Operands; 1386 // Prepare the operand vector. 1387 for (unsigned j = 0; j < VL.size(); ++j) 1388 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1389 1390 buildTree_rec(Operands, Depth + 1); 1391 } 1392 return; 1393 } 1394 default: 1395 BS.cancelScheduling(VL); 1396 newTreeEntry(VL, false); 1397 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); 1398 return; 1399 } 1400 } 1401 1402 int BoUpSLP::getEntryCost(TreeEntry *E) { 1403 ArrayRef<Value*> VL = E->Scalars; 1404 1405 Type *ScalarTy = VL[0]->getType(); 1406 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1407 ScalarTy = SI->getValueOperand()->getType(); 1408 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1409 1410 if (E->NeedToGather) { 1411 if (allConstant(VL)) 1412 return 0; 1413 if (isSplat(VL)) { 1414 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); 1415 } 1416 return getGatherCost(E->Scalars); 1417 } 1418 unsigned Opcode = getSameOpcode(VL); 1419 assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); 1420 Instruction *VL0 = cast<Instruction>(VL[0]); 1421 switch (Opcode) { 1422 case Instruction::PHI: { 1423 return 0; 1424 } 1425 case Instruction::ExtractElement: { 1426 if (CanReuseExtract(VL)) { 1427 int DeadCost = 0; 1428 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 1429 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 1430 if (E->hasOneUse()) 1431 // Take credit for instruction that will become dead. 1432 DeadCost += 1433 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); 1434 } 1435 return -DeadCost; 1436 } 1437 return getGatherCost(VecTy); 1438 } 1439 case Instruction::ZExt: 1440 case Instruction::SExt: 1441 case Instruction::FPToUI: 1442 case Instruction::FPToSI: 1443 case Instruction::FPExt: 1444 case Instruction::PtrToInt: 1445 case Instruction::IntToPtr: 1446 case Instruction::SIToFP: 1447 case Instruction::UIToFP: 1448 case Instruction::Trunc: 1449 case Instruction::FPTrunc: 1450 case Instruction::BitCast: { 1451 Type *SrcTy = VL0->getOperand(0)->getType(); 1452 1453 // Calculate the cost of this instruction. 1454 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), 1455 VL0->getType(), SrcTy); 1456 1457 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); 1458 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); 1459 return VecCost - ScalarCost; 1460 } 1461 case Instruction::FCmp: 1462 case Instruction::ICmp: 1463 case Instruction::Select: 1464 case Instruction::Add: 1465 case Instruction::FAdd: 1466 case Instruction::Sub: 1467 case Instruction::FSub: 1468 case Instruction::Mul: 1469 case Instruction::FMul: 1470 case Instruction::UDiv: 1471 case Instruction::SDiv: 1472 case Instruction::FDiv: 1473 case Instruction::URem: 1474 case Instruction::SRem: 1475 case Instruction::FRem: 1476 case Instruction::Shl: 1477 case Instruction::LShr: 1478 case Instruction::AShr: 1479 case Instruction::And: 1480 case Instruction::Or: 1481 case Instruction::Xor: { 1482 // Calculate the cost of this instruction. 1483 int ScalarCost = 0; 1484 int VecCost = 0; 1485 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || 1486 Opcode == Instruction::Select) { 1487 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); 1488 ScalarCost = VecTy->getNumElements() * 1489 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); 1490 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); 1491 } else { 1492 // Certain instructions can be cheaper to vectorize if they have a 1493 // constant second vector operand. 1494 TargetTransformInfo::OperandValueKind Op1VK = 1495 TargetTransformInfo::OK_AnyValue; 1496 TargetTransformInfo::OperandValueKind Op2VK = 1497 TargetTransformInfo::OK_UniformConstantValue; 1498 TargetTransformInfo::OperandValueProperties Op1VP = 1499 TargetTransformInfo::OP_None; 1500 TargetTransformInfo::OperandValueProperties Op2VP = 1501 TargetTransformInfo::OP_None; 1502 1503 // If all operands are exactly the same ConstantInt then set the 1504 // operand kind to OK_UniformConstantValue. 1505 // If instead not all operands are constants, then set the operand kind 1506 // to OK_AnyValue. If all operands are constants but not the same, 1507 // then set the operand kind to OK_NonUniformConstantValue. 1508 ConstantInt *CInt = nullptr; 1509 for (unsigned i = 0; i < VL.size(); ++i) { 1510 const Instruction *I = cast<Instruction>(VL[i]); 1511 if (!isa<ConstantInt>(I->getOperand(1))) { 1512 Op2VK = TargetTransformInfo::OK_AnyValue; 1513 break; 1514 } 1515 if (i == 0) { 1516 CInt = cast<ConstantInt>(I->getOperand(1)); 1517 continue; 1518 } 1519 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && 1520 CInt != cast<ConstantInt>(I->getOperand(1))) 1521 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 1522 } 1523 // FIXME: Currently cost of model modification for division by 1524 // power of 2 is handled only for X86. Add support for other targets. 1525 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && 1526 CInt->getValue().isPowerOf2()) 1527 Op2VP = TargetTransformInfo::OP_PowerOf2; 1528 1529 ScalarCost = VecTy->getNumElements() * 1530 TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, 1531 Op1VP, Op2VP); 1532 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, 1533 Op1VP, Op2VP); 1534 } 1535 return VecCost - ScalarCost; 1536 } 1537 case Instruction::GetElementPtr: { 1538 TargetTransformInfo::OperandValueKind Op1VK = 1539 TargetTransformInfo::OK_AnyValue; 1540 TargetTransformInfo::OperandValueKind Op2VK = 1541 TargetTransformInfo::OK_UniformConstantValue; 1542 1543 int ScalarCost = 1544 VecTy->getNumElements() * 1545 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); 1546 int VecCost = 1547 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); 1548 1549 return VecCost - ScalarCost; 1550 } 1551 case Instruction::Load: { 1552 // Cost of wide load - cost of scalar loads. 1553 int ScalarLdCost = VecTy->getNumElements() * 1554 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); 1555 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0); 1556 return VecLdCost - ScalarLdCost; 1557 } 1558 case Instruction::Store: { 1559 // We know that we can merge the stores. Calculate the cost. 1560 int ScalarStCost = VecTy->getNumElements() * 1561 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); 1562 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0); 1563 return VecStCost - ScalarStCost; 1564 } 1565 case Instruction::Call: { 1566 CallInst *CI = cast<CallInst>(VL0); 1567 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1568 1569 // Calculate the cost of the scalar and vector calls. 1570 SmallVector<Type*, 4> ScalarTys, VecTys; 1571 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { 1572 ScalarTys.push_back(CI->getArgOperand(op)->getType()); 1573 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), 1574 VecTy->getNumElements())); 1575 } 1576 1577 int ScalarCallCost = VecTy->getNumElements() * 1578 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys); 1579 1580 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys); 1581 1582 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost 1583 << " (" << VecCallCost << "-" << ScalarCallCost << ")" 1584 << " for " << *CI << "\n"); 1585 1586 return VecCallCost - ScalarCallCost; 1587 } 1588 case Instruction::ShuffleVector: { 1589 TargetTransformInfo::OperandValueKind Op1VK = 1590 TargetTransformInfo::OK_AnyValue; 1591 TargetTransformInfo::OperandValueKind Op2VK = 1592 TargetTransformInfo::OK_AnyValue; 1593 int ScalarCost = 0; 1594 int VecCost = 0; 1595 for (unsigned i = 0; i < VL.size(); ++i) { 1596 Instruction *I = cast<Instruction>(VL[i]); 1597 if (!I) 1598 break; 1599 ScalarCost += 1600 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); 1601 } 1602 // VecCost is equal to sum of the cost of creating 2 vectors 1603 // and the cost of creating shuffle. 1604 Instruction *I0 = cast<Instruction>(VL[0]); 1605 VecCost = 1606 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); 1607 Instruction *I1 = cast<Instruction>(VL[1]); 1608 VecCost += 1609 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); 1610 VecCost += 1611 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); 1612 return VecCost - ScalarCost; 1613 } 1614 default: 1615 llvm_unreachable("Unknown instruction"); 1616 } 1617 } 1618 1619 bool BoUpSLP::isFullyVectorizableTinyTree() { 1620 DEBUG(dbgs() << "SLP: Check whether the tree with height " << 1621 VectorizableTree.size() << " is fully vectorizable .\n"); 1622 1623 // We only handle trees of height 2. 1624 if (VectorizableTree.size() != 2) 1625 return false; 1626 1627 // Handle splat stores. 1628 if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars)) 1629 return true; 1630 1631 // Gathering cost would be too much for tiny trees. 1632 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) 1633 return false; 1634 1635 return true; 1636 } 1637 1638 int BoUpSLP::getSpillCost() { 1639 // Walk from the bottom of the tree to the top, tracking which values are 1640 // live. When we see a call instruction that is not part of our tree, 1641 // query TTI to see if there is a cost to keeping values live over it 1642 // (for example, if spills and fills are required). 1643 unsigned BundleWidth = VectorizableTree.front().Scalars.size(); 1644 int Cost = 0; 1645 1646 SmallPtrSet<Instruction*, 4> LiveValues; 1647 Instruction *PrevInst = nullptr; 1648 1649 for (unsigned N = 0; N < VectorizableTree.size(); ++N) { 1650 Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]); 1651 if (!Inst) 1652 continue; 1653 1654 if (!PrevInst) { 1655 PrevInst = Inst; 1656 continue; 1657 } 1658 1659 DEBUG( 1660 dbgs() << "SLP: #LV: " << LiveValues.size(); 1661 for (auto *X : LiveValues) 1662 dbgs() << " " << X->getName(); 1663 dbgs() << ", Looking at "; 1664 Inst->dump(); 1665 ); 1666 1667 // Update LiveValues. 1668 LiveValues.erase(PrevInst); 1669 for (auto &J : PrevInst->operands()) { 1670 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J)) 1671 LiveValues.insert(cast<Instruction>(&*J)); 1672 } 1673 1674 // Now find the sequence of instructions between PrevInst and Inst. 1675 BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst); 1676 --PrevInstIt; 1677 while (InstIt != PrevInstIt) { 1678 if (PrevInstIt == PrevInst->getParent()->rend()) { 1679 PrevInstIt = Inst->getParent()->rbegin(); 1680 continue; 1681 } 1682 1683 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) { 1684 SmallVector<Type*, 4> V; 1685 for (auto *II : LiveValues) 1686 V.push_back(VectorType::get(II->getType(), BundleWidth)); 1687 Cost += TTI->getCostOfKeepingLiveOverCall(V); 1688 } 1689 1690 ++PrevInstIt; 1691 } 1692 1693 PrevInst = Inst; 1694 } 1695 1696 DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n"); 1697 return Cost; 1698 } 1699 1700 int BoUpSLP::getTreeCost() { 1701 int Cost = 0; 1702 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << 1703 VectorizableTree.size() << ".\n"); 1704 1705 // We only vectorize tiny trees if it is fully vectorizable. 1706 if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { 1707 if (!VectorizableTree.size()) { 1708 assert(!ExternalUses.size() && "We should not have any external users"); 1709 } 1710 return INT_MAX; 1711 } 1712 1713 unsigned BundleWidth = VectorizableTree[0].Scalars.size(); 1714 1715 for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { 1716 int C = getEntryCost(&VectorizableTree[i]); 1717 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " 1718 << *VectorizableTree[i].Scalars[0] << " .\n"); 1719 Cost += C; 1720 } 1721 1722 SmallSet<Value *, 16> ExtractCostCalculated; 1723 int ExtractCost = 0; 1724 for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end(); 1725 I != E; ++I) { 1726 // We only add extract cost once for the same scalar. 1727 if (!ExtractCostCalculated.insert(I->Scalar)) 1728 continue; 1729 1730 // Uses by ephemeral values are free (because the ephemeral value will be 1731 // removed prior to code generation, and so the extraction will be 1732 // removed as well). 1733 if (EphValues.count(I->User)) 1734 continue; 1735 1736 VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); 1737 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1738 I->Lane); 1739 } 1740 1741 Cost += getSpillCost(); 1742 1743 DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); 1744 return Cost + ExtractCost; 1745 } 1746 1747 int BoUpSLP::getGatherCost(Type *Ty) { 1748 int Cost = 0; 1749 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) 1750 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); 1751 return Cost; 1752 } 1753 1754 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { 1755 // Find the type of the operands in VL. 1756 Type *ScalarTy = VL[0]->getType(); 1757 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1758 ScalarTy = SI->getValueOperand()->getType(); 1759 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1760 // Find the cost of inserting/extracting values from the vector. 1761 return getGatherCost(VecTy); 1762 } 1763 1764 Value *BoUpSLP::getPointerOperand(Value *I) { 1765 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1766 return LI->getPointerOperand(); 1767 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 1768 return SI->getPointerOperand(); 1769 return nullptr; 1770 } 1771 1772 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { 1773 if (LoadInst *L = dyn_cast<LoadInst>(I)) 1774 return L->getPointerAddressSpace(); 1775 if (StoreInst *S = dyn_cast<StoreInst>(I)) 1776 return S->getPointerAddressSpace(); 1777 return -1; 1778 } 1779 1780 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { 1781 Value *PtrA = getPointerOperand(A); 1782 Value *PtrB = getPointerOperand(B); 1783 unsigned ASA = getAddressSpaceOperand(A); 1784 unsigned ASB = getAddressSpaceOperand(B); 1785 1786 // Check that the address spaces match and that the pointers are valid. 1787 if (!PtrA || !PtrB || (ASA != ASB)) 1788 return false; 1789 1790 // Make sure that A and B are different pointers of the same type. 1791 if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) 1792 return false; 1793 1794 unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA); 1795 Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); 1796 APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty)); 1797 1798 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); 1799 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA); 1800 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB); 1801 1802 APInt OffsetDelta = OffsetB - OffsetA; 1803 1804 // Check if they are based on the same pointer. That makes the offsets 1805 // sufficient. 1806 if (PtrA == PtrB) 1807 return OffsetDelta == Size; 1808 1809 // Compute the necessary base pointer delta to have the necessary final delta 1810 // equal to the size. 1811 APInt BaseDelta = Size - OffsetDelta; 1812 1813 // Otherwise compute the distance with SCEV between the base pointers. 1814 const SCEV *PtrSCEVA = SE->getSCEV(PtrA); 1815 const SCEV *PtrSCEVB = SE->getSCEV(PtrB); 1816 const SCEV *C = SE->getConstant(BaseDelta); 1817 const SCEV *X = SE->getAddExpr(PtrSCEVA, C); 1818 return X == PtrSCEVB; 1819 } 1820 1821 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { 1822 Instruction *VL0 = cast<Instruction>(VL[0]); 1823 BasicBlock::iterator NextInst = VL0; 1824 ++NextInst; 1825 Builder.SetInsertPoint(VL0->getParent(), NextInst); 1826 Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); 1827 } 1828 1829 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { 1830 Value *Vec = UndefValue::get(Ty); 1831 // Generate the 'InsertElement' instruction. 1832 for (unsigned i = 0; i < Ty->getNumElements(); ++i) { 1833 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); 1834 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { 1835 GatherSeq.insert(Insrt); 1836 CSEBlocks.insert(Insrt->getParent()); 1837 1838 // Add to our 'need-to-extract' list. 1839 if (ScalarToTreeEntry.count(VL[i])) { 1840 int Idx = ScalarToTreeEntry[VL[i]]; 1841 TreeEntry *E = &VectorizableTree[Idx]; 1842 // Find which lane we need to extract. 1843 int FoundLane = -1; 1844 for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { 1845 // Is this the lane of the scalar that we are looking for ? 1846 if (E->Scalars[Lane] == VL[i]) { 1847 FoundLane = Lane; 1848 break; 1849 } 1850 } 1851 assert(FoundLane >= 0 && "Could not find the correct lane"); 1852 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); 1853 } 1854 } 1855 } 1856 1857 return Vec; 1858 } 1859 1860 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { 1861 SmallDenseMap<Value*, int>::const_iterator Entry 1862 = ScalarToTreeEntry.find(VL[0]); 1863 if (Entry != ScalarToTreeEntry.end()) { 1864 int Idx = Entry->second; 1865 const TreeEntry *En = &VectorizableTree[Idx]; 1866 if (En->isSame(VL) && En->VectorizedValue) 1867 return En->VectorizedValue; 1868 } 1869 return nullptr; 1870 } 1871 1872 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { 1873 if (ScalarToTreeEntry.count(VL[0])) { 1874 int Idx = ScalarToTreeEntry[VL[0]]; 1875 TreeEntry *E = &VectorizableTree[Idx]; 1876 if (E->isSame(VL)) 1877 return vectorizeTree(E); 1878 } 1879 1880 Type *ScalarTy = VL[0]->getType(); 1881 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1882 ScalarTy = SI->getValueOperand()->getType(); 1883 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1884 1885 return Gather(VL, VecTy); 1886 } 1887 1888 Value *BoUpSLP::vectorizeTree(TreeEntry *E) { 1889 IRBuilder<>::InsertPointGuard Guard(Builder); 1890 1891 if (E->VectorizedValue) { 1892 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); 1893 return E->VectorizedValue; 1894 } 1895 1896 Instruction *VL0 = cast<Instruction>(E->Scalars[0]); 1897 Type *ScalarTy = VL0->getType(); 1898 if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) 1899 ScalarTy = SI->getValueOperand()->getType(); 1900 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); 1901 1902 if (E->NeedToGather) { 1903 setInsertPointAfterBundle(E->Scalars); 1904 return Gather(E->Scalars, VecTy); 1905 } 1906 1907 unsigned Opcode = getSameOpcode(E->Scalars); 1908 1909 switch (Opcode) { 1910 case Instruction::PHI: { 1911 PHINode *PH = dyn_cast<PHINode>(VL0); 1912 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); 1913 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 1914 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); 1915 E->VectorizedValue = NewPhi; 1916 1917 // PHINodes may have multiple entries from the same block. We want to 1918 // visit every block once. 1919 SmallSet<BasicBlock*, 4> VisitedBBs; 1920 1921 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1922 ValueList Operands; 1923 BasicBlock *IBB = PH->getIncomingBlock(i); 1924 1925 if (!VisitedBBs.insert(IBB)) { 1926 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); 1927 continue; 1928 } 1929 1930 // Prepare the operand vector. 1931 for (unsigned j = 0; j < E->Scalars.size(); ++j) 1932 Operands.push_back(cast<PHINode>(E->Scalars[j])-> 1933 getIncomingValueForBlock(IBB)); 1934 1935 Builder.SetInsertPoint(IBB->getTerminator()); 1936 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 1937 Value *Vec = vectorizeTree(Operands); 1938 NewPhi->addIncoming(Vec, IBB); 1939 } 1940 1941 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && 1942 "Invalid number of incoming values"); 1943 return NewPhi; 1944 } 1945 1946 case Instruction::ExtractElement: { 1947 if (CanReuseExtract(E->Scalars)) { 1948 Value *V = VL0->getOperand(0); 1949 E->VectorizedValue = V; 1950 return V; 1951 } 1952 return Gather(E->Scalars, VecTy); 1953 } 1954 case Instruction::ZExt: 1955 case Instruction::SExt: 1956 case Instruction::FPToUI: 1957 case Instruction::FPToSI: 1958 case Instruction::FPExt: 1959 case Instruction::PtrToInt: 1960 case Instruction::IntToPtr: 1961 case Instruction::SIToFP: 1962 case Instruction::UIToFP: 1963 case Instruction::Trunc: 1964 case Instruction::FPTrunc: 1965 case Instruction::BitCast: { 1966 ValueList INVL; 1967 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 1968 INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 1969 1970 setInsertPointAfterBundle(E->Scalars); 1971 1972 Value *InVec = vectorizeTree(INVL); 1973 1974 if (Value *V = alreadyVectorized(E->Scalars)) 1975 return V; 1976 1977 CastInst *CI = dyn_cast<CastInst>(VL0); 1978 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); 1979 E->VectorizedValue = V; 1980 ++NumVectorInstructions; 1981 return V; 1982 } 1983 case Instruction::FCmp: 1984 case Instruction::ICmp: { 1985 ValueList LHSV, RHSV; 1986 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 1987 LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 1988 RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 1989 } 1990 1991 setInsertPointAfterBundle(E->Scalars); 1992 1993 Value *L = vectorizeTree(LHSV); 1994 Value *R = vectorizeTree(RHSV); 1995 1996 if (Value *V = alreadyVectorized(E->Scalars)) 1997 return V; 1998 1999 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); 2000 Value *V; 2001 if (Opcode == Instruction::FCmp) 2002 V = Builder.CreateFCmp(P0, L, R); 2003 else 2004 V = Builder.CreateICmp(P0, L, R); 2005 2006 E->VectorizedValue = V; 2007 ++NumVectorInstructions; 2008 return V; 2009 } 2010 case Instruction::Select: { 2011 ValueList TrueVec, FalseVec, CondVec; 2012 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2013 CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2014 TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2015 FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2)); 2016 } 2017 2018 setInsertPointAfterBundle(E->Scalars); 2019 2020 Value *Cond = vectorizeTree(CondVec); 2021 Value *True = vectorizeTree(TrueVec); 2022 Value *False = vectorizeTree(FalseVec); 2023 2024 if (Value *V = alreadyVectorized(E->Scalars)) 2025 return V; 2026 2027 Value *V = Builder.CreateSelect(Cond, True, False); 2028 E->VectorizedValue = V; 2029 ++NumVectorInstructions; 2030 return V; 2031 } 2032 case Instruction::Add: 2033 case Instruction::FAdd: 2034 case Instruction::Sub: 2035 case Instruction::FSub: 2036 case Instruction::Mul: 2037 case Instruction::FMul: 2038 case Instruction::UDiv: 2039 case Instruction::SDiv: 2040 case Instruction::FDiv: 2041 case Instruction::URem: 2042 case Instruction::SRem: 2043 case Instruction::FRem: 2044 case Instruction::Shl: 2045 case Instruction::LShr: 2046 case Instruction::AShr: 2047 case Instruction::And: 2048 case Instruction::Or: 2049 case Instruction::Xor: { 2050 ValueList LHSVL, RHSVL; 2051 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) 2052 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); 2053 else 2054 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2055 LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2056 RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2057 } 2058 2059 setInsertPointAfterBundle(E->Scalars); 2060 2061 Value *LHS = vectorizeTree(LHSVL); 2062 Value *RHS = vectorizeTree(RHSVL); 2063 2064 if (LHS == RHS && isa<Instruction>(LHS)) { 2065 assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); 2066 } 2067 2068 if (Value *V = alreadyVectorized(E->Scalars)) 2069 return V; 2070 2071 BinaryOperator *BinOp = cast<BinaryOperator>(VL0); 2072 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); 2073 E->VectorizedValue = V; 2074 propagateIRFlags(E->VectorizedValue, E->Scalars); 2075 ++NumVectorInstructions; 2076 2077 if (Instruction *I = dyn_cast<Instruction>(V)) 2078 return propagateMetadata(I, E->Scalars); 2079 2080 return V; 2081 } 2082 case Instruction::Load: { 2083 // Loads are inserted at the head of the tree because we don't want to 2084 // sink them all the way down past store instructions. 2085 setInsertPointAfterBundle(E->Scalars); 2086 2087 LoadInst *LI = cast<LoadInst>(VL0); 2088 Type *ScalarLoadTy = LI->getType(); 2089 unsigned AS = LI->getPointerAddressSpace(); 2090 2091 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), 2092 VecTy->getPointerTo(AS)); 2093 2094 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2095 // ExternalUses list to make sure that an extract will be generated in the 2096 // future. 2097 if (ScalarToTreeEntry.count(LI->getPointerOperand())) 2098 ExternalUses.push_back( 2099 ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0)); 2100 2101 unsigned Alignment = LI->getAlignment(); 2102 LI = Builder.CreateLoad(VecPtr); 2103 if (!Alignment) 2104 Alignment = DL->getABITypeAlignment(ScalarLoadTy); 2105 LI->setAlignment(Alignment); 2106 E->VectorizedValue = LI; 2107 ++NumVectorInstructions; 2108 return propagateMetadata(LI, E->Scalars); 2109 } 2110 case Instruction::Store: { 2111 StoreInst *SI = cast<StoreInst>(VL0); 2112 unsigned Alignment = SI->getAlignment(); 2113 unsigned AS = SI->getPointerAddressSpace(); 2114 2115 ValueList ValueOp; 2116 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2117 ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand()); 2118 2119 setInsertPointAfterBundle(E->Scalars); 2120 2121 Value *VecValue = vectorizeTree(ValueOp); 2122 Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), 2123 VecTy->getPointerTo(AS)); 2124 StoreInst *S = Builder.CreateStore(VecValue, VecPtr); 2125 2126 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2127 // ExternalUses list to make sure that an extract will be generated in the 2128 // future. 2129 if (ScalarToTreeEntry.count(SI->getPointerOperand())) 2130 ExternalUses.push_back( 2131 ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0)); 2132 2133 if (!Alignment) 2134 Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); 2135 S->setAlignment(Alignment); 2136 E->VectorizedValue = S; 2137 ++NumVectorInstructions; 2138 return propagateMetadata(S, E->Scalars); 2139 } 2140 case Instruction::GetElementPtr: { 2141 setInsertPointAfterBundle(E->Scalars); 2142 2143 ValueList Op0VL; 2144 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2145 Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0)); 2146 2147 Value *Op0 = vectorizeTree(Op0VL); 2148 2149 std::vector<Value *> OpVecs; 2150 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; 2151 ++j) { 2152 ValueList OpVL; 2153 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2154 OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j)); 2155 2156 Value *OpVec = vectorizeTree(OpVL); 2157 OpVecs.push_back(OpVec); 2158 } 2159 2160 Value *V = Builder.CreateGEP(Op0, OpVecs); 2161 E->VectorizedValue = V; 2162 ++NumVectorInstructions; 2163 2164 if (Instruction *I = dyn_cast<Instruction>(V)) 2165 return propagateMetadata(I, E->Scalars); 2166 2167 return V; 2168 } 2169 case Instruction::Call: { 2170 CallInst *CI = cast<CallInst>(VL0); 2171 setInsertPointAfterBundle(E->Scalars); 2172 Function *FI; 2173 Intrinsic::ID IID = Intrinsic::not_intrinsic; 2174 Value *ScalarArg = nullptr; 2175 if (CI && (FI = CI->getCalledFunction())) { 2176 IID = (Intrinsic::ID) FI->getIntrinsicID(); 2177 } 2178 std::vector<Value *> OpVecs; 2179 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { 2180 ValueList OpVL; 2181 // ctlz,cttz and powi are special intrinsics whose second argument is 2182 // a scalar. This argument should not be vectorized. 2183 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { 2184 CallInst *CEI = cast<CallInst>(E->Scalars[0]); 2185 ScalarArg = CEI->getArgOperand(j); 2186 OpVecs.push_back(CEI->getArgOperand(j)); 2187 continue; 2188 } 2189 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2190 CallInst *CEI = cast<CallInst>(E->Scalars[i]); 2191 OpVL.push_back(CEI->getArgOperand(j)); 2192 } 2193 2194 Value *OpVec = vectorizeTree(OpVL); 2195 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); 2196 OpVecs.push_back(OpVec); 2197 } 2198 2199 Module *M = F->getParent(); 2200 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 2201 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; 2202 Function *CF = Intrinsic::getDeclaration(M, ID, Tys); 2203 Value *V = Builder.CreateCall(CF, OpVecs); 2204 2205 // The scalar argument uses an in-tree scalar so we add the new vectorized 2206 // call to ExternalUses list to make sure that an extract will be 2207 // generated in the future. 2208 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg)) 2209 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); 2210 2211 E->VectorizedValue = V; 2212 ++NumVectorInstructions; 2213 return V; 2214 } 2215 case Instruction::ShuffleVector: { 2216 ValueList LHSVL, RHSVL; 2217 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2218 LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2219 RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2220 } 2221 setInsertPointAfterBundle(E->Scalars); 2222 2223 Value *LHS = vectorizeTree(LHSVL); 2224 Value *RHS = vectorizeTree(RHSVL); 2225 2226 if (Value *V = alreadyVectorized(E->Scalars)) 2227 return V; 2228 2229 // Create a vector of LHS op1 RHS 2230 BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0); 2231 Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); 2232 2233 // Create a vector of LHS op2 RHS 2234 Instruction *VL1 = cast<Instruction>(E->Scalars[1]); 2235 BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1); 2236 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); 2237 2238 // Create shuffle to take alternate operations from the vector. 2239 // Also, gather up odd and even scalar ops to propagate IR flags to 2240 // each vector operation. 2241 ValueList OddScalars, EvenScalars; 2242 unsigned e = E->Scalars.size(); 2243 SmallVector<Constant *, 8> Mask(e); 2244 for (unsigned i = 0; i < e; ++i) { 2245 if (i & 1) { 2246 Mask[i] = Builder.getInt32(e + i); 2247 OddScalars.push_back(E->Scalars[i]); 2248 } else { 2249 Mask[i] = Builder.getInt32(i); 2250 EvenScalars.push_back(E->Scalars[i]); 2251 } 2252 } 2253 2254 Value *ShuffleMask = ConstantVector::get(Mask); 2255 propagateIRFlags(V0, EvenScalars); 2256 propagateIRFlags(V1, OddScalars); 2257 2258 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); 2259 E->VectorizedValue = V; 2260 ++NumVectorInstructions; 2261 if (Instruction *I = dyn_cast<Instruction>(V)) 2262 return propagateMetadata(I, E->Scalars); 2263 2264 return V; 2265 } 2266 default: 2267 llvm_unreachable("unknown inst"); 2268 } 2269 return nullptr; 2270 } 2271 2272 Value *BoUpSLP::vectorizeTree() { 2273 2274 // All blocks must be scheduled before any instructions are inserted. 2275 for (auto &BSIter : BlocksSchedules) { 2276 scheduleBlock(BSIter.second.get()); 2277 } 2278 2279 Builder.SetInsertPoint(F->getEntryBlock().begin()); 2280 vectorizeTree(&VectorizableTree[0]); 2281 2282 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); 2283 2284 // Extract all of the elements with the external uses. 2285 for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end(); 2286 it != e; ++it) { 2287 Value *Scalar = it->Scalar; 2288 llvm::User *User = it->User; 2289 2290 // Skip users that we already RAUW. This happens when one instruction 2291 // has multiple uses of the same value. 2292 if (std::find(Scalar->user_begin(), Scalar->user_end(), User) == 2293 Scalar->user_end()) 2294 continue; 2295 assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); 2296 2297 int Idx = ScalarToTreeEntry[Scalar]; 2298 TreeEntry *E = &VectorizableTree[Idx]; 2299 assert(!E->NeedToGather && "Extracting from a gather list"); 2300 2301 Value *Vec = E->VectorizedValue; 2302 assert(Vec && "Can't find vectorizable value"); 2303 2304 Value *Lane = Builder.getInt32(it->Lane); 2305 // Generate extracts for out-of-tree users. 2306 // Find the insertion point for the extractelement lane. 2307 if (isa<Instruction>(Vec)){ 2308 if (PHINode *PH = dyn_cast<PHINode>(User)) { 2309 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { 2310 if (PH->getIncomingValue(i) == Scalar) { 2311 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); 2312 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2313 CSEBlocks.insert(PH->getIncomingBlock(i)); 2314 PH->setOperand(i, Ex); 2315 } 2316 } 2317 } else { 2318 Builder.SetInsertPoint(cast<Instruction>(User)); 2319 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2320 CSEBlocks.insert(cast<Instruction>(User)->getParent()); 2321 User->replaceUsesOfWith(Scalar, Ex); 2322 } 2323 } else { 2324 Builder.SetInsertPoint(F->getEntryBlock().begin()); 2325 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2326 CSEBlocks.insert(&F->getEntryBlock()); 2327 User->replaceUsesOfWith(Scalar, Ex); 2328 } 2329 2330 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); 2331 } 2332 2333 // For each vectorized value: 2334 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 2335 TreeEntry *Entry = &VectorizableTree[EIdx]; 2336 2337 // For each lane: 2338 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 2339 Value *Scalar = Entry->Scalars[Lane]; 2340 // No need to handle users of gathered values. 2341 if (Entry->NeedToGather) 2342 continue; 2343 2344 assert(Entry->VectorizedValue && "Can't find vectorizable value"); 2345 2346 Type *Ty = Scalar->getType(); 2347 if (!Ty->isVoidTy()) { 2348 #ifndef NDEBUG 2349 for (User *U : Scalar->users()) { 2350 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); 2351 2352 assert((ScalarToTreeEntry.count(U) || 2353 // It is legal to replace users in the ignorelist by undef. 2354 (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != 2355 UserIgnoreList.end())) && 2356 "Replacing out-of-tree value with undef"); 2357 } 2358 #endif 2359 Value *Undef = UndefValue::get(Ty); 2360 Scalar->replaceAllUsesWith(Undef); 2361 } 2362 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); 2363 cast<Instruction>(Scalar)->eraseFromParent(); 2364 } 2365 } 2366 2367 Builder.ClearInsertionPoint(); 2368 2369 return VectorizableTree[0].VectorizedValue; 2370 } 2371 2372 void BoUpSLP::optimizeGatherSequence() { 2373 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() 2374 << " gather sequences instructions.\n"); 2375 // LICM InsertElementInst sequences. 2376 for (SetVector<Instruction *>::iterator it = GatherSeq.begin(), 2377 e = GatherSeq.end(); it != e; ++it) { 2378 InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); 2379 2380 if (!Insert) 2381 continue; 2382 2383 // Check if this block is inside a loop. 2384 Loop *L = LI->getLoopFor(Insert->getParent()); 2385 if (!L) 2386 continue; 2387 2388 // Check if it has a preheader. 2389 BasicBlock *PreHeader = L->getLoopPreheader(); 2390 if (!PreHeader) 2391 continue; 2392 2393 // If the vector or the element that we insert into it are 2394 // instructions that are defined in this basic block then we can't 2395 // hoist this instruction. 2396 Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); 2397 Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); 2398 if (CurrVec && L->contains(CurrVec)) 2399 continue; 2400 if (NewElem && L->contains(NewElem)) 2401 continue; 2402 2403 // We can hoist this instruction. Move it to the pre-header. 2404 Insert->moveBefore(PreHeader->getTerminator()); 2405 } 2406 2407 // Make a list of all reachable blocks in our CSE queue. 2408 SmallVector<const DomTreeNode *, 8> CSEWorkList; 2409 CSEWorkList.reserve(CSEBlocks.size()); 2410 for (BasicBlock *BB : CSEBlocks) 2411 if (DomTreeNode *N = DT->getNode(BB)) { 2412 assert(DT->isReachableFromEntry(N)); 2413 CSEWorkList.push_back(N); 2414 } 2415 2416 // Sort blocks by domination. This ensures we visit a block after all blocks 2417 // dominating it are visited. 2418 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), 2419 [this](const DomTreeNode *A, const DomTreeNode *B) { 2420 return DT->properlyDominates(A, B); 2421 }); 2422 2423 // Perform O(N^2) search over the gather sequences and merge identical 2424 // instructions. TODO: We can further optimize this scan if we split the 2425 // instructions into different buckets based on the insert lane. 2426 SmallVector<Instruction *, 16> Visited; 2427 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { 2428 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && 2429 "Worklist not sorted properly!"); 2430 BasicBlock *BB = (*I)->getBlock(); 2431 // For all instructions in blocks containing gather sequences: 2432 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { 2433 Instruction *In = it++; 2434 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) 2435 continue; 2436 2437 // Check if we can replace this instruction with any of the 2438 // visited instructions. 2439 for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(), 2440 ve = Visited.end(); 2441 v != ve; ++v) { 2442 if (In->isIdenticalTo(*v) && 2443 DT->dominates((*v)->getParent(), In->getParent())) { 2444 In->replaceAllUsesWith(*v); 2445 In->eraseFromParent(); 2446 In = nullptr; 2447 break; 2448 } 2449 } 2450 if (In) { 2451 assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end()); 2452 Visited.push_back(In); 2453 } 2454 } 2455 } 2456 CSEBlocks.clear(); 2457 GatherSeq.clear(); 2458 } 2459 2460 // Groups the instructions to a bundle (which is then a single scheduling entity) 2461 // and schedules instructions until the bundle gets ready. 2462 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, 2463 AliasAnalysis *AA) { 2464 if (isa<PHINode>(VL[0])) 2465 return true; 2466 2467 // Initialize the instruction bundle. 2468 Instruction *OldScheduleEnd = ScheduleEnd; 2469 ScheduleData *PrevInBundle = nullptr; 2470 ScheduleData *Bundle = nullptr; 2471 bool ReSchedule = false; 2472 DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); 2473 for (Value *V : VL) { 2474 extendSchedulingRegion(V); 2475 ScheduleData *BundleMember = getScheduleData(V); 2476 assert(BundleMember && 2477 "no ScheduleData for bundle member (maybe not in same basic block)"); 2478 if (BundleMember->IsScheduled) { 2479 // A bundle member was scheduled as single instruction before and now 2480 // needs to be scheduled as part of the bundle. We just get rid of the 2481 // existing schedule. 2482 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember 2483 << " was already scheduled\n"); 2484 ReSchedule = true; 2485 } 2486 assert(BundleMember->isSchedulingEntity() && 2487 "bundle member already part of other bundle"); 2488 if (PrevInBundle) { 2489 PrevInBundle->NextInBundle = BundleMember; 2490 } else { 2491 Bundle = BundleMember; 2492 } 2493 BundleMember->UnscheduledDepsInBundle = 0; 2494 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; 2495 2496 // Group the instructions to a bundle. 2497 BundleMember->FirstInBundle = Bundle; 2498 PrevInBundle = BundleMember; 2499 } 2500 if (ScheduleEnd != OldScheduleEnd) { 2501 // The scheduling region got new instructions at the lower end (or it is a 2502 // new region for the first bundle). This makes it necessary to 2503 // recalculate all dependencies. 2504 // It is seldom that this needs to be done a second time after adding the 2505 // initial bundle to the region. 2506 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2507 ScheduleData *SD = getScheduleData(I); 2508 SD->clearDependencies(); 2509 } 2510 ReSchedule = true; 2511 } 2512 if (ReSchedule) { 2513 resetSchedule(); 2514 initialFillReadyList(ReadyInsts); 2515 } 2516 2517 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " 2518 << BB->getName() << "\n"); 2519 2520 calculateDependencies(Bundle, true, AA); 2521 2522 // Now try to schedule the new bundle. As soon as the bundle is "ready" it 2523 // means that there are no cyclic dependencies and we can schedule it. 2524 // Note that's important that we don't "schedule" the bundle yet (see 2525 // cancelScheduling). 2526 while (!Bundle->isReady() && !ReadyInsts.empty()) { 2527 2528 ScheduleData *pickedSD = ReadyInsts.back(); 2529 ReadyInsts.pop_back(); 2530 2531 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { 2532 schedule(pickedSD, ReadyInsts); 2533 } 2534 } 2535 return Bundle->isReady(); 2536 } 2537 2538 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) { 2539 if (isa<PHINode>(VL[0])) 2540 return; 2541 2542 ScheduleData *Bundle = getScheduleData(VL[0]); 2543 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); 2544 assert(!Bundle->IsScheduled && 2545 "Can't cancel bundle which is already scheduled"); 2546 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && 2547 "tried to unbundle something which is not a bundle"); 2548 2549 // Un-bundle: make single instructions out of the bundle. 2550 ScheduleData *BundleMember = Bundle; 2551 while (BundleMember) { 2552 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); 2553 BundleMember->FirstInBundle = BundleMember; 2554 ScheduleData *Next = BundleMember->NextInBundle; 2555 BundleMember->NextInBundle = nullptr; 2556 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; 2557 if (BundleMember->UnscheduledDepsInBundle == 0) { 2558 ReadyInsts.insert(BundleMember); 2559 } 2560 BundleMember = Next; 2561 } 2562 } 2563 2564 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { 2565 if (getScheduleData(V)) 2566 return; 2567 Instruction *I = dyn_cast<Instruction>(V); 2568 assert(I && "bundle member must be an instruction"); 2569 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); 2570 if (!ScheduleStart) { 2571 // It's the first instruction in the new region. 2572 initScheduleData(I, I->getNextNode(), nullptr, nullptr); 2573 ScheduleStart = I; 2574 ScheduleEnd = I->getNextNode(); 2575 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2576 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); 2577 return; 2578 } 2579 // Search up and down at the same time, because we don't know if the new 2580 // instruction is above or below the existing scheduling region. 2581 BasicBlock::reverse_iterator UpIter(ScheduleStart); 2582 BasicBlock::reverse_iterator UpperEnd = BB->rend(); 2583 BasicBlock::iterator DownIter(ScheduleEnd); 2584 BasicBlock::iterator LowerEnd = BB->end(); 2585 for (;;) { 2586 if (UpIter != UpperEnd) { 2587 if (&*UpIter == I) { 2588 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); 2589 ScheduleStart = I; 2590 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); 2591 return; 2592 } 2593 UpIter++; 2594 } 2595 if (DownIter != LowerEnd) { 2596 if (&*DownIter == I) { 2597 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, 2598 nullptr); 2599 ScheduleEnd = I->getNextNode(); 2600 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2601 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); 2602 return; 2603 } 2604 DownIter++; 2605 } 2606 assert((UpIter != UpperEnd || DownIter != LowerEnd) && 2607 "instruction not found in block"); 2608 } 2609 } 2610 2611 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, 2612 Instruction *ToI, 2613 ScheduleData *PrevLoadStore, 2614 ScheduleData *NextLoadStore) { 2615 ScheduleData *CurrentLoadStore = PrevLoadStore; 2616 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { 2617 ScheduleData *SD = ScheduleDataMap[I]; 2618 if (!SD) { 2619 // Allocate a new ScheduleData for the instruction. 2620 if (ChunkPos >= ChunkSize) { 2621 ScheduleDataChunks.push_back( 2622 llvm::make_unique<ScheduleData[]>(ChunkSize)); 2623 ChunkPos = 0; 2624 } 2625 SD = &(ScheduleDataChunks.back()[ChunkPos++]); 2626 ScheduleDataMap[I] = SD; 2627 SD->Inst = I; 2628 } 2629 assert(!isInSchedulingRegion(SD) && 2630 "new ScheduleData already in scheduling region"); 2631 SD->init(SchedulingRegionID); 2632 2633 if (I->mayReadOrWriteMemory()) { 2634 // Update the linked list of memory accessing instructions. 2635 if (CurrentLoadStore) { 2636 CurrentLoadStore->NextLoadStore = SD; 2637 } else { 2638 FirstLoadStoreInRegion = SD; 2639 } 2640 CurrentLoadStore = SD; 2641 } 2642 } 2643 if (NextLoadStore) { 2644 if (CurrentLoadStore) 2645 CurrentLoadStore->NextLoadStore = NextLoadStore; 2646 } else { 2647 LastLoadStoreInRegion = CurrentLoadStore; 2648 } 2649 } 2650 2651 /// \returns the AA location that is being access by the instruction. 2652 static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) { 2653 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 2654 return AA->getLocation(SI); 2655 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 2656 return AA->getLocation(LI); 2657 return AliasAnalysis::Location(); 2658 } 2659 2660 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, 2661 bool InsertInReadyList, 2662 AliasAnalysis *AA) { 2663 assert(SD->isSchedulingEntity()); 2664 2665 SmallVector<ScheduleData *, 10> WorkList; 2666 WorkList.push_back(SD); 2667 2668 while (!WorkList.empty()) { 2669 ScheduleData *SD = WorkList.back(); 2670 WorkList.pop_back(); 2671 2672 ScheduleData *BundleMember = SD; 2673 while (BundleMember) { 2674 assert(isInSchedulingRegion(BundleMember)); 2675 if (!BundleMember->hasValidDependencies()) { 2676 2677 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n"); 2678 BundleMember->Dependencies = 0; 2679 BundleMember->resetUnscheduledDeps(); 2680 2681 // Handle def-use chain dependencies. 2682 for (User *U : BundleMember->Inst->users()) { 2683 if (isa<Instruction>(U)) { 2684 ScheduleData *UseSD = getScheduleData(U); 2685 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { 2686 BundleMember->Dependencies++; 2687 ScheduleData *DestBundle = UseSD->FirstInBundle; 2688 if (!DestBundle->IsScheduled) { 2689 BundleMember->incrementUnscheduledDeps(1); 2690 } 2691 if (!DestBundle->hasValidDependencies()) { 2692 WorkList.push_back(DestBundle); 2693 } 2694 } 2695 } else { 2696 // I'm not sure if this can ever happen. But we need to be safe. 2697 // This lets the instruction/bundle never be scheduled and eventally 2698 // disable vectorization. 2699 BundleMember->Dependencies++; 2700 BundleMember->incrementUnscheduledDeps(1); 2701 } 2702 } 2703 2704 // Handle the memory dependencies. 2705 ScheduleData *DepDest = BundleMember->NextLoadStore; 2706 if (DepDest) { 2707 AliasAnalysis::Location SrcLoc = getLocation(BundleMember->Inst, AA); 2708 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); 2709 2710 while (DepDest) { 2711 assert(isInSchedulingRegion(DepDest)); 2712 if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) { 2713 AliasAnalysis::Location DstLoc = getLocation(DepDest->Inst, AA); 2714 if (!SrcLoc.Ptr || !DstLoc.Ptr || AA->alias(SrcLoc, DstLoc)) { 2715 DepDest->MemoryDependencies.push_back(BundleMember); 2716 BundleMember->Dependencies++; 2717 ScheduleData *DestBundle = DepDest->FirstInBundle; 2718 if (!DestBundle->IsScheduled) { 2719 BundleMember->incrementUnscheduledDeps(1); 2720 } 2721 if (!DestBundle->hasValidDependencies()) { 2722 WorkList.push_back(DestBundle); 2723 } 2724 } 2725 } 2726 DepDest = DepDest->NextLoadStore; 2727 } 2728 } 2729 } 2730 BundleMember = BundleMember->NextInBundle; 2731 } 2732 if (InsertInReadyList && SD->isReady()) { 2733 ReadyInsts.push_back(SD); 2734 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n"); 2735 } 2736 } 2737 } 2738 2739 void BoUpSLP::BlockScheduling::resetSchedule() { 2740 assert(ScheduleStart && 2741 "tried to reset schedule on block which has not been scheduled"); 2742 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2743 ScheduleData *SD = getScheduleData(I); 2744 assert(isInSchedulingRegion(SD)); 2745 SD->IsScheduled = false; 2746 SD->resetUnscheduledDeps(); 2747 } 2748 ReadyInsts.clear(); 2749 } 2750 2751 void BoUpSLP::scheduleBlock(BlockScheduling *BS) { 2752 2753 if (!BS->ScheduleStart) 2754 return; 2755 2756 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); 2757 2758 BS->resetSchedule(); 2759 2760 // For the real scheduling we use a more sophisticated ready-list: it is 2761 // sorted by the original instruction location. This lets the final schedule 2762 // be as close as possible to the original instruction order. 2763 struct ScheduleDataCompare { 2764 bool operator()(ScheduleData *SD1, ScheduleData *SD2) { 2765 return SD2->SchedulingPriority < SD1->SchedulingPriority; 2766 } 2767 }; 2768 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; 2769 2770 // Ensure that all depencency data is updated and fill the ready-list with 2771 // initial instructions. 2772 int Idx = 0; 2773 int NumToSchedule = 0; 2774 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; 2775 I = I->getNextNode()) { 2776 ScheduleData *SD = BS->getScheduleData(I); 2777 assert( 2778 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && 2779 "scheduler and vectorizer have different opinion on what is a bundle"); 2780 SD->FirstInBundle->SchedulingPriority = Idx++; 2781 if (SD->isSchedulingEntity()) { 2782 BS->calculateDependencies(SD, false, AA); 2783 NumToSchedule++; 2784 } 2785 } 2786 BS->initialFillReadyList(ReadyInsts); 2787 2788 Instruction *LastScheduledInst = BS->ScheduleEnd; 2789 2790 // Do the "real" scheduling. 2791 while (!ReadyInsts.empty()) { 2792 ScheduleData *picked = *ReadyInsts.begin(); 2793 ReadyInsts.erase(ReadyInsts.begin()); 2794 2795 // Move the scheduled instruction(s) to their dedicated places, if not 2796 // there yet. 2797 ScheduleData *BundleMember = picked; 2798 while (BundleMember) { 2799 Instruction *pickedInst = BundleMember->Inst; 2800 if (LastScheduledInst->getNextNode() != pickedInst) { 2801 BS->BB->getInstList().remove(pickedInst); 2802 BS->BB->getInstList().insert(LastScheduledInst, pickedInst); 2803 } 2804 LastScheduledInst = pickedInst; 2805 BundleMember = BundleMember->NextInBundle; 2806 } 2807 2808 BS->schedule(picked, ReadyInsts); 2809 NumToSchedule--; 2810 } 2811 assert(NumToSchedule == 0 && "could not schedule all instructions"); 2812 2813 // Avoid duplicate scheduling of the block. 2814 BS->ScheduleStart = nullptr; 2815 } 2816 2817 /// The SLPVectorizer Pass. 2818 struct SLPVectorizer : public FunctionPass { 2819 typedef SmallVector<StoreInst *, 8> StoreList; 2820 typedef MapVector<Value *, StoreList> StoreListMap; 2821 2822 /// Pass identification, replacement for typeid 2823 static char ID; 2824 2825 explicit SLPVectorizer() : FunctionPass(ID) { 2826 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); 2827 } 2828 2829 ScalarEvolution *SE; 2830 const DataLayout *DL; 2831 TargetTransformInfo *TTI; 2832 TargetLibraryInfo *TLI; 2833 AliasAnalysis *AA; 2834 LoopInfo *LI; 2835 DominatorTree *DT; 2836 AssumptionTracker *AT; 2837 2838 bool runOnFunction(Function &F) override { 2839 if (skipOptnoneFunction(F)) 2840 return false; 2841 2842 SE = &getAnalysis<ScalarEvolution>(); 2843 DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); 2844 DL = DLP ? &DLP->getDataLayout() : nullptr; 2845 TTI = &getAnalysis<TargetTransformInfo>(); 2846 TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); 2847 AA = &getAnalysis<AliasAnalysis>(); 2848 LI = &getAnalysis<LoopInfo>(); 2849 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2850 AT = &getAnalysis<AssumptionTracker>(); 2851 2852 StoreRefs.clear(); 2853 bool Changed = false; 2854 2855 // If the target claims to have no vector registers don't attempt 2856 // vectorization. 2857 if (!TTI->getNumberOfRegisters(true)) 2858 return false; 2859 2860 // Must have DataLayout. We can't require it because some tests run w/o 2861 // triple. 2862 if (!DL) 2863 return false; 2864 2865 // Don't vectorize when the attribute NoImplicitFloat is used. 2866 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 2867 return false; 2868 2869 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); 2870 2871 // Use the bottom up slp vectorizer to construct chains that start with 2872 // store instructions. 2873 BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AT); 2874 2875 // Scan the blocks in the function in post order. 2876 for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), 2877 e = po_end(&F.getEntryBlock()); it != e; ++it) { 2878 BasicBlock *BB = *it; 2879 // Vectorize trees that end at stores. 2880 if (unsigned count = collectStores(BB, R)) { 2881 (void)count; 2882 DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); 2883 Changed |= vectorizeStoreChains(R); 2884 } 2885 2886 // Vectorize trees that end at reductions. 2887 Changed |= vectorizeChainsInBlock(BB, R); 2888 } 2889 2890 if (Changed) { 2891 R.optimizeGatherSequence(); 2892 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); 2893 DEBUG(verifyFunction(F)); 2894 } 2895 return Changed; 2896 } 2897 2898 void getAnalysisUsage(AnalysisUsage &AU) const override { 2899 FunctionPass::getAnalysisUsage(AU); 2900 AU.addRequired<AssumptionTracker>(); 2901 AU.addRequired<ScalarEvolution>(); 2902 AU.addRequired<AliasAnalysis>(); 2903 AU.addRequired<TargetTransformInfo>(); 2904 AU.addRequired<LoopInfo>(); 2905 AU.addRequired<DominatorTreeWrapperPass>(); 2906 AU.addPreserved<LoopInfo>(); 2907 AU.addPreserved<DominatorTreeWrapperPass>(); 2908 AU.setPreservesCFG(); 2909 } 2910 2911 private: 2912 2913 /// \brief Collect memory references and sort them according to their base 2914 /// object. We sort the stores to their base objects to reduce the cost of the 2915 /// quadratic search on the stores. TODO: We can further reduce this cost 2916 /// if we flush the chain creation every time we run into a memory barrier. 2917 unsigned collectStores(BasicBlock *BB, BoUpSLP &R); 2918 2919 /// \brief Try to vectorize a chain that starts at two arithmetic instrs. 2920 bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); 2921 2922 /// \brief Try to vectorize a list of operands. 2923 /// \@param BuildVector A list of users to ignore for the purpose of 2924 /// scheduling and that don't need extracting. 2925 /// \returns true if a value was vectorized. 2926 bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 2927 ArrayRef<Value *> BuildVector = None, 2928 bool allowReorder = false); 2929 2930 /// \brief Try to vectorize a chain that may start at the operands of \V; 2931 bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); 2932 2933 /// \brief Vectorize the stores that were collected in StoreRefs. 2934 bool vectorizeStoreChains(BoUpSLP &R); 2935 2936 /// \brief Scan the basic block and look for patterns that are likely to start 2937 /// a vectorization chain. 2938 bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); 2939 2940 bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, 2941 BoUpSLP &R); 2942 2943 bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, 2944 BoUpSLP &R); 2945 private: 2946 StoreListMap StoreRefs; 2947 }; 2948 2949 /// \brief Check that the Values in the slice in VL array are still existent in 2950 /// the WeakVH array. 2951 /// Vectorization of part of the VL array may cause later values in the VL array 2952 /// to become invalid. We track when this has happened in the WeakVH array. 2953 static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL, 2954 SmallVectorImpl<WeakVH> &VH, 2955 unsigned SliceBegin, 2956 unsigned SliceSize) { 2957 for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i) 2958 if (VH[i] != VL[i]) 2959 return true; 2960 2961 return false; 2962 } 2963 2964 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, 2965 int CostThreshold, BoUpSLP &R) { 2966 unsigned ChainLen = Chain.size(); 2967 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen 2968 << "\n"); 2969 Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); 2970 unsigned Sz = DL->getTypeSizeInBits(StoreTy); 2971 unsigned VF = MinVecRegSize / Sz; 2972 2973 if (!isPowerOf2_32(Sz) || VF < 2) 2974 return false; 2975 2976 // Keep track of values that were deleted by vectorizing in the loop below. 2977 SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end()); 2978 2979 bool Changed = false; 2980 // Look for profitable vectorizable trees at all offsets, starting at zero. 2981 for (unsigned i = 0, e = ChainLen; i < e; ++i) { 2982 if (i + VF > e) 2983 break; 2984 2985 // Check that a previous iteration of this loop did not delete the Value. 2986 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) 2987 continue; 2988 2989 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i 2990 << "\n"); 2991 ArrayRef<Value *> Operands = Chain.slice(i, VF); 2992 2993 R.buildTree(Operands); 2994 2995 int Cost = R.getTreeCost(); 2996 2997 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); 2998 if (Cost < CostThreshold) { 2999 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); 3000 R.vectorizeTree(); 3001 3002 // Move to the next bundle. 3003 i += VF - 1; 3004 Changed = true; 3005 } 3006 } 3007 3008 return Changed; 3009 } 3010 3011 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, 3012 int costThreshold, BoUpSLP &R) { 3013 SetVector<Value *> Heads, Tails; 3014 SmallDenseMap<Value *, Value *> ConsecutiveChain; 3015 3016 // We may run into multiple chains that merge into a single chain. We mark the 3017 // stores that we vectorized so that we don't visit the same store twice. 3018 BoUpSLP::ValueSet VectorizedStores; 3019 bool Changed = false; 3020 3021 // Do a quadratic search on all of the given stores and find 3022 // all of the pairs of stores that follow each other. 3023 for (unsigned i = 0, e = Stores.size(); i < e; ++i) { 3024 for (unsigned j = 0; j < e; ++j) { 3025 if (i == j) 3026 continue; 3027 3028 if (R.isConsecutiveAccess(Stores[i], Stores[j])) { 3029 Tails.insert(Stores[j]); 3030 Heads.insert(Stores[i]); 3031 ConsecutiveChain[Stores[i]] = Stores[j]; 3032 } 3033 } 3034 } 3035 3036 // For stores that start but don't end a link in the chain: 3037 for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); 3038 it != e; ++it) { 3039 if (Tails.count(*it)) 3040 continue; 3041 3042 // We found a store instr that starts a chain. Now follow the chain and try 3043 // to vectorize it. 3044 BoUpSLP::ValueList Operands; 3045 Value *I = *it; 3046 // Collect the chain into a list. 3047 while (Tails.count(I) || Heads.count(I)) { 3048 if (VectorizedStores.count(I)) 3049 break; 3050 Operands.push_back(I); 3051 // Move to the next value in the chain. 3052 I = ConsecutiveChain[I]; 3053 } 3054 3055 bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); 3056 3057 // Mark the vectorized stores so that we don't vectorize them again. 3058 if (Vectorized) 3059 VectorizedStores.insert(Operands.begin(), Operands.end()); 3060 Changed |= Vectorized; 3061 } 3062 3063 return Changed; 3064 } 3065 3066 3067 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { 3068 unsigned count = 0; 3069 StoreRefs.clear(); 3070 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3071 StoreInst *SI = dyn_cast<StoreInst>(it); 3072 if (!SI) 3073 continue; 3074 3075 // Don't touch volatile stores. 3076 if (!SI->isSimple()) 3077 continue; 3078 3079 // Check that the pointer points to scalars. 3080 Type *Ty = SI->getValueOperand()->getType(); 3081 if (Ty->isAggregateType() || Ty->isVectorTy()) 3082 continue; 3083 3084 // Find the base pointer. 3085 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); 3086 3087 // Save the store locations. 3088 StoreRefs[Ptr].push_back(SI); 3089 count++; 3090 } 3091 return count; 3092 } 3093 3094 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { 3095 if (!A || !B) 3096 return false; 3097 Value *VL[] = { A, B }; 3098 return tryToVectorizeList(VL, R, None, true); 3099 } 3100 3101 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 3102 ArrayRef<Value *> BuildVector, 3103 bool allowReorder) { 3104 if (VL.size() < 2) 3105 return false; 3106 3107 DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n"); 3108 3109 // Check that all of the parts are scalar instructions of the same type. 3110 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 3111 if (!I0) 3112 return false; 3113 3114 unsigned Opcode0 = I0->getOpcode(); 3115 3116 Type *Ty0 = I0->getType(); 3117 unsigned Sz = DL->getTypeSizeInBits(Ty0); 3118 unsigned VF = MinVecRegSize / Sz; 3119 3120 for (int i = 0, e = VL.size(); i < e; ++i) { 3121 Type *Ty = VL[i]->getType(); 3122 if (Ty->isAggregateType() || Ty->isVectorTy()) 3123 return false; 3124 Instruction *Inst = dyn_cast<Instruction>(VL[i]); 3125 if (!Inst || Inst->getOpcode() != Opcode0) 3126 return false; 3127 } 3128 3129 bool Changed = false; 3130 3131 // Keep track of values that were deleted by vectorizing in the loop below. 3132 SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); 3133 3134 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 3135 unsigned OpsWidth = 0; 3136 3137 if (i + VF > e) 3138 OpsWidth = e - i; 3139 else 3140 OpsWidth = VF; 3141 3142 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) 3143 break; 3144 3145 // Check that a previous iteration of this loop did not delete the Value. 3146 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth)) 3147 continue; 3148 3149 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " 3150 << "\n"); 3151 ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); 3152 3153 ArrayRef<Value *> BuildVectorSlice; 3154 if (!BuildVector.empty()) 3155 BuildVectorSlice = BuildVector.slice(i, OpsWidth); 3156 3157 R.buildTree(Ops, BuildVectorSlice); 3158 // TODO: check if we can allow reordering also for other cases than 3159 // tryToVectorizePair() 3160 if (allowReorder && R.shouldReorder()) { 3161 assert(Ops.size() == 2); 3162 assert(BuildVectorSlice.empty()); 3163 Value *ReorderedOps[] = { Ops[1], Ops[0] }; 3164 R.buildTree(ReorderedOps, None); 3165 } 3166 int Cost = R.getTreeCost(); 3167 3168 if (Cost < -SLPCostThreshold) { 3169 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); 3170 Value *VectorizedRoot = R.vectorizeTree(); 3171 3172 // Reconstruct the build vector by extracting the vectorized root. This 3173 // way we handle the case where some elements of the vector are undefined. 3174 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) 3175 if (!BuildVectorSlice.empty()) { 3176 // The insert point is the last build vector instruction. The vectorized 3177 // root will precede it. This guarantees that we get an instruction. The 3178 // vectorized tree could have been constant folded. 3179 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); 3180 unsigned VecIdx = 0; 3181 for (auto &V : BuildVectorSlice) { 3182 IRBuilder<true, NoFolder> Builder( 3183 ++BasicBlock::iterator(InsertAfter)); 3184 InsertElementInst *IE = cast<InsertElementInst>(V); 3185 Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( 3186 VectorizedRoot, Builder.getInt32(VecIdx++))); 3187 IE->setOperand(1, Extract); 3188 IE->removeFromParent(); 3189 IE->insertAfter(Extract); 3190 InsertAfter = IE; 3191 } 3192 } 3193 // Move to the next bundle. 3194 i += VF - 1; 3195 Changed = true; 3196 } 3197 } 3198 3199 return Changed; 3200 } 3201 3202 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { 3203 if (!V) 3204 return false; 3205 3206 // Try to vectorize V. 3207 if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) 3208 return true; 3209 3210 BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0)); 3211 BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1)); 3212 // Try to skip B. 3213 if (B && B->hasOneUse()) { 3214 BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); 3215 BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); 3216 if (tryToVectorizePair(A, B0, R)) { 3217 return true; 3218 } 3219 if (tryToVectorizePair(A, B1, R)) { 3220 return true; 3221 } 3222 } 3223 3224 // Try to skip A. 3225 if (A && A->hasOneUse()) { 3226 BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); 3227 BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); 3228 if (tryToVectorizePair(A0, B, R)) { 3229 return true; 3230 } 3231 if (tryToVectorizePair(A1, B, R)) { 3232 return true; 3233 } 3234 } 3235 return 0; 3236 } 3237 3238 /// \brief Generate a shuffle mask to be used in a reduction tree. 3239 /// 3240 /// \param VecLen The length of the vector to be reduced. 3241 /// \param NumEltsToRdx The number of elements that should be reduced in the 3242 /// vector. 3243 /// \param IsPairwise Whether the reduction is a pairwise or splitting 3244 /// reduction. A pairwise reduction will generate a mask of 3245 /// <0,2,...> or <1,3,..> while a splitting reduction will generate 3246 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2. 3247 /// \param IsLeft True will generate a mask of even elements, odd otherwise. 3248 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, 3249 bool IsPairwise, bool IsLeft, 3250 IRBuilder<> &Builder) { 3251 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); 3252 3253 SmallVector<Constant *, 32> ShuffleMask( 3254 VecLen, UndefValue::get(Builder.getInt32Ty())); 3255 3256 if (IsPairwise) 3257 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). 3258 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3259 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); 3260 else 3261 // Move the upper half of the vector to the lower half. 3262 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3263 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); 3264 3265 return ConstantVector::get(ShuffleMask); 3266 } 3267 3268 3269 /// Model horizontal reductions. 3270 /// 3271 /// A horizontal reduction is a tree of reduction operations (currently add and 3272 /// fadd) that has operations that can be put into a vector as its leaf. 3273 /// For example, this tree: 3274 /// 3275 /// mul mul mul mul 3276 /// \ / \ / 3277 /// + + 3278 /// \ / 3279 /// + 3280 /// This tree has "mul" as its reduced values and "+" as its reduction 3281 /// operations. A reduction might be feeding into a store or a binary operation 3282 /// feeding a phi. 3283 /// ... 3284 /// \ / 3285 /// + 3286 /// | 3287 /// phi += 3288 /// 3289 /// Or: 3290 /// ... 3291 /// \ / 3292 /// + 3293 /// | 3294 /// *p = 3295 /// 3296 class HorizontalReduction { 3297 SmallVector<Value *, 16> ReductionOps; 3298 SmallVector<Value *, 32> ReducedVals; 3299 3300 BinaryOperator *ReductionRoot; 3301 PHINode *ReductionPHI; 3302 3303 /// The opcode of the reduction. 3304 unsigned ReductionOpcode; 3305 /// The opcode of the values we perform a reduction on. 3306 unsigned ReducedValueOpcode; 3307 /// The width of one full horizontal reduction operation. 3308 unsigned ReduxWidth; 3309 /// Should we model this reduction as a pairwise reduction tree or a tree that 3310 /// splits the vector in halves and adds those halves. 3311 bool IsPairwiseReduction; 3312 3313 public: 3314 HorizontalReduction() 3315 : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), 3316 ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} 3317 3318 /// \brief Try to find a reduction tree. 3319 bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, 3320 const DataLayout *DL) { 3321 assert((!Phi || 3322 std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && 3323 "Thi phi needs to use the binary operator"); 3324 3325 // We could have a initial reductions that is not an add. 3326 // r *= v1 + v2 + v3 + v4 3327 // In such a case start looking for a tree rooted in the first '+'. 3328 if (Phi) { 3329 if (B->getOperand(0) == Phi) { 3330 Phi = nullptr; 3331 B = dyn_cast<BinaryOperator>(B->getOperand(1)); 3332 } else if (B->getOperand(1) == Phi) { 3333 Phi = nullptr; 3334 B = dyn_cast<BinaryOperator>(B->getOperand(0)); 3335 } 3336 } 3337 3338 if (!B) 3339 return false; 3340 3341 Type *Ty = B->getType(); 3342 if (Ty->isVectorTy()) 3343 return false; 3344 3345 ReductionOpcode = B->getOpcode(); 3346 ReducedValueOpcode = 0; 3347 ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); 3348 ReductionRoot = B; 3349 ReductionPHI = Phi; 3350 3351 if (ReduxWidth < 4) 3352 return false; 3353 3354 // We currently only support adds. 3355 if (ReductionOpcode != Instruction::Add && 3356 ReductionOpcode != Instruction::FAdd) 3357 return false; 3358 3359 // Post order traverse the reduction tree starting at B. We only handle true 3360 // trees containing only binary operators. 3361 SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack; 3362 Stack.push_back(std::make_pair(B, 0)); 3363 while (!Stack.empty()) { 3364 BinaryOperator *TreeN = Stack.back().first; 3365 unsigned EdgeToVist = Stack.back().second++; 3366 bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; 3367 3368 // Only handle trees in the current basic block. 3369 if (TreeN->getParent() != B->getParent()) 3370 return false; 3371 3372 // Each tree node needs to have one user except for the ultimate 3373 // reduction. 3374 if (!TreeN->hasOneUse() && TreeN != B) 3375 return false; 3376 3377 // Postorder vist. 3378 if (EdgeToVist == 2 || IsReducedValue) { 3379 if (IsReducedValue) { 3380 // Make sure that the opcodes of the operations that we are going to 3381 // reduce match. 3382 if (!ReducedValueOpcode) 3383 ReducedValueOpcode = TreeN->getOpcode(); 3384 else if (ReducedValueOpcode != TreeN->getOpcode()) 3385 return false; 3386 ReducedVals.push_back(TreeN); 3387 } else { 3388 // We need to be able to reassociate the adds. 3389 if (!TreeN->isAssociative()) 3390 return false; 3391 ReductionOps.push_back(TreeN); 3392 } 3393 // Retract. 3394 Stack.pop_back(); 3395 continue; 3396 } 3397 3398 // Visit left or right. 3399 Value *NextV = TreeN->getOperand(EdgeToVist); 3400 BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV); 3401 if (Next) 3402 Stack.push_back(std::make_pair(Next, 0)); 3403 else if (NextV != Phi) 3404 return false; 3405 } 3406 return true; 3407 } 3408 3409 /// \brief Attempt to vectorize the tree found by 3410 /// matchAssociativeReduction. 3411 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { 3412 if (ReducedVals.empty()) 3413 return false; 3414 3415 unsigned NumReducedVals = ReducedVals.size(); 3416 if (NumReducedVals < ReduxWidth) 3417 return false; 3418 3419 Value *VectorizedTree = nullptr; 3420 IRBuilder<> Builder(ReductionRoot); 3421 FastMathFlags Unsafe; 3422 Unsafe.setUnsafeAlgebra(); 3423 Builder.SetFastMathFlags(Unsafe); 3424 unsigned i = 0; 3425 3426 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { 3427 V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps); 3428 3429 // Estimate cost. 3430 int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); 3431 if (Cost >= -SLPCostThreshold) 3432 break; 3433 3434 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost 3435 << ". (HorRdx)\n"); 3436 3437 // Vectorize a tree. 3438 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); 3439 Value *VectorizedRoot = V.vectorizeTree(); 3440 3441 // Emit a reduction. 3442 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); 3443 if (VectorizedTree) { 3444 Builder.SetCurrentDebugLocation(Loc); 3445 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3446 ReducedSubTree, "bin.rdx"); 3447 } else 3448 VectorizedTree = ReducedSubTree; 3449 } 3450 3451 if (VectorizedTree) { 3452 // Finish the reduction. 3453 for (; i < NumReducedVals; ++i) { 3454 Builder.SetCurrentDebugLocation( 3455 cast<Instruction>(ReducedVals[i])->getDebugLoc()); 3456 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3457 ReducedVals[i]); 3458 } 3459 // Update users. 3460 if (ReductionPHI) { 3461 assert(ReductionRoot && "Need a reduction operation"); 3462 ReductionRoot->setOperand(0, VectorizedTree); 3463 ReductionRoot->setOperand(1, ReductionPHI); 3464 } else 3465 ReductionRoot->replaceAllUsesWith(VectorizedTree); 3466 } 3467 return VectorizedTree != nullptr; 3468 } 3469 3470 private: 3471 3472 /// \brief Calcuate the cost of a reduction. 3473 int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { 3474 Type *ScalarTy = FirstReducedVal->getType(); 3475 Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); 3476 3477 int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); 3478 int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); 3479 3480 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; 3481 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; 3482 3483 int ScalarReduxCost = 3484 ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); 3485 3486 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost 3487 << " for reduction that starts with " << *FirstReducedVal 3488 << " (It is a " 3489 << (IsPairwiseReduction ? "pairwise" : "splitting") 3490 << " reduction)\n"); 3491 3492 return VecReduxCost - ScalarReduxCost; 3493 } 3494 3495 static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, 3496 Value *R, const Twine &Name = "") { 3497 if (Opcode == Instruction::FAdd) 3498 return Builder.CreateFAdd(L, R, Name); 3499 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); 3500 } 3501 3502 /// \brief Emit a horizontal reduction of the vectorized value. 3503 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { 3504 assert(VectorizedValue && "Need to have a vectorized tree node"); 3505 Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue); 3506 assert(isPowerOf2_32(ReduxWidth) && 3507 "We only handle power-of-two reductions for now"); 3508 3509 Value *TmpVec = ValToReduce; 3510 for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { 3511 if (IsPairwiseReduction) { 3512 Value *LeftMask = 3513 createRdxShuffleMask(ReduxWidth, i, true, true, Builder); 3514 Value *RightMask = 3515 createRdxShuffleMask(ReduxWidth, i, true, false, Builder); 3516 3517 Value *LeftShuf = Builder.CreateShuffleVector( 3518 TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); 3519 Value *RightShuf = Builder.CreateShuffleVector( 3520 TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), 3521 "rdx.shuf.r"); 3522 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, 3523 "bin.rdx"); 3524 } else { 3525 Value *UpperHalf = 3526 createRdxShuffleMask(ReduxWidth, i, false, false, Builder); 3527 Value *Shuf = Builder.CreateShuffleVector( 3528 TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); 3529 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); 3530 } 3531 } 3532 3533 // The result is in the first element of the vector. 3534 return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 3535 } 3536 }; 3537 3538 /// \brief Recognize construction of vectors like 3539 /// %ra = insertelement <4 x float> undef, float %s0, i32 0 3540 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 3541 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 3542 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 3543 /// 3544 /// Returns true if it matches 3545 /// 3546 static bool findBuildVector(InsertElementInst *FirstInsertElem, 3547 SmallVectorImpl<Value *> &BuildVector, 3548 SmallVectorImpl<Value *> &BuildVectorOpds) { 3549 if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) 3550 return false; 3551 3552 InsertElementInst *IE = FirstInsertElem; 3553 while (true) { 3554 BuildVector.push_back(IE); 3555 BuildVectorOpds.push_back(IE->getOperand(1)); 3556 3557 if (IE->use_empty()) 3558 return false; 3559 3560 InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back()); 3561 if (!NextUse) 3562 return true; 3563 3564 // If this isn't the final use, make sure the next insertelement is the only 3565 // use. It's OK if the final constructed vector is used multiple times 3566 if (!IE->hasOneUse()) 3567 return false; 3568 3569 IE = NextUse; 3570 } 3571 3572 return false; 3573 } 3574 3575 static bool PhiTypeSorterFunc(Value *V, Value *V2) { 3576 return V->getType() < V2->getType(); 3577 } 3578 3579 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { 3580 bool Changed = false; 3581 SmallVector<Value *, 4> Incoming; 3582 SmallSet<Value *, 16> VisitedInstrs; 3583 3584 bool HaveVectorizedPhiNodes = true; 3585 while (HaveVectorizedPhiNodes) { 3586 HaveVectorizedPhiNodes = false; 3587 3588 // Collect the incoming values from the PHIs. 3589 Incoming.clear(); 3590 for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie; 3591 ++instr) { 3592 PHINode *P = dyn_cast<PHINode>(instr); 3593 if (!P) 3594 break; 3595 3596 if (!VisitedInstrs.count(P)) 3597 Incoming.push_back(P); 3598 } 3599 3600 // Sort by type. 3601 std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc); 3602 3603 // Try to vectorize elements base on their type. 3604 for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), 3605 E = Incoming.end(); 3606 IncIt != E;) { 3607 3608 // Look for the next elements with the same type. 3609 SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; 3610 while (SameTypeIt != E && 3611 (*SameTypeIt)->getType() == (*IncIt)->getType()) { 3612 VisitedInstrs.insert(*SameTypeIt); 3613 ++SameTypeIt; 3614 } 3615 3616 // Try to vectorize them. 3617 unsigned NumElts = (SameTypeIt - IncIt); 3618 DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); 3619 if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { 3620 // Success start over because instructions might have been changed. 3621 HaveVectorizedPhiNodes = true; 3622 Changed = true; 3623 break; 3624 } 3625 3626 // Start over at the next instruction of a different type (or the end). 3627 IncIt = SameTypeIt; 3628 } 3629 } 3630 3631 VisitedInstrs.clear(); 3632 3633 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { 3634 // We may go through BB multiple times so skip the one we have checked. 3635 if (!VisitedInstrs.insert(it)) 3636 continue; 3637 3638 if (isa<DbgInfoIntrinsic>(it)) 3639 continue; 3640 3641 // Try to vectorize reductions that use PHINodes. 3642 if (PHINode *P = dyn_cast<PHINode>(it)) { 3643 // Check that the PHI is a reduction PHI. 3644 if (P->getNumIncomingValues() != 2) 3645 return Changed; 3646 Value *Rdx = 3647 (P->getIncomingBlock(0) == BB 3648 ? (P->getIncomingValue(0)) 3649 : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) 3650 : nullptr)); 3651 // Check if this is a Binary Operator. 3652 BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); 3653 if (!BI) 3654 continue; 3655 3656 // Try to match and vectorize a horizontal reduction. 3657 HorizontalReduction HorRdx; 3658 if (ShouldVectorizeHor && 3659 HorRdx.matchAssociativeReduction(P, BI, DL) && 3660 HorRdx.tryToReduce(R, TTI)) { 3661 Changed = true; 3662 it = BB->begin(); 3663 e = BB->end(); 3664 continue; 3665 } 3666 3667 Value *Inst = BI->getOperand(0); 3668 if (Inst == P) 3669 Inst = BI->getOperand(1); 3670 3671 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) { 3672 // We would like to start over since some instructions are deleted 3673 // and the iterator may become invalid value. 3674 Changed = true; 3675 it = BB->begin(); 3676 e = BB->end(); 3677 continue; 3678 } 3679 3680 continue; 3681 } 3682 3683 // Try to vectorize horizontal reductions feeding into a store. 3684 if (ShouldStartVectorizeHorAtStore) 3685 if (StoreInst *SI = dyn_cast<StoreInst>(it)) 3686 if (BinaryOperator *BinOp = 3687 dyn_cast<BinaryOperator>(SI->getValueOperand())) { 3688 HorizontalReduction HorRdx; 3689 if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) && 3690 HorRdx.tryToReduce(R, TTI)) || 3691 tryToVectorize(BinOp, R))) { 3692 Changed = true; 3693 it = BB->begin(); 3694 e = BB->end(); 3695 continue; 3696 } 3697 } 3698 3699 // Try to vectorize trees that start at compare instructions. 3700 if (CmpInst *CI = dyn_cast<CmpInst>(it)) { 3701 if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { 3702 Changed = true; 3703 // We would like to start over since some instructions are deleted 3704 // and the iterator may become invalid value. 3705 it = BB->begin(); 3706 e = BB->end(); 3707 continue; 3708 } 3709 3710 for (int i = 0; i < 2; ++i) { 3711 if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) { 3712 if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { 3713 Changed = true; 3714 // We would like to start over since some instructions are deleted 3715 // and the iterator may become invalid value. 3716 it = BB->begin(); 3717 e = BB->end(); 3718 } 3719 } 3720 } 3721 continue; 3722 } 3723 3724 // Try to vectorize trees that start at insertelement instructions. 3725 if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { 3726 SmallVector<Value *, 16> BuildVector; 3727 SmallVector<Value *, 16> BuildVectorOpds; 3728 if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) 3729 continue; 3730 3731 // Vectorize starting with the build vector operands ignoring the 3732 // BuildVector instructions for the purpose of scheduling and user 3733 // extraction. 3734 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { 3735 Changed = true; 3736 it = BB->begin(); 3737 e = BB->end(); 3738 } 3739 3740 continue; 3741 } 3742 } 3743 3744 return Changed; 3745 } 3746 3747 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { 3748 bool Changed = false; 3749 // Attempt to sort and vectorize each of the store-groups. 3750 for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); 3751 it != e; ++it) { 3752 if (it->second.size() < 2) 3753 continue; 3754 3755 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " 3756 << it->second.size() << ".\n"); 3757 3758 // Process the stores in chunks of 16. 3759 for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { 3760 unsigned Len = std::min<unsigned>(CE - CI, 16); 3761 Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), 3762 -SLPCostThreshold, R); 3763 } 3764 } 3765 return Changed; 3766 } 3767 3768 } // end anonymous namespace 3769 3770 char SLPVectorizer::ID = 0; 3771 static const char lv_name[] = "SLP Vectorizer"; 3772 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) 3773 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 3774 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 3775 INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) 3776 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 3777 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 3778 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) 3779 3780 namespace llvm { 3781 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); } 3782 } 3783