1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive 10 // stores that can be put together into vector-stores. Next, it attempts to 11 // construct vectorizable tree using the use-def chains. If a profitable tree 12 // was found, the SLP vectorizer performs vectorization on the tree. 13 // 14 // The pass is inspired by the work described in the paper: 15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 16 // 17 //===----------------------------------------------------------------------===// 18 #include "llvm/Transforms/Vectorize.h" 19 #include "llvm/ADT/MapVector.h" 20 #include "llvm/ADT/PostOrderIterator.h" 21 #include "llvm/ADT/SetVector.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/Analysis/AliasAnalysis.h" 24 #include "llvm/Analysis/LoopInfo.h" 25 #include "llvm/Analysis/ScalarEvolution.h" 26 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 27 #include "llvm/Analysis/TargetTransformInfo.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/IR/DataLayout.h" 30 #include "llvm/IR/Dominators.h" 31 #include "llvm/IR/IRBuilder.h" 32 #include "llvm/IR/Instructions.h" 33 #include "llvm/IR/IntrinsicInst.h" 34 #include "llvm/IR/Module.h" 35 #include "llvm/IR/NoFolder.h" 36 #include "llvm/IR/Type.h" 37 #include "llvm/IR/Value.h" 38 #include "llvm/IR/Verifier.h" 39 #include "llvm/Pass.h" 40 #include "llvm/Support/CommandLine.h" 41 #include "llvm/Support/Debug.h" 42 #include "llvm/Support/raw_ostream.h" 43 #include "llvm/Transforms/Utils/VectorUtils.h" 44 #include <algorithm> 45 #include <map> 46 #include <memory> 47 48 using namespace llvm; 49 50 #define SV_NAME "slp-vectorizer" 51 #define DEBUG_TYPE "SLP" 52 53 STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); 54 55 static cl::opt<int> 56 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, 57 cl::desc("Only vectorize if you gain more than this " 58 "number ")); 59 60 static cl::opt<bool> 61 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, 62 cl::desc("Attempt to vectorize horizontal reductions")); 63 64 static cl::opt<bool> ShouldStartVectorizeHorAtStore( 65 "slp-vectorize-hor-store", cl::init(false), cl::Hidden, 66 cl::desc( 67 "Attempt to vectorize horizontal reductions feeding into a store")); 68 69 namespace { 70 71 static const unsigned MinVecRegSize = 128; 72 73 static const unsigned RecursionMaxDepth = 12; 74 75 /// \returns the parent basic block if all of the instructions in \p VL 76 /// are in the same block or null otherwise. 77 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { 78 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 79 if (!I0) 80 return nullptr; 81 BasicBlock *BB = I0->getParent(); 82 for (int i = 1, e = VL.size(); i < e; i++) { 83 Instruction *I = dyn_cast<Instruction>(VL[i]); 84 if (!I) 85 return nullptr; 86 87 if (BB != I->getParent()) 88 return nullptr; 89 } 90 return BB; 91 } 92 93 /// \returns True if all of the values in \p VL are constants. 94 static bool allConstant(ArrayRef<Value *> VL) { 95 for (unsigned i = 0, e = VL.size(); i < e; ++i) 96 if (!isa<Constant>(VL[i])) 97 return false; 98 return true; 99 } 100 101 /// \returns True if all of the values in \p VL are identical. 102 static bool isSplat(ArrayRef<Value *> VL) { 103 for (unsigned i = 1, e = VL.size(); i < e; ++i) 104 if (VL[i] != VL[0]) 105 return false; 106 return true; 107 } 108 109 ///\returns Opcode that can be clubbed with \p Op to create an alternate 110 /// sequence which can later be merged as a ShuffleVector instruction. 111 static unsigned getAltOpcode(unsigned Op) { 112 switch (Op) { 113 case Instruction::FAdd: 114 return Instruction::FSub; 115 case Instruction::FSub: 116 return Instruction::FAdd; 117 case Instruction::Add: 118 return Instruction::Sub; 119 case Instruction::Sub: 120 return Instruction::Add; 121 default: 122 return 0; 123 } 124 } 125 126 ///\returns bool representing if Opcode \p Op can be part 127 /// of an alternate sequence which can later be merged as 128 /// a ShuffleVector instruction. 129 static bool canCombineAsAltInst(unsigned Op) { 130 if (Op == Instruction::FAdd || Op == Instruction::FSub || 131 Op == Instruction::Sub || Op == Instruction::Add) 132 return true; 133 return false; 134 } 135 136 /// \returns ShuffleVector instruction if intructions in \p VL have 137 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. 138 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) 139 static unsigned isAltInst(ArrayRef<Value *> VL) { 140 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 141 unsigned Opcode = I0->getOpcode(); 142 unsigned AltOpcode = getAltOpcode(Opcode); 143 for (int i = 1, e = VL.size(); i < e; i++) { 144 Instruction *I = dyn_cast<Instruction>(VL[i]); 145 if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) 146 return 0; 147 } 148 return Instruction::ShuffleVector; 149 } 150 151 /// \returns The opcode if all of the Instructions in \p VL have the same 152 /// opcode, or zero. 153 static unsigned getSameOpcode(ArrayRef<Value *> VL) { 154 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 155 if (!I0) 156 return 0; 157 unsigned Opcode = I0->getOpcode(); 158 for (int i = 1, e = VL.size(); i < e; i++) { 159 Instruction *I = dyn_cast<Instruction>(VL[i]); 160 if (!I || Opcode != I->getOpcode()) { 161 if (canCombineAsAltInst(Opcode) && i == 1) 162 return isAltInst(VL); 163 return 0; 164 } 165 } 166 return Opcode; 167 } 168 169 /// Get the intersection (logical and) of all of the potential IR flags 170 /// of each scalar operation (VL) that will be converted into a vector (I). 171 /// Flag set: NSW, NUW, exact, and all of fast-math. 172 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { 173 if (auto *VecOp = dyn_cast<BinaryOperator>(I)) { 174 if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) { 175 // Intersection is initialized to the 0th scalar, 176 // so start counting from index '1'. 177 for (int i = 1, e = VL.size(); i < e; ++i) { 178 if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i])) 179 Intersection->andIRFlags(Scalar); 180 } 181 VecOp->copyIRFlags(Intersection); 182 } 183 } 184 } 185 186 /// \returns \p I after propagating metadata from \p VL. 187 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { 188 Instruction *I0 = cast<Instruction>(VL[0]); 189 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; 190 I0->getAllMetadataOtherThanDebugLoc(Metadata); 191 192 for (unsigned i = 0, n = Metadata.size(); i != n; ++i) { 193 unsigned Kind = Metadata[i].first; 194 MDNode *MD = Metadata[i].second; 195 196 for (int i = 1, e = VL.size(); MD && i != e; i++) { 197 Instruction *I = cast<Instruction>(VL[i]); 198 MDNode *IMD = I->getMetadata(Kind); 199 200 switch (Kind) { 201 default: 202 MD = nullptr; // Remove unknown metadata 203 break; 204 case LLVMContext::MD_tbaa: 205 MD = MDNode::getMostGenericTBAA(MD, IMD); 206 break; 207 case LLVMContext::MD_alias_scope: 208 case LLVMContext::MD_noalias: 209 MD = MDNode::intersect(MD, IMD); 210 break; 211 case LLVMContext::MD_fpmath: 212 MD = MDNode::getMostGenericFPMath(MD, IMD); 213 break; 214 } 215 } 216 I->setMetadata(Kind, MD); 217 } 218 return I; 219 } 220 221 /// \returns The type that all of the values in \p VL have or null if there 222 /// are different types. 223 static Type* getSameType(ArrayRef<Value *> VL) { 224 Type *Ty = VL[0]->getType(); 225 for (int i = 1, e = VL.size(); i < e; i++) 226 if (VL[i]->getType() != Ty) 227 return nullptr; 228 229 return Ty; 230 } 231 232 /// \returns True if the ExtractElement instructions in VL can be vectorized 233 /// to use the original vector. 234 static bool CanReuseExtract(ArrayRef<Value *> VL) { 235 assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); 236 // Check if all of the extracts come from the same vector and from the 237 // correct offset. 238 Value *VL0 = VL[0]; 239 ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); 240 Value *Vec = E0->getOperand(0); 241 242 // We have to extract from the same vector type. 243 unsigned NElts = Vec->getType()->getVectorNumElements(); 244 245 if (NElts != VL.size()) 246 return false; 247 248 // Check that all of the indices extract from the correct offset. 249 ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); 250 if (!CI || CI->getZExtValue()) 251 return false; 252 253 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 254 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 255 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); 256 257 if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) 258 return false; 259 } 260 261 return true; 262 } 263 264 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 265 SmallVectorImpl<Value *> &Left, 266 SmallVectorImpl<Value *> &Right) { 267 268 SmallVector<Value *, 16> OrigLeft, OrigRight; 269 270 bool AllSameOpcodeLeft = true; 271 bool AllSameOpcodeRight = true; 272 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 273 Instruction *I = cast<Instruction>(VL[i]); 274 Value *V0 = I->getOperand(0); 275 Value *V1 = I->getOperand(1); 276 277 OrigLeft.push_back(V0); 278 OrigRight.push_back(V1); 279 280 Instruction *I0 = dyn_cast<Instruction>(V0); 281 Instruction *I1 = dyn_cast<Instruction>(V1); 282 283 // Check whether all operands on one side have the same opcode. In this case 284 // we want to preserve the original order and not make things worse by 285 // reordering. 286 AllSameOpcodeLeft = I0; 287 AllSameOpcodeRight = I1; 288 289 if (i && AllSameOpcodeLeft) { 290 if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) { 291 if(P0->getOpcode() != I0->getOpcode()) 292 AllSameOpcodeLeft = false; 293 } else 294 AllSameOpcodeLeft = false; 295 } 296 if (i && AllSameOpcodeRight) { 297 if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) { 298 if(P1->getOpcode() != I1->getOpcode()) 299 AllSameOpcodeRight = false; 300 } else 301 AllSameOpcodeRight = false; 302 } 303 304 // Sort two opcodes. In the code below we try to preserve the ability to use 305 // broadcast of values instead of individual inserts. 306 // vl1 = load 307 // vl2 = phi 308 // vr1 = load 309 // vr2 = vr2 310 // = vl1 x vr1 311 // = vl2 x vr2 312 // If we just sorted according to opcode we would leave the first line in 313 // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). 314 // = vl1 x vr1 315 // = vr2 x vl2 316 // Because vr2 and vr1 are from the same load we loose the opportunity of a 317 // broadcast for the packed right side in the backend: we have [vr1, vl2] 318 // instead of [vr1, vr2=vr1]. 319 if (I0 && I1) { 320 if(!i && I0->getOpcode() > I1->getOpcode()) { 321 Left.push_back(I1); 322 Right.push_back(I0); 323 } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) { 324 // Try not to destroy a broad cast for no apparent benefit. 325 Left.push_back(I1); 326 Right.push_back(I0); 327 } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) { 328 // Try preserve broadcasts. 329 Left.push_back(I1); 330 Right.push_back(I0); 331 } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) { 332 // Try preserve broadcasts. 333 Left.push_back(I1); 334 Right.push_back(I0); 335 } else { 336 Left.push_back(I0); 337 Right.push_back(I1); 338 } 339 continue; 340 } 341 // One opcode, put the instruction on the right. 342 if (I0) { 343 Left.push_back(V1); 344 Right.push_back(I0); 345 continue; 346 } 347 Left.push_back(V0); 348 Right.push_back(V1); 349 } 350 351 bool LeftBroadcast = isSplat(Left); 352 bool RightBroadcast = isSplat(Right); 353 354 // Don't reorder if the operands where good to begin with. 355 if (!(LeftBroadcast || RightBroadcast) && 356 (AllSameOpcodeRight || AllSameOpcodeLeft)) { 357 Left = OrigLeft; 358 Right = OrigRight; 359 } 360 } 361 362 /// \returns True if in-tree use also needs extract. This refers to 363 /// possible scalar operand in vectorized instruction. 364 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, 365 TargetLibraryInfo *TLI) { 366 367 unsigned Opcode = UserInst->getOpcode(); 368 switch (Opcode) { 369 case Instruction::Load: { 370 LoadInst *LI = cast<LoadInst>(UserInst); 371 return (LI->getPointerOperand() == Scalar); 372 } 373 case Instruction::Store: { 374 StoreInst *SI = cast<StoreInst>(UserInst); 375 return (SI->getPointerOperand() == Scalar); 376 } 377 case Instruction::Call: { 378 CallInst *CI = cast<CallInst>(UserInst); 379 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 380 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 381 return (CI->getArgOperand(1) == Scalar); 382 } 383 } 384 default: 385 return false; 386 } 387 } 388 389 /// Bottom Up SLP Vectorizer. 390 class BoUpSLP { 391 public: 392 typedef SmallVector<Value *, 8> ValueList; 393 typedef SmallVector<Instruction *, 16> InstrList; 394 typedef SmallPtrSet<Value *, 16> ValueSet; 395 typedef SmallVector<StoreInst *, 8> StoreList; 396 397 BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, 398 TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, 399 LoopInfo *Li, DominatorTree *Dt) 400 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), 401 F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), 402 Builder(Se->getContext()) {} 403 404 /// \brief Vectorize the tree that starts with the elements in \p VL. 405 /// Returns the vectorized root. 406 Value *vectorizeTree(); 407 408 /// \returns the cost incurred by unwanted spills and fills, caused by 409 /// holding live values over call sites. 410 int getSpillCost(); 411 412 /// \returns the vectorization cost of the subtree that starts at \p VL. 413 /// A negative number means that this is profitable. 414 int getTreeCost(); 415 416 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 417 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 418 void buildTree(ArrayRef<Value *> Roots, 419 ArrayRef<Value *> UserIgnoreLst = None); 420 421 /// Clear the internal data structures that are created by 'buildTree'. 422 void deleteTree() { 423 VectorizableTree.clear(); 424 ScalarToTreeEntry.clear(); 425 MustGather.clear(); 426 ExternalUses.clear(); 427 NumLoadsWantToKeepOrder = 0; 428 NumLoadsWantToChangeOrder = 0; 429 for (auto &Iter : BlocksSchedules) { 430 BlockScheduling *BS = Iter.second.get(); 431 BS->clear(); 432 } 433 } 434 435 /// \returns true if the memory operations A and B are consecutive. 436 bool isConsecutiveAccess(Value *A, Value *B); 437 438 /// \brief Perform LICM and CSE on the newly generated gather sequences. 439 void optimizeGatherSequence(); 440 441 /// \returns true if it is benefitial to reverse the vector order. 442 bool shouldReorder() const { 443 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; 444 } 445 446 private: 447 struct TreeEntry; 448 449 /// \returns the cost of the vectorizable entry. 450 int getEntryCost(TreeEntry *E); 451 452 /// This is the recursive part of buildTree. 453 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); 454 455 /// Vectorize a single entry in the tree. 456 Value *vectorizeTree(TreeEntry *E); 457 458 /// Vectorize a single entry in the tree, starting in \p VL. 459 Value *vectorizeTree(ArrayRef<Value *> VL); 460 461 /// \returns the pointer to the vectorized value if \p VL is already 462 /// vectorized, or NULL. They may happen in cycles. 463 Value *alreadyVectorized(ArrayRef<Value *> VL) const; 464 465 /// \brief Take the pointer operand from the Load/Store instruction. 466 /// \returns NULL if this is not a valid Load/Store instruction. 467 static Value *getPointerOperand(Value *I); 468 469 /// \brief Take the address space operand from the Load/Store instruction. 470 /// \returns -1 if this is not a valid Load/Store instruction. 471 static unsigned getAddressSpaceOperand(Value *I); 472 473 /// \returns the scalarization cost for this type. Scalarization in this 474 /// context means the creation of vectors from a group of scalars. 475 int getGatherCost(Type *Ty); 476 477 /// \returns the scalarization cost for this list of values. Assuming that 478 /// this subtree gets vectorized, we may need to extract the values from the 479 /// roots. This method calculates the cost of extracting the values. 480 int getGatherCost(ArrayRef<Value *> VL); 481 482 /// \brief Set the Builder insert point to one after the last instruction in 483 /// the bundle 484 void setInsertPointAfterBundle(ArrayRef<Value *> VL); 485 486 /// \returns a vector from a collection of scalars in \p VL. 487 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); 488 489 /// \returns whether the VectorizableTree is fully vectoriable and will 490 /// be beneficial even the tree height is tiny. 491 bool isFullyVectorizableTinyTree(); 492 493 struct TreeEntry { 494 TreeEntry() : Scalars(), VectorizedValue(nullptr), 495 NeedToGather(0) {} 496 497 /// \returns true if the scalars in VL are equal to this entry. 498 bool isSame(ArrayRef<Value *> VL) const { 499 assert(VL.size() == Scalars.size() && "Invalid size"); 500 return std::equal(VL.begin(), VL.end(), Scalars.begin()); 501 } 502 503 /// A vector of scalars. 504 ValueList Scalars; 505 506 /// The Scalars are vectorized into this value. It is initialized to Null. 507 Value *VectorizedValue; 508 509 /// Do we need to gather this sequence ? 510 bool NeedToGather; 511 }; 512 513 /// Create a new VectorizableTree entry. 514 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { 515 VectorizableTree.push_back(TreeEntry()); 516 int idx = VectorizableTree.size() - 1; 517 TreeEntry *Last = &VectorizableTree[idx]; 518 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); 519 Last->NeedToGather = !Vectorized; 520 if (Vectorized) { 521 for (int i = 0, e = VL.size(); i != e; ++i) { 522 assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); 523 ScalarToTreeEntry[VL[i]] = idx; 524 } 525 } else { 526 MustGather.insert(VL.begin(), VL.end()); 527 } 528 return Last; 529 } 530 531 /// -- Vectorization State -- 532 /// Holds all of the tree entries. 533 std::vector<TreeEntry> VectorizableTree; 534 535 /// Maps a specific scalar to its tree entry. 536 SmallDenseMap<Value*, int> ScalarToTreeEntry; 537 538 /// A list of scalars that we found that we need to keep as scalars. 539 ValueSet MustGather; 540 541 /// This POD struct describes one external user in the vectorized tree. 542 struct ExternalUser { 543 ExternalUser (Value *S, llvm::User *U, int L) : 544 Scalar(S), User(U), Lane(L){}; 545 // Which scalar in our function. 546 Value *Scalar; 547 // Which user that uses the scalar. 548 llvm::User *User; 549 // Which lane does the scalar belong to. 550 int Lane; 551 }; 552 typedef SmallVector<ExternalUser, 16> UserList; 553 554 /// A list of values that need to extracted out of the tree. 555 /// This list holds pairs of (Internal Scalar : External User). 556 UserList ExternalUses; 557 558 /// Holds all of the instructions that we gathered. 559 SetVector<Instruction *> GatherSeq; 560 /// A list of blocks that we are going to CSE. 561 SetVector<BasicBlock *> CSEBlocks; 562 563 /// Contains all scheduling relevant data for an instruction. 564 /// A ScheduleData either represents a single instruction or a member of an 565 /// instruction bundle (= a group of instructions which is combined into a 566 /// vector instruction). 567 struct ScheduleData { 568 569 // The initial value for the dependency counters. It means that the 570 // dependencies are not calculated yet. 571 enum { InvalidDeps = -1 }; 572 573 ScheduleData() 574 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr), 575 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0), 576 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps), 577 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {} 578 579 void init(int BlockSchedulingRegionID) { 580 FirstInBundle = this; 581 NextInBundle = nullptr; 582 NextLoadStore = nullptr; 583 IsScheduled = false; 584 SchedulingRegionID = BlockSchedulingRegionID; 585 UnscheduledDepsInBundle = UnscheduledDeps; 586 clearDependencies(); 587 } 588 589 /// Returns true if the dependency information has been calculated. 590 bool hasValidDependencies() const { return Dependencies != InvalidDeps; } 591 592 /// Returns true for single instructions and for bundle representatives 593 /// (= the head of a bundle). 594 bool isSchedulingEntity() const { return FirstInBundle == this; } 595 596 /// Returns true if it represents an instruction bundle and not only a 597 /// single instruction. 598 bool isPartOfBundle() const { 599 return NextInBundle != nullptr || FirstInBundle != this; 600 } 601 602 /// Returns true if it is ready for scheduling, i.e. it has no more 603 /// unscheduled depending instructions/bundles. 604 bool isReady() const { 605 assert(isSchedulingEntity() && 606 "can't consider non-scheduling entity for ready list"); 607 return UnscheduledDepsInBundle == 0 && !IsScheduled; 608 } 609 610 /// Modifies the number of unscheduled dependencies, also updating it for 611 /// the whole bundle. 612 int incrementUnscheduledDeps(int Incr) { 613 UnscheduledDeps += Incr; 614 return FirstInBundle->UnscheduledDepsInBundle += Incr; 615 } 616 617 /// Sets the number of unscheduled dependencies to the number of 618 /// dependencies. 619 void resetUnscheduledDeps() { 620 incrementUnscheduledDeps(Dependencies - UnscheduledDeps); 621 } 622 623 /// Clears all dependency information. 624 void clearDependencies() { 625 Dependencies = InvalidDeps; 626 resetUnscheduledDeps(); 627 MemoryDependencies.clear(); 628 } 629 630 void dump(raw_ostream &os) const { 631 if (!isSchedulingEntity()) { 632 os << "/ " << *Inst; 633 } else if (NextInBundle) { 634 os << '[' << *Inst; 635 ScheduleData *SD = NextInBundle; 636 while (SD) { 637 os << ';' << *SD->Inst; 638 SD = SD->NextInBundle; 639 } 640 os << ']'; 641 } else { 642 os << *Inst; 643 } 644 } 645 646 Instruction *Inst; 647 648 /// Points to the head in an instruction bundle (and always to this for 649 /// single instructions). 650 ScheduleData *FirstInBundle; 651 652 /// Single linked list of all instructions in a bundle. Null if it is a 653 /// single instruction. 654 ScheduleData *NextInBundle; 655 656 /// Single linked list of all memory instructions (e.g. load, store, call) 657 /// in the block - until the end of the scheduling region. 658 ScheduleData *NextLoadStore; 659 660 /// The dependent memory instructions. 661 /// This list is derived on demand in calculateDependencies(). 662 SmallVector<ScheduleData *, 4> MemoryDependencies; 663 664 /// This ScheduleData is in the current scheduling region if this matches 665 /// the current SchedulingRegionID of BlockScheduling. 666 int SchedulingRegionID; 667 668 /// Used for getting a "good" final ordering of instructions. 669 int SchedulingPriority; 670 671 /// The number of dependencies. Constitutes of the number of users of the 672 /// instruction plus the number of dependent memory instructions (if any). 673 /// This value is calculated on demand. 674 /// If InvalidDeps, the number of dependencies is not calculated yet. 675 /// 676 int Dependencies; 677 678 /// The number of dependencies minus the number of dependencies of scheduled 679 /// instructions. As soon as this is zero, the instruction/bundle gets ready 680 /// for scheduling. 681 /// Note that this is negative as long as Dependencies is not calculated. 682 int UnscheduledDeps; 683 684 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for 685 /// single instructions. 686 int UnscheduledDepsInBundle; 687 688 /// True if this instruction is scheduled (or considered as scheduled in the 689 /// dry-run). 690 bool IsScheduled; 691 }; 692 693 #ifndef NDEBUG 694 friend raw_ostream &operator<<(raw_ostream &os, 695 const BoUpSLP::ScheduleData &SD); 696 #endif 697 698 /// Contains all scheduling data for a basic block. 699 /// 700 struct BlockScheduling { 701 702 BlockScheduling(BasicBlock *BB) 703 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), 704 ScheduleStart(nullptr), ScheduleEnd(nullptr), 705 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), 706 // Make sure that the initial SchedulingRegionID is greater than the 707 // initial SchedulingRegionID in ScheduleData (which is 0). 708 SchedulingRegionID(1) {} 709 710 void clear() { 711 ReadyInsts.clear(); 712 ScheduleStart = nullptr; 713 ScheduleEnd = nullptr; 714 FirstLoadStoreInRegion = nullptr; 715 LastLoadStoreInRegion = nullptr; 716 717 // Make a new scheduling region, i.e. all existing ScheduleData is not 718 // in the new region yet. 719 ++SchedulingRegionID; 720 } 721 722 ScheduleData *getScheduleData(Value *V) { 723 ScheduleData *SD = ScheduleDataMap[V]; 724 if (SD && SD->SchedulingRegionID == SchedulingRegionID) 725 return SD; 726 return nullptr; 727 } 728 729 bool isInSchedulingRegion(ScheduleData *SD) { 730 return SD->SchedulingRegionID == SchedulingRegionID; 731 } 732 733 /// Marks an instruction as scheduled and puts all dependent ready 734 /// instructions into the ready-list. 735 template <typename ReadyListType> 736 void schedule(ScheduleData *SD, ReadyListType &ReadyList) { 737 SD->IsScheduled = true; 738 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); 739 740 ScheduleData *BundleMember = SD; 741 while (BundleMember) { 742 // Handle the def-use chain dependencies. 743 for (Use &U : BundleMember->Inst->operands()) { 744 ScheduleData *OpDef = getScheduleData(U.get()); 745 if (OpDef && OpDef->hasValidDependencies() && 746 OpDef->incrementUnscheduledDeps(-1) == 0) { 747 // There are no more unscheduled dependencies after decrementing, 748 // so we can put the dependent instruction into the ready list. 749 ScheduleData *DepBundle = OpDef->FirstInBundle; 750 assert(!DepBundle->IsScheduled && 751 "already scheduled bundle gets ready"); 752 ReadyList.insert(DepBundle); 753 DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); 754 } 755 } 756 // Handle the memory dependencies. 757 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { 758 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { 759 // There are no more unscheduled dependencies after decrementing, 760 // so we can put the dependent instruction into the ready list. 761 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; 762 assert(!DepBundle->IsScheduled && 763 "already scheduled bundle gets ready"); 764 ReadyList.insert(DepBundle); 765 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n"); 766 } 767 } 768 BundleMember = BundleMember->NextInBundle; 769 } 770 } 771 772 /// Put all instructions into the ReadyList which are ready for scheduling. 773 template <typename ReadyListType> 774 void initialFillReadyList(ReadyListType &ReadyList) { 775 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 776 ScheduleData *SD = getScheduleData(I); 777 if (SD->isSchedulingEntity() && SD->isReady()) { 778 ReadyList.insert(SD); 779 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); 780 } 781 } 782 } 783 784 /// Checks if a bundle of instructions can be scheduled, i.e. has no 785 /// cyclic dependencies. This is only a dry-run, no instructions are 786 /// actually moved at this stage. 787 bool tryScheduleBundle(ArrayRef<Value *> VL, AliasAnalysis *AA); 788 789 /// Un-bundles a group of instructions. 790 void cancelScheduling(ArrayRef<Value *> VL); 791 792 /// Extends the scheduling region so that V is inside the region. 793 void extendSchedulingRegion(Value *V); 794 795 /// Initialize the ScheduleData structures for new instructions in the 796 /// scheduling region. 797 void initScheduleData(Instruction *FromI, Instruction *ToI, 798 ScheduleData *PrevLoadStore, 799 ScheduleData *NextLoadStore); 800 801 /// Updates the dependency information of a bundle and of all instructions/ 802 /// bundles which depend on the original bundle. 803 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, 804 AliasAnalysis *AA); 805 806 /// Sets all instruction in the scheduling region to un-scheduled. 807 void resetSchedule(); 808 809 BasicBlock *BB; 810 811 /// Simple memory allocation for ScheduleData. 812 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; 813 814 /// The size of a ScheduleData array in ScheduleDataChunks. 815 int ChunkSize; 816 817 /// The allocator position in the current chunk, which is the last entry 818 /// of ScheduleDataChunks. 819 int ChunkPos; 820 821 /// Attaches ScheduleData to Instruction. 822 /// Note that the mapping survives during all vectorization iterations, i.e. 823 /// ScheduleData structures are recycled. 824 DenseMap<Value *, ScheduleData *> ScheduleDataMap; 825 826 struct ReadyList : SmallVector<ScheduleData *, 8> { 827 void insert(ScheduleData *SD) { push_back(SD); } 828 }; 829 830 /// The ready-list for scheduling (only used for the dry-run). 831 ReadyList ReadyInsts; 832 833 /// The first instruction of the scheduling region. 834 Instruction *ScheduleStart; 835 836 /// The first instruction _after_ the scheduling region. 837 Instruction *ScheduleEnd; 838 839 /// The first memory accessing instruction in the scheduling region 840 /// (can be null). 841 ScheduleData *FirstLoadStoreInRegion; 842 843 /// The last memory accessing instruction in the scheduling region 844 /// (can be null). 845 ScheduleData *LastLoadStoreInRegion; 846 847 /// The ID of the scheduling region. For a new vectorization iteration this 848 /// is incremented which "removes" all ScheduleData from the region. 849 int SchedulingRegionID; 850 }; 851 852 /// Attaches the BlockScheduling structures to basic blocks. 853 DenseMap<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; 854 855 /// Performs the "real" scheduling. Done before vectorization is actually 856 /// performed in a basic block. 857 void scheduleBlock(BlockScheduling *BS); 858 859 /// List of users to ignore during scheduling and that don't need extracting. 860 ArrayRef<Value *> UserIgnoreList; 861 862 // Number of load-bundles, which contain consecutive loads. 863 int NumLoadsWantToKeepOrder; 864 865 // Number of load-bundles of size 2, which are consecutive loads if reversed. 866 int NumLoadsWantToChangeOrder; 867 868 // Analysis and block reference. 869 Function *F; 870 ScalarEvolution *SE; 871 const DataLayout *DL; 872 TargetTransformInfo *TTI; 873 TargetLibraryInfo *TLI; 874 AliasAnalysis *AA; 875 LoopInfo *LI; 876 DominatorTree *DT; 877 /// Instruction builder to construct the vectorized tree. 878 IRBuilder<> Builder; 879 }; 880 881 #ifndef NDEBUG 882 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) { 883 SD.dump(os); 884 return os; 885 } 886 #endif 887 888 void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 889 ArrayRef<Value *> UserIgnoreLst) { 890 deleteTree(); 891 UserIgnoreList = UserIgnoreLst; 892 if (!getSameType(Roots)) 893 return; 894 buildTree_rec(Roots, 0); 895 896 // Collect the values that we need to extract from the tree. 897 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 898 TreeEntry *Entry = &VectorizableTree[EIdx]; 899 900 // For each lane: 901 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 902 Value *Scalar = Entry->Scalars[Lane]; 903 904 // No need to handle users of gathered values. 905 if (Entry->NeedToGather) 906 continue; 907 908 for (User *U : Scalar->users()) { 909 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); 910 911 Instruction *UserInst = dyn_cast<Instruction>(U); 912 if (!UserInst) 913 continue; 914 915 // Skip in-tree scalars that become vectors 916 if (ScalarToTreeEntry.count(U)) { 917 int Idx = ScalarToTreeEntry[U]; 918 TreeEntry *UseEntry = &VectorizableTree[Idx]; 919 Value *UseScalar = UseEntry->Scalars[0]; 920 // Some in-tree scalars will remain as scalar in vectorized 921 // instructions. If that is the case, the one in Lane 0 will 922 // be used. 923 if (UseScalar != U || 924 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { 925 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U 926 << ".\n"); 927 assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); 928 continue; 929 } 930 } 931 932 // Ignore users in the user ignore list. 933 if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != 934 UserIgnoreList.end()) 935 continue; 936 937 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << 938 Lane << " from " << *Scalar << ".\n"); 939 ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); 940 } 941 } 942 } 943 } 944 945 946 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { 947 bool SameTy = getSameType(VL); (void)SameTy; 948 bool isAltShuffle = false; 949 assert(SameTy && "Invalid types!"); 950 951 if (Depth == RecursionMaxDepth) { 952 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); 953 newTreeEntry(VL, false); 954 return; 955 } 956 957 // Don't handle vectors. 958 if (VL[0]->getType()->isVectorTy()) { 959 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); 960 newTreeEntry(VL, false); 961 return; 962 } 963 964 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 965 if (SI->getValueOperand()->getType()->isVectorTy()) { 966 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); 967 newTreeEntry(VL, false); 968 return; 969 } 970 unsigned Opcode = getSameOpcode(VL); 971 972 // Check that this shuffle vector refers to the alternate 973 // sequence of opcodes. 974 if (Opcode == Instruction::ShuffleVector) { 975 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 976 unsigned Op = I0->getOpcode(); 977 if (Op != Instruction::ShuffleVector) 978 isAltShuffle = true; 979 } 980 981 // If all of the operands are identical or constant we have a simple solution. 982 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { 983 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); 984 newTreeEntry(VL, false); 985 return; 986 } 987 988 // We now know that this is a vector of instructions of the same type from 989 // the same block. 990 991 // Check if this is a duplicate of another entry. 992 if (ScalarToTreeEntry.count(VL[0])) { 993 int Idx = ScalarToTreeEntry[VL[0]]; 994 TreeEntry *E = &VectorizableTree[Idx]; 995 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 996 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); 997 if (E->Scalars[i] != VL[i]) { 998 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); 999 newTreeEntry(VL, false); 1000 return; 1001 } 1002 } 1003 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); 1004 return; 1005 } 1006 1007 // Check that none of the instructions in the bundle are already in the tree. 1008 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1009 if (ScalarToTreeEntry.count(VL[i])) { 1010 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 1011 ") is already in tree.\n"); 1012 newTreeEntry(VL, false); 1013 return; 1014 } 1015 } 1016 1017 // If any of the scalars appears in the table OR it is marked as a value that 1018 // needs to stat scalar then we need to gather the scalars. 1019 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 1020 if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) { 1021 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n"); 1022 newTreeEntry(VL, false); 1023 return; 1024 } 1025 } 1026 1027 // Check that all of the users of the scalars that we want to vectorize are 1028 // schedulable. 1029 Instruction *VL0 = cast<Instruction>(VL[0]); 1030 BasicBlock *BB = cast<Instruction>(VL0)->getParent(); 1031 1032 if (!DT->isReachableFromEntry(BB)) { 1033 // Don't go into unreachable blocks. They may contain instructions with 1034 // dependency cycles which confuse the final scheduling. 1035 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); 1036 newTreeEntry(VL, false); 1037 return; 1038 } 1039 1040 // Check that every instructions appears once in this bundle. 1041 for (unsigned i = 0, e = VL.size(); i < e; ++i) 1042 for (unsigned j = i+1; j < e; ++j) 1043 if (VL[i] == VL[j]) { 1044 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 1045 newTreeEntry(VL, false); 1046 return; 1047 } 1048 1049 auto &BSRef = BlocksSchedules[BB]; 1050 if (!BSRef) { 1051 BSRef = llvm::make_unique<BlockScheduling>(BB); 1052 } 1053 BlockScheduling &BS = *BSRef.get(); 1054 1055 if (!BS.tryScheduleBundle(VL, AA)) { 1056 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); 1057 BS.cancelScheduling(VL); 1058 newTreeEntry(VL, false); 1059 return; 1060 } 1061 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); 1062 1063 switch (Opcode) { 1064 case Instruction::PHI: { 1065 PHINode *PH = dyn_cast<PHINode>(VL0); 1066 1067 // Check for terminator values (e.g. invoke). 1068 for (unsigned j = 0; j < VL.size(); ++j) 1069 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1070 TerminatorInst *Term = dyn_cast<TerminatorInst>( 1071 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); 1072 if (Term) { 1073 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); 1074 BS.cancelScheduling(VL); 1075 newTreeEntry(VL, false); 1076 return; 1077 } 1078 } 1079 1080 newTreeEntry(VL, true); 1081 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); 1082 1083 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1084 ValueList Operands; 1085 // Prepare the operand vector. 1086 for (unsigned j = 0; j < VL.size(); ++j) 1087 Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock( 1088 PH->getIncomingBlock(i))); 1089 1090 buildTree_rec(Operands, Depth + 1); 1091 } 1092 return; 1093 } 1094 case Instruction::ExtractElement: { 1095 bool Reuse = CanReuseExtract(VL); 1096 if (Reuse) { 1097 DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); 1098 } else { 1099 BS.cancelScheduling(VL); 1100 } 1101 newTreeEntry(VL, Reuse); 1102 return; 1103 } 1104 case Instruction::Load: { 1105 // Check if the loads are consecutive or of we need to swizzle them. 1106 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { 1107 LoadInst *L = cast<LoadInst>(VL[i]); 1108 if (!L->isSimple()) { 1109 BS.cancelScheduling(VL); 1110 newTreeEntry(VL, false); 1111 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); 1112 return; 1113 } 1114 if (!isConsecutiveAccess(VL[i], VL[i + 1])) { 1115 if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) { 1116 ++NumLoadsWantToChangeOrder; 1117 } 1118 BS.cancelScheduling(VL); 1119 newTreeEntry(VL, false); 1120 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); 1121 return; 1122 } 1123 } 1124 ++NumLoadsWantToKeepOrder; 1125 newTreeEntry(VL, true); 1126 DEBUG(dbgs() << "SLP: added a vector of loads.\n"); 1127 return; 1128 } 1129 case Instruction::ZExt: 1130 case Instruction::SExt: 1131 case Instruction::FPToUI: 1132 case Instruction::FPToSI: 1133 case Instruction::FPExt: 1134 case Instruction::PtrToInt: 1135 case Instruction::IntToPtr: 1136 case Instruction::SIToFP: 1137 case Instruction::UIToFP: 1138 case Instruction::Trunc: 1139 case Instruction::FPTrunc: 1140 case Instruction::BitCast: { 1141 Type *SrcTy = VL0->getOperand(0)->getType(); 1142 for (unsigned i = 0; i < VL.size(); ++i) { 1143 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); 1144 if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { 1145 BS.cancelScheduling(VL); 1146 newTreeEntry(VL, false); 1147 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); 1148 return; 1149 } 1150 } 1151 newTreeEntry(VL, true); 1152 DEBUG(dbgs() << "SLP: added a vector of casts.\n"); 1153 1154 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1155 ValueList Operands; 1156 // Prepare the operand vector. 1157 for (unsigned j = 0; j < VL.size(); ++j) 1158 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1159 1160 buildTree_rec(Operands, Depth+1); 1161 } 1162 return; 1163 } 1164 case Instruction::ICmp: 1165 case Instruction::FCmp: { 1166 // Check that all of the compares have the same predicate. 1167 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); 1168 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); 1169 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 1170 CmpInst *Cmp = cast<CmpInst>(VL[i]); 1171 if (Cmp->getPredicate() != P0 || 1172 Cmp->getOperand(0)->getType() != ComparedTy) { 1173 BS.cancelScheduling(VL); 1174 newTreeEntry(VL, false); 1175 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); 1176 return; 1177 } 1178 } 1179 1180 newTreeEntry(VL, true); 1181 DEBUG(dbgs() << "SLP: added a vector of compares.\n"); 1182 1183 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1184 ValueList Operands; 1185 // Prepare the operand vector. 1186 for (unsigned j = 0; j < VL.size(); ++j) 1187 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1188 1189 buildTree_rec(Operands, Depth+1); 1190 } 1191 return; 1192 } 1193 case Instruction::Select: 1194 case Instruction::Add: 1195 case Instruction::FAdd: 1196 case Instruction::Sub: 1197 case Instruction::FSub: 1198 case Instruction::Mul: 1199 case Instruction::FMul: 1200 case Instruction::UDiv: 1201 case Instruction::SDiv: 1202 case Instruction::FDiv: 1203 case Instruction::URem: 1204 case Instruction::SRem: 1205 case Instruction::FRem: 1206 case Instruction::Shl: 1207 case Instruction::LShr: 1208 case Instruction::AShr: 1209 case Instruction::And: 1210 case Instruction::Or: 1211 case Instruction::Xor: { 1212 newTreeEntry(VL, true); 1213 DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); 1214 1215 // Sort operands of the instructions so that each side is more likely to 1216 // have the same opcode. 1217 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { 1218 ValueList Left, Right; 1219 reorderInputsAccordingToOpcode(VL, Left, Right); 1220 buildTree_rec(Left, Depth + 1); 1221 buildTree_rec(Right, Depth + 1); 1222 return; 1223 } 1224 1225 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1226 ValueList Operands; 1227 // Prepare the operand vector. 1228 for (unsigned j = 0; j < VL.size(); ++j) 1229 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1230 1231 buildTree_rec(Operands, Depth+1); 1232 } 1233 return; 1234 } 1235 case Instruction::GetElementPtr: { 1236 // We don't combine GEPs with complicated (nested) indexing. 1237 for (unsigned j = 0; j < VL.size(); ++j) { 1238 if (cast<Instruction>(VL[j])->getNumOperands() != 2) { 1239 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); 1240 BS.cancelScheduling(VL); 1241 newTreeEntry(VL, false); 1242 return; 1243 } 1244 } 1245 1246 // We can't combine several GEPs into one vector if they operate on 1247 // different types. 1248 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType(); 1249 for (unsigned j = 0; j < VL.size(); ++j) { 1250 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); 1251 if (Ty0 != CurTy) { 1252 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); 1253 BS.cancelScheduling(VL); 1254 newTreeEntry(VL, false); 1255 return; 1256 } 1257 } 1258 1259 // We don't combine GEPs with non-constant indexes. 1260 for (unsigned j = 0; j < VL.size(); ++j) { 1261 auto Op = cast<Instruction>(VL[j])->getOperand(1); 1262 if (!isa<ConstantInt>(Op)) { 1263 DEBUG( 1264 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); 1265 BS.cancelScheduling(VL); 1266 newTreeEntry(VL, false); 1267 return; 1268 } 1269 } 1270 1271 newTreeEntry(VL, true); 1272 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); 1273 for (unsigned i = 0, e = 2; i < e; ++i) { 1274 ValueList Operands; 1275 // Prepare the operand vector. 1276 for (unsigned j = 0; j < VL.size(); ++j) 1277 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1278 1279 buildTree_rec(Operands, Depth + 1); 1280 } 1281 return; 1282 } 1283 case Instruction::Store: { 1284 // Check if the stores are consecutive or of we need to swizzle them. 1285 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) 1286 if (!isConsecutiveAccess(VL[i], VL[i + 1])) { 1287 BS.cancelScheduling(VL); 1288 newTreeEntry(VL, false); 1289 DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); 1290 return; 1291 } 1292 1293 newTreeEntry(VL, true); 1294 DEBUG(dbgs() << "SLP: added a vector of stores.\n"); 1295 1296 ValueList Operands; 1297 for (unsigned j = 0; j < VL.size(); ++j) 1298 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); 1299 1300 buildTree_rec(Operands, Depth + 1); 1301 return; 1302 } 1303 case Instruction::Call: { 1304 // Check if the calls are all to the same vectorizable intrinsic. 1305 CallInst *CI = cast<CallInst>(VL[0]); 1306 // Check if this is an Intrinsic call or something that can be 1307 // represented by an intrinsic call 1308 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1309 if (!isTriviallyVectorizable(ID)) { 1310 BS.cancelScheduling(VL); 1311 newTreeEntry(VL, false); 1312 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); 1313 return; 1314 } 1315 Function *Int = CI->getCalledFunction(); 1316 Value *A1I = nullptr; 1317 if (hasVectorInstrinsicScalarOpd(ID, 1)) 1318 A1I = CI->getArgOperand(1); 1319 for (unsigned i = 1, e = VL.size(); i != e; ++i) { 1320 CallInst *CI2 = dyn_cast<CallInst>(VL[i]); 1321 if (!CI2 || CI2->getCalledFunction() != Int || 1322 getIntrinsicIDForCall(CI2, TLI) != ID) { 1323 BS.cancelScheduling(VL); 1324 newTreeEntry(VL, false); 1325 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] 1326 << "\n"); 1327 return; 1328 } 1329 // ctlz,cttz and powi are special intrinsics whose second argument 1330 // should be same in order for them to be vectorized. 1331 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 1332 Value *A1J = CI2->getArgOperand(1); 1333 if (A1I != A1J) { 1334 BS.cancelScheduling(VL); 1335 newTreeEntry(VL, false); 1336 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI 1337 << " argument "<< A1I<<"!=" << A1J 1338 << "\n"); 1339 return; 1340 } 1341 } 1342 } 1343 1344 newTreeEntry(VL, true); 1345 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { 1346 ValueList Operands; 1347 // Prepare the operand vector. 1348 for (unsigned j = 0; j < VL.size(); ++j) { 1349 CallInst *CI2 = dyn_cast<CallInst>(VL[j]); 1350 Operands.push_back(CI2->getArgOperand(i)); 1351 } 1352 buildTree_rec(Operands, Depth + 1); 1353 } 1354 return; 1355 } 1356 case Instruction::ShuffleVector: { 1357 // If this is not an alternate sequence of opcode like add-sub 1358 // then do not vectorize this instruction. 1359 if (!isAltShuffle) { 1360 BS.cancelScheduling(VL); 1361 newTreeEntry(VL, false); 1362 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); 1363 return; 1364 } 1365 newTreeEntry(VL, true); 1366 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); 1367 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 1368 ValueList Operands; 1369 // Prepare the operand vector. 1370 for (unsigned j = 0; j < VL.size(); ++j) 1371 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 1372 1373 buildTree_rec(Operands, Depth + 1); 1374 } 1375 return; 1376 } 1377 default: 1378 BS.cancelScheduling(VL); 1379 newTreeEntry(VL, false); 1380 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); 1381 return; 1382 } 1383 } 1384 1385 int BoUpSLP::getEntryCost(TreeEntry *E) { 1386 ArrayRef<Value*> VL = E->Scalars; 1387 1388 Type *ScalarTy = VL[0]->getType(); 1389 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1390 ScalarTy = SI->getValueOperand()->getType(); 1391 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1392 1393 if (E->NeedToGather) { 1394 if (allConstant(VL)) 1395 return 0; 1396 if (isSplat(VL)) { 1397 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); 1398 } 1399 return getGatherCost(E->Scalars); 1400 } 1401 unsigned Opcode = getSameOpcode(VL); 1402 assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); 1403 Instruction *VL0 = cast<Instruction>(VL[0]); 1404 switch (Opcode) { 1405 case Instruction::PHI: { 1406 return 0; 1407 } 1408 case Instruction::ExtractElement: { 1409 if (CanReuseExtract(VL)) { 1410 int DeadCost = 0; 1411 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 1412 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 1413 if (E->hasOneUse()) 1414 // Take credit for instruction that will become dead. 1415 DeadCost += 1416 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); 1417 } 1418 return -DeadCost; 1419 } 1420 return getGatherCost(VecTy); 1421 } 1422 case Instruction::ZExt: 1423 case Instruction::SExt: 1424 case Instruction::FPToUI: 1425 case Instruction::FPToSI: 1426 case Instruction::FPExt: 1427 case Instruction::PtrToInt: 1428 case Instruction::IntToPtr: 1429 case Instruction::SIToFP: 1430 case Instruction::UIToFP: 1431 case Instruction::Trunc: 1432 case Instruction::FPTrunc: 1433 case Instruction::BitCast: { 1434 Type *SrcTy = VL0->getOperand(0)->getType(); 1435 1436 // Calculate the cost of this instruction. 1437 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), 1438 VL0->getType(), SrcTy); 1439 1440 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); 1441 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); 1442 return VecCost - ScalarCost; 1443 } 1444 case Instruction::FCmp: 1445 case Instruction::ICmp: 1446 case Instruction::Select: 1447 case Instruction::Add: 1448 case Instruction::FAdd: 1449 case Instruction::Sub: 1450 case Instruction::FSub: 1451 case Instruction::Mul: 1452 case Instruction::FMul: 1453 case Instruction::UDiv: 1454 case Instruction::SDiv: 1455 case Instruction::FDiv: 1456 case Instruction::URem: 1457 case Instruction::SRem: 1458 case Instruction::FRem: 1459 case Instruction::Shl: 1460 case Instruction::LShr: 1461 case Instruction::AShr: 1462 case Instruction::And: 1463 case Instruction::Or: 1464 case Instruction::Xor: { 1465 // Calculate the cost of this instruction. 1466 int ScalarCost = 0; 1467 int VecCost = 0; 1468 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || 1469 Opcode == Instruction::Select) { 1470 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); 1471 ScalarCost = VecTy->getNumElements() * 1472 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); 1473 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); 1474 } else { 1475 // Certain instructions can be cheaper to vectorize if they have a 1476 // constant second vector operand. 1477 TargetTransformInfo::OperandValueKind Op1VK = 1478 TargetTransformInfo::OK_AnyValue; 1479 TargetTransformInfo::OperandValueKind Op2VK = 1480 TargetTransformInfo::OK_UniformConstantValue; 1481 TargetTransformInfo::OperandValueProperties Op1VP = 1482 TargetTransformInfo::OP_None; 1483 TargetTransformInfo::OperandValueProperties Op2VP = 1484 TargetTransformInfo::OP_None; 1485 1486 // If all operands are exactly the same ConstantInt then set the 1487 // operand kind to OK_UniformConstantValue. 1488 // If instead not all operands are constants, then set the operand kind 1489 // to OK_AnyValue. If all operands are constants but not the same, 1490 // then set the operand kind to OK_NonUniformConstantValue. 1491 ConstantInt *CInt = nullptr; 1492 for (unsigned i = 0; i < VL.size(); ++i) { 1493 const Instruction *I = cast<Instruction>(VL[i]); 1494 if (!isa<ConstantInt>(I->getOperand(1))) { 1495 Op2VK = TargetTransformInfo::OK_AnyValue; 1496 break; 1497 } 1498 if (i == 0) { 1499 CInt = cast<ConstantInt>(I->getOperand(1)); 1500 continue; 1501 } 1502 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && 1503 CInt != cast<ConstantInt>(I->getOperand(1))) 1504 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 1505 } 1506 // FIXME: Currently cost of model modification for division by 1507 // power of 2 is handled only for X86. Add support for other targets. 1508 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && 1509 CInt->getValue().isPowerOf2()) 1510 Op2VP = TargetTransformInfo::OP_PowerOf2; 1511 1512 ScalarCost = VecTy->getNumElements() * 1513 TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, 1514 Op1VP, Op2VP); 1515 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, 1516 Op1VP, Op2VP); 1517 } 1518 return VecCost - ScalarCost; 1519 } 1520 case Instruction::GetElementPtr: { 1521 TargetTransformInfo::OperandValueKind Op1VK = 1522 TargetTransformInfo::OK_AnyValue; 1523 TargetTransformInfo::OperandValueKind Op2VK = 1524 TargetTransformInfo::OK_UniformConstantValue; 1525 1526 int ScalarCost = 1527 VecTy->getNumElements() * 1528 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); 1529 int VecCost = 1530 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); 1531 1532 return VecCost - ScalarCost; 1533 } 1534 case Instruction::Load: { 1535 // Cost of wide load - cost of scalar loads. 1536 int ScalarLdCost = VecTy->getNumElements() * 1537 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); 1538 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0); 1539 return VecLdCost - ScalarLdCost; 1540 } 1541 case Instruction::Store: { 1542 // We know that we can merge the stores. Calculate the cost. 1543 int ScalarStCost = VecTy->getNumElements() * 1544 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); 1545 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0); 1546 return VecStCost - ScalarStCost; 1547 } 1548 case Instruction::Call: { 1549 CallInst *CI = cast<CallInst>(VL0); 1550 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 1551 1552 // Calculate the cost of the scalar and vector calls. 1553 SmallVector<Type*, 4> ScalarTys, VecTys; 1554 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { 1555 ScalarTys.push_back(CI->getArgOperand(op)->getType()); 1556 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), 1557 VecTy->getNumElements())); 1558 } 1559 1560 int ScalarCallCost = VecTy->getNumElements() * 1561 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys); 1562 1563 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys); 1564 1565 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost 1566 << " (" << VecCallCost << "-" << ScalarCallCost << ")" 1567 << " for " << *CI << "\n"); 1568 1569 return VecCallCost - ScalarCallCost; 1570 } 1571 case Instruction::ShuffleVector: { 1572 TargetTransformInfo::OperandValueKind Op1VK = 1573 TargetTransformInfo::OK_AnyValue; 1574 TargetTransformInfo::OperandValueKind Op2VK = 1575 TargetTransformInfo::OK_AnyValue; 1576 int ScalarCost = 0; 1577 int VecCost = 0; 1578 for (unsigned i = 0; i < VL.size(); ++i) { 1579 Instruction *I = cast<Instruction>(VL[i]); 1580 if (!I) 1581 break; 1582 ScalarCost += 1583 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); 1584 } 1585 // VecCost is equal to sum of the cost of creating 2 vectors 1586 // and the cost of creating shuffle. 1587 Instruction *I0 = cast<Instruction>(VL[0]); 1588 VecCost = 1589 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); 1590 Instruction *I1 = cast<Instruction>(VL[1]); 1591 VecCost += 1592 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); 1593 VecCost += 1594 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); 1595 return VecCost - ScalarCost; 1596 } 1597 default: 1598 llvm_unreachable("Unknown instruction"); 1599 } 1600 } 1601 1602 bool BoUpSLP::isFullyVectorizableTinyTree() { 1603 DEBUG(dbgs() << "SLP: Check whether the tree with height " << 1604 VectorizableTree.size() << " is fully vectorizable .\n"); 1605 1606 // We only handle trees of height 2. 1607 if (VectorizableTree.size() != 2) 1608 return false; 1609 1610 // Handle splat stores. 1611 if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars)) 1612 return true; 1613 1614 // Gathering cost would be too much for tiny trees. 1615 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) 1616 return false; 1617 1618 return true; 1619 } 1620 1621 int BoUpSLP::getSpillCost() { 1622 // Walk from the bottom of the tree to the top, tracking which values are 1623 // live. When we see a call instruction that is not part of our tree, 1624 // query TTI to see if there is a cost to keeping values live over it 1625 // (for example, if spills and fills are required). 1626 unsigned BundleWidth = VectorizableTree.front().Scalars.size(); 1627 int Cost = 0; 1628 1629 SmallPtrSet<Instruction*, 4> LiveValues; 1630 Instruction *PrevInst = nullptr; 1631 1632 for (unsigned N = 0; N < VectorizableTree.size(); ++N) { 1633 Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]); 1634 if (!Inst) 1635 continue; 1636 1637 if (!PrevInst) { 1638 PrevInst = Inst; 1639 continue; 1640 } 1641 1642 DEBUG( 1643 dbgs() << "SLP: #LV: " << LiveValues.size(); 1644 for (auto *X : LiveValues) 1645 dbgs() << " " << X->getName(); 1646 dbgs() << ", Looking at "; 1647 Inst->dump(); 1648 ); 1649 1650 // Update LiveValues. 1651 LiveValues.erase(PrevInst); 1652 for (auto &J : PrevInst->operands()) { 1653 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J)) 1654 LiveValues.insert(cast<Instruction>(&*J)); 1655 } 1656 1657 // Now find the sequence of instructions between PrevInst and Inst. 1658 BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst); 1659 --PrevInstIt; 1660 while (InstIt != PrevInstIt) { 1661 if (PrevInstIt == PrevInst->getParent()->rend()) { 1662 PrevInstIt = Inst->getParent()->rbegin(); 1663 continue; 1664 } 1665 1666 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) { 1667 SmallVector<Type*, 4> V; 1668 for (auto *II : LiveValues) 1669 V.push_back(VectorType::get(II->getType(), BundleWidth)); 1670 Cost += TTI->getCostOfKeepingLiveOverCall(V); 1671 } 1672 1673 ++PrevInstIt; 1674 } 1675 1676 PrevInst = Inst; 1677 } 1678 1679 DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n"); 1680 return Cost; 1681 } 1682 1683 int BoUpSLP::getTreeCost() { 1684 int Cost = 0; 1685 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << 1686 VectorizableTree.size() << ".\n"); 1687 1688 // We only vectorize tiny trees if it is fully vectorizable. 1689 if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { 1690 if (!VectorizableTree.size()) { 1691 assert(!ExternalUses.size() && "We should not have any external users"); 1692 } 1693 return INT_MAX; 1694 } 1695 1696 unsigned BundleWidth = VectorizableTree[0].Scalars.size(); 1697 1698 for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { 1699 int C = getEntryCost(&VectorizableTree[i]); 1700 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " 1701 << *VectorizableTree[i].Scalars[0] << " .\n"); 1702 Cost += C; 1703 } 1704 1705 SmallSet<Value *, 16> ExtractCostCalculated; 1706 int ExtractCost = 0; 1707 for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end(); 1708 I != E; ++I) { 1709 // We only add extract cost once for the same scalar. 1710 if (!ExtractCostCalculated.insert(I->Scalar)) 1711 continue; 1712 1713 VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); 1714 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1715 I->Lane); 1716 } 1717 1718 Cost += getSpillCost(); 1719 1720 DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); 1721 return Cost + ExtractCost; 1722 } 1723 1724 int BoUpSLP::getGatherCost(Type *Ty) { 1725 int Cost = 0; 1726 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) 1727 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); 1728 return Cost; 1729 } 1730 1731 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { 1732 // Find the type of the operands in VL. 1733 Type *ScalarTy = VL[0]->getType(); 1734 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1735 ScalarTy = SI->getValueOperand()->getType(); 1736 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1737 // Find the cost of inserting/extracting values from the vector. 1738 return getGatherCost(VecTy); 1739 } 1740 1741 Value *BoUpSLP::getPointerOperand(Value *I) { 1742 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1743 return LI->getPointerOperand(); 1744 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 1745 return SI->getPointerOperand(); 1746 return nullptr; 1747 } 1748 1749 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { 1750 if (LoadInst *L = dyn_cast<LoadInst>(I)) 1751 return L->getPointerAddressSpace(); 1752 if (StoreInst *S = dyn_cast<StoreInst>(I)) 1753 return S->getPointerAddressSpace(); 1754 return -1; 1755 } 1756 1757 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { 1758 Value *PtrA = getPointerOperand(A); 1759 Value *PtrB = getPointerOperand(B); 1760 unsigned ASA = getAddressSpaceOperand(A); 1761 unsigned ASB = getAddressSpaceOperand(B); 1762 1763 // Check that the address spaces match and that the pointers are valid. 1764 if (!PtrA || !PtrB || (ASA != ASB)) 1765 return false; 1766 1767 // Make sure that A and B are different pointers of the same type. 1768 if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) 1769 return false; 1770 1771 unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA); 1772 Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); 1773 APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty)); 1774 1775 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); 1776 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA); 1777 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB); 1778 1779 APInt OffsetDelta = OffsetB - OffsetA; 1780 1781 // Check if they are based on the same pointer. That makes the offsets 1782 // sufficient. 1783 if (PtrA == PtrB) 1784 return OffsetDelta == Size; 1785 1786 // Compute the necessary base pointer delta to have the necessary final delta 1787 // equal to the size. 1788 APInt BaseDelta = Size - OffsetDelta; 1789 1790 // Otherwise compute the distance with SCEV between the base pointers. 1791 const SCEV *PtrSCEVA = SE->getSCEV(PtrA); 1792 const SCEV *PtrSCEVB = SE->getSCEV(PtrB); 1793 const SCEV *C = SE->getConstant(BaseDelta); 1794 const SCEV *X = SE->getAddExpr(PtrSCEVA, C); 1795 return X == PtrSCEVB; 1796 } 1797 1798 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { 1799 Instruction *VL0 = cast<Instruction>(VL[0]); 1800 BasicBlock::iterator NextInst = VL0; 1801 ++NextInst; 1802 Builder.SetInsertPoint(VL0->getParent(), NextInst); 1803 Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); 1804 } 1805 1806 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { 1807 Value *Vec = UndefValue::get(Ty); 1808 // Generate the 'InsertElement' instruction. 1809 for (unsigned i = 0; i < Ty->getNumElements(); ++i) { 1810 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); 1811 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { 1812 GatherSeq.insert(Insrt); 1813 CSEBlocks.insert(Insrt->getParent()); 1814 1815 // Add to our 'need-to-extract' list. 1816 if (ScalarToTreeEntry.count(VL[i])) { 1817 int Idx = ScalarToTreeEntry[VL[i]]; 1818 TreeEntry *E = &VectorizableTree[Idx]; 1819 // Find which lane we need to extract. 1820 int FoundLane = -1; 1821 for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { 1822 // Is this the lane of the scalar that we are looking for ? 1823 if (E->Scalars[Lane] == VL[i]) { 1824 FoundLane = Lane; 1825 break; 1826 } 1827 } 1828 assert(FoundLane >= 0 && "Could not find the correct lane"); 1829 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); 1830 } 1831 } 1832 } 1833 1834 return Vec; 1835 } 1836 1837 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { 1838 SmallDenseMap<Value*, int>::const_iterator Entry 1839 = ScalarToTreeEntry.find(VL[0]); 1840 if (Entry != ScalarToTreeEntry.end()) { 1841 int Idx = Entry->second; 1842 const TreeEntry *En = &VectorizableTree[Idx]; 1843 if (En->isSame(VL) && En->VectorizedValue) 1844 return En->VectorizedValue; 1845 } 1846 return nullptr; 1847 } 1848 1849 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { 1850 if (ScalarToTreeEntry.count(VL[0])) { 1851 int Idx = ScalarToTreeEntry[VL[0]]; 1852 TreeEntry *E = &VectorizableTree[Idx]; 1853 if (E->isSame(VL)) 1854 return vectorizeTree(E); 1855 } 1856 1857 Type *ScalarTy = VL[0]->getType(); 1858 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 1859 ScalarTy = SI->getValueOperand()->getType(); 1860 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 1861 1862 return Gather(VL, VecTy); 1863 } 1864 1865 Value *BoUpSLP::vectorizeTree(TreeEntry *E) { 1866 IRBuilder<>::InsertPointGuard Guard(Builder); 1867 1868 if (E->VectorizedValue) { 1869 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); 1870 return E->VectorizedValue; 1871 } 1872 1873 Instruction *VL0 = cast<Instruction>(E->Scalars[0]); 1874 Type *ScalarTy = VL0->getType(); 1875 if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) 1876 ScalarTy = SI->getValueOperand()->getType(); 1877 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); 1878 1879 if (E->NeedToGather) { 1880 setInsertPointAfterBundle(E->Scalars); 1881 return Gather(E->Scalars, VecTy); 1882 } 1883 1884 unsigned Opcode = getSameOpcode(E->Scalars); 1885 1886 switch (Opcode) { 1887 case Instruction::PHI: { 1888 PHINode *PH = dyn_cast<PHINode>(VL0); 1889 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); 1890 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 1891 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); 1892 E->VectorizedValue = NewPhi; 1893 1894 // PHINodes may have multiple entries from the same block. We want to 1895 // visit every block once. 1896 SmallSet<BasicBlock*, 4> VisitedBBs; 1897 1898 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 1899 ValueList Operands; 1900 BasicBlock *IBB = PH->getIncomingBlock(i); 1901 1902 if (!VisitedBBs.insert(IBB)) { 1903 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); 1904 continue; 1905 } 1906 1907 // Prepare the operand vector. 1908 for (unsigned j = 0; j < E->Scalars.size(); ++j) 1909 Operands.push_back(cast<PHINode>(E->Scalars[j])-> 1910 getIncomingValueForBlock(IBB)); 1911 1912 Builder.SetInsertPoint(IBB->getTerminator()); 1913 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 1914 Value *Vec = vectorizeTree(Operands); 1915 NewPhi->addIncoming(Vec, IBB); 1916 } 1917 1918 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && 1919 "Invalid number of incoming values"); 1920 return NewPhi; 1921 } 1922 1923 case Instruction::ExtractElement: { 1924 if (CanReuseExtract(E->Scalars)) { 1925 Value *V = VL0->getOperand(0); 1926 E->VectorizedValue = V; 1927 return V; 1928 } 1929 return Gather(E->Scalars, VecTy); 1930 } 1931 case Instruction::ZExt: 1932 case Instruction::SExt: 1933 case Instruction::FPToUI: 1934 case Instruction::FPToSI: 1935 case Instruction::FPExt: 1936 case Instruction::PtrToInt: 1937 case Instruction::IntToPtr: 1938 case Instruction::SIToFP: 1939 case Instruction::UIToFP: 1940 case Instruction::Trunc: 1941 case Instruction::FPTrunc: 1942 case Instruction::BitCast: { 1943 ValueList INVL; 1944 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 1945 INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 1946 1947 setInsertPointAfterBundle(E->Scalars); 1948 1949 Value *InVec = vectorizeTree(INVL); 1950 1951 if (Value *V = alreadyVectorized(E->Scalars)) 1952 return V; 1953 1954 CastInst *CI = dyn_cast<CastInst>(VL0); 1955 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); 1956 E->VectorizedValue = V; 1957 ++NumVectorInstructions; 1958 return V; 1959 } 1960 case Instruction::FCmp: 1961 case Instruction::ICmp: { 1962 ValueList LHSV, RHSV; 1963 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 1964 LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 1965 RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 1966 } 1967 1968 setInsertPointAfterBundle(E->Scalars); 1969 1970 Value *L = vectorizeTree(LHSV); 1971 Value *R = vectorizeTree(RHSV); 1972 1973 if (Value *V = alreadyVectorized(E->Scalars)) 1974 return V; 1975 1976 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); 1977 Value *V; 1978 if (Opcode == Instruction::FCmp) 1979 V = Builder.CreateFCmp(P0, L, R); 1980 else 1981 V = Builder.CreateICmp(P0, L, R); 1982 1983 E->VectorizedValue = V; 1984 ++NumVectorInstructions; 1985 return V; 1986 } 1987 case Instruction::Select: { 1988 ValueList TrueVec, FalseVec, CondVec; 1989 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 1990 CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 1991 TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 1992 FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2)); 1993 } 1994 1995 setInsertPointAfterBundle(E->Scalars); 1996 1997 Value *Cond = vectorizeTree(CondVec); 1998 Value *True = vectorizeTree(TrueVec); 1999 Value *False = vectorizeTree(FalseVec); 2000 2001 if (Value *V = alreadyVectorized(E->Scalars)) 2002 return V; 2003 2004 Value *V = Builder.CreateSelect(Cond, True, False); 2005 E->VectorizedValue = V; 2006 ++NumVectorInstructions; 2007 return V; 2008 } 2009 case Instruction::Add: 2010 case Instruction::FAdd: 2011 case Instruction::Sub: 2012 case Instruction::FSub: 2013 case Instruction::Mul: 2014 case Instruction::FMul: 2015 case Instruction::UDiv: 2016 case Instruction::SDiv: 2017 case Instruction::FDiv: 2018 case Instruction::URem: 2019 case Instruction::SRem: 2020 case Instruction::FRem: 2021 case Instruction::Shl: 2022 case Instruction::LShr: 2023 case Instruction::AShr: 2024 case Instruction::And: 2025 case Instruction::Or: 2026 case Instruction::Xor: { 2027 ValueList LHSVL, RHSVL; 2028 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) 2029 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); 2030 else 2031 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2032 LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2033 RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2034 } 2035 2036 setInsertPointAfterBundle(E->Scalars); 2037 2038 Value *LHS = vectorizeTree(LHSVL); 2039 Value *RHS = vectorizeTree(RHSVL); 2040 2041 if (LHS == RHS && isa<Instruction>(LHS)) { 2042 assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); 2043 } 2044 2045 if (Value *V = alreadyVectorized(E->Scalars)) 2046 return V; 2047 2048 BinaryOperator *BinOp = cast<BinaryOperator>(VL0); 2049 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); 2050 E->VectorizedValue = V; 2051 propagateIRFlags(E->VectorizedValue, E->Scalars); 2052 ++NumVectorInstructions; 2053 2054 if (Instruction *I = dyn_cast<Instruction>(V)) 2055 return propagateMetadata(I, E->Scalars); 2056 2057 return V; 2058 } 2059 case Instruction::Load: { 2060 // Loads are inserted at the head of the tree because we don't want to 2061 // sink them all the way down past store instructions. 2062 setInsertPointAfterBundle(E->Scalars); 2063 2064 LoadInst *LI = cast<LoadInst>(VL0); 2065 Type *ScalarLoadTy = LI->getType(); 2066 unsigned AS = LI->getPointerAddressSpace(); 2067 2068 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), 2069 VecTy->getPointerTo(AS)); 2070 2071 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2072 // ExternalUses list to make sure that an extract will be generated in the 2073 // future. 2074 if (ScalarToTreeEntry.count(LI->getPointerOperand())) 2075 ExternalUses.push_back( 2076 ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0)); 2077 2078 unsigned Alignment = LI->getAlignment(); 2079 LI = Builder.CreateLoad(VecPtr); 2080 if (!Alignment) 2081 Alignment = DL->getABITypeAlignment(ScalarLoadTy); 2082 LI->setAlignment(Alignment); 2083 E->VectorizedValue = LI; 2084 ++NumVectorInstructions; 2085 return propagateMetadata(LI, E->Scalars); 2086 } 2087 case Instruction::Store: { 2088 StoreInst *SI = cast<StoreInst>(VL0); 2089 unsigned Alignment = SI->getAlignment(); 2090 unsigned AS = SI->getPointerAddressSpace(); 2091 2092 ValueList ValueOp; 2093 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2094 ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand()); 2095 2096 setInsertPointAfterBundle(E->Scalars); 2097 2098 Value *VecValue = vectorizeTree(ValueOp); 2099 Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), 2100 VecTy->getPointerTo(AS)); 2101 StoreInst *S = Builder.CreateStore(VecValue, VecPtr); 2102 2103 // The pointer operand uses an in-tree scalar so we add the new BitCast to 2104 // ExternalUses list to make sure that an extract will be generated in the 2105 // future. 2106 if (ScalarToTreeEntry.count(SI->getPointerOperand())) 2107 ExternalUses.push_back( 2108 ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0)); 2109 2110 if (!Alignment) 2111 Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); 2112 S->setAlignment(Alignment); 2113 E->VectorizedValue = S; 2114 ++NumVectorInstructions; 2115 return propagateMetadata(S, E->Scalars); 2116 } 2117 case Instruction::GetElementPtr: { 2118 setInsertPointAfterBundle(E->Scalars); 2119 2120 ValueList Op0VL; 2121 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2122 Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0)); 2123 2124 Value *Op0 = vectorizeTree(Op0VL); 2125 2126 std::vector<Value *> OpVecs; 2127 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; 2128 ++j) { 2129 ValueList OpVL; 2130 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 2131 OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j)); 2132 2133 Value *OpVec = vectorizeTree(OpVL); 2134 OpVecs.push_back(OpVec); 2135 } 2136 2137 Value *V = Builder.CreateGEP(Op0, OpVecs); 2138 E->VectorizedValue = V; 2139 ++NumVectorInstructions; 2140 2141 if (Instruction *I = dyn_cast<Instruction>(V)) 2142 return propagateMetadata(I, E->Scalars); 2143 2144 return V; 2145 } 2146 case Instruction::Call: { 2147 CallInst *CI = cast<CallInst>(VL0); 2148 setInsertPointAfterBundle(E->Scalars); 2149 Function *FI; 2150 Intrinsic::ID IID = Intrinsic::not_intrinsic; 2151 Value *ScalarArg = nullptr; 2152 if (CI && (FI = CI->getCalledFunction())) { 2153 IID = (Intrinsic::ID) FI->getIntrinsicID(); 2154 } 2155 std::vector<Value *> OpVecs; 2156 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { 2157 ValueList OpVL; 2158 // ctlz,cttz and powi are special intrinsics whose second argument is 2159 // a scalar. This argument should not be vectorized. 2160 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { 2161 CallInst *CEI = cast<CallInst>(E->Scalars[0]); 2162 ScalarArg = CEI->getArgOperand(j); 2163 OpVecs.push_back(CEI->getArgOperand(j)); 2164 continue; 2165 } 2166 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2167 CallInst *CEI = cast<CallInst>(E->Scalars[i]); 2168 OpVL.push_back(CEI->getArgOperand(j)); 2169 } 2170 2171 Value *OpVec = vectorizeTree(OpVL); 2172 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); 2173 OpVecs.push_back(OpVec); 2174 } 2175 2176 Module *M = F->getParent(); 2177 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 2178 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; 2179 Function *CF = Intrinsic::getDeclaration(M, ID, Tys); 2180 Value *V = Builder.CreateCall(CF, OpVecs); 2181 2182 // The scalar argument uses an in-tree scalar so we add the new vectorized 2183 // call to ExternalUses list to make sure that an extract will be 2184 // generated in the future. 2185 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg)) 2186 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); 2187 2188 E->VectorizedValue = V; 2189 ++NumVectorInstructions; 2190 return V; 2191 } 2192 case Instruction::ShuffleVector: { 2193 ValueList LHSVL, RHSVL; 2194 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 2195 LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 2196 RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 2197 } 2198 setInsertPointAfterBundle(E->Scalars); 2199 2200 Value *LHS = vectorizeTree(LHSVL); 2201 Value *RHS = vectorizeTree(RHSVL); 2202 2203 if (Value *V = alreadyVectorized(E->Scalars)) 2204 return V; 2205 2206 // Create a vector of LHS op1 RHS 2207 BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0); 2208 Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); 2209 2210 // Create a vector of LHS op2 RHS 2211 Instruction *VL1 = cast<Instruction>(E->Scalars[1]); 2212 BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1); 2213 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); 2214 2215 // Create shuffle to take alternate operations from the vector. 2216 // Also, gather up odd and even scalar ops to propagate IR flags to 2217 // each vector operation. 2218 ValueList OddScalars, EvenScalars; 2219 unsigned e = E->Scalars.size(); 2220 SmallVector<Constant *, 8> Mask(e); 2221 for (unsigned i = 0; i < e; ++i) { 2222 if (i & 1) { 2223 Mask[i] = Builder.getInt32(e + i); 2224 OddScalars.push_back(E->Scalars[i]); 2225 } else { 2226 Mask[i] = Builder.getInt32(i); 2227 EvenScalars.push_back(E->Scalars[i]); 2228 } 2229 } 2230 2231 Value *ShuffleMask = ConstantVector::get(Mask); 2232 propagateIRFlags(V0, EvenScalars); 2233 propagateIRFlags(V1, OddScalars); 2234 2235 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); 2236 E->VectorizedValue = V; 2237 ++NumVectorInstructions; 2238 if (Instruction *I = dyn_cast<Instruction>(V)) 2239 return propagateMetadata(I, E->Scalars); 2240 2241 return V; 2242 } 2243 default: 2244 llvm_unreachable("unknown inst"); 2245 } 2246 return nullptr; 2247 } 2248 2249 Value *BoUpSLP::vectorizeTree() { 2250 2251 // All blocks must be scheduled before any instructions are inserted. 2252 for (auto &BSIter : BlocksSchedules) { 2253 scheduleBlock(BSIter.second.get()); 2254 } 2255 2256 Builder.SetInsertPoint(F->getEntryBlock().begin()); 2257 vectorizeTree(&VectorizableTree[0]); 2258 2259 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); 2260 2261 // Extract all of the elements with the external uses. 2262 for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end(); 2263 it != e; ++it) { 2264 Value *Scalar = it->Scalar; 2265 llvm::User *User = it->User; 2266 2267 // Skip users that we already RAUW. This happens when one instruction 2268 // has multiple uses of the same value. 2269 if (std::find(Scalar->user_begin(), Scalar->user_end(), User) == 2270 Scalar->user_end()) 2271 continue; 2272 assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); 2273 2274 int Idx = ScalarToTreeEntry[Scalar]; 2275 TreeEntry *E = &VectorizableTree[Idx]; 2276 assert(!E->NeedToGather && "Extracting from a gather list"); 2277 2278 Value *Vec = E->VectorizedValue; 2279 assert(Vec && "Can't find vectorizable value"); 2280 2281 Value *Lane = Builder.getInt32(it->Lane); 2282 // Generate extracts for out-of-tree users. 2283 // Find the insertion point for the extractelement lane. 2284 if (isa<Instruction>(Vec)){ 2285 if (PHINode *PH = dyn_cast<PHINode>(User)) { 2286 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { 2287 if (PH->getIncomingValue(i) == Scalar) { 2288 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); 2289 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2290 CSEBlocks.insert(PH->getIncomingBlock(i)); 2291 PH->setOperand(i, Ex); 2292 } 2293 } 2294 } else { 2295 Builder.SetInsertPoint(cast<Instruction>(User)); 2296 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2297 CSEBlocks.insert(cast<Instruction>(User)->getParent()); 2298 User->replaceUsesOfWith(Scalar, Ex); 2299 } 2300 } else { 2301 Builder.SetInsertPoint(F->getEntryBlock().begin()); 2302 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 2303 CSEBlocks.insert(&F->getEntryBlock()); 2304 User->replaceUsesOfWith(Scalar, Ex); 2305 } 2306 2307 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); 2308 } 2309 2310 // For each vectorized value: 2311 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 2312 TreeEntry *Entry = &VectorizableTree[EIdx]; 2313 2314 // For each lane: 2315 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 2316 Value *Scalar = Entry->Scalars[Lane]; 2317 // No need to handle users of gathered values. 2318 if (Entry->NeedToGather) 2319 continue; 2320 2321 assert(Entry->VectorizedValue && "Can't find vectorizable value"); 2322 2323 Type *Ty = Scalar->getType(); 2324 if (!Ty->isVoidTy()) { 2325 #ifndef NDEBUG 2326 for (User *U : Scalar->users()) { 2327 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); 2328 2329 assert((ScalarToTreeEntry.count(U) || 2330 // It is legal to replace users in the ignorelist by undef. 2331 (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != 2332 UserIgnoreList.end())) && 2333 "Replacing out-of-tree value with undef"); 2334 } 2335 #endif 2336 Value *Undef = UndefValue::get(Ty); 2337 Scalar->replaceAllUsesWith(Undef); 2338 } 2339 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); 2340 cast<Instruction>(Scalar)->eraseFromParent(); 2341 } 2342 } 2343 2344 Builder.ClearInsertionPoint(); 2345 2346 return VectorizableTree[0].VectorizedValue; 2347 } 2348 2349 void BoUpSLP::optimizeGatherSequence() { 2350 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() 2351 << " gather sequences instructions.\n"); 2352 // LICM InsertElementInst sequences. 2353 for (SetVector<Instruction *>::iterator it = GatherSeq.begin(), 2354 e = GatherSeq.end(); it != e; ++it) { 2355 InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); 2356 2357 if (!Insert) 2358 continue; 2359 2360 // Check if this block is inside a loop. 2361 Loop *L = LI->getLoopFor(Insert->getParent()); 2362 if (!L) 2363 continue; 2364 2365 // Check if it has a preheader. 2366 BasicBlock *PreHeader = L->getLoopPreheader(); 2367 if (!PreHeader) 2368 continue; 2369 2370 // If the vector or the element that we insert into it are 2371 // instructions that are defined in this basic block then we can't 2372 // hoist this instruction. 2373 Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); 2374 Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); 2375 if (CurrVec && L->contains(CurrVec)) 2376 continue; 2377 if (NewElem && L->contains(NewElem)) 2378 continue; 2379 2380 // We can hoist this instruction. Move it to the pre-header. 2381 Insert->moveBefore(PreHeader->getTerminator()); 2382 } 2383 2384 // Make a list of all reachable blocks in our CSE queue. 2385 SmallVector<const DomTreeNode *, 8> CSEWorkList; 2386 CSEWorkList.reserve(CSEBlocks.size()); 2387 for (BasicBlock *BB : CSEBlocks) 2388 if (DomTreeNode *N = DT->getNode(BB)) { 2389 assert(DT->isReachableFromEntry(N)); 2390 CSEWorkList.push_back(N); 2391 } 2392 2393 // Sort blocks by domination. This ensures we visit a block after all blocks 2394 // dominating it are visited. 2395 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), 2396 [this](const DomTreeNode *A, const DomTreeNode *B) { 2397 return DT->properlyDominates(A, B); 2398 }); 2399 2400 // Perform O(N^2) search over the gather sequences and merge identical 2401 // instructions. TODO: We can further optimize this scan if we split the 2402 // instructions into different buckets based on the insert lane. 2403 SmallVector<Instruction *, 16> Visited; 2404 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { 2405 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && 2406 "Worklist not sorted properly!"); 2407 BasicBlock *BB = (*I)->getBlock(); 2408 // For all instructions in blocks containing gather sequences: 2409 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { 2410 Instruction *In = it++; 2411 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) 2412 continue; 2413 2414 // Check if we can replace this instruction with any of the 2415 // visited instructions. 2416 for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(), 2417 ve = Visited.end(); 2418 v != ve; ++v) { 2419 if (In->isIdenticalTo(*v) && 2420 DT->dominates((*v)->getParent(), In->getParent())) { 2421 In->replaceAllUsesWith(*v); 2422 In->eraseFromParent(); 2423 In = nullptr; 2424 break; 2425 } 2426 } 2427 if (In) { 2428 assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end()); 2429 Visited.push_back(In); 2430 } 2431 } 2432 } 2433 CSEBlocks.clear(); 2434 GatherSeq.clear(); 2435 } 2436 2437 // Groups the instructions to a bundle (which is then a single scheduling entity) 2438 // and schedules instructions until the bundle gets ready. 2439 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, 2440 AliasAnalysis *AA) { 2441 if (isa<PHINode>(VL[0])) 2442 return true; 2443 2444 // Initialize the instruction bundle. 2445 Instruction *OldScheduleEnd = ScheduleEnd; 2446 ScheduleData *PrevInBundle = nullptr; 2447 ScheduleData *Bundle = nullptr; 2448 bool ReSchedule = false; 2449 DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); 2450 for (Value *V : VL) { 2451 extendSchedulingRegion(V); 2452 ScheduleData *BundleMember = getScheduleData(V); 2453 assert(BundleMember && 2454 "no ScheduleData for bundle member (maybe not in same basic block)"); 2455 if (BundleMember->IsScheduled) { 2456 // A bundle member was scheduled as single instruction before and now 2457 // needs to be scheduled as part of the bundle. We just get rid of the 2458 // existing schedule. 2459 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember 2460 << " was already scheduled\n"); 2461 ReSchedule = true; 2462 } 2463 assert(BundleMember->isSchedulingEntity() && 2464 "bundle member already part of other bundle"); 2465 if (PrevInBundle) { 2466 PrevInBundle->NextInBundle = BundleMember; 2467 } else { 2468 Bundle = BundleMember; 2469 } 2470 BundleMember->UnscheduledDepsInBundle = 0; 2471 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; 2472 2473 // Group the instructions to a bundle. 2474 BundleMember->FirstInBundle = Bundle; 2475 PrevInBundle = BundleMember; 2476 } 2477 if (ScheduleEnd != OldScheduleEnd) { 2478 // The scheduling region got new instructions at the lower end (or it is a 2479 // new region for the first bundle). This makes it necessary to 2480 // recalculate all dependencies. 2481 // It is seldom that this needs to be done a second time after adding the 2482 // initial bundle to the region. 2483 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2484 ScheduleData *SD = getScheduleData(I); 2485 SD->clearDependencies(); 2486 } 2487 ReSchedule = true; 2488 } 2489 if (ReSchedule) { 2490 resetSchedule(); 2491 initialFillReadyList(ReadyInsts); 2492 } 2493 2494 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " 2495 << BB->getName() << "\n"); 2496 2497 calculateDependencies(Bundle, true, AA); 2498 2499 // Now try to schedule the new bundle. As soon as the bundle is "ready" it 2500 // means that there are no cyclic dependencies and we can schedule it. 2501 // Note that's important that we don't "schedule" the bundle yet (see 2502 // cancelScheduling). 2503 while (!Bundle->isReady() && !ReadyInsts.empty()) { 2504 2505 ScheduleData *pickedSD = ReadyInsts.back(); 2506 ReadyInsts.pop_back(); 2507 2508 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { 2509 schedule(pickedSD, ReadyInsts); 2510 } 2511 } 2512 return Bundle->isReady(); 2513 } 2514 2515 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) { 2516 if (isa<PHINode>(VL[0])) 2517 return; 2518 2519 ScheduleData *Bundle = getScheduleData(VL[0]); 2520 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); 2521 assert(!Bundle->IsScheduled && 2522 "Can't cancel bundle which is already scheduled"); 2523 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && 2524 "tried to unbundle something which is not a bundle"); 2525 2526 // Un-bundle: make single instructions out of the bundle. 2527 ScheduleData *BundleMember = Bundle; 2528 while (BundleMember) { 2529 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); 2530 BundleMember->FirstInBundle = BundleMember; 2531 ScheduleData *Next = BundleMember->NextInBundle; 2532 BundleMember->NextInBundle = nullptr; 2533 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; 2534 if (BundleMember->UnscheduledDepsInBundle == 0) { 2535 ReadyInsts.insert(BundleMember); 2536 } 2537 BundleMember = Next; 2538 } 2539 } 2540 2541 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { 2542 if (getScheduleData(V)) 2543 return; 2544 Instruction *I = dyn_cast<Instruction>(V); 2545 assert(I && "bundle member must be an instruction"); 2546 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); 2547 if (!ScheduleStart) { 2548 // It's the first instruction in the new region. 2549 initScheduleData(I, I->getNextNode(), nullptr, nullptr); 2550 ScheduleStart = I; 2551 ScheduleEnd = I->getNextNode(); 2552 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2553 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); 2554 return; 2555 } 2556 // Search up and down at the same time, because we don't know if the new 2557 // instruction is above or below the existing scheduling region. 2558 BasicBlock::reverse_iterator UpIter(ScheduleStart); 2559 BasicBlock::reverse_iterator UpperEnd = BB->rend(); 2560 BasicBlock::iterator DownIter(ScheduleEnd); 2561 BasicBlock::iterator LowerEnd = BB->end(); 2562 for (;;) { 2563 if (UpIter != UpperEnd) { 2564 if (&*UpIter == I) { 2565 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); 2566 ScheduleStart = I; 2567 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); 2568 return; 2569 } 2570 UpIter++; 2571 } 2572 if (DownIter != LowerEnd) { 2573 if (&*DownIter == I) { 2574 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, 2575 nullptr); 2576 ScheduleEnd = I->getNextNode(); 2577 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 2578 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); 2579 return; 2580 } 2581 DownIter++; 2582 } 2583 assert((UpIter != UpperEnd || DownIter != LowerEnd) && 2584 "instruction not found in block"); 2585 } 2586 } 2587 2588 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, 2589 Instruction *ToI, 2590 ScheduleData *PrevLoadStore, 2591 ScheduleData *NextLoadStore) { 2592 ScheduleData *CurrentLoadStore = PrevLoadStore; 2593 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { 2594 ScheduleData *SD = ScheduleDataMap[I]; 2595 if (!SD) { 2596 // Allocate a new ScheduleData for the instruction. 2597 if (ChunkPos >= ChunkSize) { 2598 ScheduleDataChunks.push_back( 2599 llvm::make_unique<ScheduleData[]>(ChunkSize)); 2600 ChunkPos = 0; 2601 } 2602 SD = &(ScheduleDataChunks.back()[ChunkPos++]); 2603 ScheduleDataMap[I] = SD; 2604 SD->Inst = I; 2605 } 2606 assert(!isInSchedulingRegion(SD) && 2607 "new ScheduleData already in scheduling region"); 2608 SD->init(SchedulingRegionID); 2609 2610 if (I->mayReadOrWriteMemory()) { 2611 // Update the linked list of memory accessing instructions. 2612 if (CurrentLoadStore) { 2613 CurrentLoadStore->NextLoadStore = SD; 2614 } else { 2615 FirstLoadStoreInRegion = SD; 2616 } 2617 CurrentLoadStore = SD; 2618 } 2619 } 2620 if (NextLoadStore) { 2621 if (CurrentLoadStore) 2622 CurrentLoadStore->NextLoadStore = NextLoadStore; 2623 } else { 2624 LastLoadStoreInRegion = CurrentLoadStore; 2625 } 2626 } 2627 2628 /// \returns the AA location that is being access by the instruction. 2629 static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) { 2630 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 2631 return AA->getLocation(SI); 2632 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 2633 return AA->getLocation(LI); 2634 return AliasAnalysis::Location(); 2635 } 2636 2637 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, 2638 bool InsertInReadyList, 2639 AliasAnalysis *AA) { 2640 assert(SD->isSchedulingEntity()); 2641 2642 SmallVector<ScheduleData *, 10> WorkList; 2643 WorkList.push_back(SD); 2644 2645 while (!WorkList.empty()) { 2646 ScheduleData *SD = WorkList.back(); 2647 WorkList.pop_back(); 2648 2649 ScheduleData *BundleMember = SD; 2650 while (BundleMember) { 2651 assert(isInSchedulingRegion(BundleMember)); 2652 if (!BundleMember->hasValidDependencies()) { 2653 2654 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n"); 2655 BundleMember->Dependencies = 0; 2656 BundleMember->resetUnscheduledDeps(); 2657 2658 // Handle def-use chain dependencies. 2659 for (User *U : BundleMember->Inst->users()) { 2660 if (isa<Instruction>(U)) { 2661 ScheduleData *UseSD = getScheduleData(U); 2662 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { 2663 BundleMember->Dependencies++; 2664 ScheduleData *DestBundle = UseSD->FirstInBundle; 2665 if (!DestBundle->IsScheduled) { 2666 BundleMember->incrementUnscheduledDeps(1); 2667 } 2668 if (!DestBundle->hasValidDependencies()) { 2669 WorkList.push_back(DestBundle); 2670 } 2671 } 2672 } else { 2673 // I'm not sure if this can ever happen. But we need to be safe. 2674 // This lets the instruction/bundle never be scheduled and eventally 2675 // disable vectorization. 2676 BundleMember->Dependencies++; 2677 BundleMember->incrementUnscheduledDeps(1); 2678 } 2679 } 2680 2681 // Handle the memory dependencies. 2682 ScheduleData *DepDest = BundleMember->NextLoadStore; 2683 if (DepDest) { 2684 AliasAnalysis::Location SrcLoc = getLocation(BundleMember->Inst, AA); 2685 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); 2686 2687 while (DepDest) { 2688 assert(isInSchedulingRegion(DepDest)); 2689 if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) { 2690 AliasAnalysis::Location DstLoc = getLocation(DepDest->Inst, AA); 2691 if (!SrcLoc.Ptr || !DstLoc.Ptr || AA->alias(SrcLoc, DstLoc)) { 2692 DepDest->MemoryDependencies.push_back(BundleMember); 2693 BundleMember->Dependencies++; 2694 ScheduleData *DestBundle = DepDest->FirstInBundle; 2695 if (!DestBundle->IsScheduled) { 2696 BundleMember->incrementUnscheduledDeps(1); 2697 } 2698 if (!DestBundle->hasValidDependencies()) { 2699 WorkList.push_back(DestBundle); 2700 } 2701 } 2702 } 2703 DepDest = DepDest->NextLoadStore; 2704 } 2705 } 2706 } 2707 BundleMember = BundleMember->NextInBundle; 2708 } 2709 if (InsertInReadyList && SD->isReady()) { 2710 ReadyInsts.push_back(SD); 2711 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n"); 2712 } 2713 } 2714 } 2715 2716 void BoUpSLP::BlockScheduling::resetSchedule() { 2717 assert(ScheduleStart && 2718 "tried to reset schedule on block which has not been scheduled"); 2719 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2720 ScheduleData *SD = getScheduleData(I); 2721 assert(isInSchedulingRegion(SD)); 2722 SD->IsScheduled = false; 2723 SD->resetUnscheduledDeps(); 2724 } 2725 ReadyInsts.clear(); 2726 } 2727 2728 void BoUpSLP::scheduleBlock(BlockScheduling *BS) { 2729 2730 if (!BS->ScheduleStart) 2731 return; 2732 2733 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); 2734 2735 BS->resetSchedule(); 2736 2737 // For the real scheduling we use a more sophisticated ready-list: it is 2738 // sorted by the original instruction location. This lets the final schedule 2739 // be as close as possible to the original instruction order. 2740 struct ScheduleDataCompare { 2741 bool operator()(ScheduleData *SD1, ScheduleData *SD2) { 2742 return SD2->SchedulingPriority < SD1->SchedulingPriority; 2743 } 2744 }; 2745 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; 2746 2747 // Ensure that all depencency data is updated and fill the ready-list with 2748 // initial instructions. 2749 int Idx = 0; 2750 int NumToSchedule = 0; 2751 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; 2752 I = I->getNextNode()) { 2753 ScheduleData *SD = BS->getScheduleData(I); 2754 assert( 2755 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && 2756 "scheduler and vectorizer have different opinion on what is a bundle"); 2757 SD->FirstInBundle->SchedulingPriority = Idx++; 2758 if (SD->isSchedulingEntity()) { 2759 BS->calculateDependencies(SD, false, AA); 2760 NumToSchedule++; 2761 } 2762 } 2763 BS->initialFillReadyList(ReadyInsts); 2764 2765 Instruction *LastScheduledInst = BS->ScheduleEnd; 2766 2767 // Do the "real" scheduling. 2768 while (!ReadyInsts.empty()) { 2769 ScheduleData *picked = *ReadyInsts.begin(); 2770 ReadyInsts.erase(ReadyInsts.begin()); 2771 2772 // Move the scheduled instruction(s) to their dedicated places, if not 2773 // there yet. 2774 ScheduleData *BundleMember = picked; 2775 while (BundleMember) { 2776 Instruction *pickedInst = BundleMember->Inst; 2777 if (LastScheduledInst->getNextNode() != pickedInst) { 2778 BS->BB->getInstList().remove(pickedInst); 2779 BS->BB->getInstList().insert(LastScheduledInst, pickedInst); 2780 } 2781 LastScheduledInst = pickedInst; 2782 BundleMember = BundleMember->NextInBundle; 2783 } 2784 2785 BS->schedule(picked, ReadyInsts); 2786 NumToSchedule--; 2787 } 2788 assert(NumToSchedule == 0 && "could not schedule all instructions"); 2789 2790 // Avoid duplicate scheduling of the block. 2791 BS->ScheduleStart = nullptr; 2792 } 2793 2794 /// The SLPVectorizer Pass. 2795 struct SLPVectorizer : public FunctionPass { 2796 typedef SmallVector<StoreInst *, 8> StoreList; 2797 typedef MapVector<Value *, StoreList> StoreListMap; 2798 2799 /// Pass identification, replacement for typeid 2800 static char ID; 2801 2802 explicit SLPVectorizer() : FunctionPass(ID) { 2803 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); 2804 } 2805 2806 ScalarEvolution *SE; 2807 const DataLayout *DL; 2808 TargetTransformInfo *TTI; 2809 TargetLibraryInfo *TLI; 2810 AliasAnalysis *AA; 2811 LoopInfo *LI; 2812 DominatorTree *DT; 2813 2814 bool runOnFunction(Function &F) override { 2815 if (skipOptnoneFunction(F)) 2816 return false; 2817 2818 SE = &getAnalysis<ScalarEvolution>(); 2819 DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); 2820 DL = DLP ? &DLP->getDataLayout() : nullptr; 2821 TTI = &getAnalysis<TargetTransformInfo>(); 2822 TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); 2823 AA = &getAnalysis<AliasAnalysis>(); 2824 LI = &getAnalysis<LoopInfo>(); 2825 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2826 2827 StoreRefs.clear(); 2828 bool Changed = false; 2829 2830 // If the target claims to have no vector registers don't attempt 2831 // vectorization. 2832 if (!TTI->getNumberOfRegisters(true)) 2833 return false; 2834 2835 // Must have DataLayout. We can't require it because some tests run w/o 2836 // triple. 2837 if (!DL) 2838 return false; 2839 2840 // Don't vectorize when the attribute NoImplicitFloat is used. 2841 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 2842 return false; 2843 2844 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); 2845 2846 // Use the bottom up slp vectorizer to construct chains that start with 2847 // store instructions. 2848 BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT); 2849 2850 // Scan the blocks in the function in post order. 2851 for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), 2852 e = po_end(&F.getEntryBlock()); it != e; ++it) { 2853 BasicBlock *BB = *it; 2854 // Vectorize trees that end at stores. 2855 if (unsigned count = collectStores(BB, R)) { 2856 (void)count; 2857 DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); 2858 Changed |= vectorizeStoreChains(R); 2859 } 2860 2861 // Vectorize trees that end at reductions. 2862 Changed |= vectorizeChainsInBlock(BB, R); 2863 } 2864 2865 if (Changed) { 2866 R.optimizeGatherSequence(); 2867 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); 2868 DEBUG(verifyFunction(F)); 2869 } 2870 return Changed; 2871 } 2872 2873 void getAnalysisUsage(AnalysisUsage &AU) const override { 2874 FunctionPass::getAnalysisUsage(AU); 2875 AU.addRequired<ScalarEvolution>(); 2876 AU.addRequired<AliasAnalysis>(); 2877 AU.addRequired<TargetTransformInfo>(); 2878 AU.addRequired<LoopInfo>(); 2879 AU.addRequired<DominatorTreeWrapperPass>(); 2880 AU.addPreserved<LoopInfo>(); 2881 AU.addPreserved<DominatorTreeWrapperPass>(); 2882 AU.setPreservesCFG(); 2883 } 2884 2885 private: 2886 2887 /// \brief Collect memory references and sort them according to their base 2888 /// object. We sort the stores to their base objects to reduce the cost of the 2889 /// quadratic search on the stores. TODO: We can further reduce this cost 2890 /// if we flush the chain creation every time we run into a memory barrier. 2891 unsigned collectStores(BasicBlock *BB, BoUpSLP &R); 2892 2893 /// \brief Try to vectorize a chain that starts at two arithmetic instrs. 2894 bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); 2895 2896 /// \brief Try to vectorize a list of operands. 2897 /// \@param BuildVector A list of users to ignore for the purpose of 2898 /// scheduling and that don't need extracting. 2899 /// \returns true if a value was vectorized. 2900 bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 2901 ArrayRef<Value *> BuildVector = None, 2902 bool allowReorder = false); 2903 2904 /// \brief Try to vectorize a chain that may start at the operands of \V; 2905 bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); 2906 2907 /// \brief Vectorize the stores that were collected in StoreRefs. 2908 bool vectorizeStoreChains(BoUpSLP &R); 2909 2910 /// \brief Scan the basic block and look for patterns that are likely to start 2911 /// a vectorization chain. 2912 bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); 2913 2914 bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, 2915 BoUpSLP &R); 2916 2917 bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, 2918 BoUpSLP &R); 2919 private: 2920 StoreListMap StoreRefs; 2921 }; 2922 2923 /// \brief Check that the Values in the slice in VL array are still existent in 2924 /// the WeakVH array. 2925 /// Vectorization of part of the VL array may cause later values in the VL array 2926 /// to become invalid. We track when this has happened in the WeakVH array. 2927 static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL, 2928 SmallVectorImpl<WeakVH> &VH, 2929 unsigned SliceBegin, 2930 unsigned SliceSize) { 2931 for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i) 2932 if (VH[i] != VL[i]) 2933 return true; 2934 2935 return false; 2936 } 2937 2938 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, 2939 int CostThreshold, BoUpSLP &R) { 2940 unsigned ChainLen = Chain.size(); 2941 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen 2942 << "\n"); 2943 Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); 2944 unsigned Sz = DL->getTypeSizeInBits(StoreTy); 2945 unsigned VF = MinVecRegSize / Sz; 2946 2947 if (!isPowerOf2_32(Sz) || VF < 2) 2948 return false; 2949 2950 // Keep track of values that were deleted by vectorizing in the loop below. 2951 SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end()); 2952 2953 bool Changed = false; 2954 // Look for profitable vectorizable trees at all offsets, starting at zero. 2955 for (unsigned i = 0, e = ChainLen; i < e; ++i) { 2956 if (i + VF > e) 2957 break; 2958 2959 // Check that a previous iteration of this loop did not delete the Value. 2960 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) 2961 continue; 2962 2963 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i 2964 << "\n"); 2965 ArrayRef<Value *> Operands = Chain.slice(i, VF); 2966 2967 R.buildTree(Operands); 2968 2969 int Cost = R.getTreeCost(); 2970 2971 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); 2972 if (Cost < CostThreshold) { 2973 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); 2974 R.vectorizeTree(); 2975 2976 // Move to the next bundle. 2977 i += VF - 1; 2978 Changed = true; 2979 } 2980 } 2981 2982 return Changed; 2983 } 2984 2985 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, 2986 int costThreshold, BoUpSLP &R) { 2987 SetVector<Value *> Heads, Tails; 2988 SmallDenseMap<Value *, Value *> ConsecutiveChain; 2989 2990 // We may run into multiple chains that merge into a single chain. We mark the 2991 // stores that we vectorized so that we don't visit the same store twice. 2992 BoUpSLP::ValueSet VectorizedStores; 2993 bool Changed = false; 2994 2995 // Do a quadratic search on all of the given stores and find 2996 // all of the pairs of stores that follow each other. 2997 for (unsigned i = 0, e = Stores.size(); i < e; ++i) { 2998 for (unsigned j = 0; j < e; ++j) { 2999 if (i == j) 3000 continue; 3001 3002 if (R.isConsecutiveAccess(Stores[i], Stores[j])) { 3003 Tails.insert(Stores[j]); 3004 Heads.insert(Stores[i]); 3005 ConsecutiveChain[Stores[i]] = Stores[j]; 3006 } 3007 } 3008 } 3009 3010 // For stores that start but don't end a link in the chain: 3011 for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); 3012 it != e; ++it) { 3013 if (Tails.count(*it)) 3014 continue; 3015 3016 // We found a store instr that starts a chain. Now follow the chain and try 3017 // to vectorize it. 3018 BoUpSLP::ValueList Operands; 3019 Value *I = *it; 3020 // Collect the chain into a list. 3021 while (Tails.count(I) || Heads.count(I)) { 3022 if (VectorizedStores.count(I)) 3023 break; 3024 Operands.push_back(I); 3025 // Move to the next value in the chain. 3026 I = ConsecutiveChain[I]; 3027 } 3028 3029 bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); 3030 3031 // Mark the vectorized stores so that we don't vectorize them again. 3032 if (Vectorized) 3033 VectorizedStores.insert(Operands.begin(), Operands.end()); 3034 Changed |= Vectorized; 3035 } 3036 3037 return Changed; 3038 } 3039 3040 3041 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { 3042 unsigned count = 0; 3043 StoreRefs.clear(); 3044 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3045 StoreInst *SI = dyn_cast<StoreInst>(it); 3046 if (!SI) 3047 continue; 3048 3049 // Don't touch volatile stores. 3050 if (!SI->isSimple()) 3051 continue; 3052 3053 // Check that the pointer points to scalars. 3054 Type *Ty = SI->getValueOperand()->getType(); 3055 if (Ty->isAggregateType() || Ty->isVectorTy()) 3056 continue; 3057 3058 // Find the base pointer. 3059 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); 3060 3061 // Save the store locations. 3062 StoreRefs[Ptr].push_back(SI); 3063 count++; 3064 } 3065 return count; 3066 } 3067 3068 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { 3069 if (!A || !B) 3070 return false; 3071 Value *VL[] = { A, B }; 3072 return tryToVectorizeList(VL, R, None, true); 3073 } 3074 3075 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 3076 ArrayRef<Value *> BuildVector, 3077 bool allowReorder) { 3078 if (VL.size() < 2) 3079 return false; 3080 3081 DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n"); 3082 3083 // Check that all of the parts are scalar instructions of the same type. 3084 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 3085 if (!I0) 3086 return false; 3087 3088 unsigned Opcode0 = I0->getOpcode(); 3089 3090 Type *Ty0 = I0->getType(); 3091 unsigned Sz = DL->getTypeSizeInBits(Ty0); 3092 unsigned VF = MinVecRegSize / Sz; 3093 3094 for (int i = 0, e = VL.size(); i < e; ++i) { 3095 Type *Ty = VL[i]->getType(); 3096 if (Ty->isAggregateType() || Ty->isVectorTy()) 3097 return false; 3098 Instruction *Inst = dyn_cast<Instruction>(VL[i]); 3099 if (!Inst || Inst->getOpcode() != Opcode0) 3100 return false; 3101 } 3102 3103 bool Changed = false; 3104 3105 // Keep track of values that were deleted by vectorizing in the loop below. 3106 SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); 3107 3108 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 3109 unsigned OpsWidth = 0; 3110 3111 if (i + VF > e) 3112 OpsWidth = e - i; 3113 else 3114 OpsWidth = VF; 3115 3116 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) 3117 break; 3118 3119 // Check that a previous iteration of this loop did not delete the Value. 3120 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth)) 3121 continue; 3122 3123 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " 3124 << "\n"); 3125 ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); 3126 3127 ArrayRef<Value *> BuildVectorSlice; 3128 if (!BuildVector.empty()) 3129 BuildVectorSlice = BuildVector.slice(i, OpsWidth); 3130 3131 R.buildTree(Ops, BuildVectorSlice); 3132 // TODO: check if we can allow reordering also for other cases than 3133 // tryToVectorizePair() 3134 if (allowReorder && R.shouldReorder()) { 3135 assert(Ops.size() == 2); 3136 assert(BuildVectorSlice.empty()); 3137 Value *ReorderedOps[] = { Ops[1], Ops[0] }; 3138 R.buildTree(ReorderedOps, None); 3139 } 3140 int Cost = R.getTreeCost(); 3141 3142 if (Cost < -SLPCostThreshold) { 3143 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); 3144 Value *VectorizedRoot = R.vectorizeTree(); 3145 3146 // Reconstruct the build vector by extracting the vectorized root. This 3147 // way we handle the case where some elements of the vector are undefined. 3148 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) 3149 if (!BuildVectorSlice.empty()) { 3150 // The insert point is the last build vector instruction. The vectorized 3151 // root will precede it. This guarantees that we get an instruction. The 3152 // vectorized tree could have been constant folded. 3153 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); 3154 unsigned VecIdx = 0; 3155 for (auto &V : BuildVectorSlice) { 3156 IRBuilder<true, NoFolder> Builder( 3157 ++BasicBlock::iterator(InsertAfter)); 3158 InsertElementInst *IE = cast<InsertElementInst>(V); 3159 Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( 3160 VectorizedRoot, Builder.getInt32(VecIdx++))); 3161 IE->setOperand(1, Extract); 3162 IE->removeFromParent(); 3163 IE->insertAfter(Extract); 3164 InsertAfter = IE; 3165 } 3166 } 3167 // Move to the next bundle. 3168 i += VF - 1; 3169 Changed = true; 3170 } 3171 } 3172 3173 return Changed; 3174 } 3175 3176 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { 3177 if (!V) 3178 return false; 3179 3180 // Try to vectorize V. 3181 if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) 3182 return true; 3183 3184 BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0)); 3185 BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1)); 3186 // Try to skip B. 3187 if (B && B->hasOneUse()) { 3188 BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); 3189 BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); 3190 if (tryToVectorizePair(A, B0, R)) { 3191 return true; 3192 } 3193 if (tryToVectorizePair(A, B1, R)) { 3194 return true; 3195 } 3196 } 3197 3198 // Try to skip A. 3199 if (A && A->hasOneUse()) { 3200 BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); 3201 BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); 3202 if (tryToVectorizePair(A0, B, R)) { 3203 return true; 3204 } 3205 if (tryToVectorizePair(A1, B, R)) { 3206 return true; 3207 } 3208 } 3209 return 0; 3210 } 3211 3212 /// \brief Generate a shuffle mask to be used in a reduction tree. 3213 /// 3214 /// \param VecLen The length of the vector to be reduced. 3215 /// \param NumEltsToRdx The number of elements that should be reduced in the 3216 /// vector. 3217 /// \param IsPairwise Whether the reduction is a pairwise or splitting 3218 /// reduction. A pairwise reduction will generate a mask of 3219 /// <0,2,...> or <1,3,..> while a splitting reduction will generate 3220 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2. 3221 /// \param IsLeft True will generate a mask of even elements, odd otherwise. 3222 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, 3223 bool IsPairwise, bool IsLeft, 3224 IRBuilder<> &Builder) { 3225 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); 3226 3227 SmallVector<Constant *, 32> ShuffleMask( 3228 VecLen, UndefValue::get(Builder.getInt32Ty())); 3229 3230 if (IsPairwise) 3231 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). 3232 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3233 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); 3234 else 3235 // Move the upper half of the vector to the lower half. 3236 for (unsigned i = 0; i != NumEltsToRdx; ++i) 3237 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); 3238 3239 return ConstantVector::get(ShuffleMask); 3240 } 3241 3242 3243 /// Model horizontal reductions. 3244 /// 3245 /// A horizontal reduction is a tree of reduction operations (currently add and 3246 /// fadd) that has operations that can be put into a vector as its leaf. 3247 /// For example, this tree: 3248 /// 3249 /// mul mul mul mul 3250 /// \ / \ / 3251 /// + + 3252 /// \ / 3253 /// + 3254 /// This tree has "mul" as its reduced values and "+" as its reduction 3255 /// operations. A reduction might be feeding into a store or a binary operation 3256 /// feeding a phi. 3257 /// ... 3258 /// \ / 3259 /// + 3260 /// | 3261 /// phi += 3262 /// 3263 /// Or: 3264 /// ... 3265 /// \ / 3266 /// + 3267 /// | 3268 /// *p = 3269 /// 3270 class HorizontalReduction { 3271 SmallVector<Value *, 16> ReductionOps; 3272 SmallVector<Value *, 32> ReducedVals; 3273 3274 BinaryOperator *ReductionRoot; 3275 PHINode *ReductionPHI; 3276 3277 /// The opcode of the reduction. 3278 unsigned ReductionOpcode; 3279 /// The opcode of the values we perform a reduction on. 3280 unsigned ReducedValueOpcode; 3281 /// The width of one full horizontal reduction operation. 3282 unsigned ReduxWidth; 3283 /// Should we model this reduction as a pairwise reduction tree or a tree that 3284 /// splits the vector in halves and adds those halves. 3285 bool IsPairwiseReduction; 3286 3287 public: 3288 HorizontalReduction() 3289 : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), 3290 ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} 3291 3292 /// \brief Try to find a reduction tree. 3293 bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, 3294 const DataLayout *DL) { 3295 assert((!Phi || 3296 std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && 3297 "Thi phi needs to use the binary operator"); 3298 3299 // We could have a initial reductions that is not an add. 3300 // r *= v1 + v2 + v3 + v4 3301 // In such a case start looking for a tree rooted in the first '+'. 3302 if (Phi) { 3303 if (B->getOperand(0) == Phi) { 3304 Phi = nullptr; 3305 B = dyn_cast<BinaryOperator>(B->getOperand(1)); 3306 } else if (B->getOperand(1) == Phi) { 3307 Phi = nullptr; 3308 B = dyn_cast<BinaryOperator>(B->getOperand(0)); 3309 } 3310 } 3311 3312 if (!B) 3313 return false; 3314 3315 Type *Ty = B->getType(); 3316 if (Ty->isVectorTy()) 3317 return false; 3318 3319 ReductionOpcode = B->getOpcode(); 3320 ReducedValueOpcode = 0; 3321 ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); 3322 ReductionRoot = B; 3323 ReductionPHI = Phi; 3324 3325 if (ReduxWidth < 4) 3326 return false; 3327 3328 // We currently only support adds. 3329 if (ReductionOpcode != Instruction::Add && 3330 ReductionOpcode != Instruction::FAdd) 3331 return false; 3332 3333 // Post order traverse the reduction tree starting at B. We only handle true 3334 // trees containing only binary operators. 3335 SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack; 3336 Stack.push_back(std::make_pair(B, 0)); 3337 while (!Stack.empty()) { 3338 BinaryOperator *TreeN = Stack.back().first; 3339 unsigned EdgeToVist = Stack.back().second++; 3340 bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; 3341 3342 // Only handle trees in the current basic block. 3343 if (TreeN->getParent() != B->getParent()) 3344 return false; 3345 3346 // Each tree node needs to have one user except for the ultimate 3347 // reduction. 3348 if (!TreeN->hasOneUse() && TreeN != B) 3349 return false; 3350 3351 // Postorder vist. 3352 if (EdgeToVist == 2 || IsReducedValue) { 3353 if (IsReducedValue) { 3354 // Make sure that the opcodes of the operations that we are going to 3355 // reduce match. 3356 if (!ReducedValueOpcode) 3357 ReducedValueOpcode = TreeN->getOpcode(); 3358 else if (ReducedValueOpcode != TreeN->getOpcode()) 3359 return false; 3360 ReducedVals.push_back(TreeN); 3361 } else { 3362 // We need to be able to reassociate the adds. 3363 if (!TreeN->isAssociative()) 3364 return false; 3365 ReductionOps.push_back(TreeN); 3366 } 3367 // Retract. 3368 Stack.pop_back(); 3369 continue; 3370 } 3371 3372 // Visit left or right. 3373 Value *NextV = TreeN->getOperand(EdgeToVist); 3374 BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV); 3375 if (Next) 3376 Stack.push_back(std::make_pair(Next, 0)); 3377 else if (NextV != Phi) 3378 return false; 3379 } 3380 return true; 3381 } 3382 3383 /// \brief Attempt to vectorize the tree found by 3384 /// matchAssociativeReduction. 3385 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { 3386 if (ReducedVals.empty()) 3387 return false; 3388 3389 unsigned NumReducedVals = ReducedVals.size(); 3390 if (NumReducedVals < ReduxWidth) 3391 return false; 3392 3393 Value *VectorizedTree = nullptr; 3394 IRBuilder<> Builder(ReductionRoot); 3395 FastMathFlags Unsafe; 3396 Unsafe.setUnsafeAlgebra(); 3397 Builder.SetFastMathFlags(Unsafe); 3398 unsigned i = 0; 3399 3400 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { 3401 V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps); 3402 3403 // Estimate cost. 3404 int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); 3405 if (Cost >= -SLPCostThreshold) 3406 break; 3407 3408 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost 3409 << ". (HorRdx)\n"); 3410 3411 // Vectorize a tree. 3412 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); 3413 Value *VectorizedRoot = V.vectorizeTree(); 3414 3415 // Emit a reduction. 3416 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); 3417 if (VectorizedTree) { 3418 Builder.SetCurrentDebugLocation(Loc); 3419 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3420 ReducedSubTree, "bin.rdx"); 3421 } else 3422 VectorizedTree = ReducedSubTree; 3423 } 3424 3425 if (VectorizedTree) { 3426 // Finish the reduction. 3427 for (; i < NumReducedVals; ++i) { 3428 Builder.SetCurrentDebugLocation( 3429 cast<Instruction>(ReducedVals[i])->getDebugLoc()); 3430 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 3431 ReducedVals[i]); 3432 } 3433 // Update users. 3434 if (ReductionPHI) { 3435 assert(ReductionRoot && "Need a reduction operation"); 3436 ReductionRoot->setOperand(0, VectorizedTree); 3437 ReductionRoot->setOperand(1, ReductionPHI); 3438 } else 3439 ReductionRoot->replaceAllUsesWith(VectorizedTree); 3440 } 3441 return VectorizedTree != nullptr; 3442 } 3443 3444 private: 3445 3446 /// \brief Calcuate the cost of a reduction. 3447 int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { 3448 Type *ScalarTy = FirstReducedVal->getType(); 3449 Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); 3450 3451 int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); 3452 int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); 3453 3454 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; 3455 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; 3456 3457 int ScalarReduxCost = 3458 ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); 3459 3460 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost 3461 << " for reduction that starts with " << *FirstReducedVal 3462 << " (It is a " 3463 << (IsPairwiseReduction ? "pairwise" : "splitting") 3464 << " reduction)\n"); 3465 3466 return VecReduxCost - ScalarReduxCost; 3467 } 3468 3469 static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, 3470 Value *R, const Twine &Name = "") { 3471 if (Opcode == Instruction::FAdd) 3472 return Builder.CreateFAdd(L, R, Name); 3473 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); 3474 } 3475 3476 /// \brief Emit a horizontal reduction of the vectorized value. 3477 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { 3478 assert(VectorizedValue && "Need to have a vectorized tree node"); 3479 Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue); 3480 assert(isPowerOf2_32(ReduxWidth) && 3481 "We only handle power-of-two reductions for now"); 3482 3483 Value *TmpVec = ValToReduce; 3484 for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { 3485 if (IsPairwiseReduction) { 3486 Value *LeftMask = 3487 createRdxShuffleMask(ReduxWidth, i, true, true, Builder); 3488 Value *RightMask = 3489 createRdxShuffleMask(ReduxWidth, i, true, false, Builder); 3490 3491 Value *LeftShuf = Builder.CreateShuffleVector( 3492 TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); 3493 Value *RightShuf = Builder.CreateShuffleVector( 3494 TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), 3495 "rdx.shuf.r"); 3496 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, 3497 "bin.rdx"); 3498 } else { 3499 Value *UpperHalf = 3500 createRdxShuffleMask(ReduxWidth, i, false, false, Builder); 3501 Value *Shuf = Builder.CreateShuffleVector( 3502 TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); 3503 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); 3504 } 3505 } 3506 3507 // The result is in the first element of the vector. 3508 return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 3509 } 3510 }; 3511 3512 /// \brief Recognize construction of vectors like 3513 /// %ra = insertelement <4 x float> undef, float %s0, i32 0 3514 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 3515 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 3516 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 3517 /// 3518 /// Returns true if it matches 3519 /// 3520 static bool findBuildVector(InsertElementInst *FirstInsertElem, 3521 SmallVectorImpl<Value *> &BuildVector, 3522 SmallVectorImpl<Value *> &BuildVectorOpds) { 3523 if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) 3524 return false; 3525 3526 InsertElementInst *IE = FirstInsertElem; 3527 while (true) { 3528 BuildVector.push_back(IE); 3529 BuildVectorOpds.push_back(IE->getOperand(1)); 3530 3531 if (IE->use_empty()) 3532 return false; 3533 3534 InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back()); 3535 if (!NextUse) 3536 return true; 3537 3538 // If this isn't the final use, make sure the next insertelement is the only 3539 // use. It's OK if the final constructed vector is used multiple times 3540 if (!IE->hasOneUse()) 3541 return false; 3542 3543 IE = NextUse; 3544 } 3545 3546 return false; 3547 } 3548 3549 static bool PhiTypeSorterFunc(Value *V, Value *V2) { 3550 return V->getType() < V2->getType(); 3551 } 3552 3553 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { 3554 bool Changed = false; 3555 SmallVector<Value *, 4> Incoming; 3556 SmallSet<Value *, 16> VisitedInstrs; 3557 3558 bool HaveVectorizedPhiNodes = true; 3559 while (HaveVectorizedPhiNodes) { 3560 HaveVectorizedPhiNodes = false; 3561 3562 // Collect the incoming values from the PHIs. 3563 Incoming.clear(); 3564 for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie; 3565 ++instr) { 3566 PHINode *P = dyn_cast<PHINode>(instr); 3567 if (!P) 3568 break; 3569 3570 if (!VisitedInstrs.count(P)) 3571 Incoming.push_back(P); 3572 } 3573 3574 // Sort by type. 3575 std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc); 3576 3577 // Try to vectorize elements base on their type. 3578 for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), 3579 E = Incoming.end(); 3580 IncIt != E;) { 3581 3582 // Look for the next elements with the same type. 3583 SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; 3584 while (SameTypeIt != E && 3585 (*SameTypeIt)->getType() == (*IncIt)->getType()) { 3586 VisitedInstrs.insert(*SameTypeIt); 3587 ++SameTypeIt; 3588 } 3589 3590 // Try to vectorize them. 3591 unsigned NumElts = (SameTypeIt - IncIt); 3592 DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); 3593 if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { 3594 // Success start over because instructions might have been changed. 3595 HaveVectorizedPhiNodes = true; 3596 Changed = true; 3597 break; 3598 } 3599 3600 // Start over at the next instruction of a different type (or the end). 3601 IncIt = SameTypeIt; 3602 } 3603 } 3604 3605 VisitedInstrs.clear(); 3606 3607 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { 3608 // We may go through BB multiple times so skip the one we have checked. 3609 if (!VisitedInstrs.insert(it)) 3610 continue; 3611 3612 if (isa<DbgInfoIntrinsic>(it)) 3613 continue; 3614 3615 // Try to vectorize reductions that use PHINodes. 3616 if (PHINode *P = dyn_cast<PHINode>(it)) { 3617 // Check that the PHI is a reduction PHI. 3618 if (P->getNumIncomingValues() != 2) 3619 return Changed; 3620 Value *Rdx = 3621 (P->getIncomingBlock(0) == BB 3622 ? (P->getIncomingValue(0)) 3623 : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) 3624 : nullptr)); 3625 // Check if this is a Binary Operator. 3626 BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); 3627 if (!BI) 3628 continue; 3629 3630 // Try to match and vectorize a horizontal reduction. 3631 HorizontalReduction HorRdx; 3632 if (ShouldVectorizeHor && 3633 HorRdx.matchAssociativeReduction(P, BI, DL) && 3634 HorRdx.tryToReduce(R, TTI)) { 3635 Changed = true; 3636 it = BB->begin(); 3637 e = BB->end(); 3638 continue; 3639 } 3640 3641 Value *Inst = BI->getOperand(0); 3642 if (Inst == P) 3643 Inst = BI->getOperand(1); 3644 3645 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) { 3646 // We would like to start over since some instructions are deleted 3647 // and the iterator may become invalid value. 3648 Changed = true; 3649 it = BB->begin(); 3650 e = BB->end(); 3651 continue; 3652 } 3653 3654 continue; 3655 } 3656 3657 // Try to vectorize horizontal reductions feeding into a store. 3658 if (ShouldStartVectorizeHorAtStore) 3659 if (StoreInst *SI = dyn_cast<StoreInst>(it)) 3660 if (BinaryOperator *BinOp = 3661 dyn_cast<BinaryOperator>(SI->getValueOperand())) { 3662 HorizontalReduction HorRdx; 3663 if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) && 3664 HorRdx.tryToReduce(R, TTI)) || 3665 tryToVectorize(BinOp, R))) { 3666 Changed = true; 3667 it = BB->begin(); 3668 e = BB->end(); 3669 continue; 3670 } 3671 } 3672 3673 // Try to vectorize trees that start at compare instructions. 3674 if (CmpInst *CI = dyn_cast<CmpInst>(it)) { 3675 if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { 3676 Changed = true; 3677 // We would like to start over since some instructions are deleted 3678 // and the iterator may become invalid value. 3679 it = BB->begin(); 3680 e = BB->end(); 3681 continue; 3682 } 3683 3684 for (int i = 0; i < 2; ++i) { 3685 if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) { 3686 if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { 3687 Changed = true; 3688 // We would like to start over since some instructions are deleted 3689 // and the iterator may become invalid value. 3690 it = BB->begin(); 3691 e = BB->end(); 3692 } 3693 } 3694 } 3695 continue; 3696 } 3697 3698 // Try to vectorize trees that start at insertelement instructions. 3699 if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { 3700 SmallVector<Value *, 16> BuildVector; 3701 SmallVector<Value *, 16> BuildVectorOpds; 3702 if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) 3703 continue; 3704 3705 // Vectorize starting with the build vector operands ignoring the 3706 // BuildVector instructions for the purpose of scheduling and user 3707 // extraction. 3708 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { 3709 Changed = true; 3710 it = BB->begin(); 3711 e = BB->end(); 3712 } 3713 3714 continue; 3715 } 3716 } 3717 3718 return Changed; 3719 } 3720 3721 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { 3722 bool Changed = false; 3723 // Attempt to sort and vectorize each of the store-groups. 3724 for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); 3725 it != e; ++it) { 3726 if (it->second.size() < 2) 3727 continue; 3728 3729 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " 3730 << it->second.size() << ".\n"); 3731 3732 // Process the stores in chunks of 16. 3733 for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { 3734 unsigned Len = std::min<unsigned>(CE - CI, 16); 3735 Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), 3736 -SLPCostThreshold, R); 3737 } 3738 } 3739 return Changed; 3740 } 3741 3742 } // end anonymous namespace 3743 3744 char SLPVectorizer::ID = 0; 3745 static const char lv_name[] = "SLP Vectorizer"; 3746 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) 3747 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 3748 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 3749 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 3750 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 3751 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) 3752 3753 namespace llvm { 3754 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); } 3755 } 3756