1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the Loop Distribution Pass. Its main focus is to 11 // distribute loops that cannot be vectorized due to dependence cycles. It 12 // tries to isolate the offending dependences into a new loop allowing 13 // vectorization of the remaining parts. 14 // 15 // For dependence analysis, the pass uses the LoopVectorizer's 16 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 17 // memory operations, special care is taken to preserve the lexical order of 18 // these operations. 19 // 20 // Similarly to the Vectorizer, the pass also supports loop versioning to 21 // run-time disambiguate potentially overlapping arrays. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #include "llvm/ADT/DepthFirstIterator.h" 26 #include "llvm/ADT/EquivalenceClasses.h" 27 #include "llvm/ADT/STLExtras.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/Analysis/LoopAccessAnalysis.h" 30 #include "llvm/Analysis/LoopInfo.h" 31 #include "llvm/IR/Dominators.h" 32 #include "llvm/Pass.h" 33 #include "llvm/Support/CommandLine.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 36 #include "llvm/Transforms/Utils/Cloning.h" 37 #include "llvm/Transforms/Utils/LoopVersioning.h" 38 #include <list> 39 40 #define LDIST_NAME "loop-distribute" 41 #define DEBUG_TYPE LDIST_NAME 42 43 using namespace llvm; 44 45 static cl::opt<bool> 46 LDistVerify("loop-distribute-verify", cl::Hidden, 47 cl::desc("Turn on DominatorTree and LoopInfo verification " 48 "after Loop Distribution"), 49 cl::init(false)); 50 51 static cl::opt<bool> DistributeNonIfConvertible( 52 "loop-distribute-non-if-convertible", cl::Hidden, 53 cl::desc("Whether to distribute into a loop that may not be " 54 "if-convertible by the loop vectorizer"), 55 cl::init(false)); 56 57 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 58 59 namespace { 60 /// \brief Maintains the set of instructions of the loop for a partition before 61 /// cloning. After cloning, it hosts the new loop. 62 class InstPartition { 63 typedef SmallPtrSet<Instruction *, 8> InstructionSet; 64 65 public: 66 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 67 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) { 68 Set.insert(I); 69 } 70 71 /// \brief Returns whether this partition contains a dependence cycle. 72 bool hasDepCycle() const { return DepCycle; } 73 74 /// \brief Adds an instruction to this partition. 75 void add(Instruction *I) { Set.insert(I); } 76 77 /// \brief Collection accessors. 78 InstructionSet::iterator begin() { return Set.begin(); } 79 InstructionSet::iterator end() { return Set.end(); } 80 InstructionSet::const_iterator begin() const { return Set.begin(); } 81 InstructionSet::const_iterator end() const { return Set.end(); } 82 bool empty() const { return Set.empty(); } 83 84 /// \brief Moves this partition into \p Other. This partition becomes empty 85 /// after this. 86 void moveTo(InstPartition &Other) { 87 Other.Set.insert(Set.begin(), Set.end()); 88 Set.clear(); 89 Other.DepCycle |= DepCycle; 90 } 91 92 /// \brief Populates the partition with a transitive closure of all the 93 /// instructions that the seeded instructions dependent on. 94 void populateUsedSet() { 95 // FIXME: We currently don't use control-dependence but simply include all 96 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 97 // up. 98 for (auto *B : OrigLoop->getBlocks()) 99 Set.insert(B->getTerminator()); 100 101 // Follow the use-def chains to form a transitive closure of all the 102 // instructions that the originally seeded instructions depend on. 103 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 104 while (!Worklist.empty()) { 105 Instruction *I = Worklist.pop_back_val(); 106 // Insert instructions from the loop that we depend on. 107 for (Value *V : I->operand_values()) { 108 auto *I = dyn_cast<Instruction>(V); 109 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 110 Worklist.push_back(I); 111 } 112 } 113 } 114 115 /// \brief Clones the original loop. 116 /// 117 /// Updates LoopInfo and DominatorTree using the information that block \p 118 /// LoopDomBB dominates the loop. 119 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 120 unsigned Index, LoopInfo *LI, 121 DominatorTree *DT) { 122 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 123 VMap, Twine(".ldist") + Twine(Index), 124 LI, DT, ClonedLoopBlocks); 125 return ClonedLoop; 126 } 127 128 /// \brief The cloned loop. If this partition is mapped to the original loop, 129 /// this is null. 130 const Loop *getClonedLoop() const { return ClonedLoop; } 131 132 /// \brief Returns the loop where this partition ends up after distribution. 133 /// If this partition is mapped to the original loop then use the block from 134 /// the loop. 135 const Loop *getDistributedLoop() const { 136 return ClonedLoop ? ClonedLoop : OrigLoop; 137 } 138 139 /// \brief The VMap that is populated by cloning and then used in 140 /// remapinstruction to remap the cloned instructions. 141 ValueToValueMapTy &getVMap() { return VMap; } 142 143 /// \brief Remaps the cloned instructions using VMap. 144 void remapInstructions() { 145 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 146 } 147 148 /// \brief Based on the set of instructions selected for this partition, 149 /// removes the unnecessary ones. 150 void removeUnusedInsts() { 151 SmallVector<Instruction *, 8> Unused; 152 153 for (auto *Block : OrigLoop->getBlocks()) 154 for (auto &Inst : *Block) 155 if (!Set.count(&Inst)) { 156 Instruction *NewInst = &Inst; 157 if (!VMap.empty()) 158 NewInst = cast<Instruction>(VMap[NewInst]); 159 160 assert(!isa<BranchInst>(NewInst) && 161 "Branches are marked used early on"); 162 Unused.push_back(NewInst); 163 } 164 165 // Delete the instructions backwards, as it has a reduced likelihood of 166 // having to update as many def-use and use-def chains. 167 for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I) { 168 auto *Inst = *I; 169 170 if (!Inst->use_empty()) 171 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 172 Inst->eraseFromParent(); 173 } 174 } 175 176 void print() const { 177 if (DepCycle) 178 dbgs() << " (cycle)\n"; 179 for (auto *I : Set) 180 // Prefix with the block name. 181 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 182 } 183 184 void printBlocks() const { 185 for (auto *BB : getDistributedLoop()->getBlocks()) 186 dbgs() << *BB; 187 } 188 189 private: 190 /// \brief Instructions from OrigLoop selected for this partition. 191 InstructionSet Set; 192 193 /// \brief Whether this partition contains a dependence cycle. 194 bool DepCycle; 195 196 /// \brief The original loop. 197 Loop *OrigLoop; 198 199 /// \brief The cloned loop. If this partition is mapped to the original loop, 200 /// this is null. 201 Loop *ClonedLoop; 202 203 /// \brief The blocks of ClonedLoop including the preheader. If this 204 /// partition is mapped to the original loop, this is empty. 205 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 206 207 /// \brief These gets populated once the set of instructions have been 208 /// finalized. If this partition is mapped to the original loop, these are not 209 /// set. 210 ValueToValueMapTy VMap; 211 }; 212 213 /// \brief Holds the set of Partitions. It populates them, merges them and then 214 /// clones the loops. 215 class InstPartitionContainer { 216 typedef DenseMap<Instruction *, int> InstToPartitionIdT; 217 218 public: 219 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 220 : L(L), LI(LI), DT(DT) {} 221 222 /// \brief Returns the number of partitions. 223 unsigned getSize() const { return PartitionContainer.size(); } 224 225 /// \brief Adds \p Inst into the current partition if that is marked to 226 /// contain cycles. Otherwise start a new partition for it. 227 void addToCyclicPartition(Instruction *Inst) { 228 // If the current partition is non-cyclic. Start a new one. 229 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 230 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 231 else 232 PartitionContainer.back().add(Inst); 233 } 234 235 /// \brief Adds \p Inst into a partition that is not marked to contain 236 /// dependence cycles. 237 /// 238 // Initially we isolate memory instructions into as many partitions as 239 // possible, then later we may merge them back together. 240 void addToNewNonCyclicPartition(Instruction *Inst) { 241 PartitionContainer.emplace_back(Inst, L); 242 } 243 244 /// \brief Merges adjacent non-cyclic partitions. 245 /// 246 /// The idea is that we currently only want to isolate the non-vectorizable 247 /// partition. We could later allow more distribution among these partition 248 /// too. 249 void mergeAdjacentNonCyclic() { 250 mergeAdjacentPartitionsIf( 251 [](const InstPartition *P) { return !P->hasDepCycle(); }); 252 } 253 254 /// \brief If a partition contains only conditional stores, we won't vectorize 255 /// it. Try to merge it with a previous cyclic partition. 256 void mergeNonIfConvertible() { 257 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 258 if (Partition->hasDepCycle()) 259 return true; 260 261 // Now, check if all stores are conditional in this partition. 262 bool seenStore = false; 263 264 for (auto *Inst : *Partition) 265 if (isa<StoreInst>(Inst)) { 266 seenStore = true; 267 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 268 return false; 269 } 270 return seenStore; 271 }); 272 } 273 274 /// \brief Merges the partitions according to various heuristics. 275 void mergeBeforePopulating() { 276 mergeAdjacentNonCyclic(); 277 if (!DistributeNonIfConvertible) 278 mergeNonIfConvertible(); 279 } 280 281 /// \brief Merges partitions in order to ensure that no loads are duplicated. 282 /// 283 /// We can't duplicate loads because that could potentially reorder them. 284 /// LoopAccessAnalysis provides dependency information with the context that 285 /// the order of memory operation is preserved. 286 /// 287 /// Return if any partitions were merged. 288 bool mergeToAvoidDuplicatedLoads() { 289 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT; 290 typedef EquivalenceClasses<InstPartition *> ToBeMergedT; 291 292 LoadToPartitionT LoadToPartition; 293 ToBeMergedT ToBeMerged; 294 295 // Step through the partitions and create equivalence between partitions 296 // that contain the same load. Also put partitions in between them in the 297 // same equivalence class to avoid reordering of memory operations. 298 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 299 E = PartitionContainer.end(); 300 I != E; ++I) { 301 auto *PartI = &*I; 302 303 // If a load occurs in two partitions PartI and PartJ, merge all 304 // partitions (PartI, PartJ] into PartI. 305 for (Instruction *Inst : *PartI) 306 if (isa<LoadInst>(Inst)) { 307 bool NewElt; 308 LoadToPartitionT::iterator LoadToPart; 309 310 std::tie(LoadToPart, NewElt) = 311 LoadToPartition.insert(std::make_pair(Inst, PartI)); 312 if (!NewElt) { 313 DEBUG(dbgs() << "Merging partitions due to this load in multiple " 314 << "partitions: " << PartI << ", " 315 << LoadToPart->second << "\n" << *Inst << "\n"); 316 317 auto PartJ = I; 318 do { 319 --PartJ; 320 ToBeMerged.unionSets(PartI, &*PartJ); 321 } while (&*PartJ != LoadToPart->second); 322 } 323 } 324 } 325 if (ToBeMerged.empty()) 326 return false; 327 328 // Merge the member of an equivalence class into its class leader. This 329 // makes the members empty. 330 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 331 I != E; ++I) { 332 if (!I->isLeader()) 333 continue; 334 335 auto PartI = I->getData(); 336 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 337 ToBeMerged.member_end())) { 338 PartJ->moveTo(*PartI); 339 } 340 } 341 342 // Remove the empty partitions. 343 PartitionContainer.remove_if( 344 [](const InstPartition &P) { return P.empty(); }); 345 346 return true; 347 } 348 349 /// \brief Sets up the mapping between instructions to partitions. If the 350 /// instruction is duplicated across multiple partitions, set the entry to -1. 351 void setupPartitionIdOnInstructions() { 352 int PartitionID = 0; 353 for (const auto &Partition : PartitionContainer) { 354 for (Instruction *Inst : Partition) { 355 bool NewElt; 356 InstToPartitionIdT::iterator Iter; 357 358 std::tie(Iter, NewElt) = 359 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 360 if (!NewElt) 361 Iter->second = -1; 362 } 363 ++PartitionID; 364 } 365 } 366 367 /// \brief Populates the partition with everything that the seeding 368 /// instructions require. 369 void populateUsedSet() { 370 for (auto &P : PartitionContainer) 371 P.populateUsedSet(); 372 } 373 374 /// \brief This performs the main chunk of the work of cloning the loops for 375 /// the partitions. 376 void cloneLoops(Pass *P) { 377 BasicBlock *OrigPH = L->getLoopPreheader(); 378 // At this point the predecessor of the preheader is either the memcheck 379 // block or the top part of the original preheader. 380 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 381 assert(Pred && "Preheader does not have a single predecessor"); 382 BasicBlock *ExitBlock = L->getExitBlock(); 383 assert(ExitBlock && "No single exit block"); 384 Loop *NewLoop; 385 386 assert(!PartitionContainer.empty() && "at least two partitions expected"); 387 // We're cloning the preheader along with the loop so we already made sure 388 // it was empty. 389 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 390 "preheader not empty"); 391 392 // Create a loop for each partition except the last. Clone the original 393 // loop before PH along with adding a preheader for the cloned loop. Then 394 // update PH to point to the newly added preheader. 395 BasicBlock *TopPH = OrigPH; 396 unsigned Index = getSize() - 1; 397 for (auto I = std::next(PartitionContainer.rbegin()), 398 E = PartitionContainer.rend(); 399 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 400 auto *Part = &*I; 401 402 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 403 404 Part->getVMap()[ExitBlock] = TopPH; 405 Part->remapInstructions(); 406 } 407 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 408 409 // Now go in forward order and update the immediate dominator for the 410 // preheaders with the exiting block of the previous loop. Dominance 411 // within the loop is updated in cloneLoopWithPreheader. 412 for (auto Curr = PartitionContainer.cbegin(), 413 Next = std::next(PartitionContainer.cbegin()), 414 E = PartitionContainer.cend(); 415 Next != E; ++Curr, ++Next) 416 DT->changeImmediateDominator( 417 Next->getDistributedLoop()->getLoopPreheader(), 418 Curr->getDistributedLoop()->getExitingBlock()); 419 } 420 421 /// \brief Removes the dead instructions from the cloned loops. 422 void removeUnusedInsts() { 423 for (auto &Partition : PartitionContainer) 424 Partition.removeUnusedInsts(); 425 } 426 427 /// \brief For each memory pointer, it computes the partitionId the pointer is 428 /// used in. 429 /// 430 /// This returns an array of int where the I-th entry corresponds to I-th 431 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 432 /// partitions its entry is set to -1. 433 SmallVector<int, 8> 434 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 435 const LoopAccessInfo::RuntimePointerCheck *RtPtrCheck = 436 LAI.getRuntimePointerCheck(); 437 438 unsigned N = RtPtrCheck->Pointers.size(); 439 SmallVector<int, 8> PtrToPartitions(N); 440 for (unsigned I = 0; I < N; ++I) { 441 Value *Ptr = RtPtrCheck->Pointers[I]; 442 auto Instructions = 443 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->IsWritePtr[I]); 444 445 int &Partition = PtrToPartitions[I]; 446 // First set it to uninitialized. 447 Partition = -2; 448 for (Instruction *Inst : Instructions) { 449 // Note that this could be -1 if Inst is duplicated across multiple 450 // partitions. 451 int ThisPartition = this->InstToPartitionId[Inst]; 452 if (Partition == -2) 453 Partition = ThisPartition; 454 // -1 means belonging to multiple partitions. 455 else if (Partition == -1) 456 break; 457 else if (Partition != (int)ThisPartition) 458 Partition = -1; 459 } 460 assert(Partition != -2 && "Pointer not belonging to any partition"); 461 } 462 463 return PtrToPartitions; 464 } 465 466 void print(raw_ostream &OS) const { 467 unsigned Index = 0; 468 for (const auto &P : PartitionContainer) { 469 OS << "Partition " << Index++ << " (" << &P << "):\n"; 470 P.print(); 471 } 472 } 473 474 void dump() const { print(dbgs()); } 475 476 #ifndef NDEBUG 477 friend raw_ostream &operator<<(raw_ostream &OS, 478 const InstPartitionContainer &Partitions) { 479 Partitions.print(OS); 480 return OS; 481 } 482 #endif 483 484 void printBlocks() const { 485 unsigned Index = 0; 486 for (const auto &P : PartitionContainer) { 487 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 488 P.printBlocks(); 489 } 490 } 491 492 private: 493 typedef std::list<InstPartition> PartitionContainerT; 494 495 /// \brief List of partitions. 496 PartitionContainerT PartitionContainer; 497 498 /// \brief Mapping from Instruction to partition Id. If the instruction 499 /// belongs to multiple partitions the entry contains -1. 500 InstToPartitionIdT InstToPartitionId; 501 502 Loop *L; 503 LoopInfo *LI; 504 DominatorTree *DT; 505 506 /// \brief The control structure to merge adjacent partitions if both satisfy 507 /// the \p Predicate. 508 template <class UnaryPredicate> 509 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 510 InstPartition *PrevMatch = nullptr; 511 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 512 auto DoesMatch = Predicate(&*I); 513 if (PrevMatch == nullptr && DoesMatch) { 514 PrevMatch = &*I; 515 ++I; 516 } else if (PrevMatch != nullptr && DoesMatch) { 517 I->moveTo(*PrevMatch); 518 I = PartitionContainer.erase(I); 519 } else { 520 PrevMatch = nullptr; 521 ++I; 522 } 523 } 524 } 525 }; 526 527 /// \brief For each memory instruction, this class maintains difference of the 528 /// number of unsafe dependences that start out from this instruction minus 529 /// those that end here. 530 /// 531 /// By traversing the memory instructions in program order and accumulating this 532 /// number, we know whether any unsafe dependence crosses over a program point. 533 class MemoryInstructionDependences { 534 typedef MemoryDepChecker::Dependence Dependence; 535 536 public: 537 struct Entry { 538 Instruction *Inst; 539 unsigned NumUnsafeDependencesStartOrEnd; 540 541 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {} 542 }; 543 544 typedef SmallVector<Entry, 8> AccessesType; 545 546 AccessesType::const_iterator begin() const { return Accesses.begin(); } 547 AccessesType::const_iterator end() const { return Accesses.end(); } 548 549 MemoryInstructionDependences( 550 const SmallVectorImpl<Instruction *> &Instructions, 551 const SmallVectorImpl<Dependence> &InterestingDependences) { 552 Accesses.append(Instructions.begin(), Instructions.end()); 553 554 DEBUG(dbgs() << "Backward dependences:\n"); 555 for (auto &Dep : InterestingDependences) 556 if (Dep.isPossiblyBackward()) { 557 // Note that the designations source and destination follow the program 558 // order, i.e. source is always first. (The direction is given by the 559 // DepType.) 560 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 561 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 562 563 DEBUG(Dep.print(dbgs(), 2, Instructions)); 564 } 565 } 566 567 private: 568 AccessesType Accesses; 569 }; 570 571 /// \brief Returns the instructions that use values defined in the loop. 572 static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) { 573 SmallVector<Instruction *, 8> UsedOutside; 574 575 for (auto *Block : L->getBlocks()) 576 // FIXME: I believe that this could use copy_if if the Inst reference could 577 // be adapted into a pointer. 578 for (auto &Inst : *Block) { 579 auto Users = Inst.users(); 580 if (std::any_of(Users.begin(), Users.end(), [&](User *U) { 581 auto *Use = cast<Instruction>(U); 582 return !L->contains(Use->getParent()); 583 })) 584 UsedOutside.push_back(&Inst); 585 } 586 587 return UsedOutside; 588 } 589 590 /// \brief The pass class. 591 class LoopDistribute : public FunctionPass { 592 public: 593 LoopDistribute() : FunctionPass(ID) { 594 initializeLoopDistributePass(*PassRegistry::getPassRegistry()); 595 } 596 597 bool runOnFunction(Function &F) override { 598 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 599 LAA = &getAnalysis<LoopAccessAnalysis>(); 600 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 601 602 // Build up a worklist of inner-loops to vectorize. This is necessary as the 603 // act of distributing a loop creates new loops and can invalidate iterators 604 // across the loops. 605 SmallVector<Loop *, 8> Worklist; 606 607 for (Loop *TopLevelLoop : *LI) 608 for (Loop *L : depth_first(TopLevelLoop)) 609 // We only handle inner-most loops. 610 if (L->empty()) 611 Worklist.push_back(L); 612 613 // Now walk the identified inner loops. 614 bool Changed = false; 615 for (Loop *L : Worklist) 616 Changed |= processLoop(L); 617 618 // Process each loop nest in the function. 619 return Changed; 620 } 621 622 void getAnalysisUsage(AnalysisUsage &AU) const override { 623 AU.addRequired<LoopInfoWrapperPass>(); 624 AU.addPreserved<LoopInfoWrapperPass>(); 625 AU.addRequired<LoopAccessAnalysis>(); 626 AU.addRequired<DominatorTreeWrapperPass>(); 627 AU.addPreserved<DominatorTreeWrapperPass>(); 628 } 629 630 static char ID; 631 632 private: 633 /// \brief Try to distribute an inner-most loop. 634 bool processLoop(Loop *L) { 635 assert(L->empty() && "Only process inner loops."); 636 637 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() 638 << "\" checking " << *L << "\n"); 639 640 BasicBlock *PH = L->getLoopPreheader(); 641 if (!PH) { 642 DEBUG(dbgs() << "Skipping; no preheader"); 643 return false; 644 } 645 if (!L->getExitBlock()) { 646 DEBUG(dbgs() << "Skipping; multiple exit blocks"); 647 return false; 648 } 649 // LAA will check that we only have a single exiting block. 650 651 const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); 652 653 // Currently, we only distribute to isolate the part of the loop with 654 // dependence cycles to enable partial vectorization. 655 if (LAI.canVectorizeMemory()) { 656 DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization"); 657 return false; 658 } 659 auto *InterestingDependences = 660 LAI.getDepChecker().getInterestingDependences(); 661 if (!InterestingDependences || InterestingDependences->empty()) { 662 DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate"); 663 return false; 664 } 665 666 InstPartitionContainer Partitions(L, LI, DT); 667 668 // First, go through each memory operation and assign them to consecutive 669 // partitions (the order of partitions follows program order). Put those 670 // with unsafe dependences into "cyclic" partition otherwise put each store 671 // in its own "non-cyclic" partition (we'll merge these later). 672 // 673 // Note that a memory operation (e.g. Load2 below) at a program point that 674 // has an unsafe dependence (Store3->Load1) spanning over it must be 675 // included in the same cyclic partition as the dependent operations. This 676 // is to preserve the original program order after distribution. E.g.: 677 // 678 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 679 // Load1 -. 1 0->1 680 // Load2 | /Unsafe/ 0 1 681 // Store3 -' -1 1->0 682 // Load4 0 0 683 // 684 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 685 // we just keep assigning to the same cyclic partition until 686 // NumUnsafeDependencesActive reaches 0. 687 const MemoryDepChecker &DepChecker = LAI.getDepChecker(); 688 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 689 *InterestingDependences); 690 691 int NumUnsafeDependencesActive = 0; 692 for (auto &InstDep : MID) { 693 Instruction *I = InstDep.Inst; 694 // We update NumUnsafeDependencesActive post-instruction, catch the 695 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 696 if (NumUnsafeDependencesActive || 697 InstDep.NumUnsafeDependencesStartOrEnd > 0) 698 Partitions.addToCyclicPartition(I); 699 else 700 Partitions.addToNewNonCyclicPartition(I); 701 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 702 assert(NumUnsafeDependencesActive >= 0 && 703 "Negative number of dependences active"); 704 } 705 706 // Add partitions for values used outside. These partitions can be out of 707 // order from the original program order. This is OK because if the 708 // partition uses a load we will merge this partition with the original 709 // partition of the load that we set up in the previous loop (see 710 // mergeToAvoidDuplicatedLoads). 711 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 712 for (auto *Inst : DefsUsedOutside) 713 Partitions.addToNewNonCyclicPartition(Inst); 714 715 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 716 if (Partitions.getSize() < 2) 717 return false; 718 719 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 720 // should be able to vectorize these together. 721 Partitions.mergeBeforePopulating(); 722 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 723 if (Partitions.getSize() < 2) 724 return false; 725 726 // Now, populate the partitions with non-memory operations. 727 Partitions.populateUsedSet(); 728 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 729 730 // In order to preserve original lexical order for loads, keep them in the 731 // partition that we set up in the MemoryInstructionDependences loop. 732 if (Partitions.mergeToAvoidDuplicatedLoads()) { 733 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 734 << Partitions); 735 if (Partitions.getSize() < 2) 736 return false; 737 } 738 739 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 740 // We're done forming the partitions set up the reverse mapping from 741 // instructions to partitions. 742 Partitions.setupPartitionIdOnInstructions(); 743 744 // To keep things simple have an empty preheader before we version or clone 745 // the loop. (Also split if this has no predecessor, i.e. entry, because we 746 // rely on PH having a predecessor.) 747 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 748 SplitBlock(PH, PH->getTerminator(), DT, LI); 749 750 // If we need run-time checks to disambiguate pointers are run-time, version 751 // the loop now. 752 auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI); 753 LoopVersioning LVer(LAI, L, LI, DT, &PtrToPartition); 754 if (LVer.needsRuntimeChecks()) { 755 DEBUG(dbgs() << "\nPointers:\n"); 756 DEBUG(LAI.getRuntimePointerCheck()->print(dbgs(), 0, &PtrToPartition)); 757 LVer.versionLoop(this); 758 LVer.addPHINodes(DefsUsedOutside); 759 } 760 761 // Create identical copies of the original loop for each partition and hook 762 // them up sequentially. 763 Partitions.cloneLoops(this); 764 765 // Now, we remove the instruction from each loop that don't belong to that 766 // partition. 767 Partitions.removeUnusedInsts(); 768 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 769 DEBUG(Partitions.printBlocks()); 770 771 if (LDistVerify) { 772 LI->verify(); 773 DT->verifyDomTree(); 774 } 775 776 ++NumLoopsDistributed; 777 return true; 778 } 779 780 // Analyses used. 781 LoopInfo *LI; 782 LoopAccessAnalysis *LAA; 783 DominatorTree *DT; 784 }; 785 } // anonymous namespace 786 787 char LoopDistribute::ID; 788 static const char ldist_name[] = "Loop Distribition"; 789 790 INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) 791 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 792 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) 793 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 794 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) 795 796 namespace llvm { 797 FunctionPass *createLoopDistributePass() { return new LoopDistribute(); } 798 } 799