1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the Loop Distribution Pass. Its main focus is to 11 // distribute loops that cannot be vectorized due to dependence cycles. It 12 // tries to isolate the offending dependences into a new loop allowing 13 // vectorization of the remaining parts. 14 // 15 // For dependence analysis, the pass uses the LoopVectorizer's 16 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 17 // memory operations, special care is taken to preserve the lexical order of 18 // these operations. 19 // 20 // Similarly to the Vectorizer, the pass also supports loop versioning to 21 // run-time disambiguate potentially overlapping arrays. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #include "llvm/Transforms/Scalar/LoopDistribute.h" 26 #include "llvm/ADT/DepthFirstIterator.h" 27 #include "llvm/ADT/EquivalenceClasses.h" 28 #include "llvm/ADT/STLExtras.h" 29 #include "llvm/ADT/Statistic.h" 30 #include "llvm/Analysis/BlockFrequencyInfo.h" 31 #include "llvm/Analysis/GlobalsModRef.h" 32 #include "llvm/Analysis/LoopAccessAnalysis.h" 33 #include "llvm/Analysis/LoopInfo.h" 34 #include "llvm/Analysis/LoopPassManager.h" 35 #include "llvm/Analysis/OptimizationDiagnosticInfo.h" 36 #include "llvm/IR/DiagnosticInfo.h" 37 #include "llvm/IR/Dominators.h" 38 #include "llvm/Pass.h" 39 #include "llvm/Support/CommandLine.h" 40 #include "llvm/Support/Debug.h" 41 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 42 #include "llvm/Transforms/Utils/Cloning.h" 43 #include "llvm/Transforms/Utils/LoopUtils.h" 44 #include "llvm/Transforms/Utils/LoopVersioning.h" 45 #include <list> 46 47 #define LDIST_NAME "loop-distribute" 48 #define DEBUG_TYPE LDIST_NAME 49 50 using namespace llvm; 51 52 static cl::opt<bool> 53 LDistVerify("loop-distribute-verify", cl::Hidden, 54 cl::desc("Turn on DominatorTree and LoopInfo verification " 55 "after Loop Distribution"), 56 cl::init(false)); 57 58 static cl::opt<bool> DistributeNonIfConvertible( 59 "loop-distribute-non-if-convertible", cl::Hidden, 60 cl::desc("Whether to distribute into a loop that may not be " 61 "if-convertible by the loop vectorizer"), 62 cl::init(false)); 63 64 static cl::opt<unsigned> DistributeSCEVCheckThreshold( 65 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 66 cl::desc("The maximum number of SCEV checks allowed for Loop " 67 "Distribution")); 68 69 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( 70 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), 71 cl::Hidden, 72 cl::desc( 73 "The maximum number of SCEV checks allowed for Loop " 74 "Distribution for loop marked with #pragma loop distribute(enable)")); 75 76 // Note that the initial value for this depends on whether the pass is invoked 77 // directly or from the optimization pipeline. 78 static cl::opt<bool> EnableLoopDistribute( 79 "enable-loop-distribute", cl::Hidden, 80 cl::desc("Enable the new, experimental LoopDistribution Pass")); 81 82 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 83 84 namespace { 85 /// \brief Maintains the set of instructions of the loop for a partition before 86 /// cloning. After cloning, it hosts the new loop. 87 class InstPartition { 88 typedef SmallPtrSet<Instruction *, 8> InstructionSet; 89 90 public: 91 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 92 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) { 93 Set.insert(I); 94 } 95 96 /// \brief Returns whether this partition contains a dependence cycle. 97 bool hasDepCycle() const { return DepCycle; } 98 99 /// \brief Adds an instruction to this partition. 100 void add(Instruction *I) { Set.insert(I); } 101 102 /// \brief Collection accessors. 103 InstructionSet::iterator begin() { return Set.begin(); } 104 InstructionSet::iterator end() { return Set.end(); } 105 InstructionSet::const_iterator begin() const { return Set.begin(); } 106 InstructionSet::const_iterator end() const { return Set.end(); } 107 bool empty() const { return Set.empty(); } 108 109 /// \brief Moves this partition into \p Other. This partition becomes empty 110 /// after this. 111 void moveTo(InstPartition &Other) { 112 Other.Set.insert(Set.begin(), Set.end()); 113 Set.clear(); 114 Other.DepCycle |= DepCycle; 115 } 116 117 /// \brief Populates the partition with a transitive closure of all the 118 /// instructions that the seeded instructions dependent on. 119 void populateUsedSet() { 120 // FIXME: We currently don't use control-dependence but simply include all 121 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 122 // up. 123 for (auto *B : OrigLoop->getBlocks()) 124 Set.insert(B->getTerminator()); 125 126 // Follow the use-def chains to form a transitive closure of all the 127 // instructions that the originally seeded instructions depend on. 128 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 129 while (!Worklist.empty()) { 130 Instruction *I = Worklist.pop_back_val(); 131 // Insert instructions from the loop that we depend on. 132 for (Value *V : I->operand_values()) { 133 auto *I = dyn_cast<Instruction>(V); 134 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 135 Worklist.push_back(I); 136 } 137 } 138 } 139 140 /// \brief Clones the original loop. 141 /// 142 /// Updates LoopInfo and DominatorTree using the information that block \p 143 /// LoopDomBB dominates the loop. 144 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 145 unsigned Index, LoopInfo *LI, 146 DominatorTree *DT) { 147 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 148 VMap, Twine(".ldist") + Twine(Index), 149 LI, DT, ClonedLoopBlocks); 150 return ClonedLoop; 151 } 152 153 /// \brief The cloned loop. If this partition is mapped to the original loop, 154 /// this is null. 155 const Loop *getClonedLoop() const { return ClonedLoop; } 156 157 /// \brief Returns the loop where this partition ends up after distribution. 158 /// If this partition is mapped to the original loop then use the block from 159 /// the loop. 160 const Loop *getDistributedLoop() const { 161 return ClonedLoop ? ClonedLoop : OrigLoop; 162 } 163 164 /// \brief The VMap that is populated by cloning and then used in 165 /// remapinstruction to remap the cloned instructions. 166 ValueToValueMapTy &getVMap() { return VMap; } 167 168 /// \brief Remaps the cloned instructions using VMap. 169 void remapInstructions() { 170 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 171 } 172 173 /// \brief Based on the set of instructions selected for this partition, 174 /// removes the unnecessary ones. 175 void removeUnusedInsts() { 176 SmallVector<Instruction *, 8> Unused; 177 178 for (auto *Block : OrigLoop->getBlocks()) 179 for (auto &Inst : *Block) 180 if (!Set.count(&Inst)) { 181 Instruction *NewInst = &Inst; 182 if (!VMap.empty()) 183 NewInst = cast<Instruction>(VMap[NewInst]); 184 185 assert(!isa<BranchInst>(NewInst) && 186 "Branches are marked used early on"); 187 Unused.push_back(NewInst); 188 } 189 190 // Delete the instructions backwards, as it has a reduced likelihood of 191 // having to update as many def-use and use-def chains. 192 for (auto *Inst : reverse(Unused)) { 193 if (!Inst->use_empty()) 194 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 195 Inst->eraseFromParent(); 196 } 197 } 198 199 void print() const { 200 if (DepCycle) 201 dbgs() << " (cycle)\n"; 202 for (auto *I : Set) 203 // Prefix with the block name. 204 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 205 } 206 207 void printBlocks() const { 208 for (auto *BB : getDistributedLoop()->getBlocks()) 209 dbgs() << *BB; 210 } 211 212 private: 213 /// \brief Instructions from OrigLoop selected for this partition. 214 InstructionSet Set; 215 216 /// \brief Whether this partition contains a dependence cycle. 217 bool DepCycle; 218 219 /// \brief The original loop. 220 Loop *OrigLoop; 221 222 /// \brief The cloned loop. If this partition is mapped to the original loop, 223 /// this is null. 224 Loop *ClonedLoop; 225 226 /// \brief The blocks of ClonedLoop including the preheader. If this 227 /// partition is mapped to the original loop, this is empty. 228 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 229 230 /// \brief These gets populated once the set of instructions have been 231 /// finalized. If this partition is mapped to the original loop, these are not 232 /// set. 233 ValueToValueMapTy VMap; 234 }; 235 236 /// \brief Holds the set of Partitions. It populates them, merges them and then 237 /// clones the loops. 238 class InstPartitionContainer { 239 typedef DenseMap<Instruction *, int> InstToPartitionIdT; 240 241 public: 242 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 243 : L(L), LI(LI), DT(DT) {} 244 245 /// \brief Returns the number of partitions. 246 unsigned getSize() const { return PartitionContainer.size(); } 247 248 /// \brief Adds \p Inst into the current partition if that is marked to 249 /// contain cycles. Otherwise start a new partition for it. 250 void addToCyclicPartition(Instruction *Inst) { 251 // If the current partition is non-cyclic. Start a new one. 252 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 253 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 254 else 255 PartitionContainer.back().add(Inst); 256 } 257 258 /// \brief Adds \p Inst into a partition that is not marked to contain 259 /// dependence cycles. 260 /// 261 // Initially we isolate memory instructions into as many partitions as 262 // possible, then later we may merge them back together. 263 void addToNewNonCyclicPartition(Instruction *Inst) { 264 PartitionContainer.emplace_back(Inst, L); 265 } 266 267 /// \brief Merges adjacent non-cyclic partitions. 268 /// 269 /// The idea is that we currently only want to isolate the non-vectorizable 270 /// partition. We could later allow more distribution among these partition 271 /// too. 272 void mergeAdjacentNonCyclic() { 273 mergeAdjacentPartitionsIf( 274 [](const InstPartition *P) { return !P->hasDepCycle(); }); 275 } 276 277 /// \brief If a partition contains only conditional stores, we won't vectorize 278 /// it. Try to merge it with a previous cyclic partition. 279 void mergeNonIfConvertible() { 280 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 281 if (Partition->hasDepCycle()) 282 return true; 283 284 // Now, check if all stores are conditional in this partition. 285 bool seenStore = false; 286 287 for (auto *Inst : *Partition) 288 if (isa<StoreInst>(Inst)) { 289 seenStore = true; 290 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 291 return false; 292 } 293 return seenStore; 294 }); 295 } 296 297 /// \brief Merges the partitions according to various heuristics. 298 void mergeBeforePopulating() { 299 mergeAdjacentNonCyclic(); 300 if (!DistributeNonIfConvertible) 301 mergeNonIfConvertible(); 302 } 303 304 /// \brief Merges partitions in order to ensure that no loads are duplicated. 305 /// 306 /// We can't duplicate loads because that could potentially reorder them. 307 /// LoopAccessAnalysis provides dependency information with the context that 308 /// the order of memory operation is preserved. 309 /// 310 /// Return if any partitions were merged. 311 bool mergeToAvoidDuplicatedLoads() { 312 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT; 313 typedef EquivalenceClasses<InstPartition *> ToBeMergedT; 314 315 LoadToPartitionT LoadToPartition; 316 ToBeMergedT ToBeMerged; 317 318 // Step through the partitions and create equivalence between partitions 319 // that contain the same load. Also put partitions in between them in the 320 // same equivalence class to avoid reordering of memory operations. 321 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 322 E = PartitionContainer.end(); 323 I != E; ++I) { 324 auto *PartI = &*I; 325 326 // If a load occurs in two partitions PartI and PartJ, merge all 327 // partitions (PartI, PartJ] into PartI. 328 for (Instruction *Inst : *PartI) 329 if (isa<LoadInst>(Inst)) { 330 bool NewElt; 331 LoadToPartitionT::iterator LoadToPart; 332 333 std::tie(LoadToPart, NewElt) = 334 LoadToPartition.insert(std::make_pair(Inst, PartI)); 335 if (!NewElt) { 336 DEBUG(dbgs() << "Merging partitions due to this load in multiple " 337 << "partitions: " << PartI << ", " 338 << LoadToPart->second << "\n" << *Inst << "\n"); 339 340 auto PartJ = I; 341 do { 342 --PartJ; 343 ToBeMerged.unionSets(PartI, &*PartJ); 344 } while (&*PartJ != LoadToPart->second); 345 } 346 } 347 } 348 if (ToBeMerged.empty()) 349 return false; 350 351 // Merge the member of an equivalence class into its class leader. This 352 // makes the members empty. 353 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 354 I != E; ++I) { 355 if (!I->isLeader()) 356 continue; 357 358 auto PartI = I->getData(); 359 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 360 ToBeMerged.member_end())) { 361 PartJ->moveTo(*PartI); 362 } 363 } 364 365 // Remove the empty partitions. 366 PartitionContainer.remove_if( 367 [](const InstPartition &P) { return P.empty(); }); 368 369 return true; 370 } 371 372 /// \brief Sets up the mapping between instructions to partitions. If the 373 /// instruction is duplicated across multiple partitions, set the entry to -1. 374 void setupPartitionIdOnInstructions() { 375 int PartitionID = 0; 376 for (const auto &Partition : PartitionContainer) { 377 for (Instruction *Inst : Partition) { 378 bool NewElt; 379 InstToPartitionIdT::iterator Iter; 380 381 std::tie(Iter, NewElt) = 382 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 383 if (!NewElt) 384 Iter->second = -1; 385 } 386 ++PartitionID; 387 } 388 } 389 390 /// \brief Populates the partition with everything that the seeding 391 /// instructions require. 392 void populateUsedSet() { 393 for (auto &P : PartitionContainer) 394 P.populateUsedSet(); 395 } 396 397 /// \brief This performs the main chunk of the work of cloning the loops for 398 /// the partitions. 399 void cloneLoops() { 400 BasicBlock *OrigPH = L->getLoopPreheader(); 401 // At this point the predecessor of the preheader is either the memcheck 402 // block or the top part of the original preheader. 403 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 404 assert(Pred && "Preheader does not have a single predecessor"); 405 BasicBlock *ExitBlock = L->getExitBlock(); 406 assert(ExitBlock && "No single exit block"); 407 Loop *NewLoop; 408 409 assert(!PartitionContainer.empty() && "at least two partitions expected"); 410 // We're cloning the preheader along with the loop so we already made sure 411 // it was empty. 412 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 413 "preheader not empty"); 414 415 // Create a loop for each partition except the last. Clone the original 416 // loop before PH along with adding a preheader for the cloned loop. Then 417 // update PH to point to the newly added preheader. 418 BasicBlock *TopPH = OrigPH; 419 unsigned Index = getSize() - 1; 420 for (auto I = std::next(PartitionContainer.rbegin()), 421 E = PartitionContainer.rend(); 422 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 423 auto *Part = &*I; 424 425 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 426 427 Part->getVMap()[ExitBlock] = TopPH; 428 Part->remapInstructions(); 429 } 430 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 431 432 // Now go in forward order and update the immediate dominator for the 433 // preheaders with the exiting block of the previous loop. Dominance 434 // within the loop is updated in cloneLoopWithPreheader. 435 for (auto Curr = PartitionContainer.cbegin(), 436 Next = std::next(PartitionContainer.cbegin()), 437 E = PartitionContainer.cend(); 438 Next != E; ++Curr, ++Next) 439 DT->changeImmediateDominator( 440 Next->getDistributedLoop()->getLoopPreheader(), 441 Curr->getDistributedLoop()->getExitingBlock()); 442 } 443 444 /// \brief Removes the dead instructions from the cloned loops. 445 void removeUnusedInsts() { 446 for (auto &Partition : PartitionContainer) 447 Partition.removeUnusedInsts(); 448 } 449 450 /// \brief For each memory pointer, it computes the partitionId the pointer is 451 /// used in. 452 /// 453 /// This returns an array of int where the I-th entry corresponds to I-th 454 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 455 /// partitions its entry is set to -1. 456 SmallVector<int, 8> 457 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 458 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 459 460 unsigned N = RtPtrCheck->Pointers.size(); 461 SmallVector<int, 8> PtrToPartitions(N); 462 for (unsigned I = 0; I < N; ++I) { 463 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 464 auto Instructions = 465 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 466 467 int &Partition = PtrToPartitions[I]; 468 // First set it to uninitialized. 469 Partition = -2; 470 for (Instruction *Inst : Instructions) { 471 // Note that this could be -1 if Inst is duplicated across multiple 472 // partitions. 473 int ThisPartition = this->InstToPartitionId[Inst]; 474 if (Partition == -2) 475 Partition = ThisPartition; 476 // -1 means belonging to multiple partitions. 477 else if (Partition == -1) 478 break; 479 else if (Partition != (int)ThisPartition) 480 Partition = -1; 481 } 482 assert(Partition != -2 && "Pointer not belonging to any partition"); 483 } 484 485 return PtrToPartitions; 486 } 487 488 void print(raw_ostream &OS) const { 489 unsigned Index = 0; 490 for (const auto &P : PartitionContainer) { 491 OS << "Partition " << Index++ << " (" << &P << "):\n"; 492 P.print(); 493 } 494 } 495 496 void dump() const { print(dbgs()); } 497 498 #ifndef NDEBUG 499 friend raw_ostream &operator<<(raw_ostream &OS, 500 const InstPartitionContainer &Partitions) { 501 Partitions.print(OS); 502 return OS; 503 } 504 #endif 505 506 void printBlocks() const { 507 unsigned Index = 0; 508 for (const auto &P : PartitionContainer) { 509 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 510 P.printBlocks(); 511 } 512 } 513 514 private: 515 typedef std::list<InstPartition> PartitionContainerT; 516 517 /// \brief List of partitions. 518 PartitionContainerT PartitionContainer; 519 520 /// \brief Mapping from Instruction to partition Id. If the instruction 521 /// belongs to multiple partitions the entry contains -1. 522 InstToPartitionIdT InstToPartitionId; 523 524 Loop *L; 525 LoopInfo *LI; 526 DominatorTree *DT; 527 528 /// \brief The control structure to merge adjacent partitions if both satisfy 529 /// the \p Predicate. 530 template <class UnaryPredicate> 531 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 532 InstPartition *PrevMatch = nullptr; 533 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 534 auto DoesMatch = Predicate(&*I); 535 if (PrevMatch == nullptr && DoesMatch) { 536 PrevMatch = &*I; 537 ++I; 538 } else if (PrevMatch != nullptr && DoesMatch) { 539 I->moveTo(*PrevMatch); 540 I = PartitionContainer.erase(I); 541 } else { 542 PrevMatch = nullptr; 543 ++I; 544 } 545 } 546 } 547 }; 548 549 /// \brief For each memory instruction, this class maintains difference of the 550 /// number of unsafe dependences that start out from this instruction minus 551 /// those that end here. 552 /// 553 /// By traversing the memory instructions in program order and accumulating this 554 /// number, we know whether any unsafe dependence crosses over a program point. 555 class MemoryInstructionDependences { 556 typedef MemoryDepChecker::Dependence Dependence; 557 558 public: 559 struct Entry { 560 Instruction *Inst; 561 unsigned NumUnsafeDependencesStartOrEnd; 562 563 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {} 564 }; 565 566 typedef SmallVector<Entry, 8> AccessesType; 567 568 AccessesType::const_iterator begin() const { return Accesses.begin(); } 569 AccessesType::const_iterator end() const { return Accesses.end(); } 570 571 MemoryInstructionDependences( 572 const SmallVectorImpl<Instruction *> &Instructions, 573 const SmallVectorImpl<Dependence> &Dependences) { 574 Accesses.append(Instructions.begin(), Instructions.end()); 575 576 DEBUG(dbgs() << "Backward dependences:\n"); 577 for (auto &Dep : Dependences) 578 if (Dep.isPossiblyBackward()) { 579 // Note that the designations source and destination follow the program 580 // order, i.e. source is always first. (The direction is given by the 581 // DepType.) 582 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 583 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 584 585 DEBUG(Dep.print(dbgs(), 2, Instructions)); 586 } 587 } 588 589 private: 590 AccessesType Accesses; 591 }; 592 593 /// \brief The actual class performing the per-loop work. 594 class LoopDistributeForLoop { 595 public: 596 LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, 597 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) 598 : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE), ORE(ORE) { 599 setForced(); 600 } 601 602 /// \brief Try to distribute an inner-most loop. 603 bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { 604 assert(L->empty() && "Only process inner loops."); 605 606 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() 607 << "\" checking " << *L << "\n"); 608 609 if (!L->getExitBlock()) 610 return fail("MultipleExitBlocks", "multiple exit blocks"); 611 if (!L->isLoopSimplifyForm()) 612 return fail("NotLoopSimplifyForm", 613 "loop is not in loop-simplify form"); 614 615 BasicBlock *PH = L->getLoopPreheader(); 616 617 // LAA will check that we only have a single exiting block. 618 LAI = &GetLAA(*L); 619 620 // Currently, we only distribute to isolate the part of the loop with 621 // dependence cycles to enable partial vectorization. 622 if (LAI->canVectorizeMemory()) 623 return fail("MemOpsCanBeVectorized", 624 "memory operations are safe for vectorization"); 625 626 auto *Dependences = LAI->getDepChecker().getDependences(); 627 if (!Dependences || Dependences->empty()) 628 return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); 629 630 InstPartitionContainer Partitions(L, LI, DT); 631 632 // First, go through each memory operation and assign them to consecutive 633 // partitions (the order of partitions follows program order). Put those 634 // with unsafe dependences into "cyclic" partition otherwise put each store 635 // in its own "non-cyclic" partition (we'll merge these later). 636 // 637 // Note that a memory operation (e.g. Load2 below) at a program point that 638 // has an unsafe dependence (Store3->Load1) spanning over it must be 639 // included in the same cyclic partition as the dependent operations. This 640 // is to preserve the original program order after distribution. E.g.: 641 // 642 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 643 // Load1 -. 1 0->1 644 // Load2 | /Unsafe/ 0 1 645 // Store3 -' -1 1->0 646 // Load4 0 0 647 // 648 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 649 // we just keep assigning to the same cyclic partition until 650 // NumUnsafeDependencesActive reaches 0. 651 const MemoryDepChecker &DepChecker = LAI->getDepChecker(); 652 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 653 *Dependences); 654 655 int NumUnsafeDependencesActive = 0; 656 for (auto &InstDep : MID) { 657 Instruction *I = InstDep.Inst; 658 // We update NumUnsafeDependencesActive post-instruction, catch the 659 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 660 if (NumUnsafeDependencesActive || 661 InstDep.NumUnsafeDependencesStartOrEnd > 0) 662 Partitions.addToCyclicPartition(I); 663 else 664 Partitions.addToNewNonCyclicPartition(I); 665 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 666 assert(NumUnsafeDependencesActive >= 0 && 667 "Negative number of dependences active"); 668 } 669 670 // Add partitions for values used outside. These partitions can be out of 671 // order from the original program order. This is OK because if the 672 // partition uses a load we will merge this partition with the original 673 // partition of the load that we set up in the previous loop (see 674 // mergeToAvoidDuplicatedLoads). 675 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 676 for (auto *Inst : DefsUsedOutside) 677 Partitions.addToNewNonCyclicPartition(Inst); 678 679 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 680 if (Partitions.getSize() < 2) 681 return fail("CantIsolateUnsafeDeps", 682 "cannot isolate unsafe dependencies"); 683 684 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 685 // should be able to vectorize these together. 686 Partitions.mergeBeforePopulating(); 687 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 688 if (Partitions.getSize() < 2) 689 return fail("CantIsolateUnsafeDeps", 690 "cannot isolate unsafe dependencies"); 691 692 // Now, populate the partitions with non-memory operations. 693 Partitions.populateUsedSet(); 694 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 695 696 // In order to preserve original lexical order for loads, keep them in the 697 // partition that we set up in the MemoryInstructionDependences loop. 698 if (Partitions.mergeToAvoidDuplicatedLoads()) { 699 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 700 << Partitions); 701 if (Partitions.getSize() < 2) 702 return fail("CantIsolateUnsafeDeps", 703 "cannot isolate unsafe dependencies"); 704 } 705 706 // Don't distribute the loop if we need too many SCEV run-time checks. 707 const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); 708 if (Pred.getComplexity() > (IsForced.getValueOr(false) 709 ? PragmaDistributeSCEVCheckThreshold 710 : DistributeSCEVCheckThreshold)) 711 return fail("TooManySCEVRuntimeChecks", 712 "too many SCEV run-time checks needed.\n"); 713 714 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 715 // We're done forming the partitions set up the reverse mapping from 716 // instructions to partitions. 717 Partitions.setupPartitionIdOnInstructions(); 718 719 // To keep things simple have an empty preheader before we version or clone 720 // the loop. (Also split if this has no predecessor, i.e. entry, because we 721 // rely on PH having a predecessor.) 722 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 723 SplitBlock(PH, PH->getTerminator(), DT, LI); 724 725 // If we need run-time checks, version the loop now. 726 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); 727 const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); 728 const auto &AllChecks = RtPtrChecking->getChecks(); 729 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 730 RtPtrChecking); 731 732 if (!Pred.isAlwaysTrue() || !Checks.empty()) { 733 DEBUG(dbgs() << "\nPointers:\n"); 734 DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 735 LoopVersioning LVer(*LAI, L, LI, DT, SE, false); 736 LVer.setAliasChecks(std::move(Checks)); 737 LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate()); 738 LVer.versionLoop(DefsUsedOutside); 739 LVer.annotateLoopWithNoAlias(); 740 } 741 742 // Create identical copies of the original loop for each partition and hook 743 // them up sequentially. 744 Partitions.cloneLoops(); 745 746 // Now, we remove the instruction from each loop that don't belong to that 747 // partition. 748 Partitions.removeUnusedInsts(); 749 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 750 DEBUG(Partitions.printBlocks()); 751 752 if (LDistVerify) { 753 LI->verify(*DT); 754 DT->verifyDomTree(); 755 } 756 757 ++NumLoopsDistributed; 758 // Report the success. 759 ORE->emit(OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), 760 L->getHeader()) 761 << "distributed loop"); 762 return true; 763 } 764 765 /// \brief Provide diagnostics then \return with false. 766 bool fail(StringRef RemarkName, StringRef Message) { 767 LLVMContext &Ctx = F->getContext(); 768 bool Forced = isForced().getValueOr(false); 769 770 DEBUG(dbgs() << "Skipping; " << Message << "\n"); 771 772 // With Rpass-missed report that distribution failed. 773 ORE->emit( 774 OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", L->getStartLoc(), 775 L->getHeader()) 776 << "loop not distributed: use -Rpass-analysis=loop-distribute for more " 777 "info"); 778 779 // With Rpass-analysis report why. This is on by default if distribution 780 // was requested explicitly. 781 ORE->emit(OptimizationRemarkAnalysis( 782 Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, 783 RemarkName, L->getStartLoc(), L->getHeader()) 784 << "loop not distributed: " << Message); 785 786 // Also issue a warning if distribution was requested explicitly but it 787 // failed. 788 if (Forced) 789 Ctx.diagnose(DiagnosticInfoOptimizationFailure( 790 *F, L->getStartLoc(), "loop not distributed: failed " 791 "explicitly specified loop distribution")); 792 793 return false; 794 } 795 796 /// \brief Return if distribution forced to be enabled/disabled for the loop. 797 /// 798 /// If the optional has a value, it indicates whether distribution was forced 799 /// to be enabled (true) or disabled (false). If the optional has no value 800 /// distribution was not forced either way. 801 const Optional<bool> &isForced() const { return IsForced; } 802 803 private: 804 /// \brief Filter out checks between pointers from the same partition. 805 /// 806 /// \p PtrToPartition contains the partition number for pointers. Partition 807 /// number -1 means that the pointer is used in multiple partitions. In this 808 /// case we can't safely omit the check. 809 SmallVector<RuntimePointerChecking::PointerCheck, 4> 810 includeOnlyCrossPartitionChecks( 811 const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, 812 const SmallVectorImpl<int> &PtrToPartition, 813 const RuntimePointerChecking *RtPtrChecking) { 814 SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; 815 816 std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), 817 [&](const RuntimePointerChecking::PointerCheck &Check) { 818 for (unsigned PtrIdx1 : Check.first->Members) 819 for (unsigned PtrIdx2 : Check.second->Members) 820 // Only include this check if there is a pair of pointers 821 // that require checking and the pointers fall into 822 // separate partitions. 823 // 824 // (Note that we already know at this point that the two 825 // pointer groups need checking but it doesn't follow 826 // that each pair of pointers within the two groups need 827 // checking as well. 828 // 829 // In other words we don't want to include a check just 830 // because there is a pair of pointers between the two 831 // pointer groups that require checks and a different 832 // pair whose pointers fall into different partitions.) 833 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 834 !RuntimePointerChecking::arePointersInSamePartition( 835 PtrToPartition, PtrIdx1, PtrIdx2)) 836 return true; 837 return false; 838 }); 839 840 return Checks; 841 } 842 843 /// \brief Check whether the loop metadata is forcing distribution to be 844 /// enabled/disabled. 845 void setForced() { 846 Optional<const MDOperand *> Value = 847 findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); 848 if (!Value) 849 return; 850 851 const MDOperand *Op = *Value; 852 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 853 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 854 } 855 856 Loop *L; 857 Function *F; 858 859 // Analyses used. 860 LoopInfo *LI; 861 const LoopAccessInfo *LAI; 862 DominatorTree *DT; 863 ScalarEvolution *SE; 864 OptimizationRemarkEmitter *ORE; 865 866 /// \brief Indicates whether distribution is forced to be enabled/disabled for 867 /// the loop. 868 /// 869 /// If the optional has a value, it indicates whether distribution was forced 870 /// to be enabled (true) or disabled (false). If the optional has no value 871 /// distribution was not forced either way. 872 Optional<bool> IsForced; 873 }; 874 875 /// Shared implementation between new and old PMs. 876 static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, 877 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, 878 std::function<const LoopAccessInfo &(Loop &)> &GetLAA, 879 bool ProcessAllLoops) { 880 // Build up a worklist of inner-loops to vectorize. This is necessary as the 881 // act of distributing a loop creates new loops and can invalidate iterators 882 // across the loops. 883 SmallVector<Loop *, 8> Worklist; 884 885 for (Loop *TopLevelLoop : *LI) 886 for (Loop *L : depth_first(TopLevelLoop)) 887 // We only handle inner-most loops. 888 if (L->empty()) 889 Worklist.push_back(L); 890 891 // Now walk the identified inner loops. 892 bool Changed = false; 893 for (Loop *L : Worklist) { 894 LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE); 895 896 // If distribution was forced for the specific loop to be 897 // enabled/disabled, follow that. Otherwise use the global flag. 898 if (LDL.isForced().getValueOr(ProcessAllLoops)) 899 Changed |= LDL.processLoop(GetLAA); 900 } 901 902 // Process each loop nest in the function. 903 return Changed; 904 } 905 906 /// \brief The pass class. 907 class LoopDistributeLegacy : public FunctionPass { 908 public: 909 /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be 910 /// performed by default. Pass -enable-loop-distribute={0,1} overrides this 911 /// default. We use this to keep LoopDistribution off by default when invoked 912 /// from the optimization pipeline but on when invoked explicitly from opt. 913 LoopDistributeLegacy(bool ProcessAllLoopsByDefault = true) 914 : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) { 915 // The default is set by the caller. 916 if (EnableLoopDistribute.getNumOccurrences() > 0) 917 ProcessAllLoops = EnableLoopDistribute; 918 initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); 919 } 920 921 bool runOnFunction(Function &F) override { 922 if (skipFunction(F)) 923 return false; 924 925 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 926 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 927 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 928 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 929 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 930 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 931 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 932 933 return runImpl(F, LI, DT, SE, ORE, GetLAA, ProcessAllLoops); 934 } 935 936 void getAnalysisUsage(AnalysisUsage &AU) const override { 937 AU.addRequired<ScalarEvolutionWrapperPass>(); 938 AU.addRequired<LoopInfoWrapperPass>(); 939 AU.addPreserved<LoopInfoWrapperPass>(); 940 AU.addRequired<LoopAccessLegacyAnalysis>(); 941 AU.addRequired<DominatorTreeWrapperPass>(); 942 AU.addPreserved<DominatorTreeWrapperPass>(); 943 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 944 AU.addPreserved<GlobalsAAWrapperPass>(); 945 } 946 947 static char ID; 948 949 private: 950 /// \brief Whether distribution should be on in this function. The per-loop 951 /// pragma can override this. 952 bool ProcessAllLoops; 953 }; 954 } // anonymous namespace 955 956 PreservedAnalyses LoopDistributePass::run(Function &F, 957 FunctionAnalysisManager &AM) { 958 // FIXME: This does not currently match the behavior from the old PM. 959 // ProcessAllLoops with the old PM defaults to true when invoked from opt and 960 // false when invoked from the optimization pipeline. 961 bool ProcessAllLoops = false; 962 if (EnableLoopDistribute.getNumOccurrences() > 0) 963 ProcessAllLoops = EnableLoopDistribute; 964 965 auto &LI = AM.getResult<LoopAnalysis>(F); 966 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 967 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 968 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 969 970 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 971 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 972 [&](Loop &L) -> const LoopAccessInfo & { 973 return LAM.getResult<LoopAccessAnalysis>(L); 974 }; 975 976 bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA, ProcessAllLoops); 977 if (!Changed) 978 return PreservedAnalyses::all(); 979 PreservedAnalyses PA; 980 PA.preserve<LoopAnalysis>(); 981 PA.preserve<DominatorTreeAnalysis>(); 982 PA.preserve<GlobalsAA>(); 983 return PA; 984 } 985 986 char LoopDistributeLegacy::ID; 987 static const char ldist_name[] = "Loop Distribution"; 988 989 INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, 990 false) 991 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 992 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 993 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 994 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 995 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 996 INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) 997 998 namespace llvm { 999 FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) { 1000 return new LoopDistributeLegacy(ProcessAllLoopsByDefault); 1001 } 1002 } 1003