1 //===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass implements a simple loop reroller. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/Scalar.h" 15 #include "llvm/ADT/MapVector.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallBitVector.h" 18 #include "llvm/ADT/SmallSet.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AliasAnalysis.h" 21 #include "llvm/Analysis/AliasSetTracker.h" 22 #include "llvm/Analysis/LoopPass.h" 23 #include "llvm/Analysis/ScalarEvolution.h" 24 #include "llvm/Analysis/ScalarEvolutionExpander.h" 25 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 26 #include "llvm/Analysis/ValueTracking.h" 27 #include "llvm/IR/DataLayout.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/IntrinsicInst.h" 30 #include "llvm/Support/CommandLine.h" 31 #include "llvm/Support/Debug.h" 32 #include "llvm/Support/raw_ostream.h" 33 #include "llvm/Analysis/TargetLibraryInfo.h" 34 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 35 #include "llvm/Transforms/Utils/Local.h" 36 #include "llvm/Transforms/Utils/LoopUtils.h" 37 38 using namespace llvm; 39 40 #define DEBUG_TYPE "loop-reroll" 41 42 STATISTIC(NumRerolledLoops, "Number of rerolled loops"); 43 44 static cl::opt<unsigned> 45 MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, 46 cl::desc("The maximum increment for loop rerolling")); 47 48 static cl::opt<unsigned> 49 NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400), 50 cl::Hidden, 51 cl::desc("The maximum number of failures to tolerate" 52 " during fuzzy matching. (default: 400)")); 53 54 // This loop re-rolling transformation aims to transform loops like this: 55 // 56 // int foo(int a); 57 // void bar(int *x) { 58 // for (int i = 0; i < 500; i += 3) { 59 // foo(i); 60 // foo(i+1); 61 // foo(i+2); 62 // } 63 // } 64 // 65 // into a loop like this: 66 // 67 // void bar(int *x) { 68 // for (int i = 0; i < 500; ++i) 69 // foo(i); 70 // } 71 // 72 // It does this by looking for loops that, besides the latch code, are composed 73 // of isomorphic DAGs of instructions, with each DAG rooted at some increment 74 // to the induction variable, and where each DAG is isomorphic to the DAG 75 // rooted at the induction variable (excepting the sub-DAGs which root the 76 // other induction-variable increments). In other words, we're looking for loop 77 // bodies of the form: 78 // 79 // %iv = phi [ (preheader, ...), (body, %iv.next) ] 80 // f(%iv) 81 // %iv.1 = add %iv, 1 <-- a root increment 82 // f(%iv.1) 83 // %iv.2 = add %iv, 2 <-- a root increment 84 // f(%iv.2) 85 // %iv.scale_m_1 = add %iv, scale-1 <-- a root increment 86 // f(%iv.scale_m_1) 87 // ... 88 // %iv.next = add %iv, scale 89 // %cmp = icmp(%iv, ...) 90 // br %cmp, header, exit 91 // 92 // where each f(i) is a set of instructions that, collectively, are a function 93 // only of i (and other loop-invariant values). 94 // 95 // As a special case, we can also reroll loops like this: 96 // 97 // int foo(int); 98 // void bar(int *x) { 99 // for (int i = 0; i < 500; ++i) { 100 // x[3*i] = foo(0); 101 // x[3*i+1] = foo(0); 102 // x[3*i+2] = foo(0); 103 // } 104 // } 105 // 106 // into this: 107 // 108 // void bar(int *x) { 109 // for (int i = 0; i < 1500; ++i) 110 // x[i] = foo(0); 111 // } 112 // 113 // in which case, we're looking for inputs like this: 114 // 115 // %iv = phi [ (preheader, ...), (body, %iv.next) ] 116 // %scaled.iv = mul %iv, scale 117 // f(%scaled.iv) 118 // %scaled.iv.1 = add %scaled.iv, 1 119 // f(%scaled.iv.1) 120 // %scaled.iv.2 = add %scaled.iv, 2 121 // f(%scaled.iv.2) 122 // %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 123 // f(%scaled.iv.scale_m_1) 124 // ... 125 // %iv.next = add %iv, 1 126 // %cmp = icmp(%iv, ...) 127 // br %cmp, header, exit 128 129 namespace { 130 enum IterationLimits { 131 /// The maximum number of iterations that we'll try and reroll. This 132 /// has to be less than 25 in order to fit into a SmallBitVector. 133 IL_MaxRerollIterations = 16, 134 /// The bitvector index used by loop induction variables and other 135 /// instructions that belong to all iterations. 136 IL_All, 137 IL_End 138 }; 139 140 class LoopReroll : public LoopPass { 141 public: 142 static char ID; // Pass ID, replacement for typeid 143 LoopReroll() : LoopPass(ID) { 144 initializeLoopRerollPass(*PassRegistry::getPassRegistry()); 145 } 146 147 bool runOnLoop(Loop *L, LPPassManager &LPM) override; 148 149 void getAnalysisUsage(AnalysisUsage &AU) const override { 150 AU.addRequired<AliasAnalysis>(); 151 AU.addRequired<LoopInfoWrapperPass>(); 152 AU.addPreserved<LoopInfoWrapperPass>(); 153 AU.addRequired<DominatorTreeWrapperPass>(); 154 AU.addPreserved<DominatorTreeWrapperPass>(); 155 AU.addRequired<ScalarEvolution>(); 156 AU.addRequired<TargetLibraryInfoWrapperPass>(); 157 } 158 159 protected: 160 AliasAnalysis *AA; 161 LoopInfo *LI; 162 ScalarEvolution *SE; 163 const DataLayout *DL; 164 TargetLibraryInfo *TLI; 165 DominatorTree *DT; 166 167 typedef SmallVector<Instruction *, 16> SmallInstructionVector; 168 typedef SmallSet<Instruction *, 16> SmallInstructionSet; 169 170 // A chain of isomorphic instructions, indentified by a single-use PHI, 171 // representing a reduction. Only the last value may be used outside the 172 // loop. 173 struct SimpleLoopReduction { 174 SimpleLoopReduction(Instruction *P, Loop *L) 175 : Valid(false), Instructions(1, P) { 176 assert(isa<PHINode>(P) && "First reduction instruction must be a PHI"); 177 add(L); 178 } 179 180 bool valid() const { 181 return Valid; 182 } 183 184 Instruction *getPHI() const { 185 assert(Valid && "Using invalid reduction"); 186 return Instructions.front(); 187 } 188 189 Instruction *getReducedValue() const { 190 assert(Valid && "Using invalid reduction"); 191 return Instructions.back(); 192 } 193 194 Instruction *get(size_t i) const { 195 assert(Valid && "Using invalid reduction"); 196 return Instructions[i+1]; 197 } 198 199 Instruction *operator [] (size_t i) const { return get(i); } 200 201 // The size, ignoring the initial PHI. 202 size_t size() const { 203 assert(Valid && "Using invalid reduction"); 204 return Instructions.size()-1; 205 } 206 207 typedef SmallInstructionVector::iterator iterator; 208 typedef SmallInstructionVector::const_iterator const_iterator; 209 210 iterator begin() { 211 assert(Valid && "Using invalid reduction"); 212 return std::next(Instructions.begin()); 213 } 214 215 const_iterator begin() const { 216 assert(Valid && "Using invalid reduction"); 217 return std::next(Instructions.begin()); 218 } 219 220 iterator end() { return Instructions.end(); } 221 const_iterator end() const { return Instructions.end(); } 222 223 protected: 224 bool Valid; 225 SmallInstructionVector Instructions; 226 227 void add(Loop *L); 228 }; 229 230 // The set of all reductions, and state tracking of possible reductions 231 // during loop instruction processing. 232 struct ReductionTracker { 233 typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector; 234 235 // Add a new possible reduction. 236 void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); } 237 238 // Setup to track possible reductions corresponding to the provided 239 // rerolling scale. Only reductions with a number of non-PHI instructions 240 // that is divisible by the scale are considered. Three instructions sets 241 // are filled in: 242 // - A set of all possible instructions in eligible reductions. 243 // - A set of all PHIs in eligible reductions 244 // - A set of all reduced values (last instructions) in eligible 245 // reductions. 246 void restrictToScale(uint64_t Scale, 247 SmallInstructionSet &PossibleRedSet, 248 SmallInstructionSet &PossibleRedPHISet, 249 SmallInstructionSet &PossibleRedLastSet) { 250 PossibleRedIdx.clear(); 251 PossibleRedIter.clear(); 252 Reds.clear(); 253 254 for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i) 255 if (PossibleReds[i].size() % Scale == 0) { 256 PossibleRedLastSet.insert(PossibleReds[i].getReducedValue()); 257 PossibleRedPHISet.insert(PossibleReds[i].getPHI()); 258 259 PossibleRedSet.insert(PossibleReds[i].getPHI()); 260 PossibleRedIdx[PossibleReds[i].getPHI()] = i; 261 for (Instruction *J : PossibleReds[i]) { 262 PossibleRedSet.insert(J); 263 PossibleRedIdx[J] = i; 264 } 265 } 266 } 267 268 // The functions below are used while processing the loop instructions. 269 270 // Are the two instructions both from reductions, and furthermore, from 271 // the same reduction? 272 bool isPairInSame(Instruction *J1, Instruction *J2) { 273 DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1); 274 if (J1I != PossibleRedIdx.end()) { 275 DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2); 276 if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second) 277 return true; 278 } 279 280 return false; 281 } 282 283 // The two provided instructions, the first from the base iteration, and 284 // the second from iteration i, form a matched pair. If these are part of 285 // a reduction, record that fact. 286 void recordPair(Instruction *J1, Instruction *J2, unsigned i) { 287 if (PossibleRedIdx.count(J1)) { 288 assert(PossibleRedIdx.count(J2) && 289 "Recording reduction vs. non-reduction instruction?"); 290 291 PossibleRedIter[J1] = 0; 292 PossibleRedIter[J2] = i; 293 294 int Idx = PossibleRedIdx[J1]; 295 assert(Idx == PossibleRedIdx[J2] && 296 "Recording pair from different reductions?"); 297 Reds.insert(Idx); 298 } 299 } 300 301 // The functions below can be called after we've finished processing all 302 // instructions in the loop, and we know which reductions were selected. 303 304 // Is the provided instruction the PHI of a reduction selected for 305 // rerolling? 306 bool isSelectedPHI(Instruction *J) { 307 if (!isa<PHINode>(J)) 308 return false; 309 310 for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); 311 RI != RIE; ++RI) { 312 int i = *RI; 313 if (cast<Instruction>(J) == PossibleReds[i].getPHI()) 314 return true; 315 } 316 317 return false; 318 } 319 320 bool validateSelected(); 321 void replaceSelected(); 322 323 protected: 324 // The vector of all possible reductions (for any scale). 325 SmallReductionVector PossibleReds; 326 327 DenseMap<Instruction *, int> PossibleRedIdx; 328 DenseMap<Instruction *, int> PossibleRedIter; 329 DenseSet<int> Reds; 330 }; 331 332 // A DAGRootSet models an induction variable being used in a rerollable 333 // loop. For example, 334 // 335 // x[i*3+0] = y1 336 // x[i*3+1] = y2 337 // x[i*3+2] = y3 338 // 339 // Base instruction -> i*3 340 // +---+----+ 341 // / | \ 342 // ST[y1] +1 +2 <-- Roots 343 // | | 344 // ST[y2] ST[y3] 345 // 346 // There may be multiple DAGRoots, for example: 347 // 348 // x[i*2+0] = ... (1) 349 // x[i*2+1] = ... (1) 350 // x[i*2+4] = ... (2) 351 // x[i*2+5] = ... (2) 352 // x[(i+1234)*2+5678] = ... (3) 353 // x[(i+1234)*2+5679] = ... (3) 354 // 355 // The loop will be rerolled by adding a new loop induction variable, 356 // one for the Base instruction in each DAGRootSet. 357 // 358 struct DAGRootSet { 359 Instruction *BaseInst; 360 SmallInstructionVector Roots; 361 // The instructions between IV and BaseInst (but not including BaseInst). 362 SmallInstructionSet SubsumedInsts; 363 }; 364 365 // The set of all DAG roots, and state tracking of all roots 366 // for a particular induction variable. 367 struct DAGRootTracker { 368 DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV, 369 ScalarEvolution *SE, AliasAnalysis *AA, 370 TargetLibraryInfo *TLI, const DataLayout *DL) 371 : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), 372 DL(DL), IV(IV) { 373 } 374 375 /// Stage 1: Find all the DAG roots for the induction variable. 376 bool findRoots(); 377 /// Stage 2: Validate if the found roots are valid. 378 bool validate(ReductionTracker &Reductions); 379 /// Stage 3: Assuming validate() returned true, perform the 380 /// replacement. 381 /// @param IterCount The maximum iteration count of L. 382 void replace(const SCEV *IterCount); 383 384 protected: 385 typedef MapVector<Instruction*, SmallBitVector> UsesTy; 386 387 bool findRootsRecursive(Instruction *IVU, 388 SmallInstructionSet SubsumedInsts); 389 bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts); 390 bool collectPossibleRoots(Instruction *Base, 391 std::map<int64_t,Instruction*> &Roots); 392 393 bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet); 394 void collectInLoopUserSet(const SmallInstructionVector &Roots, 395 const SmallInstructionSet &Exclude, 396 const SmallInstructionSet &Final, 397 DenseSet<Instruction *> &Users); 398 void collectInLoopUserSet(Instruction *Root, 399 const SmallInstructionSet &Exclude, 400 const SmallInstructionSet &Final, 401 DenseSet<Instruction *> &Users); 402 403 UsesTy::iterator nextInstr(int Val, UsesTy &In, 404 const SmallInstructionSet &Exclude, 405 UsesTy::iterator *StartI=nullptr); 406 bool isBaseInst(Instruction *I); 407 bool isRootInst(Instruction *I); 408 bool instrDependsOn(Instruction *I, 409 UsesTy::iterator Start, 410 UsesTy::iterator End); 411 412 LoopReroll *Parent; 413 414 // Members of Parent, replicated here for brevity. 415 Loop *L; 416 ScalarEvolution *SE; 417 AliasAnalysis *AA; 418 TargetLibraryInfo *TLI; 419 const DataLayout *DL; 420 421 // The loop induction variable. 422 Instruction *IV; 423 // Loop step amount. 424 uint64_t Inc; 425 // Loop reroll count; if Inc == 1, this records the scaling applied 426 // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ; 427 // If Inc is not 1, Scale = Inc. 428 uint64_t Scale; 429 // The roots themselves. 430 SmallVector<DAGRootSet,16> RootSets; 431 // All increment instructions for IV. 432 SmallInstructionVector LoopIncs; 433 // Map of all instructions in the loop (in order) to the iterations 434 // they are used in (or specially, IL_All for instructions 435 // used in the loop increment mechanism). 436 UsesTy Uses; 437 }; 438 439 void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); 440 void collectPossibleReductions(Loop *L, 441 ReductionTracker &Reductions); 442 bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, 443 ReductionTracker &Reductions); 444 }; 445 } 446 447 char LoopReroll::ID = 0; 448 INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) 449 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 450 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 451 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 452 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 453 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 454 INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) 455 456 Pass *llvm::createLoopRerollPass() { 457 return new LoopReroll; 458 } 459 460 // Returns true if the provided instruction is used outside the given loop. 461 // This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in 462 // non-loop blocks to be outside the loop. 463 static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { 464 for (User *U : I->users()) { 465 if (!L->contains(cast<Instruction>(U))) 466 return true; 467 } 468 return false; 469 } 470 471 // Collect the list of loop induction variables with respect to which it might 472 // be possible to reroll the loop. 473 void LoopReroll::collectPossibleIVs(Loop *L, 474 SmallInstructionVector &PossibleIVs) { 475 BasicBlock *Header = L->getHeader(); 476 for (BasicBlock::iterator I = Header->begin(), 477 IE = Header->getFirstInsertionPt(); I != IE; ++I) { 478 if (!isa<PHINode>(I)) 479 continue; 480 if (!I->getType()->isIntegerTy()) 481 continue; 482 483 if (const SCEVAddRecExpr *PHISCEV = 484 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) { 485 if (PHISCEV->getLoop() != L) 486 continue; 487 if (!PHISCEV->isAffine()) 488 continue; 489 if (const SCEVConstant *IncSCEV = 490 dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) { 491 if (!IncSCEV->getValue()->getValue().isStrictlyPositive()) 492 continue; 493 if (IncSCEV->getValue()->uge(MaxInc)) 494 continue; 495 496 DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << 497 *PHISCEV << "\n"); 498 PossibleIVs.push_back(I); 499 } 500 } 501 } 502 } 503 504 // Add the remainder of the reduction-variable chain to the instruction vector 505 // (the initial PHINode has already been added). If successful, the object is 506 // marked as valid. 507 void LoopReroll::SimpleLoopReduction::add(Loop *L) { 508 assert(!Valid && "Cannot add to an already-valid chain"); 509 510 // The reduction variable must be a chain of single-use instructions 511 // (including the PHI), except for the last value (which is used by the PHI 512 // and also outside the loop). 513 Instruction *C = Instructions.front(); 514 if (C->user_empty()) 515 return; 516 517 do { 518 C = cast<Instruction>(*C->user_begin()); 519 if (C->hasOneUse()) { 520 if (!C->isBinaryOp()) 521 return; 522 523 if (!(isa<PHINode>(Instructions.back()) || 524 C->isSameOperationAs(Instructions.back()))) 525 return; 526 527 Instructions.push_back(C); 528 } 529 } while (C->hasOneUse()); 530 531 if (Instructions.size() < 2 || 532 !C->isSameOperationAs(Instructions.back()) || 533 C->use_empty()) 534 return; 535 536 // C is now the (potential) last instruction in the reduction chain. 537 for (User *U : C->users()) { 538 // The only in-loop user can be the initial PHI. 539 if (L->contains(cast<Instruction>(U))) 540 if (cast<Instruction>(U) != Instructions.front()) 541 return; 542 } 543 544 Instructions.push_back(C); 545 Valid = true; 546 } 547 548 // Collect the vector of possible reduction variables. 549 void LoopReroll::collectPossibleReductions(Loop *L, 550 ReductionTracker &Reductions) { 551 BasicBlock *Header = L->getHeader(); 552 for (BasicBlock::iterator I = Header->begin(), 553 IE = Header->getFirstInsertionPt(); I != IE; ++I) { 554 if (!isa<PHINode>(I)) 555 continue; 556 if (!I->getType()->isSingleValueType()) 557 continue; 558 559 SimpleLoopReduction SLR(I, L); 560 if (!SLR.valid()) 561 continue; 562 563 DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " << 564 SLR.size() << " chained instructions)\n"); 565 Reductions.addSLR(SLR); 566 } 567 } 568 569 // Collect the set of all users of the provided root instruction. This set of 570 // users contains not only the direct users of the root instruction, but also 571 // all users of those users, and so on. There are two exceptions: 572 // 573 // 1. Instructions in the set of excluded instructions are never added to the 574 // use set (even if they are users). This is used, for example, to exclude 575 // including root increments in the use set of the primary IV. 576 // 577 // 2. Instructions in the set of final instructions are added to the use set 578 // if they are users, but their users are not added. This is used, for 579 // example, to prevent a reduction update from forcing all later reduction 580 // updates into the use set. 581 void LoopReroll::DAGRootTracker::collectInLoopUserSet( 582 Instruction *Root, const SmallInstructionSet &Exclude, 583 const SmallInstructionSet &Final, 584 DenseSet<Instruction *> &Users) { 585 SmallInstructionVector Queue(1, Root); 586 while (!Queue.empty()) { 587 Instruction *I = Queue.pop_back_val(); 588 if (!Users.insert(I).second) 589 continue; 590 591 if (!Final.count(I)) 592 for (Use &U : I->uses()) { 593 Instruction *User = cast<Instruction>(U.getUser()); 594 if (PHINode *PN = dyn_cast<PHINode>(User)) { 595 // Ignore "wrap-around" uses to PHIs of this loop's header. 596 if (PN->getIncomingBlock(U) == L->getHeader()) 597 continue; 598 } 599 600 if (L->contains(User) && !Exclude.count(User)) { 601 Queue.push_back(User); 602 } 603 } 604 605 // We also want to collect single-user "feeder" values. 606 for (User::op_iterator OI = I->op_begin(), 607 OIE = I->op_end(); OI != OIE; ++OI) { 608 if (Instruction *Op = dyn_cast<Instruction>(*OI)) 609 if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) && 610 !Final.count(Op)) 611 Queue.push_back(Op); 612 } 613 } 614 } 615 616 // Collect all of the users of all of the provided root instructions (combined 617 // into a single set). 618 void LoopReroll::DAGRootTracker::collectInLoopUserSet( 619 const SmallInstructionVector &Roots, 620 const SmallInstructionSet &Exclude, 621 const SmallInstructionSet &Final, 622 DenseSet<Instruction *> &Users) { 623 for (SmallInstructionVector::const_iterator I = Roots.begin(), 624 IE = Roots.end(); I != IE; ++I) 625 collectInLoopUserSet(*I, Exclude, Final, Users); 626 } 627 628 static bool isSimpleLoadStore(Instruction *I) { 629 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 630 return LI->isSimple(); 631 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 632 return SI->isSimple(); 633 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 634 return !MI->isVolatile(); 635 return false; 636 } 637 638 /// Return true if IVU is a "simple" arithmetic operation. 639 /// This is used for narrowing the search space for DAGRoots; only arithmetic 640 /// and GEPs can be part of a DAGRoot. 641 static bool isSimpleArithmeticOp(User *IVU) { 642 if (Instruction *I = dyn_cast<Instruction>(IVU)) { 643 switch (I->getOpcode()) { 644 default: return false; 645 case Instruction::Add: 646 case Instruction::Sub: 647 case Instruction::Mul: 648 case Instruction::Shl: 649 case Instruction::AShr: 650 case Instruction::LShr: 651 case Instruction::GetElementPtr: 652 case Instruction::Trunc: 653 case Instruction::ZExt: 654 case Instruction::SExt: 655 return true; 656 } 657 } 658 return false; 659 } 660 661 static bool isLoopIncrement(User *U, Instruction *IV) { 662 BinaryOperator *BO = dyn_cast<BinaryOperator>(U); 663 if (!BO || BO->getOpcode() != Instruction::Add) 664 return false; 665 666 for (auto *UU : BO->users()) { 667 PHINode *PN = dyn_cast<PHINode>(UU); 668 if (PN && PN == IV) 669 return true; 670 } 671 return false; 672 } 673 674 bool LoopReroll::DAGRootTracker:: 675 collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { 676 SmallInstructionVector BaseUsers; 677 678 for (auto *I : Base->users()) { 679 ConstantInt *CI = nullptr; 680 681 if (isLoopIncrement(I, IV)) { 682 LoopIncs.push_back(cast<Instruction>(I)); 683 continue; 684 } 685 686 // The root nodes must be either GEPs, ORs or ADDs. 687 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 688 if (BO->getOpcode() == Instruction::Add || 689 BO->getOpcode() == Instruction::Or) 690 CI = dyn_cast<ConstantInt>(BO->getOperand(1)); 691 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { 692 Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1); 693 CI = dyn_cast<ConstantInt>(LastOperand); 694 } 695 696 if (!CI) { 697 if (Instruction *II = dyn_cast<Instruction>(I)) { 698 BaseUsers.push_back(II); 699 continue; 700 } else { 701 DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n"); 702 return false; 703 } 704 } 705 706 int64_t V = CI->getValue().getSExtValue(); 707 if (Roots.find(V) != Roots.end()) 708 // No duplicates, please. 709 return false; 710 711 // FIXME: Add support for negative values. 712 if (V < 0) { 713 DEBUG(dbgs() << "LRR: Aborting due to negative value: " << V << "\n"); 714 return false; 715 } 716 717 Roots[V] = cast<Instruction>(I); 718 } 719 720 if (Roots.empty()) 721 return false; 722 723 // If we found non-loop-inc, non-root users of Base, assume they are 724 // for the zeroth root index. This is because "add %a, 0" gets optimized 725 // away. 726 if (BaseUsers.size()) { 727 if (Roots.find(0) != Roots.end()) { 728 DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n"); 729 return false; 730 } 731 Roots[0] = Base; 732 } 733 734 // Calculate the number of users of the base, or lowest indexed, iteration. 735 unsigned NumBaseUses = BaseUsers.size(); 736 if (NumBaseUses == 0) 737 NumBaseUses = Roots.begin()->second->getNumUses(); 738 739 // Check that every node has the same number of users. 740 for (auto &KV : Roots) { 741 if (KV.first == 0) 742 continue; 743 if (KV.second->getNumUses() != NumBaseUses) { 744 DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: " 745 << "#Base=" << NumBaseUses << ", #Root=" << 746 KV.second->getNumUses() << "\n"); 747 return false; 748 } 749 } 750 751 return true; 752 } 753 754 bool LoopReroll::DAGRootTracker:: 755 findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) { 756 // Does the user look like it could be part of a root set? 757 // All its users must be simple arithmetic ops. 758 if (I->getNumUses() > IL_MaxRerollIterations) 759 return false; 760 761 if ((I->getOpcode() == Instruction::Mul || 762 I->getOpcode() == Instruction::PHI) && 763 I != IV && 764 findRootsBase(I, SubsumedInsts)) 765 return true; 766 767 SubsumedInsts.insert(I); 768 769 for (User *V : I->users()) { 770 Instruction *I = dyn_cast<Instruction>(V); 771 if (std::find(LoopIncs.begin(), LoopIncs.end(), I) != LoopIncs.end()) 772 continue; 773 774 if (!I || !isSimpleArithmeticOp(I) || 775 !findRootsRecursive(I, SubsumedInsts)) 776 return false; 777 } 778 return true; 779 } 780 781 bool LoopReroll::DAGRootTracker:: 782 findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { 783 784 // The base instruction needs to be a multiply so 785 // that we can erase it. 786 if (IVU->getOpcode() != Instruction::Mul && 787 IVU->getOpcode() != Instruction::PHI) 788 return false; 789 790 std::map<int64_t, Instruction*> V; 791 if (!collectPossibleRoots(IVU, V)) 792 return false; 793 794 // If we didn't get a root for index zero, then IVU must be 795 // subsumed. 796 if (V.find(0) == V.end()) 797 SubsumedInsts.insert(IVU); 798 799 // Partition the vector into monotonically increasing indexes. 800 DAGRootSet DRS; 801 DRS.BaseInst = nullptr; 802 803 for (auto &KV : V) { 804 if (!DRS.BaseInst) { 805 DRS.BaseInst = KV.second; 806 DRS.SubsumedInsts = SubsumedInsts; 807 } else if (DRS.Roots.empty()) { 808 DRS.Roots.push_back(KV.second); 809 } else if (V.find(KV.first - 1) != V.end()) { 810 DRS.Roots.push_back(KV.second); 811 } else { 812 // Linear sequence terminated. 813 RootSets.push_back(DRS); 814 DRS.BaseInst = KV.second; 815 DRS.SubsumedInsts = SubsumedInsts; 816 DRS.Roots.clear(); 817 } 818 } 819 RootSets.push_back(DRS); 820 821 return true; 822 } 823 824 bool LoopReroll::DAGRootTracker::findRoots() { 825 826 const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); 827 Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> 828 getValue()->getZExtValue(); 829 830 assert(RootSets.empty() && "Unclean state!"); 831 if (Inc == 1) { 832 for (auto *IVU : IV->users()) { 833 if (isLoopIncrement(IVU, IV)) 834 LoopIncs.push_back(cast<Instruction>(IVU)); 835 } 836 if (!findRootsRecursive(IV, SmallInstructionSet())) 837 return false; 838 LoopIncs.push_back(IV); 839 } else { 840 if (!findRootsBase(IV, SmallInstructionSet())) 841 return false; 842 } 843 844 // Ensure all sets have the same size. 845 if (RootSets.empty()) { 846 DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n"); 847 return false; 848 } 849 for (auto &V : RootSets) { 850 if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) { 851 DEBUG(dbgs() 852 << "LRR: Aborting because not all root sets have the same size\n"); 853 return false; 854 } 855 } 856 857 // And ensure all loop iterations are consecutive. We rely on std::map 858 // providing ordered traversal. 859 for (auto &V : RootSets) { 860 const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(V.BaseInst)); 861 if (!ADR) 862 return false; 863 864 // Consider a DAGRootSet with N-1 roots (so N different values including 865 // BaseInst). 866 // Define d = Roots[0] - BaseInst, which should be the same as 867 // Roots[I] - Roots[I-1] for all I in [1..N). 868 // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the 869 // loop iteration J. 870 // 871 // Now, For the loop iterations to be consecutive: 872 // D = d * N 873 874 unsigned N = V.Roots.size() + 1; 875 const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(V.Roots[0]), ADR); 876 const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N); 877 if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) { 878 DEBUG(dbgs() << "LRR: Aborting because iterations are not consecutive\n"); 879 return false; 880 } 881 } 882 Scale = RootSets[0].Roots.size() + 1; 883 884 if (Scale > IL_MaxRerollIterations) { 885 DEBUG(dbgs() << "LRR: Aborting - too many iterations found. " 886 << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations 887 << "\n"); 888 return false; 889 } 890 891 DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n"); 892 893 return true; 894 } 895 896 bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) { 897 // Populate the MapVector with all instructions in the block, in order first, 898 // so we can iterate over the contents later in perfect order. 899 for (auto &I : *L->getHeader()) { 900 Uses[&I].resize(IL_End); 901 } 902 903 SmallInstructionSet Exclude; 904 for (auto &DRS : RootSets) { 905 Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); 906 Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); 907 Exclude.insert(DRS.BaseInst); 908 } 909 Exclude.insert(LoopIncs.begin(), LoopIncs.end()); 910 911 for (auto &DRS : RootSets) { 912 DenseSet<Instruction*> VBase; 913 collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase); 914 for (auto *I : VBase) { 915 Uses[I].set(0); 916 } 917 918 unsigned Idx = 1; 919 for (auto *Root : DRS.Roots) { 920 DenseSet<Instruction*> V; 921 collectInLoopUserSet(Root, Exclude, PossibleRedSet, V); 922 923 // While we're here, check the use sets are the same size. 924 if (V.size() != VBase.size()) { 925 DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n"); 926 return false; 927 } 928 929 for (auto *I : V) { 930 Uses[I].set(Idx); 931 } 932 ++Idx; 933 } 934 935 // Make sure our subsumed instructions are remembered too. 936 for (auto *I : DRS.SubsumedInsts) { 937 Uses[I].set(IL_All); 938 } 939 } 940 941 // Make sure the loop increments are also accounted for. 942 943 Exclude.clear(); 944 for (auto &DRS : RootSets) { 945 Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); 946 Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); 947 Exclude.insert(DRS.BaseInst); 948 } 949 950 DenseSet<Instruction*> V; 951 collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V); 952 for (auto *I : V) { 953 Uses[I].set(IL_All); 954 } 955 956 return true; 957 958 } 959 960 /// Get the next instruction in "In" that is a member of set Val. 961 /// Start searching from StartI, and do not return anything in Exclude. 962 /// If StartI is not given, start from In.begin(). 963 LoopReroll::DAGRootTracker::UsesTy::iterator 964 LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In, 965 const SmallInstructionSet &Exclude, 966 UsesTy::iterator *StartI) { 967 UsesTy::iterator I = StartI ? *StartI : In.begin(); 968 while (I != In.end() && (I->second.test(Val) == 0 || 969 Exclude.count(I->first) != 0)) 970 ++I; 971 return I; 972 } 973 974 bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) { 975 for (auto &DRS : RootSets) { 976 if (DRS.BaseInst == I) 977 return true; 978 } 979 return false; 980 } 981 982 bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) { 983 for (auto &DRS : RootSets) { 984 if (std::find(DRS.Roots.begin(), DRS.Roots.end(), I) != DRS.Roots.end()) 985 return true; 986 } 987 return false; 988 } 989 990 /// Return true if instruction I depends on any instruction between 991 /// Start and End. 992 bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I, 993 UsesTy::iterator Start, 994 UsesTy::iterator End) { 995 for (auto *U : I->users()) { 996 for (auto It = Start; It != End; ++It) 997 if (U == It->first) 998 return true; 999 } 1000 return false; 1001 } 1002 1003 bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { 1004 // We now need to check for equivalence of the use graph of each root with 1005 // that of the primary induction variable (excluding the roots). Our goal 1006 // here is not to solve the full graph isomorphism problem, but rather to 1007 // catch common cases without a lot of work. As a result, we will assume 1008 // that the relative order of the instructions in each unrolled iteration 1009 // is the same (although we will not make an assumption about how the 1010 // different iterations are intermixed). Note that while the order must be 1011 // the same, the instructions may not be in the same basic block. 1012 1013 // An array of just the possible reductions for this scale factor. When we 1014 // collect the set of all users of some root instructions, these reduction 1015 // instructions are treated as 'final' (their uses are not considered). 1016 // This is important because we don't want the root use set to search down 1017 // the reduction chain. 1018 SmallInstructionSet PossibleRedSet; 1019 SmallInstructionSet PossibleRedLastSet; 1020 SmallInstructionSet PossibleRedPHISet; 1021 Reductions.restrictToScale(Scale, PossibleRedSet, 1022 PossibleRedPHISet, PossibleRedLastSet); 1023 1024 // Populate "Uses" with where each instruction is used. 1025 if (!collectUsedInstructions(PossibleRedSet)) 1026 return false; 1027 1028 // Make sure we mark the reduction PHIs as used in all iterations. 1029 for (auto *I : PossibleRedPHISet) { 1030 Uses[I].set(IL_All); 1031 } 1032 1033 // Make sure all instructions in the loop are in one and only one 1034 // set. 1035 for (auto &KV : Uses) { 1036 if (KV.second.count() != 1) { 1037 DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: " 1038 << *KV.first << " (#uses=" << KV.second.count() << ")\n"); 1039 return false; 1040 } 1041 } 1042 1043 DEBUG( 1044 for (auto &KV : Uses) { 1045 dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n"; 1046 } 1047 ); 1048 1049 for (unsigned Iter = 1; Iter < Scale; ++Iter) { 1050 // In addition to regular aliasing information, we need to look for 1051 // instructions from later (future) iterations that have side effects 1052 // preventing us from reordering them past other instructions with side 1053 // effects. 1054 bool FutureSideEffects = false; 1055 AliasSetTracker AST(*AA); 1056 // The map between instructions in f(%iv.(i+1)) and f(%iv). 1057 DenseMap<Value *, Value *> BaseMap; 1058 1059 // Compare iteration Iter to the base. 1060 SmallInstructionSet Visited; 1061 auto BaseIt = nextInstr(0, Uses, Visited); 1062 auto RootIt = nextInstr(Iter, Uses, Visited); 1063 auto LastRootIt = Uses.begin(); 1064 1065 while (BaseIt != Uses.end() && RootIt != Uses.end()) { 1066 Instruction *BaseInst = BaseIt->first; 1067 Instruction *RootInst = RootIt->first; 1068 1069 // Skip over the IV or root instructions; only match their users. 1070 bool Continue = false; 1071 if (isBaseInst(BaseInst)) { 1072 Visited.insert(BaseInst); 1073 BaseIt = nextInstr(0, Uses, Visited); 1074 Continue = true; 1075 } 1076 if (isRootInst(RootInst)) { 1077 LastRootIt = RootIt; 1078 Visited.insert(RootInst); 1079 RootIt = nextInstr(Iter, Uses, Visited); 1080 Continue = true; 1081 } 1082 if (Continue) continue; 1083 1084 if (!BaseInst->isSameOperationAs(RootInst)) { 1085 // Last chance saloon. We don't try and solve the full isomorphism 1086 // problem, but try and at least catch the case where two instructions 1087 // *of different types* are round the wrong way. We won't be able to 1088 // efficiently tell, given two ADD instructions, which way around we 1089 // should match them, but given an ADD and a SUB, we can at least infer 1090 // which one is which. 1091 // 1092 // This should allow us to deal with a greater subset of the isomorphism 1093 // problem. It does however change a linear algorithm into a quadratic 1094 // one, so limit the number of probes we do. 1095 auto TryIt = RootIt; 1096 unsigned N = NumToleratedFailedMatches; 1097 while (TryIt != Uses.end() && 1098 !BaseInst->isSameOperationAs(TryIt->first) && 1099 N--) { 1100 ++TryIt; 1101 TryIt = nextInstr(Iter, Uses, Visited, &TryIt); 1102 } 1103 1104 if (TryIt == Uses.end() || TryIt == RootIt || 1105 instrDependsOn(TryIt->first, RootIt, TryIt)) { 1106 DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << 1107 " vs. " << *RootInst << "\n"); 1108 return false; 1109 } 1110 1111 RootIt = TryIt; 1112 RootInst = TryIt->first; 1113 } 1114 1115 // All instructions between the last root and this root 1116 // may belong to some other iteration. If they belong to a 1117 // future iteration, then they're dangerous to alias with. 1118 // 1119 // Note that because we allow a limited amount of flexibility in the order 1120 // that we visit nodes, LastRootIt might be *before* RootIt, in which 1121 // case we've already checked this set of instructions so we shouldn't 1122 // do anything. 1123 for (; LastRootIt < RootIt; ++LastRootIt) { 1124 Instruction *I = LastRootIt->first; 1125 if (LastRootIt->second.find_first() < (int)Iter) 1126 continue; 1127 if (I->mayWriteToMemory()) 1128 AST.add(I); 1129 // Note: This is specifically guarded by a check on isa<PHINode>, 1130 // which while a valid (somewhat arbitrary) micro-optimization, is 1131 // needed because otherwise isSafeToSpeculativelyExecute returns 1132 // false on PHI nodes. 1133 if (!isa<PHINode>(I) && !isSimpleLoadStore(I) && 1134 !isSafeToSpeculativelyExecute(I, DL)) 1135 // Intervening instructions cause side effects. 1136 FutureSideEffects = true; 1137 } 1138 1139 // Make sure that this instruction, which is in the use set of this 1140 // root instruction, does not also belong to the base set or the set of 1141 // some other root instruction. 1142 if (RootIt->second.count() > 1) { 1143 DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << 1144 " vs. " << *RootInst << " (prev. case overlap)\n"); 1145 return false; 1146 } 1147 1148 // Make sure that we don't alias with any instruction in the alias set 1149 // tracker. If we do, then we depend on a future iteration, and we 1150 // can't reroll. 1151 if (RootInst->mayReadFromMemory()) 1152 for (auto &K : AST) { 1153 if (K.aliasesUnknownInst(RootInst, *AA)) { 1154 DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << 1155 " vs. " << *RootInst << " (depends on future store)\n"); 1156 return false; 1157 } 1158 } 1159 1160 // If we've past an instruction from a future iteration that may have 1161 // side effects, and this instruction might also, then we can't reorder 1162 // them, and this matching fails. As an exception, we allow the alias 1163 // set tracker to handle regular (simple) load/store dependencies. 1164 if (FutureSideEffects && 1165 ((!isSimpleLoadStore(BaseInst) && 1166 !isSafeToSpeculativelyExecute(BaseInst, DL)) || 1167 (!isSimpleLoadStore(RootInst) && 1168 !isSafeToSpeculativelyExecute(RootInst, DL)))) { 1169 DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << 1170 " vs. " << *RootInst << 1171 " (side effects prevent reordering)\n"); 1172 return false; 1173 } 1174 1175 // For instructions that are part of a reduction, if the operation is 1176 // associative, then don't bother matching the operands (because we 1177 // already know that the instructions are isomorphic, and the order 1178 // within the iteration does not matter). For non-associative reductions, 1179 // we do need to match the operands, because we need to reject 1180 // out-of-order instructions within an iteration! 1181 // For example (assume floating-point addition), we need to reject this: 1182 // x += a[i]; x += b[i]; 1183 // x += a[i+1]; x += b[i+1]; 1184 // x += b[i+2]; x += a[i+2]; 1185 bool InReduction = Reductions.isPairInSame(BaseInst, RootInst); 1186 1187 if (!(InReduction && BaseInst->isAssociative())) { 1188 bool Swapped = false, SomeOpMatched = false; 1189 for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) { 1190 Value *Op2 = RootInst->getOperand(j); 1191 1192 // If this is part of a reduction (and the operation is not 1193 // associatve), then we match all operands, but not those that are 1194 // part of the reduction. 1195 if (InReduction) 1196 if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) 1197 if (Reductions.isPairInSame(RootInst, Op2I)) 1198 continue; 1199 1200 DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); 1201 if (BMI != BaseMap.end()) { 1202 Op2 = BMI->second; 1203 } else { 1204 for (auto &DRS : RootSets) { 1205 if (DRS.Roots[Iter-1] == (Instruction*) Op2) { 1206 Op2 = DRS.BaseInst; 1207 break; 1208 } 1209 } 1210 } 1211 1212 if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) { 1213 // If we've not already decided to swap the matched operands, and 1214 // we've not already matched our first operand (note that we could 1215 // have skipped matching the first operand because it is part of a 1216 // reduction above), and the instruction is commutative, then try 1217 // the swapped match. 1218 if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched && 1219 BaseInst->getOperand(!j) == Op2) { 1220 Swapped = true; 1221 } else { 1222 DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst 1223 << " vs. " << *RootInst << " (operand " << j << ")\n"); 1224 return false; 1225 } 1226 } 1227 1228 SomeOpMatched = true; 1229 } 1230 } 1231 1232 if ((!PossibleRedLastSet.count(BaseInst) && 1233 hasUsesOutsideLoop(BaseInst, L)) || 1234 (!PossibleRedLastSet.count(RootInst) && 1235 hasUsesOutsideLoop(RootInst, L))) { 1236 DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << 1237 " vs. " << *RootInst << " (uses outside loop)\n"); 1238 return false; 1239 } 1240 1241 Reductions.recordPair(BaseInst, RootInst, Iter); 1242 BaseMap.insert(std::make_pair(RootInst, BaseInst)); 1243 1244 LastRootIt = RootIt; 1245 Visited.insert(BaseInst); 1246 Visited.insert(RootInst); 1247 BaseIt = nextInstr(0, Uses, Visited); 1248 RootIt = nextInstr(Iter, Uses, Visited); 1249 } 1250 assert (BaseIt == Uses.end() && RootIt == Uses.end() && 1251 "Mismatched set sizes!"); 1252 } 1253 1254 DEBUG(dbgs() << "LRR: Matched all iteration increments for " << 1255 *IV << "\n"); 1256 1257 return true; 1258 } 1259 1260 void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { 1261 BasicBlock *Header = L->getHeader(); 1262 // Remove instructions associated with non-base iterations. 1263 for (BasicBlock::reverse_iterator J = Header->rbegin(); 1264 J != Header->rend();) { 1265 unsigned I = Uses[&*J].find_first(); 1266 if (I > 0 && I < IL_All) { 1267 Instruction *D = &*J; 1268 DEBUG(dbgs() << "LRR: removing: " << *D << "\n"); 1269 D->eraseFromParent(); 1270 continue; 1271 } 1272 1273 ++J; 1274 } 1275 1276 // We need to create a new induction variable for each different BaseInst. 1277 for (auto &DRS : RootSets) { 1278 // Insert the new induction variable. 1279 const SCEVAddRecExpr *RealIVSCEV = 1280 cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); 1281 const SCEV *Start = RealIVSCEV->getStart(); 1282 const SCEVAddRecExpr *H = cast<SCEVAddRecExpr> 1283 (SE->getAddRecExpr(Start, 1284 SE->getConstant(RealIVSCEV->getType(), 1), 1285 L, SCEV::FlagAnyWrap)); 1286 { // Limit the lifetime of SCEVExpander. 1287 SCEVExpander Expander(*SE, "reroll"); 1288 Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); 1289 1290 for (auto &KV : Uses) { 1291 if (KV.second.find_first() == 0) 1292 KV.first->replaceUsesOfWith(DRS.BaseInst, NewIV); 1293 } 1294 1295 if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { 1296 // FIXME: Why do we need this check? 1297 if (Uses[BI].find_first() == IL_All) { 1298 const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); 1299 1300 // Iteration count SCEV minus 1 1301 const SCEV *ICMinus1SCEV = 1302 SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); 1303 1304 Value *ICMinus1; // Iteration count minus 1 1305 if (isa<SCEVConstant>(ICMinus1SCEV)) { 1306 ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI); 1307 } else { 1308 BasicBlock *Preheader = L->getLoopPreheader(); 1309 if (!Preheader) 1310 Preheader = InsertPreheaderForLoop(L, Parent); 1311 1312 ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), 1313 Preheader->getTerminator()); 1314 } 1315 1316 Value *Cond = 1317 new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond"); 1318 BI->setCondition(Cond); 1319 1320 if (BI->getSuccessor(1) != Header) 1321 BI->swapSuccessors(); 1322 } 1323 } 1324 } 1325 } 1326 1327 SimplifyInstructionsInBlock(Header, DL, TLI); 1328 DeleteDeadPHIs(Header, TLI); 1329 } 1330 1331 // Validate the selected reductions. All iterations must have an isomorphic 1332 // part of the reduction chain and, for non-associative reductions, the chain 1333 // entries must appear in order. 1334 bool LoopReroll::ReductionTracker::validateSelected() { 1335 // For a non-associative reduction, the chain entries must appear in order. 1336 for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); 1337 RI != RIE; ++RI) { 1338 int i = *RI; 1339 int PrevIter = 0, BaseCount = 0, Count = 0; 1340 for (Instruction *J : PossibleReds[i]) { 1341 // Note that all instructions in the chain must have been found because 1342 // all instructions in the function must have been assigned to some 1343 // iteration. 1344 int Iter = PossibleRedIter[J]; 1345 if (Iter != PrevIter && Iter != PrevIter + 1 && 1346 !PossibleReds[i].getReducedValue()->isAssociative()) { 1347 DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " << 1348 J << "\n"); 1349 return false; 1350 } 1351 1352 if (Iter != PrevIter) { 1353 if (Count != BaseCount) { 1354 DEBUG(dbgs() << "LRR: Iteration " << PrevIter << 1355 " reduction use count " << Count << 1356 " is not equal to the base use count " << 1357 BaseCount << "\n"); 1358 return false; 1359 } 1360 1361 Count = 0; 1362 } 1363 1364 ++Count; 1365 if (Iter == 0) 1366 ++BaseCount; 1367 1368 PrevIter = Iter; 1369 } 1370 } 1371 1372 return true; 1373 } 1374 1375 // For all selected reductions, remove all parts except those in the first 1376 // iteration (and the PHI). Replace outside uses of the reduced value with uses 1377 // of the first-iteration reduced value (in other words, reroll the selected 1378 // reductions). 1379 void LoopReroll::ReductionTracker::replaceSelected() { 1380 // Fixup reductions to refer to the last instruction associated with the 1381 // first iteration (not the last). 1382 for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); 1383 RI != RIE; ++RI) { 1384 int i = *RI; 1385 int j = 0; 1386 for (int e = PossibleReds[i].size(); j != e; ++j) 1387 if (PossibleRedIter[PossibleReds[i][j]] != 0) { 1388 --j; 1389 break; 1390 } 1391 1392 // Replace users with the new end-of-chain value. 1393 SmallInstructionVector Users; 1394 for (User *U : PossibleReds[i].getReducedValue()->users()) { 1395 Users.push_back(cast<Instruction>(U)); 1396 } 1397 1398 for (SmallInstructionVector::iterator J = Users.begin(), 1399 JE = Users.end(); J != JE; ++J) 1400 (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(), 1401 PossibleReds[i][j]); 1402 } 1403 } 1404 1405 // Reroll the provided loop with respect to the provided induction variable. 1406 // Generally, we're looking for a loop like this: 1407 // 1408 // %iv = phi [ (preheader, ...), (body, %iv.next) ] 1409 // f(%iv) 1410 // %iv.1 = add %iv, 1 <-- a root increment 1411 // f(%iv.1) 1412 // %iv.2 = add %iv, 2 <-- a root increment 1413 // f(%iv.2) 1414 // %iv.scale_m_1 = add %iv, scale-1 <-- a root increment 1415 // f(%iv.scale_m_1) 1416 // ... 1417 // %iv.next = add %iv, scale 1418 // %cmp = icmp(%iv, ...) 1419 // br %cmp, header, exit 1420 // 1421 // Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of 1422 // instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can 1423 // be intermixed with eachother. The restriction imposed by this algorithm is 1424 // that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), 1425 // etc. be the same. 1426 // 1427 // First, we collect the use set of %iv, excluding the other increment roots. 1428 // This gives us f(%iv). Then we iterate over the loop instructions (scale-1) 1429 // times, having collected the use set of f(%iv.(i+1)), during which we: 1430 // - Ensure that the next unmatched instruction in f(%iv) is isomorphic to 1431 // the next unmatched instruction in f(%iv.(i+1)). 1432 // - Ensure that both matched instructions don't have any external users 1433 // (with the exception of last-in-chain reduction instructions). 1434 // - Track the (aliasing) write set, and other side effects, of all 1435 // instructions that belong to future iterations that come before the matched 1436 // instructions. If the matched instructions read from that write set, then 1437 // f(%iv) or f(%iv.(i+1)) has some dependency on instructions in 1438 // f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, 1439 // if any of these future instructions had side effects (could not be 1440 // speculatively executed), and so do the matched instructions, when we 1441 // cannot reorder those side-effect-producing instructions, and rerolling 1442 // fails. 1443 // 1444 // Finally, we make sure that all loop instructions are either loop increment 1445 // roots, belong to simple latch code, parts of validated reductions, part of 1446 // f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions 1447 // have been validated), then we reroll the loop. 1448 bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, 1449 const SCEV *IterCount, 1450 ReductionTracker &Reductions) { 1451 DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DL); 1452 1453 if (!DAGRoots.findRoots()) 1454 return false; 1455 DEBUG(dbgs() << "LRR: Found all root induction increments for: " << 1456 *IV << "\n"); 1457 1458 if (!DAGRoots.validate(Reductions)) 1459 return false; 1460 if (!Reductions.validateSelected()) 1461 return false; 1462 // At this point, we've validated the rerolling, and we're committed to 1463 // making changes! 1464 1465 Reductions.replaceSelected(); 1466 DAGRoots.replace(IterCount); 1467 1468 ++NumRerolledLoops; 1469 return true; 1470 } 1471 1472 bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { 1473 if (skipOptnoneFunction(L)) 1474 return false; 1475 1476 AA = &getAnalysis<AliasAnalysis>(); 1477 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1478 SE = &getAnalysis<ScalarEvolution>(); 1479 TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); 1480 DL = &L->getHeader()->getModule()->getDataLayout(); 1481 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1482 1483 BasicBlock *Header = L->getHeader(); 1484 DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << 1485 "] Loop %" << Header->getName() << " (" << 1486 L->getNumBlocks() << " block(s))\n"); 1487 1488 bool Changed = false; 1489 1490 // For now, we'll handle only single BB loops. 1491 if (L->getNumBlocks() > 1) 1492 return Changed; 1493 1494 if (!SE->hasLoopInvariantBackedgeTakenCount(L)) 1495 return Changed; 1496 1497 const SCEV *LIBETC = SE->getBackedgeTakenCount(L); 1498 const SCEV *IterCount = 1499 SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1)); 1500 DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n"); 1501 1502 // First, we need to find the induction variable with respect to which we can 1503 // reroll (there may be several possible options). 1504 SmallInstructionVector PossibleIVs; 1505 collectPossibleIVs(L, PossibleIVs); 1506 1507 if (PossibleIVs.empty()) { 1508 DEBUG(dbgs() << "LRR: No possible IVs found\n"); 1509 return Changed; 1510 } 1511 1512 ReductionTracker Reductions; 1513 collectPossibleReductions(L, Reductions); 1514 1515 // For each possible IV, collect the associated possible set of 'root' nodes 1516 // (i+1, i+2, etc.). 1517 for (SmallInstructionVector::iterator I = PossibleIVs.begin(), 1518 IE = PossibleIVs.end(); I != IE; ++I) 1519 if (reroll(*I, L, Header, IterCount, Reductions)) { 1520 Changed = true; 1521 break; 1522 } 1523 1524 return Changed; 1525 } 1526