1 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This transformation analyzes and transforms the induction variables (and 11 // computations derived from them) into forms suitable for efficient execution 12 // on the target. 13 // 14 // This pass performs a strength reduction on array references inside loops that 15 // have as one or more of their components the loop induction variable, it 16 // rewrites expressions to take advantage of scaled-index addressing modes 17 // available on the target, and it performs a variety of other optimizations 18 // related to loop induction variables. 19 // 20 // Terminology note: this code has a lot of handling for "post-increment" or 21 // "post-inc" users. This is not talking about post-increment addressing modes; 22 // it is instead talking about code like this: 23 // 24 // %i = phi [ 0, %entry ], [ %i.next, %latch ] 25 // ... 26 // %i.next = add %i, 1 27 // %c = icmp eq %i.next, %n 28 // 29 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however 30 // it's useful to think about these as the same register, with some uses using 31 // the value of the register before the add and some using it after. In this 32 // example, the icmp is a post-increment user, since it uses %i.next, which is 33 // the value of the induction variable after the increment. The other common 34 // case of post-increment users is users outside the loop. 35 // 36 // TODO: More sophistication in the way Formulae are generated and filtered. 37 // 38 // TODO: Handle multiple loops at a time. 39 // 40 // TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead 41 // of a GlobalValue? 42 // 43 // TODO: When truncation is free, truncate ICmp users' operands to make it a 44 // smaller encoding (on x86 at least). 45 // 46 // TODO: When a negated register is used by an add (such as in a list of 47 // multiple base registers, or as the increment expression in an addrec), 48 // we may not actually need both reg and (-1 * reg) in registers; the 49 // negation can be implemented by using a sub instead of an add. The 50 // lack of support for taking this into consideration when making 51 // register pressure decisions is partly worked around by the "Special" 52 // use kind. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h" 57 #include "llvm/ADT/APInt.h" 58 #include "llvm/ADT/DenseMap.h" 59 #include "llvm/ADT/DenseSet.h" 60 #include "llvm/ADT/Hashing.h" 61 #include "llvm/ADT/PointerIntPair.h" 62 #include "llvm/ADT/STLExtras.h" 63 #include "llvm/ADT/SetVector.h" 64 #include "llvm/ADT/SmallBitVector.h" 65 #include "llvm/ADT/SmallPtrSet.h" 66 #include "llvm/ADT/SmallSet.h" 67 #include "llvm/ADT/SmallVector.h" 68 #include "llvm/ADT/iterator_range.h" 69 #include "llvm/Analysis/IVUsers.h" 70 #include "llvm/Analysis/LoopAnalysisManager.h" 71 #include "llvm/Analysis/LoopInfo.h" 72 #include "llvm/Analysis/LoopPass.h" 73 #include "llvm/Analysis/ScalarEvolution.h" 74 #include "llvm/Analysis/ScalarEvolutionExpander.h" 75 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 76 #include "llvm/Analysis/ScalarEvolutionNormalization.h" 77 #include "llvm/Analysis/TargetTransformInfo.h" 78 #include "llvm/Transforms/Utils/Local.h" 79 #include "llvm/Config/llvm-config.h" 80 #include "llvm/IR/BasicBlock.h" 81 #include "llvm/IR/Constant.h" 82 #include "llvm/IR/Constants.h" 83 #include "llvm/IR/DerivedTypes.h" 84 #include "llvm/IR/Dominators.h" 85 #include "llvm/IR/GlobalValue.h" 86 #include "llvm/IR/IRBuilder.h" 87 #include "llvm/IR/InstrTypes.h" 88 #include "llvm/IR/Instruction.h" 89 #include "llvm/IR/Instructions.h" 90 #include "llvm/IR/IntrinsicInst.h" 91 #include "llvm/IR/Intrinsics.h" 92 #include "llvm/IR/Module.h" 93 #include "llvm/IR/OperandTraits.h" 94 #include "llvm/IR/Operator.h" 95 #include "llvm/IR/PassManager.h" 96 #include "llvm/IR/Type.h" 97 #include "llvm/IR/Use.h" 98 #include "llvm/IR/User.h" 99 #include "llvm/IR/Value.h" 100 #include "llvm/IR/ValueHandle.h" 101 #include "llvm/Pass.h" 102 #include "llvm/Support/Casting.h" 103 #include "llvm/Support/CommandLine.h" 104 #include "llvm/Support/Compiler.h" 105 #include "llvm/Support/Debug.h" 106 #include "llvm/Support/ErrorHandling.h" 107 #include "llvm/Support/MathExtras.h" 108 #include "llvm/Support/raw_ostream.h" 109 #include "llvm/Transforms/Scalar.h" 110 #include "llvm/Transforms/Utils.h" 111 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 112 #include <algorithm> 113 #include <cassert> 114 #include <cstddef> 115 #include <cstdint> 116 #include <cstdlib> 117 #include <iterator> 118 #include <limits> 119 #include <map> 120 #include <utility> 121 122 using namespace llvm; 123 124 #define DEBUG_TYPE "loop-reduce" 125 126 /// MaxIVUsers is an arbitrary threshold that provides an early opportunity for 127 /// bail out. This threshold is far beyond the number of users that LSR can 128 /// conceivably solve, so it should not affect generated code, but catches the 129 /// worst cases before LSR burns too much compile time and stack space. 130 static const unsigned MaxIVUsers = 200; 131 132 // Temporary flag to cleanup congruent phis after LSR phi expansion. 133 // It's currently disabled until we can determine whether it's truly useful or 134 // not. The flag should be removed after the v3.0 release. 135 // This is now needed for ivchains. 136 static cl::opt<bool> EnablePhiElim( 137 "enable-lsr-phielim", cl::Hidden, cl::init(true), 138 cl::desc("Enable LSR phi elimination")); 139 140 // The flag adds instruction count to solutions cost comparision. 141 static cl::opt<bool> InsnsCost( 142 "lsr-insns-cost", cl::Hidden, cl::init(true), 143 cl::desc("Add instruction count to a LSR cost model")); 144 145 // Flag to choose how to narrow complex lsr solution 146 static cl::opt<bool> LSRExpNarrow( 147 "lsr-exp-narrow", cl::Hidden, cl::init(false), 148 cl::desc("Narrow LSR complex solution using" 149 " expectation of registers number")); 150 151 // Flag to narrow search space by filtering non-optimal formulae with 152 // the same ScaledReg and Scale. 153 static cl::opt<bool> FilterSameScaledReg( 154 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), 155 cl::desc("Narrow LSR search space by filtering non-optimal formulae" 156 " with the same ScaledReg and Scale")); 157 158 static cl::opt<unsigned> ComplexityLimit( 159 "lsr-complexity-limit", cl::Hidden, 160 cl::init(std::numeric_limits<uint16_t>::max()), 161 cl::desc("LSR search space complexity limit")); 162 163 #ifndef NDEBUG 164 // Stress test IV chain generation. 165 static cl::opt<bool> StressIVChain( 166 "stress-ivchain", cl::Hidden, cl::init(false), 167 cl::desc("Stress test LSR IV chains")); 168 #else 169 static bool StressIVChain = false; 170 #endif 171 172 namespace { 173 174 struct MemAccessTy { 175 /// Used in situations where the accessed memory type is unknown. 176 static const unsigned UnknownAddressSpace = 177 std::numeric_limits<unsigned>::max(); 178 179 Type *MemTy = nullptr; 180 unsigned AddrSpace = UnknownAddressSpace; 181 182 MemAccessTy() = default; 183 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {} 184 185 bool operator==(MemAccessTy Other) const { 186 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace; 187 } 188 189 bool operator!=(MemAccessTy Other) const { return !(*this == Other); } 190 191 static MemAccessTy getUnknown(LLVMContext &Ctx, 192 unsigned AS = UnknownAddressSpace) { 193 return MemAccessTy(Type::getVoidTy(Ctx), AS); 194 } 195 196 Type *getType() { return MemTy; } 197 }; 198 199 /// This class holds data which is used to order reuse candidates. 200 class RegSortData { 201 public: 202 /// This represents the set of LSRUse indices which reference 203 /// a particular register. 204 SmallBitVector UsedByIndices; 205 206 void print(raw_ostream &OS) const; 207 void dump() const; 208 }; 209 210 } // end anonymous namespace 211 212 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 213 void RegSortData::print(raw_ostream &OS) const { 214 OS << "[NumUses=" << UsedByIndices.count() << ']'; 215 } 216 217 LLVM_DUMP_METHOD void RegSortData::dump() const { 218 print(errs()); errs() << '\n'; 219 } 220 #endif 221 222 namespace { 223 224 /// Map register candidates to information about how they are used. 225 class RegUseTracker { 226 using RegUsesTy = DenseMap<const SCEV *, RegSortData>; 227 228 RegUsesTy RegUsesMap; 229 SmallVector<const SCEV *, 16> RegSequence; 230 231 public: 232 void countRegister(const SCEV *Reg, size_t LUIdx); 233 void dropRegister(const SCEV *Reg, size_t LUIdx); 234 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx); 235 236 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; 237 238 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const; 239 240 void clear(); 241 242 using iterator = SmallVectorImpl<const SCEV *>::iterator; 243 using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator; 244 245 iterator begin() { return RegSequence.begin(); } 246 iterator end() { return RegSequence.end(); } 247 const_iterator begin() const { return RegSequence.begin(); } 248 const_iterator end() const { return RegSequence.end(); } 249 }; 250 251 } // end anonymous namespace 252 253 void 254 RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) { 255 std::pair<RegUsesTy::iterator, bool> Pair = 256 RegUsesMap.insert(std::make_pair(Reg, RegSortData())); 257 RegSortData &RSD = Pair.first->second; 258 if (Pair.second) 259 RegSequence.push_back(Reg); 260 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1)); 261 RSD.UsedByIndices.set(LUIdx); 262 } 263 264 void 265 RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) { 266 RegUsesTy::iterator It = RegUsesMap.find(Reg); 267 assert(It != RegUsesMap.end()); 268 RegSortData &RSD = It->second; 269 assert(RSD.UsedByIndices.size() > LUIdx); 270 RSD.UsedByIndices.reset(LUIdx); 271 } 272 273 void 274 RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) { 275 assert(LUIdx <= LastLUIdx); 276 277 // Update RegUses. The data structure is not optimized for this purpose; 278 // we must iterate through it and update each of the bit vectors. 279 for (auto &Pair : RegUsesMap) { 280 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices; 281 if (LUIdx < UsedByIndices.size()) 282 UsedByIndices[LUIdx] = 283 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false; 284 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx)); 285 } 286 } 287 288 bool 289 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const { 290 RegUsesTy::const_iterator I = RegUsesMap.find(Reg); 291 if (I == RegUsesMap.end()) 292 return false; 293 const SmallBitVector &UsedByIndices = I->second.UsedByIndices; 294 int i = UsedByIndices.find_first(); 295 if (i == -1) return false; 296 if ((size_t)i != LUIdx) return true; 297 return UsedByIndices.find_next(i) != -1; 298 } 299 300 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const { 301 RegUsesTy::const_iterator I = RegUsesMap.find(Reg); 302 assert(I != RegUsesMap.end() && "Unknown register!"); 303 return I->second.UsedByIndices; 304 } 305 306 void RegUseTracker::clear() { 307 RegUsesMap.clear(); 308 RegSequence.clear(); 309 } 310 311 namespace { 312 313 /// This class holds information that describes a formula for computing 314 /// satisfying a use. It may include broken-out immediates and scaled registers. 315 struct Formula { 316 /// Global base address used for complex addressing. 317 GlobalValue *BaseGV = nullptr; 318 319 /// Base offset for complex addressing. 320 int64_t BaseOffset = 0; 321 322 /// Whether any complex addressing has a base register. 323 bool HasBaseReg = false; 324 325 /// The scale of any complex addressing. 326 int64_t Scale = 0; 327 328 /// The list of "base" registers for this use. When this is non-empty. The 329 /// canonical representation of a formula is 330 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and 331 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty(). 332 /// 3. The reg containing recurrent expr related with currect loop in the 333 /// formula should be put in the ScaledReg. 334 /// #1 enforces that the scaled register is always used when at least two 335 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2. 336 /// #2 enforces that 1 * reg is reg. 337 /// #3 ensures invariant regs with respect to current loop can be combined 338 /// together in LSR codegen. 339 /// This invariant can be temporarily broken while building a formula. 340 /// However, every formula inserted into the LSRInstance must be in canonical 341 /// form. 342 SmallVector<const SCEV *, 4> BaseRegs; 343 344 /// The 'scaled' register for this use. This should be non-null when Scale is 345 /// not zero. 346 const SCEV *ScaledReg = nullptr; 347 348 /// An additional constant offset which added near the use. This requires a 349 /// temporary register, but the offset itself can live in an add immediate 350 /// field rather than a register. 351 int64_t UnfoldedOffset = 0; 352 353 Formula() = default; 354 355 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); 356 357 bool isCanonical(const Loop &L) const; 358 359 void canonicalize(const Loop &L); 360 361 bool unscale(); 362 363 bool hasZeroEnd() const; 364 365 size_t getNumRegs() const; 366 Type *getType() const; 367 368 void deleteBaseReg(const SCEV *&S); 369 370 bool referencesReg(const SCEV *S) const; 371 bool hasRegsUsedByUsesOtherThan(size_t LUIdx, 372 const RegUseTracker &RegUses) const; 373 374 void print(raw_ostream &OS) const; 375 void dump() const; 376 }; 377 378 } // end anonymous namespace 379 380 /// Recursion helper for initialMatch. 381 static void DoInitialMatch(const SCEV *S, Loop *L, 382 SmallVectorImpl<const SCEV *> &Good, 383 SmallVectorImpl<const SCEV *> &Bad, 384 ScalarEvolution &SE) { 385 // Collect expressions which properly dominate the loop header. 386 if (SE.properlyDominates(S, L->getHeader())) { 387 Good.push_back(S); 388 return; 389 } 390 391 // Look at add operands. 392 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 393 for (const SCEV *S : Add->operands()) 394 DoInitialMatch(S, L, Good, Bad, SE); 395 return; 396 } 397 398 // Look at addrec operands. 399 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) 400 if (!AR->getStart()->isZero() && AR->isAffine()) { 401 DoInitialMatch(AR->getStart(), L, Good, Bad, SE); 402 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), 403 AR->getStepRecurrence(SE), 404 // FIXME: AR->getNoWrapFlags() 405 AR->getLoop(), SCEV::FlagAnyWrap), 406 L, Good, Bad, SE); 407 return; 408 } 409 410 // Handle a multiplication by -1 (negation) if it didn't fold. 411 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) 412 if (Mul->getOperand(0)->isAllOnesValue()) { 413 SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end()); 414 const SCEV *NewMul = SE.getMulExpr(Ops); 415 416 SmallVector<const SCEV *, 4> MyGood; 417 SmallVector<const SCEV *, 4> MyBad; 418 DoInitialMatch(NewMul, L, MyGood, MyBad, SE); 419 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue( 420 SE.getEffectiveSCEVType(NewMul->getType()))); 421 for (const SCEV *S : MyGood) 422 Good.push_back(SE.getMulExpr(NegOne, S)); 423 for (const SCEV *S : MyBad) 424 Bad.push_back(SE.getMulExpr(NegOne, S)); 425 return; 426 } 427 428 // Ok, we can't do anything interesting. Just stuff the whole thing into a 429 // register and hope for the best. 430 Bad.push_back(S); 431 } 432 433 /// Incorporate loop-variant parts of S into this Formula, attempting to keep 434 /// all loop-invariant and loop-computable values in a single base register. 435 void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { 436 SmallVector<const SCEV *, 4> Good; 437 SmallVector<const SCEV *, 4> Bad; 438 DoInitialMatch(S, L, Good, Bad, SE); 439 if (!Good.empty()) { 440 const SCEV *Sum = SE.getAddExpr(Good); 441 if (!Sum->isZero()) 442 BaseRegs.push_back(Sum); 443 HasBaseReg = true; 444 } 445 if (!Bad.empty()) { 446 const SCEV *Sum = SE.getAddExpr(Bad); 447 if (!Sum->isZero()) 448 BaseRegs.push_back(Sum); 449 HasBaseReg = true; 450 } 451 canonicalize(*L); 452 } 453 454 /// Check whether or not this formula satisfies the canonical 455 /// representation. 456 /// \see Formula::BaseRegs. 457 bool Formula::isCanonical(const Loop &L) const { 458 if (!ScaledReg) 459 return BaseRegs.size() <= 1; 460 461 if (Scale != 1) 462 return true; 463 464 if (Scale == 1 && BaseRegs.empty()) 465 return false; 466 467 const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg); 468 if (SAR && SAR->getLoop() == &L) 469 return true; 470 471 // If ScaledReg is not a recurrent expr, or it is but its loop is not current 472 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current 473 // loop, we want to swap the reg in BaseRegs with ScaledReg. 474 auto I = 475 find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) { 476 return isa<const SCEVAddRecExpr>(S) && 477 (cast<SCEVAddRecExpr>(S)->getLoop() == &L); 478 }); 479 return I == BaseRegs.end(); 480 } 481 482 /// Helper method to morph a formula into its canonical representation. 483 /// \see Formula::BaseRegs. 484 /// Every formula having more than one base register, must use the ScaledReg 485 /// field. Otherwise, we would have to do special cases everywhere in LSR 486 /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ... 487 /// On the other hand, 1*reg should be canonicalized into reg. 488 void Formula::canonicalize(const Loop &L) { 489 if (isCanonical(L)) 490 return; 491 // So far we did not need this case. This is easy to implement but it is 492 // useless to maintain dead code. Beside it could hurt compile time. 493 assert(!BaseRegs.empty() && "1*reg => reg, should not be needed."); 494 495 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg. 496 if (!ScaledReg) { 497 ScaledReg = BaseRegs.back(); 498 BaseRegs.pop_back(); 499 Scale = 1; 500 } 501 502 // If ScaledReg is an invariant with respect to L, find the reg from 503 // BaseRegs containing the recurrent expr related with Loop L. Swap the 504 // reg with ScaledReg. 505 const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg); 506 if (!SAR || SAR->getLoop() != &L) { 507 auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()), 508 [&](const SCEV *S) { 509 return isa<const SCEVAddRecExpr>(S) && 510 (cast<SCEVAddRecExpr>(S)->getLoop() == &L); 511 }); 512 if (I != BaseRegs.end()) 513 std::swap(ScaledReg, *I); 514 } 515 } 516 517 /// Get rid of the scale in the formula. 518 /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2. 519 /// \return true if it was possible to get rid of the scale, false otherwise. 520 /// \note After this operation the formula may not be in the canonical form. 521 bool Formula::unscale() { 522 if (Scale != 1) 523 return false; 524 Scale = 0; 525 BaseRegs.push_back(ScaledReg); 526 ScaledReg = nullptr; 527 return true; 528 } 529 530 bool Formula::hasZeroEnd() const { 531 if (UnfoldedOffset || BaseOffset) 532 return false; 533 if (BaseRegs.size() != 1 || ScaledReg) 534 return false; 535 return true; 536 } 537 538 /// Return the total number of register operands used by this formula. This does 539 /// not include register uses implied by non-constant addrec strides. 540 size_t Formula::getNumRegs() const { 541 return !!ScaledReg + BaseRegs.size(); 542 } 543 544 /// Return the type of this formula, if it has one, or null otherwise. This type 545 /// is meaningless except for the bit size. 546 Type *Formula::getType() const { 547 return !BaseRegs.empty() ? BaseRegs.front()->getType() : 548 ScaledReg ? ScaledReg->getType() : 549 BaseGV ? BaseGV->getType() : 550 nullptr; 551 } 552 553 /// Delete the given base reg from the BaseRegs list. 554 void Formula::deleteBaseReg(const SCEV *&S) { 555 if (&S != &BaseRegs.back()) 556 std::swap(S, BaseRegs.back()); 557 BaseRegs.pop_back(); 558 } 559 560 /// Test if this formula references the given register. 561 bool Formula::referencesReg(const SCEV *S) const { 562 return S == ScaledReg || is_contained(BaseRegs, S); 563 } 564 565 /// Test whether this formula uses registers which are used by uses other than 566 /// the use with the given index. 567 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, 568 const RegUseTracker &RegUses) const { 569 if (ScaledReg) 570 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx)) 571 return true; 572 for (const SCEV *BaseReg : BaseRegs) 573 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx)) 574 return true; 575 return false; 576 } 577 578 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 579 void Formula::print(raw_ostream &OS) const { 580 bool First = true; 581 if (BaseGV) { 582 if (!First) OS << " + "; else First = false; 583 BaseGV->printAsOperand(OS, /*PrintType=*/false); 584 } 585 if (BaseOffset != 0) { 586 if (!First) OS << " + "; else First = false; 587 OS << BaseOffset; 588 } 589 for (const SCEV *BaseReg : BaseRegs) { 590 if (!First) OS << " + "; else First = false; 591 OS << "reg(" << *BaseReg << ')'; 592 } 593 if (HasBaseReg && BaseRegs.empty()) { 594 if (!First) OS << " + "; else First = false; 595 OS << "**error: HasBaseReg**"; 596 } else if (!HasBaseReg && !BaseRegs.empty()) { 597 if (!First) OS << " + "; else First = false; 598 OS << "**error: !HasBaseReg**"; 599 } 600 if (Scale != 0) { 601 if (!First) OS << " + "; else First = false; 602 OS << Scale << "*reg("; 603 if (ScaledReg) 604 OS << *ScaledReg; 605 else 606 OS << "<unknown>"; 607 OS << ')'; 608 } 609 if (UnfoldedOffset != 0) { 610 if (!First) OS << " + "; 611 OS << "imm(" << UnfoldedOffset << ')'; 612 } 613 } 614 615 LLVM_DUMP_METHOD void Formula::dump() const { 616 print(errs()); errs() << '\n'; 617 } 618 #endif 619 620 /// Return true if the given addrec can be sign-extended without changing its 621 /// value. 622 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { 623 Type *WideTy = 624 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1); 625 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); 626 } 627 628 /// Return true if the given add can be sign-extended without changing its 629 /// value. 630 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { 631 Type *WideTy = 632 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1); 633 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy)); 634 } 635 636 /// Return true if the given mul can be sign-extended without changing its 637 /// value. 638 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { 639 Type *WideTy = 640 IntegerType::get(SE.getContext(), 641 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands()); 642 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy)); 643 } 644 645 /// Return an expression for LHS /s RHS, if it can be determined and if the 646 /// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits 647 /// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that 648 /// the multiplication may overflow, which is useful when the result will be 649 /// used in a context where the most significant bits are ignored. 650 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, 651 ScalarEvolution &SE, 652 bool IgnoreSignificantBits = false) { 653 // Handle the trivial case, which works for any SCEV type. 654 if (LHS == RHS) 655 return SE.getConstant(LHS->getType(), 1); 656 657 // Handle a few RHS special cases. 658 const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS); 659 if (RC) { 660 const APInt &RA = RC->getAPInt(); 661 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do 662 // some folding. 663 if (RA.isAllOnesValue()) 664 return SE.getMulExpr(LHS, RC); 665 // Handle x /s 1 as x. 666 if (RA == 1) 667 return LHS; 668 } 669 670 // Check for a division of a constant by a constant. 671 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) { 672 if (!RC) 673 return nullptr; 674 const APInt &LA = C->getAPInt(); 675 const APInt &RA = RC->getAPInt(); 676 if (LA.srem(RA) != 0) 677 return nullptr; 678 return SE.getConstant(LA.sdiv(RA)); 679 } 680 681 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow. 682 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) { 683 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) { 684 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE, 685 IgnoreSignificantBits); 686 if (!Step) return nullptr; 687 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE, 688 IgnoreSignificantBits); 689 if (!Start) return nullptr; 690 // FlagNW is independent of the start value, step direction, and is 691 // preserved with smaller magnitude steps. 692 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 693 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap); 694 } 695 return nullptr; 696 } 697 698 // Distribute the sdiv over add operands, if the add doesn't overflow. 699 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) { 700 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) { 701 SmallVector<const SCEV *, 8> Ops; 702 for (const SCEV *S : Add->operands()) { 703 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits); 704 if (!Op) return nullptr; 705 Ops.push_back(Op); 706 } 707 return SE.getAddExpr(Ops); 708 } 709 return nullptr; 710 } 711 712 // Check for a multiply operand that we can pull RHS out of. 713 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) { 714 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) { 715 SmallVector<const SCEV *, 4> Ops; 716 bool Found = false; 717 for (const SCEV *S : Mul->operands()) { 718 if (!Found) 719 if (const SCEV *Q = getExactSDiv(S, RHS, SE, 720 IgnoreSignificantBits)) { 721 S = Q; 722 Found = true; 723 } 724 Ops.push_back(S); 725 } 726 return Found ? SE.getMulExpr(Ops) : nullptr; 727 } 728 return nullptr; 729 } 730 731 // Otherwise we don't know. 732 return nullptr; 733 } 734 735 /// If S involves the addition of a constant integer value, return that integer 736 /// value, and mutate S to point to a new SCEV with that value excluded. 737 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { 738 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { 739 if (C->getAPInt().getMinSignedBits() <= 64) { 740 S = SE.getConstant(C->getType(), 0); 741 return C->getValue()->getSExtValue(); 742 } 743 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 744 SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end()); 745 int64_t Result = ExtractImmediate(NewOps.front(), SE); 746 if (Result != 0) 747 S = SE.getAddExpr(NewOps); 748 return Result; 749 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 750 SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end()); 751 int64_t Result = ExtractImmediate(NewOps.front(), SE); 752 if (Result != 0) 753 S = SE.getAddRecExpr(NewOps, AR->getLoop(), 754 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 755 SCEV::FlagAnyWrap); 756 return Result; 757 } 758 return 0; 759 } 760 761 /// If S involves the addition of a GlobalValue address, return that symbol, and 762 /// mutate S to point to a new SCEV with that value excluded. 763 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { 764 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { 765 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { 766 S = SE.getConstant(GV->getType(), 0); 767 return GV; 768 } 769 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 770 SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end()); 771 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE); 772 if (Result) 773 S = SE.getAddExpr(NewOps); 774 return Result; 775 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 776 SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end()); 777 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE); 778 if (Result) 779 S = SE.getAddRecExpr(NewOps, AR->getLoop(), 780 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 781 SCEV::FlagAnyWrap); 782 return Result; 783 } 784 return nullptr; 785 } 786 787 /// Returns true if the specified instruction is using the specified value as an 788 /// address. 789 static bool isAddressUse(const TargetTransformInfo &TTI, 790 Instruction *Inst, Value *OperandVal) { 791 bool isAddress = isa<LoadInst>(Inst); 792 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 793 if (SI->getPointerOperand() == OperandVal) 794 isAddress = true; 795 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { 796 // Addressing modes can also be folded into prefetches and a variety 797 // of intrinsics. 798 switch (II->getIntrinsicID()) { 799 case Intrinsic::memset: 800 case Intrinsic::prefetch: 801 if (II->getArgOperand(0) == OperandVal) 802 isAddress = true; 803 break; 804 case Intrinsic::memmove: 805 case Intrinsic::memcpy: 806 if (II->getArgOperand(0) == OperandVal || 807 II->getArgOperand(1) == OperandVal) 808 isAddress = true; 809 break; 810 default: { 811 MemIntrinsicInfo IntrInfo; 812 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) { 813 if (IntrInfo.PtrVal == OperandVal) 814 isAddress = true; 815 } 816 } 817 } 818 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) { 819 if (RMW->getPointerOperand() == OperandVal) 820 isAddress = true; 821 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) { 822 if (CmpX->getPointerOperand() == OperandVal) 823 isAddress = true; 824 } 825 return isAddress; 826 } 827 828 /// Return the type of the memory being accessed. 829 static MemAccessTy getAccessType(const TargetTransformInfo &TTI, 830 Instruction *Inst, Value *OperandVal) { 831 MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace); 832 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 833 AccessTy.MemTy = SI->getOperand(0)->getType(); 834 AccessTy.AddrSpace = SI->getPointerAddressSpace(); 835 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 836 AccessTy.AddrSpace = LI->getPointerAddressSpace(); 837 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) { 838 AccessTy.AddrSpace = RMW->getPointerAddressSpace(); 839 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) { 840 AccessTy.AddrSpace = CmpX->getPointerAddressSpace(); 841 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { 842 switch (II->getIntrinsicID()) { 843 case Intrinsic::prefetch: 844 case Intrinsic::memset: 845 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace(); 846 AccessTy.MemTy = OperandVal->getType(); 847 break; 848 case Intrinsic::memmove: 849 case Intrinsic::memcpy: 850 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace(); 851 AccessTy.MemTy = OperandVal->getType(); 852 break; 853 default: { 854 MemIntrinsicInfo IntrInfo; 855 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) { 856 AccessTy.AddrSpace 857 = IntrInfo.PtrVal->getType()->getPointerAddressSpace(); 858 } 859 860 break; 861 } 862 } 863 } 864 865 // All pointers have the same requirements, so canonicalize them to an 866 // arbitrary pointer type to minimize variation. 867 if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy)) 868 AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), 869 PTy->getAddressSpace()); 870 871 return AccessTy; 872 } 873 874 /// Return true if this AddRec is already a phi in its loop. 875 static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { 876 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) { 877 if (SE.isSCEVable(PN.getType()) && 878 (SE.getEffectiveSCEVType(PN.getType()) == 879 SE.getEffectiveSCEVType(AR->getType())) && 880 SE.getSCEV(&PN) == AR) 881 return true; 882 } 883 return false; 884 } 885 886 /// Check if expanding this expression is likely to incur significant cost. This 887 /// is tricky because SCEV doesn't track which expressions are actually computed 888 /// by the current IR. 889 /// 890 /// We currently allow expansion of IV increments that involve adds, 891 /// multiplication by constants, and AddRecs from existing phis. 892 /// 893 /// TODO: Allow UDivExpr if we can find an existing IV increment that is an 894 /// obvious multiple of the UDivExpr. 895 static bool isHighCostExpansion(const SCEV *S, 896 SmallPtrSetImpl<const SCEV*> &Processed, 897 ScalarEvolution &SE) { 898 // Zero/One operand expressions 899 switch (S->getSCEVType()) { 900 case scUnknown: 901 case scConstant: 902 return false; 903 case scTruncate: 904 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(), 905 Processed, SE); 906 case scZeroExtend: 907 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(), 908 Processed, SE); 909 case scSignExtend: 910 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(), 911 Processed, SE); 912 } 913 914 if (!Processed.insert(S).second) 915 return false; 916 917 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 918 for (const SCEV *S : Add->operands()) { 919 if (isHighCostExpansion(S, Processed, SE)) 920 return true; 921 } 922 return false; 923 } 924 925 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { 926 if (Mul->getNumOperands() == 2) { 927 // Multiplication by a constant is ok 928 if (isa<SCEVConstant>(Mul->getOperand(0))) 929 return isHighCostExpansion(Mul->getOperand(1), Processed, SE); 930 931 // If we have the value of one operand, check if an existing 932 // multiplication already generates this expression. 933 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) { 934 Value *UVal = U->getValue(); 935 for (User *UR : UVal->users()) { 936 // If U is a constant, it may be used by a ConstantExpr. 937 Instruction *UI = dyn_cast<Instruction>(UR); 938 if (UI && UI->getOpcode() == Instruction::Mul && 939 SE.isSCEVable(UI->getType())) { 940 return SE.getSCEV(UI) == Mul; 941 } 942 } 943 } 944 } 945 } 946 947 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 948 if (isExistingPhi(AR, SE)) 949 return false; 950 } 951 952 // Fow now, consider any other type of expression (div/mul/min/max) high cost. 953 return true; 954 } 955 956 /// If any of the instructions in the specified set are trivially dead, delete 957 /// them and see if this makes any of their operands subsequently dead. 958 static bool 959 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 960 bool Changed = false; 961 962 while (!DeadInsts.empty()) { 963 Value *V = DeadInsts.pop_back_val(); 964 Instruction *I = dyn_cast_or_null<Instruction>(V); 965 966 if (!I || !isInstructionTriviallyDead(I)) 967 continue; 968 969 for (Use &O : I->operands()) 970 if (Instruction *U = dyn_cast<Instruction>(O)) { 971 O = nullptr; 972 if (U->use_empty()) 973 DeadInsts.emplace_back(U); 974 } 975 976 I->eraseFromParent(); 977 Changed = true; 978 } 979 980 return Changed; 981 } 982 983 namespace { 984 985 class LSRUse; 986 987 } // end anonymous namespace 988 989 /// Check if the addressing mode defined by \p F is completely 990 /// folded in \p LU at isel time. 991 /// This includes address-mode folding and special icmp tricks. 992 /// This function returns true if \p LU can accommodate what \p F 993 /// defines and up to 1 base + 1 scaled + offset. 994 /// In other words, if \p F has several base registers, this function may 995 /// still return true. Therefore, users still need to account for 996 /// additional base registers and/or unfolded offsets to derive an 997 /// accurate cost model. 998 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 999 const LSRUse &LU, const Formula &F); 1000 1001 // Get the cost of the scaling factor used in F for LU. 1002 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, 1003 const LSRUse &LU, const Formula &F, 1004 const Loop &L); 1005 1006 namespace { 1007 1008 /// This class is used to measure and compare candidate formulae. 1009 class Cost { 1010 TargetTransformInfo::LSRCost C; 1011 1012 public: 1013 Cost() { 1014 C.Insns = 0; 1015 C.NumRegs = 0; 1016 C.AddRecCost = 0; 1017 C.NumIVMuls = 0; 1018 C.NumBaseAdds = 0; 1019 C.ImmCost = 0; 1020 C.SetupCost = 0; 1021 C.ScaleCost = 0; 1022 } 1023 1024 bool isLess(Cost &Other, const TargetTransformInfo &TTI); 1025 1026 void Lose(); 1027 1028 #ifndef NDEBUG 1029 // Once any of the metrics loses, they must all remain losers. 1030 bool isValid() { 1031 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds 1032 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u) 1033 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds 1034 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u); 1035 } 1036 #endif 1037 1038 bool isLoser() { 1039 assert(isValid() && "invalid cost"); 1040 return C.NumRegs == ~0u; 1041 } 1042 1043 void RateFormula(const TargetTransformInfo &TTI, 1044 const Formula &F, 1045 SmallPtrSetImpl<const SCEV *> &Regs, 1046 const DenseSet<const SCEV *> &VisitedRegs, 1047 const Loop *L, 1048 ScalarEvolution &SE, DominatorTree &DT, 1049 const LSRUse &LU, 1050 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr); 1051 1052 void print(raw_ostream &OS) const; 1053 void dump() const; 1054 1055 private: 1056 void RateRegister(const SCEV *Reg, 1057 SmallPtrSetImpl<const SCEV *> &Regs, 1058 const Loop *L, 1059 ScalarEvolution &SE, DominatorTree &DT, 1060 const TargetTransformInfo &TTI); 1061 void RatePrimaryRegister(const SCEV *Reg, 1062 SmallPtrSetImpl<const SCEV *> &Regs, 1063 const Loop *L, 1064 ScalarEvolution &SE, DominatorTree &DT, 1065 SmallPtrSetImpl<const SCEV *> *LoserRegs, 1066 const TargetTransformInfo &TTI); 1067 }; 1068 1069 /// An operand value in an instruction which is to be replaced with some 1070 /// equivalent, possibly strength-reduced, replacement. 1071 struct LSRFixup { 1072 /// The instruction which will be updated. 1073 Instruction *UserInst = nullptr; 1074 1075 /// The operand of the instruction which will be replaced. The operand may be 1076 /// used more than once; every instance will be replaced. 1077 Value *OperandValToReplace = nullptr; 1078 1079 /// If this user is to use the post-incremented value of an induction 1080 /// variable, this set is non-empty and holds the loops associated with the 1081 /// induction variable. 1082 PostIncLoopSet PostIncLoops; 1083 1084 /// A constant offset to be added to the LSRUse expression. This allows 1085 /// multiple fixups to share the same LSRUse with different offsets, for 1086 /// example in an unrolled loop. 1087 int64_t Offset = 0; 1088 1089 LSRFixup() = default; 1090 1091 bool isUseFullyOutsideLoop(const Loop *L) const; 1092 1093 void print(raw_ostream &OS) const; 1094 void dump() const; 1095 }; 1096 1097 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted 1098 /// SmallVectors of const SCEV*. 1099 struct UniquifierDenseMapInfo { 1100 static SmallVector<const SCEV *, 4> getEmptyKey() { 1101 SmallVector<const SCEV *, 4> V; 1102 V.push_back(reinterpret_cast<const SCEV *>(-1)); 1103 return V; 1104 } 1105 1106 static SmallVector<const SCEV *, 4> getTombstoneKey() { 1107 SmallVector<const SCEV *, 4> V; 1108 V.push_back(reinterpret_cast<const SCEV *>(-2)); 1109 return V; 1110 } 1111 1112 static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) { 1113 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); 1114 } 1115 1116 static bool isEqual(const SmallVector<const SCEV *, 4> &LHS, 1117 const SmallVector<const SCEV *, 4> &RHS) { 1118 return LHS == RHS; 1119 } 1120 }; 1121 1122 /// This class holds the state that LSR keeps for each use in IVUsers, as well 1123 /// as uses invented by LSR itself. It includes information about what kinds of 1124 /// things can be folded into the user, information about the user itself, and 1125 /// information about how the use may be satisfied. TODO: Represent multiple 1126 /// users of the same expression in common? 1127 class LSRUse { 1128 DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier; 1129 1130 public: 1131 /// An enum for a kind of use, indicating what types of scaled and immediate 1132 /// operands it might support. 1133 enum KindType { 1134 Basic, ///< A normal use, with no folding. 1135 Special, ///< A special case of basic, allowing -1 scales. 1136 Address, ///< An address use; folding according to TargetLowering 1137 ICmpZero ///< An equality icmp with both operands folded into one. 1138 // TODO: Add a generic icmp too? 1139 }; 1140 1141 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>; 1142 1143 KindType Kind; 1144 MemAccessTy AccessTy; 1145 1146 /// The list of operands which are to be replaced. 1147 SmallVector<LSRFixup, 8> Fixups; 1148 1149 /// Keep track of the min and max offsets of the fixups. 1150 int64_t MinOffset = std::numeric_limits<int64_t>::max(); 1151 int64_t MaxOffset = std::numeric_limits<int64_t>::min(); 1152 1153 /// This records whether all of the fixups using this LSRUse are outside of 1154 /// the loop, in which case some special-case heuristics may be used. 1155 bool AllFixupsOutsideLoop = true; 1156 1157 /// RigidFormula is set to true to guarantee that this use will be associated 1158 /// with a single formula--the one that initially matched. Some SCEV 1159 /// expressions cannot be expanded. This allows LSR to consider the registers 1160 /// used by those expressions without the need to expand them later after 1161 /// changing the formula. 1162 bool RigidFormula = false; 1163 1164 /// This records the widest use type for any fixup using this 1165 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max 1166 /// fixup widths to be equivalent, because the narrower one may be relying on 1167 /// the implicit truncation to truncate away bogus bits. 1168 Type *WidestFixupType = nullptr; 1169 1170 /// A list of ways to build a value that can satisfy this user. After the 1171 /// list is populated, one of these is selected heuristically and used to 1172 /// formulate a replacement for OperandValToReplace in UserInst. 1173 SmallVector<Formula, 12> Formulae; 1174 1175 /// The set of register candidates used by all formulae in this LSRUse. 1176 SmallPtrSet<const SCEV *, 4> Regs; 1177 1178 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {} 1179 1180 LSRFixup &getNewFixup() { 1181 Fixups.push_back(LSRFixup()); 1182 return Fixups.back(); 1183 } 1184 1185 void pushFixup(LSRFixup &f) { 1186 Fixups.push_back(f); 1187 if (f.Offset > MaxOffset) 1188 MaxOffset = f.Offset; 1189 if (f.Offset < MinOffset) 1190 MinOffset = f.Offset; 1191 } 1192 1193 bool HasFormulaWithSameRegs(const Formula &F) const; 1194 float getNotSelectedProbability(const SCEV *Reg) const; 1195 bool InsertFormula(const Formula &F, const Loop &L); 1196 void DeleteFormula(Formula &F); 1197 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); 1198 1199 void print(raw_ostream &OS) const; 1200 void dump() const; 1201 }; 1202 1203 } // end anonymous namespace 1204 1205 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 1206 LSRUse::KindType Kind, MemAccessTy AccessTy, 1207 GlobalValue *BaseGV, int64_t BaseOffset, 1208 bool HasBaseReg, int64_t Scale, 1209 Instruction *Fixup = nullptr); 1210 1211 /// Tally up interesting quantities from the given register. 1212 void Cost::RateRegister(const SCEV *Reg, 1213 SmallPtrSetImpl<const SCEV *> &Regs, 1214 const Loop *L, 1215 ScalarEvolution &SE, DominatorTree &DT, 1216 const TargetTransformInfo &TTI) { 1217 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { 1218 // If this is an addrec for another loop, it should be an invariant 1219 // with respect to L since L is the innermost loop (at least 1220 // for now LSR only handles innermost loops). 1221 if (AR->getLoop() != L) { 1222 // If the AddRec exists, consider it's register free and leave it alone. 1223 if (isExistingPhi(AR, SE)) 1224 return; 1225 1226 // It is bad to allow LSR for current loop to add induction variables 1227 // for its sibling loops. 1228 if (!AR->getLoop()->contains(L)) { 1229 Lose(); 1230 return; 1231 } 1232 1233 // Otherwise, it will be an invariant with respect to Loop L. 1234 ++C.NumRegs; 1235 return; 1236 } 1237 1238 unsigned LoopCost = 1; 1239 if (TTI.shouldFavorPostInc()) { 1240 const SCEV *LoopStep = AR->getStepRecurrence(SE); 1241 if (isa<SCEVConstant>(LoopStep)) { 1242 // Check if a post-indexed load/store can be used. 1243 if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || 1244 TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { 1245 const SCEV *LoopStart = AR->getStart(); 1246 if (!isa<SCEVConstant>(LoopStart) && 1247 SE.isLoopInvariant(LoopStart, L)) 1248 LoopCost = 0; 1249 } 1250 } 1251 } 1252 C.AddRecCost += LoopCost; 1253 1254 // Add the step value register, if it needs one. 1255 // TODO: The non-affine case isn't precisely modeled here. 1256 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) { 1257 if (!Regs.count(AR->getOperand(1))) { 1258 RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI); 1259 if (isLoser()) 1260 return; 1261 } 1262 } 1263 } 1264 ++C.NumRegs; 1265 1266 // Rough heuristic; favor registers which don't require extra setup 1267 // instructions in the preheader. 1268 if (!isa<SCEVUnknown>(Reg) && 1269 !isa<SCEVConstant>(Reg) && 1270 !(isa<SCEVAddRecExpr>(Reg) && 1271 (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) || 1272 isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart())))) 1273 ++C.SetupCost; 1274 1275 C.NumIVMuls += isa<SCEVMulExpr>(Reg) && 1276 SE.hasComputableLoopEvolution(Reg, L); 1277 } 1278 1279 /// Record this register in the set. If we haven't seen it before, rate 1280 /// it. Optional LoserRegs provides a way to declare any formula that refers to 1281 /// one of those regs an instant loser. 1282 void Cost::RatePrimaryRegister(const SCEV *Reg, 1283 SmallPtrSetImpl<const SCEV *> &Regs, 1284 const Loop *L, 1285 ScalarEvolution &SE, DominatorTree &DT, 1286 SmallPtrSetImpl<const SCEV *> *LoserRegs, 1287 const TargetTransformInfo &TTI) { 1288 if (LoserRegs && LoserRegs->count(Reg)) { 1289 Lose(); 1290 return; 1291 } 1292 if (Regs.insert(Reg).second) { 1293 RateRegister(Reg, Regs, L, SE, DT, TTI); 1294 if (LoserRegs && isLoser()) 1295 LoserRegs->insert(Reg); 1296 } 1297 } 1298 1299 void Cost::RateFormula(const TargetTransformInfo &TTI, 1300 const Formula &F, 1301 SmallPtrSetImpl<const SCEV *> &Regs, 1302 const DenseSet<const SCEV *> &VisitedRegs, 1303 const Loop *L, 1304 ScalarEvolution &SE, DominatorTree &DT, 1305 const LSRUse &LU, 1306 SmallPtrSetImpl<const SCEV *> *LoserRegs) { 1307 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula"); 1308 // Tally up the registers. 1309 unsigned PrevAddRecCost = C.AddRecCost; 1310 unsigned PrevNumRegs = C.NumRegs; 1311 unsigned PrevNumBaseAdds = C.NumBaseAdds; 1312 if (const SCEV *ScaledReg = F.ScaledReg) { 1313 if (VisitedRegs.count(ScaledReg)) { 1314 Lose(); 1315 return; 1316 } 1317 RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI); 1318 if (isLoser()) 1319 return; 1320 } 1321 for (const SCEV *BaseReg : F.BaseRegs) { 1322 if (VisitedRegs.count(BaseReg)) { 1323 Lose(); 1324 return; 1325 } 1326 RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI); 1327 if (isLoser()) 1328 return; 1329 } 1330 1331 // Determine how many (unfolded) adds we'll need inside the loop. 1332 size_t NumBaseParts = F.getNumRegs(); 1333 if (NumBaseParts > 1) 1334 // Do not count the base and a possible second register if the target 1335 // allows to fold 2 registers. 1336 C.NumBaseAdds += 1337 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F))); 1338 C.NumBaseAdds += (F.UnfoldedOffset != 0); 1339 1340 // Accumulate non-free scaling amounts. 1341 C.ScaleCost += getScalingFactorCost(TTI, LU, F, *L); 1342 1343 // Tally up the non-zero immediates. 1344 for (const LSRFixup &Fixup : LU.Fixups) { 1345 int64_t O = Fixup.Offset; 1346 int64_t Offset = (uint64_t)O + F.BaseOffset; 1347 if (F.BaseGV) 1348 C.ImmCost += 64; // Handle symbolic values conservatively. 1349 // TODO: This should probably be the pointer size. 1350 else if (Offset != 0) 1351 C.ImmCost += APInt(64, Offset, true).getMinSignedBits(); 1352 1353 // Check with target if this offset with this instruction is 1354 // specifically not supported. 1355 if (LU.Kind == LSRUse::Address && Offset != 0 && 1356 !isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, 1357 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) 1358 C.NumBaseAdds++; 1359 } 1360 1361 // If we don't count instruction cost exit here. 1362 if (!InsnsCost) { 1363 assert(isValid() && "invalid cost"); 1364 return; 1365 } 1366 1367 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as 1368 // additional instruction (at least fill). 1369 unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1; 1370 if (C.NumRegs > TTIRegNum) { 1371 // Cost already exceeded TTIRegNum, then only newly added register can add 1372 // new instructions. 1373 if (PrevNumRegs > TTIRegNum) 1374 C.Insns += (C.NumRegs - PrevNumRegs); 1375 else 1376 C.Insns += (C.NumRegs - TTIRegNum); 1377 } 1378 1379 // If ICmpZero formula ends with not 0, it could not be replaced by 1380 // just add or sub. We'll need to compare final result of AddRec. 1381 // That means we'll need an additional instruction. But if the target can 1382 // macro-fuse a compare with a branch, don't count this extra instruction. 1383 // For -10 + {0, +, 1}: 1384 // i = i + 1; 1385 // cmp i, 10 1386 // 1387 // For {-10, +, 1}: 1388 // i = i + 1; 1389 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && !TTI.canMacroFuseCmp()) 1390 C.Insns++; 1391 // Each new AddRec adds 1 instruction to calculation. 1392 C.Insns += (C.AddRecCost - PrevAddRecCost); 1393 1394 // BaseAdds adds instructions for unfolded registers. 1395 if (LU.Kind != LSRUse::ICmpZero) 1396 C.Insns += C.NumBaseAdds - PrevNumBaseAdds; 1397 assert(isValid() && "invalid cost"); 1398 } 1399 1400 /// Set this cost to a losing value. 1401 void Cost::Lose() { 1402 C.Insns = std::numeric_limits<unsigned>::max(); 1403 C.NumRegs = std::numeric_limits<unsigned>::max(); 1404 C.AddRecCost = std::numeric_limits<unsigned>::max(); 1405 C.NumIVMuls = std::numeric_limits<unsigned>::max(); 1406 C.NumBaseAdds = std::numeric_limits<unsigned>::max(); 1407 C.ImmCost = std::numeric_limits<unsigned>::max(); 1408 C.SetupCost = std::numeric_limits<unsigned>::max(); 1409 C.ScaleCost = std::numeric_limits<unsigned>::max(); 1410 } 1411 1412 /// Choose the lower cost. 1413 bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) { 1414 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost && 1415 C.Insns != Other.C.Insns) 1416 return C.Insns < Other.C.Insns; 1417 return TTI.isLSRCostLess(C, Other.C); 1418 } 1419 1420 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1421 void Cost::print(raw_ostream &OS) const { 1422 if (InsnsCost) 1423 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s "); 1424 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s"); 1425 if (C.AddRecCost != 0) 1426 OS << ", with addrec cost " << C.AddRecCost; 1427 if (C.NumIVMuls != 0) 1428 OS << ", plus " << C.NumIVMuls << " IV mul" 1429 << (C.NumIVMuls == 1 ? "" : "s"); 1430 if (C.NumBaseAdds != 0) 1431 OS << ", plus " << C.NumBaseAdds << " base add" 1432 << (C.NumBaseAdds == 1 ? "" : "s"); 1433 if (C.ScaleCost != 0) 1434 OS << ", plus " << C.ScaleCost << " scale cost"; 1435 if (C.ImmCost != 0) 1436 OS << ", plus " << C.ImmCost << " imm cost"; 1437 if (C.SetupCost != 0) 1438 OS << ", plus " << C.SetupCost << " setup cost"; 1439 } 1440 1441 LLVM_DUMP_METHOD void Cost::dump() const { 1442 print(errs()); errs() << '\n'; 1443 } 1444 #endif 1445 1446 /// Test whether this fixup always uses its value outside of the given loop. 1447 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { 1448 // PHI nodes use their value in their incoming blocks. 1449 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { 1450 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 1451 if (PN->getIncomingValue(i) == OperandValToReplace && 1452 L->contains(PN->getIncomingBlock(i))) 1453 return false; 1454 return true; 1455 } 1456 1457 return !L->contains(UserInst); 1458 } 1459 1460 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1461 void LSRFixup::print(raw_ostream &OS) const { 1462 OS << "UserInst="; 1463 // Store is common and interesting enough to be worth special-casing. 1464 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) { 1465 OS << "store "; 1466 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false); 1467 } else if (UserInst->getType()->isVoidTy()) 1468 OS << UserInst->getOpcodeName(); 1469 else 1470 UserInst->printAsOperand(OS, /*PrintType=*/false); 1471 1472 OS << ", OperandValToReplace="; 1473 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false); 1474 1475 for (const Loop *PIL : PostIncLoops) { 1476 OS << ", PostIncLoop="; 1477 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false); 1478 } 1479 1480 if (Offset != 0) 1481 OS << ", Offset=" << Offset; 1482 } 1483 1484 LLVM_DUMP_METHOD void LSRFixup::dump() const { 1485 print(errs()); errs() << '\n'; 1486 } 1487 #endif 1488 1489 /// Test whether this use as a formula which has the same registers as the given 1490 /// formula. 1491 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { 1492 SmallVector<const SCEV *, 4> Key = F.BaseRegs; 1493 if (F.ScaledReg) Key.push_back(F.ScaledReg); 1494 // Unstable sort by host order ok, because this is only used for uniquifying. 1495 llvm::sort(Key); 1496 return Uniquifier.count(Key); 1497 } 1498 1499 /// The function returns a probability of selecting formula without Reg. 1500 float LSRUse::getNotSelectedProbability(const SCEV *Reg) const { 1501 unsigned FNum = 0; 1502 for (const Formula &F : Formulae) 1503 if (F.referencesReg(Reg)) 1504 FNum++; 1505 return ((float)(Formulae.size() - FNum)) / Formulae.size(); 1506 } 1507 1508 /// If the given formula has not yet been inserted, add it to the list, and 1509 /// return true. Return false otherwise. The formula must be in canonical form. 1510 bool LSRUse::InsertFormula(const Formula &F, const Loop &L) { 1511 assert(F.isCanonical(L) && "Invalid canonical representation"); 1512 1513 if (!Formulae.empty() && RigidFormula) 1514 return false; 1515 1516 SmallVector<const SCEV *, 4> Key = F.BaseRegs; 1517 if (F.ScaledReg) Key.push_back(F.ScaledReg); 1518 // Unstable sort by host order ok, because this is only used for uniquifying. 1519 llvm::sort(Key); 1520 1521 if (!Uniquifier.insert(Key).second) 1522 return false; 1523 1524 // Using a register to hold the value of 0 is not profitable. 1525 assert((!F.ScaledReg || !F.ScaledReg->isZero()) && 1526 "Zero allocated in a scaled register!"); 1527 #ifndef NDEBUG 1528 for (const SCEV *BaseReg : F.BaseRegs) 1529 assert(!BaseReg->isZero() && "Zero allocated in a base register!"); 1530 #endif 1531 1532 // Add the formula to the list. 1533 Formulae.push_back(F); 1534 1535 // Record registers now being used by this use. 1536 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); 1537 if (F.ScaledReg) 1538 Regs.insert(F.ScaledReg); 1539 1540 return true; 1541 } 1542 1543 /// Remove the given formula from this use's list. 1544 void LSRUse::DeleteFormula(Formula &F) { 1545 if (&F != &Formulae.back()) 1546 std::swap(F, Formulae.back()); 1547 Formulae.pop_back(); 1548 } 1549 1550 /// Recompute the Regs field, and update RegUses. 1551 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { 1552 // Now that we've filtered out some formulae, recompute the Regs set. 1553 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs); 1554 Regs.clear(); 1555 for (const Formula &F : Formulae) { 1556 if (F.ScaledReg) Regs.insert(F.ScaledReg); 1557 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); 1558 } 1559 1560 // Update the RegTracker. 1561 for (const SCEV *S : OldRegs) 1562 if (!Regs.count(S)) 1563 RegUses.dropRegister(S, LUIdx); 1564 } 1565 1566 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1567 void LSRUse::print(raw_ostream &OS) const { 1568 OS << "LSR Use: Kind="; 1569 switch (Kind) { 1570 case Basic: OS << "Basic"; break; 1571 case Special: OS << "Special"; break; 1572 case ICmpZero: OS << "ICmpZero"; break; 1573 case Address: 1574 OS << "Address of "; 1575 if (AccessTy.MemTy->isPointerTy()) 1576 OS << "pointer"; // the full pointer type could be really verbose 1577 else { 1578 OS << *AccessTy.MemTy; 1579 } 1580 1581 OS << " in addrspace(" << AccessTy.AddrSpace << ')'; 1582 } 1583 1584 OS << ", Offsets={"; 1585 bool NeedComma = false; 1586 for (const LSRFixup &Fixup : Fixups) { 1587 if (NeedComma) OS << ','; 1588 OS << Fixup.Offset; 1589 NeedComma = true; 1590 } 1591 OS << '}'; 1592 1593 if (AllFixupsOutsideLoop) 1594 OS << ", all-fixups-outside-loop"; 1595 1596 if (WidestFixupType) 1597 OS << ", widest fixup type: " << *WidestFixupType; 1598 } 1599 1600 LLVM_DUMP_METHOD void LSRUse::dump() const { 1601 print(errs()); errs() << '\n'; 1602 } 1603 #endif 1604 1605 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 1606 LSRUse::KindType Kind, MemAccessTy AccessTy, 1607 GlobalValue *BaseGV, int64_t BaseOffset, 1608 bool HasBaseReg, int64_t Scale, 1609 Instruction *Fixup/*= nullptr*/) { 1610 switch (Kind) { 1611 case LSRUse::Address: 1612 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset, 1613 HasBaseReg, Scale, AccessTy.AddrSpace, Fixup); 1614 1615 case LSRUse::ICmpZero: 1616 // There's not even a target hook for querying whether it would be legal to 1617 // fold a GV into an ICmp. 1618 if (BaseGV) 1619 return false; 1620 1621 // ICmp only has two operands; don't allow more than two non-trivial parts. 1622 if (Scale != 0 && HasBaseReg && BaseOffset != 0) 1623 return false; 1624 1625 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by 1626 // putting the scaled register in the other operand of the icmp. 1627 if (Scale != 0 && Scale != -1) 1628 return false; 1629 1630 // If we have low-level target information, ask the target if it can fold an 1631 // integer immediate on an icmp. 1632 if (BaseOffset != 0) { 1633 // We have one of: 1634 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset 1635 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset 1636 // Offs is the ICmp immediate. 1637 if (Scale == 0) 1638 // The cast does the right thing with 1639 // std::numeric_limits<int64_t>::min(). 1640 BaseOffset = -(uint64_t)BaseOffset; 1641 return TTI.isLegalICmpImmediate(BaseOffset); 1642 } 1643 1644 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg 1645 return true; 1646 1647 case LSRUse::Basic: 1648 // Only handle single-register values. 1649 return !BaseGV && Scale == 0 && BaseOffset == 0; 1650 1651 case LSRUse::Special: 1652 // Special case Basic to handle -1 scales. 1653 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0; 1654 } 1655 1656 llvm_unreachable("Invalid LSRUse Kind!"); 1657 } 1658 1659 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 1660 int64_t MinOffset, int64_t MaxOffset, 1661 LSRUse::KindType Kind, MemAccessTy AccessTy, 1662 GlobalValue *BaseGV, int64_t BaseOffset, 1663 bool HasBaseReg, int64_t Scale) { 1664 // Check for overflow. 1665 if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) != 1666 (MinOffset > 0)) 1667 return false; 1668 MinOffset = (uint64_t)BaseOffset + MinOffset; 1669 if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) != 1670 (MaxOffset > 0)) 1671 return false; 1672 MaxOffset = (uint64_t)BaseOffset + MaxOffset; 1673 1674 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset, 1675 HasBaseReg, Scale) && 1676 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset, 1677 HasBaseReg, Scale); 1678 } 1679 1680 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 1681 int64_t MinOffset, int64_t MaxOffset, 1682 LSRUse::KindType Kind, MemAccessTy AccessTy, 1683 const Formula &F, const Loop &L) { 1684 // For the purpose of isAMCompletelyFolded either having a canonical formula 1685 // or a scale not equal to zero is correct. 1686 // Problems may arise from non canonical formulae having a scale == 0. 1687 // Strictly speaking it would best to just rely on canonical formulae. 1688 // However, when we generate the scaled formulae, we first check that the 1689 // scaling factor is profitable before computing the actual ScaledReg for 1690 // compile time sake. 1691 assert((F.isCanonical(L) || F.Scale != 0)); 1692 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, 1693 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); 1694 } 1695 1696 /// Test whether we know how to expand the current formula. 1697 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, 1698 int64_t MaxOffset, LSRUse::KindType Kind, 1699 MemAccessTy AccessTy, GlobalValue *BaseGV, 1700 int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { 1701 // We know how to expand completely foldable formulae. 1702 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, 1703 BaseOffset, HasBaseReg, Scale) || 1704 // Or formulae that use a base register produced by a sum of base 1705 // registers. 1706 (Scale == 1 && 1707 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, 1708 BaseGV, BaseOffset, true, 0)); 1709 } 1710 1711 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, 1712 int64_t MaxOffset, LSRUse::KindType Kind, 1713 MemAccessTy AccessTy, const Formula &F) { 1714 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, 1715 F.BaseOffset, F.HasBaseReg, F.Scale); 1716 } 1717 1718 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 1719 const LSRUse &LU, const Formula &F) { 1720 // Target may want to look at the user instructions. 1721 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) { 1722 for (const LSRFixup &Fixup : LU.Fixups) 1723 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, 1724 (F.BaseOffset + Fixup.Offset), F.HasBaseReg, 1725 F.Scale, Fixup.UserInst)) 1726 return false; 1727 return true; 1728 } 1729 1730 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, 1731 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, 1732 F.Scale); 1733 } 1734 1735 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, 1736 const LSRUse &LU, const Formula &F, 1737 const Loop &L) { 1738 if (!F.Scale) 1739 return 0; 1740 1741 // If the use is not completely folded in that instruction, we will have to 1742 // pay an extra cost only for scale != 1. 1743 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, 1744 LU.AccessTy, F, L)) 1745 return F.Scale != 1; 1746 1747 switch (LU.Kind) { 1748 case LSRUse::Address: { 1749 // Check the scaling factor cost with both the min and max offsets. 1750 int ScaleCostMinOffset = TTI.getScalingFactorCost( 1751 LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg, 1752 F.Scale, LU.AccessTy.AddrSpace); 1753 int ScaleCostMaxOffset = TTI.getScalingFactorCost( 1754 LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg, 1755 F.Scale, LU.AccessTy.AddrSpace); 1756 1757 assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 && 1758 "Legal addressing mode has an illegal cost!"); 1759 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset); 1760 } 1761 case LSRUse::ICmpZero: 1762 case LSRUse::Basic: 1763 case LSRUse::Special: 1764 // The use is completely folded, i.e., everything is folded into the 1765 // instruction. 1766 return 0; 1767 } 1768 1769 llvm_unreachable("Invalid LSRUse Kind!"); 1770 } 1771 1772 static bool isAlwaysFoldable(const TargetTransformInfo &TTI, 1773 LSRUse::KindType Kind, MemAccessTy AccessTy, 1774 GlobalValue *BaseGV, int64_t BaseOffset, 1775 bool HasBaseReg) { 1776 // Fast-path: zero is always foldable. 1777 if (BaseOffset == 0 && !BaseGV) return true; 1778 1779 // Conservatively, create an address with an immediate and a 1780 // base and a scale. 1781 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; 1782 1783 // Canonicalize a scale of 1 to a base register if the formula doesn't 1784 // already have a base register. 1785 if (!HasBaseReg && Scale == 1) { 1786 Scale = 0; 1787 HasBaseReg = true; 1788 } 1789 1790 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset, 1791 HasBaseReg, Scale); 1792 } 1793 1794 static bool isAlwaysFoldable(const TargetTransformInfo &TTI, 1795 ScalarEvolution &SE, int64_t MinOffset, 1796 int64_t MaxOffset, LSRUse::KindType Kind, 1797 MemAccessTy AccessTy, const SCEV *S, 1798 bool HasBaseReg) { 1799 // Fast-path: zero is always foldable. 1800 if (S->isZero()) return true; 1801 1802 // Conservatively, create an address with an immediate and a 1803 // base and a scale. 1804 int64_t BaseOffset = ExtractImmediate(S, SE); 1805 GlobalValue *BaseGV = ExtractSymbol(S, SE); 1806 1807 // If there's anything else involved, it's not foldable. 1808 if (!S->isZero()) return false; 1809 1810 // Fast-path: zero is always foldable. 1811 if (BaseOffset == 0 && !BaseGV) return true; 1812 1813 // Conservatively, create an address with an immediate and a 1814 // base and a scale. 1815 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; 1816 1817 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, 1818 BaseOffset, HasBaseReg, Scale); 1819 } 1820 1821 namespace { 1822 1823 /// An individual increment in a Chain of IV increments. Relate an IV user to 1824 /// an expression that computes the IV it uses from the IV used by the previous 1825 /// link in the Chain. 1826 /// 1827 /// For the head of a chain, IncExpr holds the absolute SCEV expression for the 1828 /// original IVOperand. The head of the chain's IVOperand is only valid during 1829 /// chain collection, before LSR replaces IV users. During chain generation, 1830 /// IncExpr can be used to find the new IVOperand that computes the same 1831 /// expression. 1832 struct IVInc { 1833 Instruction *UserInst; 1834 Value* IVOperand; 1835 const SCEV *IncExpr; 1836 1837 IVInc(Instruction *U, Value *O, const SCEV *E) 1838 : UserInst(U), IVOperand(O), IncExpr(E) {} 1839 }; 1840 1841 // The list of IV increments in program order. We typically add the head of a 1842 // chain without finding subsequent links. 1843 struct IVChain { 1844 SmallVector<IVInc, 1> Incs; 1845 const SCEV *ExprBase = nullptr; 1846 1847 IVChain() = default; 1848 IVChain(const IVInc &Head, const SCEV *Base) 1849 : Incs(1, Head), ExprBase(Base) {} 1850 1851 using const_iterator = SmallVectorImpl<IVInc>::const_iterator; 1852 1853 // Return the first increment in the chain. 1854 const_iterator begin() const { 1855 assert(!Incs.empty()); 1856 return std::next(Incs.begin()); 1857 } 1858 const_iterator end() const { 1859 return Incs.end(); 1860 } 1861 1862 // Returns true if this chain contains any increments. 1863 bool hasIncs() const { return Incs.size() >= 2; } 1864 1865 // Add an IVInc to the end of this chain. 1866 void add(const IVInc &X) { Incs.push_back(X); } 1867 1868 // Returns the last UserInst in the chain. 1869 Instruction *tailUserInst() const { return Incs.back().UserInst; } 1870 1871 // Returns true if IncExpr can be profitably added to this chain. 1872 bool isProfitableIncrement(const SCEV *OperExpr, 1873 const SCEV *IncExpr, 1874 ScalarEvolution&); 1875 }; 1876 1877 /// Helper for CollectChains to track multiple IV increment uses. Distinguish 1878 /// between FarUsers that definitely cross IV increments and NearUsers that may 1879 /// be used between IV increments. 1880 struct ChainUsers { 1881 SmallPtrSet<Instruction*, 4> FarUsers; 1882 SmallPtrSet<Instruction*, 4> NearUsers; 1883 }; 1884 1885 /// This class holds state for the main loop strength reduction logic. 1886 class LSRInstance { 1887 IVUsers &IU; 1888 ScalarEvolution &SE; 1889 DominatorTree &DT; 1890 LoopInfo &LI; 1891 const TargetTransformInfo &TTI; 1892 Loop *const L; 1893 bool Changed = false; 1894 1895 /// This is the insert position that the current loop's induction variable 1896 /// increment should be placed. In simple loops, this is the latch block's 1897 /// terminator. But in more complicated cases, this is a position which will 1898 /// dominate all the in-loop post-increment users. 1899 Instruction *IVIncInsertPos = nullptr; 1900 1901 /// Interesting factors between use strides. 1902 /// 1903 /// We explicitly use a SetVector which contains a SmallSet, instead of the 1904 /// default, a SmallDenseSet, because we need to use the full range of 1905 /// int64_ts, and there's currently no good way of doing that with 1906 /// SmallDenseSet. 1907 SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors; 1908 1909 /// Interesting use types, to facilitate truncation reuse. 1910 SmallSetVector<Type *, 4> Types; 1911 1912 /// The list of interesting uses. 1913 SmallVector<LSRUse, 16> Uses; 1914 1915 /// Track which uses use which register candidates. 1916 RegUseTracker RegUses; 1917 1918 // Limit the number of chains to avoid quadratic behavior. We don't expect to 1919 // have more than a few IV increment chains in a loop. Missing a Chain falls 1920 // back to normal LSR behavior for those uses. 1921 static const unsigned MaxChains = 8; 1922 1923 /// IV users can form a chain of IV increments. 1924 SmallVector<IVChain, MaxChains> IVChainVec; 1925 1926 /// IV users that belong to profitable IVChains. 1927 SmallPtrSet<Use*, MaxChains> IVIncSet; 1928 1929 void OptimizeShadowIV(); 1930 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse); 1931 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse); 1932 void OptimizeLoopTermCond(); 1933 1934 void ChainInstruction(Instruction *UserInst, Instruction *IVOper, 1935 SmallVectorImpl<ChainUsers> &ChainUsersVec); 1936 void FinalizeChain(IVChain &Chain); 1937 void CollectChains(); 1938 void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, 1939 SmallVectorImpl<WeakTrackingVH> &DeadInsts); 1940 1941 void CollectInterestingTypesAndFactors(); 1942 void CollectFixupsAndInitialFormulae(); 1943 1944 // Support for sharing of LSRUses between LSRFixups. 1945 using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>; 1946 UseMapTy UseMap; 1947 1948 bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, 1949 LSRUse::KindType Kind, MemAccessTy AccessTy); 1950 1951 std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind, 1952 MemAccessTy AccessTy); 1953 1954 void DeleteUse(LSRUse &LU, size_t LUIdx); 1955 1956 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU); 1957 1958 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); 1959 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); 1960 void CountRegisters(const Formula &F, size_t LUIdx); 1961 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); 1962 1963 void CollectLoopInvariantFixupsAndFormulae(); 1964 1965 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, 1966 unsigned Depth = 0); 1967 1968 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, 1969 const Formula &Base, unsigned Depth, 1970 size_t Idx, bool IsScaledReg = false); 1971 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base); 1972 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, 1973 const Formula &Base, size_t Idx, 1974 bool IsScaledReg = false); 1975 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); 1976 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx, 1977 const Formula &Base, 1978 const SmallVectorImpl<int64_t> &Worklist, 1979 size_t Idx, bool IsScaledReg = false); 1980 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); 1981 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base); 1982 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base); 1983 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base); 1984 void GenerateCrossUseConstantOffsets(); 1985 void GenerateAllReuseFormulae(); 1986 1987 void FilterOutUndesirableDedicatedRegisters(); 1988 1989 size_t EstimateSearchSpaceComplexity() const; 1990 void NarrowSearchSpaceByDetectingSupersets(); 1991 void NarrowSearchSpaceByCollapsingUnrolledCode(); 1992 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); 1993 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); 1994 void NarrowSearchSpaceByDeletingCostlyFormulas(); 1995 void NarrowSearchSpaceByPickingWinnerRegs(); 1996 void NarrowSearchSpaceUsingHeuristics(); 1997 1998 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution, 1999 Cost &SolutionCost, 2000 SmallVectorImpl<const Formula *> &Workspace, 2001 const Cost &CurCost, 2002 const SmallPtrSet<const SCEV *, 16> &CurRegs, 2003 DenseSet<const SCEV *> &VisitedRegs) const; 2004 void Solve(SmallVectorImpl<const Formula *> &Solution) const; 2005 2006 BasicBlock::iterator 2007 HoistInsertPosition(BasicBlock::iterator IP, 2008 const SmallVectorImpl<Instruction *> &Inputs) const; 2009 BasicBlock::iterator 2010 AdjustInsertPositionForExpand(BasicBlock::iterator IP, 2011 const LSRFixup &LF, 2012 const LSRUse &LU, 2013 SCEVExpander &Rewriter) const; 2014 2015 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F, 2016 BasicBlock::iterator IP, SCEVExpander &Rewriter, 2017 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const; 2018 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF, 2019 const Formula &F, SCEVExpander &Rewriter, 2020 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const; 2021 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F, 2022 SCEVExpander &Rewriter, 2023 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const; 2024 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution); 2025 2026 public: 2027 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, 2028 LoopInfo &LI, const TargetTransformInfo &TTI); 2029 2030 bool getChanged() const { return Changed; } 2031 2032 void print_factors_and_types(raw_ostream &OS) const; 2033 void print_fixups(raw_ostream &OS) const; 2034 void print_uses(raw_ostream &OS) const; 2035 void print(raw_ostream &OS) const; 2036 void dump() const; 2037 }; 2038 2039 } // end anonymous namespace 2040 2041 /// If IV is used in a int-to-float cast inside the loop then try to eliminate 2042 /// the cast operation. 2043 void LSRInstance::OptimizeShadowIV() { 2044 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 2045 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) 2046 return; 2047 2048 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); 2049 UI != E; /* empty */) { 2050 IVUsers::const_iterator CandidateUI = UI; 2051 ++UI; 2052 Instruction *ShadowUse = CandidateUI->getUser(); 2053 Type *DestTy = nullptr; 2054 bool IsSigned = false; 2055 2056 /* If shadow use is a int->float cast then insert a second IV 2057 to eliminate this cast. 2058 2059 for (unsigned i = 0; i < n; ++i) 2060 foo((double)i); 2061 2062 is transformed into 2063 2064 double d = 0.0; 2065 for (unsigned i = 0; i < n; ++i, ++d) 2066 foo(d); 2067 */ 2068 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) { 2069 IsSigned = false; 2070 DestTy = UCast->getDestTy(); 2071 } 2072 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) { 2073 IsSigned = true; 2074 DestTy = SCast->getDestTy(); 2075 } 2076 if (!DestTy) continue; 2077 2078 // If target does not support DestTy natively then do not apply 2079 // this transformation. 2080 if (!TTI.isTypeLegal(DestTy)) continue; 2081 2082 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0)); 2083 if (!PH) continue; 2084 if (PH->getNumIncomingValues() != 2) continue; 2085 2086 // If the calculation in integers overflows, the result in FP type will 2087 // differ. So we only can do this transformation if we are guaranteed to not 2088 // deal with overflowing values 2089 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH)); 2090 if (!AR) continue; 2091 if (IsSigned && !AR->hasNoSignedWrap()) continue; 2092 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue; 2093 2094 Type *SrcTy = PH->getType(); 2095 int Mantissa = DestTy->getFPMantissaWidth(); 2096 if (Mantissa == -1) continue; 2097 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa) 2098 continue; 2099 2100 unsigned Entry, Latch; 2101 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) { 2102 Entry = 0; 2103 Latch = 1; 2104 } else { 2105 Entry = 1; 2106 Latch = 0; 2107 } 2108 2109 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry)); 2110 if (!Init) continue; 2111 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ? 2112 (double)Init->getSExtValue() : 2113 (double)Init->getZExtValue()); 2114 2115 BinaryOperator *Incr = 2116 dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch)); 2117 if (!Incr) continue; 2118 if (Incr->getOpcode() != Instruction::Add 2119 && Incr->getOpcode() != Instruction::Sub) 2120 continue; 2121 2122 /* Initialize new IV, double d = 0.0 in above example. */ 2123 ConstantInt *C = nullptr; 2124 if (Incr->getOperand(0) == PH) 2125 C = dyn_cast<ConstantInt>(Incr->getOperand(1)); 2126 else if (Incr->getOperand(1) == PH) 2127 C = dyn_cast<ConstantInt>(Incr->getOperand(0)); 2128 else 2129 continue; 2130 2131 if (!C) continue; 2132 2133 // Ignore negative constants, as the code below doesn't handle them 2134 // correctly. TODO: Remove this restriction. 2135 if (!C->getValue().isStrictlyPositive()) continue; 2136 2137 /* Add new PHINode. */ 2138 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH); 2139 2140 /* create new increment. '++d' in above example. */ 2141 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue()); 2142 BinaryOperator *NewIncr = 2143 BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ? 2144 Instruction::FAdd : Instruction::FSub, 2145 NewPH, CFP, "IV.S.next.", Incr); 2146 2147 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry)); 2148 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch)); 2149 2150 /* Remove cast operation */ 2151 ShadowUse->replaceAllUsesWith(NewPH); 2152 ShadowUse->eraseFromParent(); 2153 Changed = true; 2154 break; 2155 } 2156 } 2157 2158 /// If Cond has an operand that is an expression of an IV, set the IV user and 2159 /// stride information and return true, otherwise return false. 2160 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { 2161 for (IVStrideUse &U : IU) 2162 if (U.getUser() == Cond) { 2163 // NOTE: we could handle setcc instructions with multiple uses here, but 2164 // InstCombine does it as well for simple uses, it's not clear that it 2165 // occurs enough in real life to handle. 2166 CondUse = &U; 2167 return true; 2168 } 2169 return false; 2170 } 2171 2172 /// Rewrite the loop's terminating condition if it uses a max computation. 2173 /// 2174 /// This is a narrow solution to a specific, but acute, problem. For loops 2175 /// like this: 2176 /// 2177 /// i = 0; 2178 /// do { 2179 /// p[i] = 0.0; 2180 /// } while (++i < n); 2181 /// 2182 /// the trip count isn't just 'n', because 'n' might not be positive. And 2183 /// unfortunately this can come up even for loops where the user didn't use 2184 /// a C do-while loop. For example, seemingly well-behaved top-test loops 2185 /// will commonly be lowered like this: 2186 /// 2187 /// if (n > 0) { 2188 /// i = 0; 2189 /// do { 2190 /// p[i] = 0.0; 2191 /// } while (++i < n); 2192 /// } 2193 /// 2194 /// and then it's possible for subsequent optimization to obscure the if 2195 /// test in such a way that indvars can't find it. 2196 /// 2197 /// When indvars can't find the if test in loops like this, it creates a 2198 /// max expression, which allows it to give the loop a canonical 2199 /// induction variable: 2200 /// 2201 /// i = 0; 2202 /// max = n < 1 ? 1 : n; 2203 /// do { 2204 /// p[i] = 0.0; 2205 /// } while (++i != max); 2206 /// 2207 /// Canonical induction variables are necessary because the loop passes 2208 /// are designed around them. The most obvious example of this is the 2209 /// LoopInfo analysis, which doesn't remember trip count values. It 2210 /// expects to be able to rediscover the trip count each time it is 2211 /// needed, and it does this using a simple analysis that only succeeds if 2212 /// the loop has a canonical induction variable. 2213 /// 2214 /// However, when it comes time to generate code, the maximum operation 2215 /// can be quite costly, especially if it's inside of an outer loop. 2216 /// 2217 /// This function solves this problem by detecting this type of loop and 2218 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting 2219 /// the instructions for the maximum computation. 2220 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { 2221 // Check that the loop matches the pattern we're looking for. 2222 if (Cond->getPredicate() != CmpInst::ICMP_EQ && 2223 Cond->getPredicate() != CmpInst::ICMP_NE) 2224 return Cond; 2225 2226 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1)); 2227 if (!Sel || !Sel->hasOneUse()) return Cond; 2228 2229 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 2230 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) 2231 return Cond; 2232 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1); 2233 2234 // Add one to the backedge-taken count to get the trip count. 2235 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount); 2236 if (IterationCount != SE.getSCEV(Sel)) return Cond; 2237 2238 // Check for a max calculation that matches the pattern. There's no check 2239 // for ICMP_ULE here because the comparison would be with zero, which 2240 // isn't interesting. 2241 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; 2242 const SCEVNAryExpr *Max = nullptr; 2243 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) { 2244 Pred = ICmpInst::ICMP_SLE; 2245 Max = S; 2246 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) { 2247 Pred = ICmpInst::ICMP_SLT; 2248 Max = S; 2249 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) { 2250 Pred = ICmpInst::ICMP_ULT; 2251 Max = U; 2252 } else { 2253 // No match; bail. 2254 return Cond; 2255 } 2256 2257 // To handle a max with more than two operands, this optimization would 2258 // require additional checking and setup. 2259 if (Max->getNumOperands() != 2) 2260 return Cond; 2261 2262 const SCEV *MaxLHS = Max->getOperand(0); 2263 const SCEV *MaxRHS = Max->getOperand(1); 2264 2265 // ScalarEvolution canonicalizes constants to the left. For < and >, look 2266 // for a comparison with 1. For <= and >=, a comparison with zero. 2267 if (!MaxLHS || 2268 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One))) 2269 return Cond; 2270 2271 // Check the relevant induction variable for conformance to 2272 // the pattern. 2273 const SCEV *IV = SE.getSCEV(Cond->getOperand(0)); 2274 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV); 2275 if (!AR || !AR->isAffine() || 2276 AR->getStart() != One || 2277 AR->getStepRecurrence(SE) != One) 2278 return Cond; 2279 2280 assert(AR->getLoop() == L && 2281 "Loop condition operand is an addrec in a different loop!"); 2282 2283 // Check the right operand of the select, and remember it, as it will 2284 // be used in the new comparison instruction. 2285 Value *NewRHS = nullptr; 2286 if (ICmpInst::isTrueWhenEqual(Pred)) { 2287 // Look for n+1, and grab n. 2288 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1))) 2289 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1))) 2290 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS) 2291 NewRHS = BO->getOperand(0); 2292 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2))) 2293 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1))) 2294 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS) 2295 NewRHS = BO->getOperand(0); 2296 if (!NewRHS) 2297 return Cond; 2298 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS) 2299 NewRHS = Sel->getOperand(1); 2300 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS) 2301 NewRHS = Sel->getOperand(2); 2302 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS)) 2303 NewRHS = SU->getValue(); 2304 else 2305 // Max doesn't match expected pattern. 2306 return Cond; 2307 2308 // Determine the new comparison opcode. It may be signed or unsigned, 2309 // and the original comparison may be either equality or inequality. 2310 if (Cond->getPredicate() == CmpInst::ICMP_EQ) 2311 Pred = CmpInst::getInversePredicate(Pred); 2312 2313 // Ok, everything looks ok to change the condition into an SLT or SGE and 2314 // delete the max calculation. 2315 ICmpInst *NewCond = 2316 new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp"); 2317 2318 // Delete the max calculation instructions. 2319 Cond->replaceAllUsesWith(NewCond); 2320 CondUse->setUser(NewCond); 2321 Instruction *Cmp = cast<Instruction>(Sel->getOperand(0)); 2322 Cond->eraseFromParent(); 2323 Sel->eraseFromParent(); 2324 if (Cmp->use_empty()) 2325 Cmp->eraseFromParent(); 2326 return NewCond; 2327 } 2328 2329 /// Change loop terminating condition to use the postinc iv when possible. 2330 void 2331 LSRInstance::OptimizeLoopTermCond() { 2332 SmallPtrSet<Instruction *, 4> PostIncs; 2333 2334 // We need a different set of heuristics for rotated and non-rotated loops. 2335 // If a loop is rotated then the latch is also the backedge, so inserting 2336 // post-inc expressions just before the latch is ideal. To reduce live ranges 2337 // it also makes sense to rewrite terminating conditions to use post-inc 2338 // expressions. 2339 // 2340 // If the loop is not rotated then the latch is not a backedge; the latch 2341 // check is done in the loop head. Adding post-inc expressions before the 2342 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions 2343 // in the loop body. In this case we do *not* want to use post-inc expressions 2344 // in the latch check, and we want to insert post-inc expressions before 2345 // the backedge. 2346 BasicBlock *LatchBlock = L->getLoopLatch(); 2347 SmallVector<BasicBlock*, 8> ExitingBlocks; 2348 L->getExitingBlocks(ExitingBlocks); 2349 if (llvm::all_of(ExitingBlocks, [&LatchBlock](const BasicBlock *BB) { 2350 return LatchBlock != BB; 2351 })) { 2352 // The backedge doesn't exit the loop; treat this as a head-tested loop. 2353 IVIncInsertPos = LatchBlock->getTerminator(); 2354 return; 2355 } 2356 2357 // Otherwise treat this as a rotated loop. 2358 for (BasicBlock *ExitingBlock : ExitingBlocks) { 2359 // Get the terminating condition for the loop if possible. If we 2360 // can, we want to change it to use a post-incremented version of its 2361 // induction variable, to allow coalescing the live ranges for the IV into 2362 // one register value. 2363 2364 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator()); 2365 if (!TermBr) 2366 continue; 2367 // FIXME: Overly conservative, termination condition could be an 'or' etc.. 2368 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition())) 2369 continue; 2370 2371 // Search IVUsesByStride to find Cond's IVUse if there is one. 2372 IVStrideUse *CondUse = nullptr; 2373 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition()); 2374 if (!FindIVUserForCond(Cond, CondUse)) 2375 continue; 2376 2377 // If the trip count is computed in terms of a max (due to ScalarEvolution 2378 // being unable to find a sufficient guard, for example), change the loop 2379 // comparison to use SLT or ULT instead of NE. 2380 // One consequence of doing this now is that it disrupts the count-down 2381 // optimization. That's not always a bad thing though, because in such 2382 // cases it may still be worthwhile to avoid a max. 2383 Cond = OptimizeMax(Cond, CondUse); 2384 2385 // If this exiting block dominates the latch block, it may also use 2386 // the post-inc value if it won't be shared with other uses. 2387 // Check for dominance. 2388 if (!DT.dominates(ExitingBlock, LatchBlock)) 2389 continue; 2390 2391 // Conservatively avoid trying to use the post-inc value in non-latch 2392 // exits if there may be pre-inc users in intervening blocks. 2393 if (LatchBlock != ExitingBlock) 2394 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) 2395 // Test if the use is reachable from the exiting block. This dominator 2396 // query is a conservative approximation of reachability. 2397 if (&*UI != CondUse && 2398 !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) { 2399 // Conservatively assume there may be reuse if the quotient of their 2400 // strides could be a legal scale. 2401 const SCEV *A = IU.getStride(*CondUse, L); 2402 const SCEV *B = IU.getStride(*UI, L); 2403 if (!A || !B) continue; 2404 if (SE.getTypeSizeInBits(A->getType()) != 2405 SE.getTypeSizeInBits(B->getType())) { 2406 if (SE.getTypeSizeInBits(A->getType()) > 2407 SE.getTypeSizeInBits(B->getType())) 2408 B = SE.getSignExtendExpr(B, A->getType()); 2409 else 2410 A = SE.getSignExtendExpr(A, B->getType()); 2411 } 2412 if (const SCEVConstant *D = 2413 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) { 2414 const ConstantInt *C = D->getValue(); 2415 // Stride of one or negative one can have reuse with non-addresses. 2416 if (C->isOne() || C->isMinusOne()) 2417 goto decline_post_inc; 2418 // Avoid weird situations. 2419 if (C->getValue().getMinSignedBits() >= 64 || 2420 C->getValue().isMinSignedValue()) 2421 goto decline_post_inc; 2422 // Check for possible scaled-address reuse. 2423 if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) { 2424 MemAccessTy AccessTy = getAccessType( 2425 TTI, UI->getUser(), UI->getOperandValToReplace()); 2426 int64_t Scale = C->getSExtValue(); 2427 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, 2428 /*BaseOffset=*/0, 2429 /*HasBaseReg=*/false, Scale, 2430 AccessTy.AddrSpace)) 2431 goto decline_post_inc; 2432 Scale = -Scale; 2433 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, 2434 /*BaseOffset=*/0, 2435 /*HasBaseReg=*/false, Scale, 2436 AccessTy.AddrSpace)) 2437 goto decline_post_inc; 2438 } 2439 } 2440 } 2441 2442 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: " 2443 << *Cond << '\n'); 2444 2445 // It's possible for the setcc instruction to be anywhere in the loop, and 2446 // possible for it to have multiple users. If it is not immediately before 2447 // the exiting block branch, move it. 2448 if (&*++BasicBlock::iterator(Cond) != TermBr) { 2449 if (Cond->hasOneUse()) { 2450 Cond->moveBefore(TermBr); 2451 } else { 2452 // Clone the terminating condition and insert into the loopend. 2453 ICmpInst *OldCond = Cond; 2454 Cond = cast<ICmpInst>(Cond->clone()); 2455 Cond->setName(L->getHeader()->getName() + ".termcond"); 2456 ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond); 2457 2458 // Clone the IVUse, as the old use still exists! 2459 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); 2460 TermBr->replaceUsesOfWith(OldCond, Cond); 2461 } 2462 } 2463 2464 // If we get to here, we know that we can transform the setcc instruction to 2465 // use the post-incremented version of the IV, allowing us to coalesce the 2466 // live ranges for the IV correctly. 2467 CondUse->transformToPostInc(L); 2468 Changed = true; 2469 2470 PostIncs.insert(Cond); 2471 decline_post_inc:; 2472 } 2473 2474 // Determine an insertion point for the loop induction variable increment. It 2475 // must dominate all the post-inc comparisons we just set up, and it must 2476 // dominate the loop latch edge. 2477 IVIncInsertPos = L->getLoopLatch()->getTerminator(); 2478 for (Instruction *Inst : PostIncs) { 2479 BasicBlock *BB = 2480 DT.findNearestCommonDominator(IVIncInsertPos->getParent(), 2481 Inst->getParent()); 2482 if (BB == Inst->getParent()) 2483 IVIncInsertPos = Inst; 2484 else if (BB != IVIncInsertPos->getParent()) 2485 IVIncInsertPos = BB->getTerminator(); 2486 } 2487 } 2488 2489 /// Determine if the given use can accommodate a fixup at the given offset and 2490 /// other details. If so, update the use and return true. 2491 bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, 2492 bool HasBaseReg, LSRUse::KindType Kind, 2493 MemAccessTy AccessTy) { 2494 int64_t NewMinOffset = LU.MinOffset; 2495 int64_t NewMaxOffset = LU.MaxOffset; 2496 MemAccessTy NewAccessTy = AccessTy; 2497 2498 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to 2499 // something conservative, however this can pessimize in the case that one of 2500 // the uses will have all its uses outside the loop, for example. 2501 if (LU.Kind != Kind) 2502 return false; 2503 2504 // Check for a mismatched access type, and fall back conservatively as needed. 2505 // TODO: Be less conservative when the type is similar and can use the same 2506 // addressing modes. 2507 if (Kind == LSRUse::Address) { 2508 if (AccessTy.MemTy != LU.AccessTy.MemTy) { 2509 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(), 2510 AccessTy.AddrSpace); 2511 } 2512 } 2513 2514 // Conservatively assume HasBaseReg is true for now. 2515 if (NewOffset < LU.MinOffset) { 2516 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, 2517 LU.MaxOffset - NewOffset, HasBaseReg)) 2518 return false; 2519 NewMinOffset = NewOffset; 2520 } else if (NewOffset > LU.MaxOffset) { 2521 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, 2522 NewOffset - LU.MinOffset, HasBaseReg)) 2523 return false; 2524 NewMaxOffset = NewOffset; 2525 } 2526 2527 // Update the use. 2528 LU.MinOffset = NewMinOffset; 2529 LU.MaxOffset = NewMaxOffset; 2530 LU.AccessTy = NewAccessTy; 2531 return true; 2532 } 2533 2534 /// Return an LSRUse index and an offset value for a fixup which needs the given 2535 /// expression, with the given kind and optional access type. Either reuse an 2536 /// existing use or create a new one, as needed. 2537 std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr, 2538 LSRUse::KindType Kind, 2539 MemAccessTy AccessTy) { 2540 const SCEV *Copy = Expr; 2541 int64_t Offset = ExtractImmediate(Expr, SE); 2542 2543 // Basic uses can't accept any offset, for example. 2544 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr, 2545 Offset, /*HasBaseReg=*/ true)) { 2546 Expr = Copy; 2547 Offset = 0; 2548 } 2549 2550 std::pair<UseMapTy::iterator, bool> P = 2551 UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0)); 2552 if (!P.second) { 2553 // A use already existed with this base. 2554 size_t LUIdx = P.first->second; 2555 LSRUse &LU = Uses[LUIdx]; 2556 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy)) 2557 // Reuse this use. 2558 return std::make_pair(LUIdx, Offset); 2559 } 2560 2561 // Create a new use. 2562 size_t LUIdx = Uses.size(); 2563 P.first->second = LUIdx; 2564 Uses.push_back(LSRUse(Kind, AccessTy)); 2565 LSRUse &LU = Uses[LUIdx]; 2566 2567 LU.MinOffset = Offset; 2568 LU.MaxOffset = Offset; 2569 return std::make_pair(LUIdx, Offset); 2570 } 2571 2572 /// Delete the given use from the Uses list. 2573 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) { 2574 if (&LU != &Uses.back()) 2575 std::swap(LU, Uses.back()); 2576 Uses.pop_back(); 2577 2578 // Update RegUses. 2579 RegUses.swapAndDropUse(LUIdx, Uses.size()); 2580 } 2581 2582 /// Look for a use distinct from OrigLU which is has a formula that has the same 2583 /// registers as the given formula. 2584 LSRUse * 2585 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, 2586 const LSRUse &OrigLU) { 2587 // Search all uses for the formula. This could be more clever. 2588 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 2589 LSRUse &LU = Uses[LUIdx]; 2590 // Check whether this use is close enough to OrigLU, to see whether it's 2591 // worthwhile looking through its formulae. 2592 // Ignore ICmpZero uses because they may contain formulae generated by 2593 // GenerateICmpZeroScales, in which case adding fixup offsets may 2594 // be invalid. 2595 if (&LU != &OrigLU && 2596 LU.Kind != LSRUse::ICmpZero && 2597 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy && 2598 LU.WidestFixupType == OrigLU.WidestFixupType && 2599 LU.HasFormulaWithSameRegs(OrigF)) { 2600 // Scan through this use's formulae. 2601 for (const Formula &F : LU.Formulae) { 2602 // Check to see if this formula has the same registers and symbols 2603 // as OrigF. 2604 if (F.BaseRegs == OrigF.BaseRegs && 2605 F.ScaledReg == OrigF.ScaledReg && 2606 F.BaseGV == OrigF.BaseGV && 2607 F.Scale == OrigF.Scale && 2608 F.UnfoldedOffset == OrigF.UnfoldedOffset) { 2609 if (F.BaseOffset == 0) 2610 return &LU; 2611 // This is the formula where all the registers and symbols matched; 2612 // there aren't going to be any others. Since we declined it, we 2613 // can skip the rest of the formulae and proceed to the next LSRUse. 2614 break; 2615 } 2616 } 2617 } 2618 } 2619 2620 // Nothing looked good. 2621 return nullptr; 2622 } 2623 2624 void LSRInstance::CollectInterestingTypesAndFactors() { 2625 SmallSetVector<const SCEV *, 4> Strides; 2626 2627 // Collect interesting types and strides. 2628 SmallVector<const SCEV *, 4> Worklist; 2629 for (const IVStrideUse &U : IU) { 2630 const SCEV *Expr = IU.getExpr(U); 2631 2632 // Collect interesting types. 2633 Types.insert(SE.getEffectiveSCEVType(Expr->getType())); 2634 2635 // Add strides for mentioned loops. 2636 Worklist.push_back(Expr); 2637 do { 2638 const SCEV *S = Worklist.pop_back_val(); 2639 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 2640 if (AR->getLoop() == L) 2641 Strides.insert(AR->getStepRecurrence(SE)); 2642 Worklist.push_back(AR->getStart()); 2643 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 2644 Worklist.append(Add->op_begin(), Add->op_end()); 2645 } 2646 } while (!Worklist.empty()); 2647 } 2648 2649 // Compute interesting factors from the set of interesting strides. 2650 for (SmallSetVector<const SCEV *, 4>::const_iterator 2651 I = Strides.begin(), E = Strides.end(); I != E; ++I) 2652 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter = 2653 std::next(I); NewStrideIter != E; ++NewStrideIter) { 2654 const SCEV *OldStride = *I; 2655 const SCEV *NewStride = *NewStrideIter; 2656 2657 if (SE.getTypeSizeInBits(OldStride->getType()) != 2658 SE.getTypeSizeInBits(NewStride->getType())) { 2659 if (SE.getTypeSizeInBits(OldStride->getType()) > 2660 SE.getTypeSizeInBits(NewStride->getType())) 2661 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType()); 2662 else 2663 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType()); 2664 } 2665 if (const SCEVConstant *Factor = 2666 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride, 2667 SE, true))) { 2668 if (Factor->getAPInt().getMinSignedBits() <= 64) 2669 Factors.insert(Factor->getAPInt().getSExtValue()); 2670 } else if (const SCEVConstant *Factor = 2671 dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride, 2672 NewStride, 2673 SE, true))) { 2674 if (Factor->getAPInt().getMinSignedBits() <= 64) 2675 Factors.insert(Factor->getAPInt().getSExtValue()); 2676 } 2677 } 2678 2679 // If all uses use the same type, don't bother looking for truncation-based 2680 // reuse. 2681 if (Types.size() == 1) 2682 Types.clear(); 2683 2684 LLVM_DEBUG(print_factors_and_types(dbgs())); 2685 } 2686 2687 /// Helper for CollectChains that finds an IV operand (computed by an AddRec in 2688 /// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to 2689 /// IVStrideUses, we could partially skip this. 2690 static User::op_iterator 2691 findIVOperand(User::op_iterator OI, User::op_iterator OE, 2692 Loop *L, ScalarEvolution &SE) { 2693 for(; OI != OE; ++OI) { 2694 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) { 2695 if (!SE.isSCEVable(Oper->getType())) 2696 continue; 2697 2698 if (const SCEVAddRecExpr *AR = 2699 dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) { 2700 if (AR->getLoop() == L) 2701 break; 2702 } 2703 } 2704 } 2705 return OI; 2706 } 2707 2708 /// IVChain logic must consistently peek base TruncInst operands, so wrap it in 2709 /// a convenient helper. 2710 static Value *getWideOperand(Value *Oper) { 2711 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper)) 2712 return Trunc->getOperand(0); 2713 return Oper; 2714 } 2715 2716 /// Return true if we allow an IV chain to include both types. 2717 static bool isCompatibleIVType(Value *LVal, Value *RVal) { 2718 Type *LType = LVal->getType(); 2719 Type *RType = RVal->getType(); 2720 return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() && 2721 // Different address spaces means (possibly) 2722 // different types of the pointer implementation, 2723 // e.g. i16 vs i32 so disallow that. 2724 (LType->getPointerAddressSpace() == 2725 RType->getPointerAddressSpace())); 2726 } 2727 2728 /// Return an approximation of this SCEV expression's "base", or NULL for any 2729 /// constant. Returning the expression itself is conservative. Returning a 2730 /// deeper subexpression is more precise and valid as long as it isn't less 2731 /// complex than another subexpression. For expressions involving multiple 2732 /// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids 2733 /// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i], 2734 /// IVInc==b-a. 2735 /// 2736 /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost 2737 /// SCEVUnknown, we simply return the rightmost SCEV operand. 2738 static const SCEV *getExprBase(const SCEV *S) { 2739 switch (S->getSCEVType()) { 2740 default: // uncluding scUnknown. 2741 return S; 2742 case scConstant: 2743 return nullptr; 2744 case scTruncate: 2745 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand()); 2746 case scZeroExtend: 2747 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand()); 2748 case scSignExtend: 2749 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand()); 2750 case scAddExpr: { 2751 // Skip over scaled operands (scMulExpr) to follow add operands as long as 2752 // there's nothing more complex. 2753 // FIXME: not sure if we want to recognize negation. 2754 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S); 2755 for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()), 2756 E(Add->op_begin()); I != E; ++I) { 2757 const SCEV *SubExpr = *I; 2758 if (SubExpr->getSCEVType() == scAddExpr) 2759 return getExprBase(SubExpr); 2760 2761 if (SubExpr->getSCEVType() != scMulExpr) 2762 return SubExpr; 2763 } 2764 return S; // all operands are scaled, be conservative. 2765 } 2766 case scAddRecExpr: 2767 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart()); 2768 } 2769 } 2770 2771 /// Return true if the chain increment is profitable to expand into a loop 2772 /// invariant value, which may require its own register. A profitable chain 2773 /// increment will be an offset relative to the same base. We allow such offsets 2774 /// to potentially be used as chain increment as long as it's not obviously 2775 /// expensive to expand using real instructions. 2776 bool IVChain::isProfitableIncrement(const SCEV *OperExpr, 2777 const SCEV *IncExpr, 2778 ScalarEvolution &SE) { 2779 // Aggressively form chains when -stress-ivchain. 2780 if (StressIVChain) 2781 return true; 2782 2783 // Do not replace a constant offset from IV head with a nonconstant IV 2784 // increment. 2785 if (!isa<SCEVConstant>(IncExpr)) { 2786 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand)); 2787 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr))) 2788 return false; 2789 } 2790 2791 SmallPtrSet<const SCEV*, 8> Processed; 2792 return !isHighCostExpansion(IncExpr, Processed, SE); 2793 } 2794 2795 /// Return true if the number of registers needed for the chain is estimated to 2796 /// be less than the number required for the individual IV users. First prohibit 2797 /// any IV users that keep the IV live across increments (the Users set should 2798 /// be empty). Next count the number and type of increments in the chain. 2799 /// 2800 /// Chaining IVs can lead to considerable code bloat if ISEL doesn't 2801 /// effectively use postinc addressing modes. Only consider it profitable it the 2802 /// increments can be computed in fewer registers when chained. 2803 /// 2804 /// TODO: Consider IVInc free if it's already used in another chains. 2805 static bool 2806 isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users, 2807 ScalarEvolution &SE, const TargetTransformInfo &TTI) { 2808 if (StressIVChain) 2809 return true; 2810 2811 if (!Chain.hasIncs()) 2812 return false; 2813 2814 if (!Users.empty()) { 2815 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n"; 2816 for (Instruction *Inst 2817 : Users) { dbgs() << " " << *Inst << "\n"; }); 2818 return false; 2819 } 2820 assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); 2821 2822 // The chain itself may require a register, so intialize cost to 1. 2823 int cost = 1; 2824 2825 // A complete chain likely eliminates the need for keeping the original IV in 2826 // a register. LSR does not currently know how to form a complete chain unless 2827 // the header phi already exists. 2828 if (isa<PHINode>(Chain.tailUserInst()) 2829 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) { 2830 --cost; 2831 } 2832 const SCEV *LastIncExpr = nullptr; 2833 unsigned NumConstIncrements = 0; 2834 unsigned NumVarIncrements = 0; 2835 unsigned NumReusedIncrements = 0; 2836 for (const IVInc &Inc : Chain) { 2837 if (Inc.IncExpr->isZero()) 2838 continue; 2839 2840 // Incrementing by zero or some constant is neutral. We assume constants can 2841 // be folded into an addressing mode or an add's immediate operand. 2842 if (isa<SCEVConstant>(Inc.IncExpr)) { 2843 ++NumConstIncrements; 2844 continue; 2845 } 2846 2847 if (Inc.IncExpr == LastIncExpr) 2848 ++NumReusedIncrements; 2849 else 2850 ++NumVarIncrements; 2851 2852 LastIncExpr = Inc.IncExpr; 2853 } 2854 // An IV chain with a single increment is handled by LSR's postinc 2855 // uses. However, a chain with multiple increments requires keeping the IV's 2856 // value live longer than it needs to be if chained. 2857 if (NumConstIncrements > 1) 2858 --cost; 2859 2860 // Materializing increment expressions in the preheader that didn't exist in 2861 // the original code may cost a register. For example, sign-extended array 2862 // indices can produce ridiculous increments like this: 2863 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64))) 2864 cost += NumVarIncrements; 2865 2866 // Reusing variable increments likely saves a register to hold the multiple of 2867 // the stride. 2868 cost -= NumReusedIncrements; 2869 2870 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost 2871 << "\n"); 2872 2873 return cost < 0; 2874 } 2875 2876 /// Add this IV user to an existing chain or make it the head of a new chain. 2877 void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, 2878 SmallVectorImpl<ChainUsers> &ChainUsersVec) { 2879 // When IVs are used as types of varying widths, they are generally converted 2880 // to a wider type with some uses remaining narrow under a (free) trunc. 2881 Value *const NextIV = getWideOperand(IVOper); 2882 const SCEV *const OperExpr = SE.getSCEV(NextIV); 2883 const SCEV *const OperExprBase = getExprBase(OperExpr); 2884 2885 // Visit all existing chains. Check if its IVOper can be computed as a 2886 // profitable loop invariant increment from the last link in the Chain. 2887 unsigned ChainIdx = 0, NChains = IVChainVec.size(); 2888 const SCEV *LastIncExpr = nullptr; 2889 for (; ChainIdx < NChains; ++ChainIdx) { 2890 IVChain &Chain = IVChainVec[ChainIdx]; 2891 2892 // Prune the solution space aggressively by checking that both IV operands 2893 // are expressions that operate on the same unscaled SCEVUnknown. This 2894 // "base" will be canceled by the subsequent getMinusSCEV call. Checking 2895 // first avoids creating extra SCEV expressions. 2896 if (!StressIVChain && Chain.ExprBase != OperExprBase) 2897 continue; 2898 2899 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand); 2900 if (!isCompatibleIVType(PrevIV, NextIV)) 2901 continue; 2902 2903 // A phi node terminates a chain. 2904 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst())) 2905 continue; 2906 2907 // The increment must be loop-invariant so it can be kept in a register. 2908 const SCEV *PrevExpr = SE.getSCEV(PrevIV); 2909 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr); 2910 if (!SE.isLoopInvariant(IncExpr, L)) 2911 continue; 2912 2913 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) { 2914 LastIncExpr = IncExpr; 2915 break; 2916 } 2917 } 2918 // If we haven't found a chain, create a new one, unless we hit the max. Don't 2919 // bother for phi nodes, because they must be last in the chain. 2920 if (ChainIdx == NChains) { 2921 if (isa<PHINode>(UserInst)) 2922 return; 2923 if (NChains >= MaxChains && !StressIVChain) { 2924 LLVM_DEBUG(dbgs() << "IV Chain Limit\n"); 2925 return; 2926 } 2927 LastIncExpr = OperExpr; 2928 // IVUsers may have skipped over sign/zero extensions. We don't currently 2929 // attempt to form chains involving extensions unless they can be hoisted 2930 // into this loop's AddRec. 2931 if (!isa<SCEVAddRecExpr>(LastIncExpr)) 2932 return; 2933 ++NChains; 2934 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr), 2935 OperExprBase)); 2936 ChainUsersVec.resize(NChains); 2937 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst 2938 << ") IV=" << *LastIncExpr << "\n"); 2939 } else { 2940 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst 2941 << ") IV+" << *LastIncExpr << "\n"); 2942 // Add this IV user to the end of the chain. 2943 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr)); 2944 } 2945 IVChain &Chain = IVChainVec[ChainIdx]; 2946 2947 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers; 2948 // This chain's NearUsers become FarUsers. 2949 if (!LastIncExpr->isZero()) { 2950 ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(), 2951 NearUsers.end()); 2952 NearUsers.clear(); 2953 } 2954 2955 // All other uses of IVOperand become near uses of the chain. 2956 // We currently ignore intermediate values within SCEV expressions, assuming 2957 // they will eventually be used be the current chain, or can be computed 2958 // from one of the chain increments. To be more precise we could 2959 // transitively follow its user and only add leaf IV users to the set. 2960 for (User *U : IVOper->users()) { 2961 Instruction *OtherUse = dyn_cast<Instruction>(U); 2962 if (!OtherUse) 2963 continue; 2964 // Uses in the chain will no longer be uses if the chain is formed. 2965 // Include the head of the chain in this iteration (not Chain.begin()). 2966 IVChain::const_iterator IncIter = Chain.Incs.begin(); 2967 IVChain::const_iterator IncEnd = Chain.Incs.end(); 2968 for( ; IncIter != IncEnd; ++IncIter) { 2969 if (IncIter->UserInst == OtherUse) 2970 break; 2971 } 2972 if (IncIter != IncEnd) 2973 continue; 2974 2975 if (SE.isSCEVable(OtherUse->getType()) 2976 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse)) 2977 && IU.isIVUserOrOperand(OtherUse)) { 2978 continue; 2979 } 2980 NearUsers.insert(OtherUse); 2981 } 2982 2983 // Since this user is part of the chain, it's no longer considered a use 2984 // of the chain. 2985 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst); 2986 } 2987 2988 /// Populate the vector of Chains. 2989 /// 2990 /// This decreases ILP at the architecture level. Targets with ample registers, 2991 /// multiple memory ports, and no register renaming probably don't want 2992 /// this. However, such targets should probably disable LSR altogether. 2993 /// 2994 /// The job of LSR is to make a reasonable choice of induction variables across 2995 /// the loop. Subsequent passes can easily "unchain" computation exposing more 2996 /// ILP *within the loop* if the target wants it. 2997 /// 2998 /// Finding the best IV chain is potentially a scheduling problem. Since LSR 2999 /// will not reorder memory operations, it will recognize this as a chain, but 3000 /// will generate redundant IV increments. Ideally this would be corrected later 3001 /// by a smart scheduler: 3002 /// = A[i] 3003 /// = A[i+x] 3004 /// A[i] = 3005 /// A[i+x] = 3006 /// 3007 /// TODO: Walk the entire domtree within this loop, not just the path to the 3008 /// loop latch. This will discover chains on side paths, but requires 3009 /// maintaining multiple copies of the Chains state. 3010 void LSRInstance::CollectChains() { 3011 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n"); 3012 SmallVector<ChainUsers, 8> ChainUsersVec; 3013 3014 SmallVector<BasicBlock *,8> LatchPath; 3015 BasicBlock *LoopHeader = L->getHeader(); 3016 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch()); 3017 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) { 3018 LatchPath.push_back(Rung->getBlock()); 3019 } 3020 LatchPath.push_back(LoopHeader); 3021 3022 // Walk the instruction stream from the loop header to the loop latch. 3023 for (BasicBlock *BB : reverse(LatchPath)) { 3024 for (Instruction &I : *BB) { 3025 // Skip instructions that weren't seen by IVUsers analysis. 3026 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I)) 3027 continue; 3028 3029 // Ignore users that are part of a SCEV expression. This way we only 3030 // consider leaf IV Users. This effectively rediscovers a portion of 3031 // IVUsers analysis but in program order this time. 3032 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I))) 3033 continue; 3034 3035 // Remove this instruction from any NearUsers set it may be in. 3036 for (unsigned ChainIdx = 0, NChains = IVChainVec.size(); 3037 ChainIdx < NChains; ++ChainIdx) { 3038 ChainUsersVec[ChainIdx].NearUsers.erase(&I); 3039 } 3040 // Search for operands that can be chained. 3041 SmallPtrSet<Instruction*, 4> UniqueOperands; 3042 User::op_iterator IVOpEnd = I.op_end(); 3043 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE); 3044 while (IVOpIter != IVOpEnd) { 3045 Instruction *IVOpInst = cast<Instruction>(*IVOpIter); 3046 if (UniqueOperands.insert(IVOpInst).second) 3047 ChainInstruction(&I, IVOpInst, ChainUsersVec); 3048 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); 3049 } 3050 } // Continue walking down the instructions. 3051 } // Continue walking down the domtree. 3052 // Visit phi backedges to determine if the chain can generate the IV postinc. 3053 for (PHINode &PN : L->getHeader()->phis()) { 3054 if (!SE.isSCEVable(PN.getType())) 3055 continue; 3056 3057 Instruction *IncV = 3058 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch())); 3059 if (IncV) 3060 ChainInstruction(&PN, IncV, ChainUsersVec); 3061 } 3062 // Remove any unprofitable chains. 3063 unsigned ChainIdx = 0; 3064 for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); 3065 UsersIdx < NChains; ++UsersIdx) { 3066 if (!isProfitableChain(IVChainVec[UsersIdx], 3067 ChainUsersVec[UsersIdx].FarUsers, SE, TTI)) 3068 continue; 3069 // Preserve the chain at UsesIdx. 3070 if (ChainIdx != UsersIdx) 3071 IVChainVec[ChainIdx] = IVChainVec[UsersIdx]; 3072 FinalizeChain(IVChainVec[ChainIdx]); 3073 ++ChainIdx; 3074 } 3075 IVChainVec.resize(ChainIdx); 3076 } 3077 3078 void LSRInstance::FinalizeChain(IVChain &Chain) { 3079 assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); 3080 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); 3081 3082 for (const IVInc &Inc : Chain) { 3083 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n"); 3084 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand); 3085 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand"); 3086 IVIncSet.insert(UseI); 3087 } 3088 } 3089 3090 /// Return true if the IVInc can be folded into an addressing mode. 3091 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, 3092 Value *Operand, const TargetTransformInfo &TTI) { 3093 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr); 3094 if (!IncConst || !isAddressUse(TTI, UserInst, Operand)) 3095 return false; 3096 3097 if (IncConst->getAPInt().getMinSignedBits() > 64) 3098 return false; 3099 3100 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand); 3101 int64_t IncOffset = IncConst->getValue()->getSExtValue(); 3102 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, 3103 IncOffset, /*HaseBaseReg=*/false)) 3104 return false; 3105 3106 return true; 3107 } 3108 3109 /// Generate an add or subtract for each IVInc in a chain to materialize the IV 3110 /// user's operand from the previous IV user's operand. 3111 void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, 3112 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 3113 // Find the new IVOperand for the head of the chain. It may have been replaced 3114 // by LSR. 3115 const IVInc &Head = Chain.Incs[0]; 3116 User::op_iterator IVOpEnd = Head.UserInst->op_end(); 3117 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user. 3118 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(), 3119 IVOpEnd, L, SE); 3120 Value *IVSrc = nullptr; 3121 while (IVOpIter != IVOpEnd) { 3122 IVSrc = getWideOperand(*IVOpIter); 3123 3124 // If this operand computes the expression that the chain needs, we may use 3125 // it. (Check this after setting IVSrc which is used below.) 3126 // 3127 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too 3128 // narrow for the chain, so we can no longer use it. We do allow using a 3129 // wider phi, assuming the LSR checked for free truncation. In that case we 3130 // should already have a truncate on this operand such that 3131 // getSCEV(IVSrc) == IncExpr. 3132 if (SE.getSCEV(*IVOpIter) == Head.IncExpr 3133 || SE.getSCEV(IVSrc) == Head.IncExpr) { 3134 break; 3135 } 3136 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); 3137 } 3138 if (IVOpIter == IVOpEnd) { 3139 // Gracefully give up on this chain. 3140 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n"); 3141 return; 3142 } 3143 3144 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n"); 3145 Type *IVTy = IVSrc->getType(); 3146 Type *IntTy = SE.getEffectiveSCEVType(IVTy); 3147 const SCEV *LeftOverExpr = nullptr; 3148 for (const IVInc &Inc : Chain) { 3149 Instruction *InsertPt = Inc.UserInst; 3150 if (isa<PHINode>(InsertPt)) 3151 InsertPt = L->getLoopLatch()->getTerminator(); 3152 3153 // IVOper will replace the current IV User's operand. IVSrc is the IV 3154 // value currently held in a register. 3155 Value *IVOper = IVSrc; 3156 if (!Inc.IncExpr->isZero()) { 3157 // IncExpr was the result of subtraction of two narrow values, so must 3158 // be signed. 3159 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy); 3160 LeftOverExpr = LeftOverExpr ? 3161 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr; 3162 } 3163 if (LeftOverExpr && !LeftOverExpr->isZero()) { 3164 // Expand the IV increment. 3165 Rewriter.clearPostInc(); 3166 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt); 3167 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc), 3168 SE.getUnknown(IncV)); 3169 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt); 3170 3171 // If an IV increment can't be folded, use it as the next IV value. 3172 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) { 3173 assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); 3174 IVSrc = IVOper; 3175 LeftOverExpr = nullptr; 3176 } 3177 } 3178 Type *OperTy = Inc.IVOperand->getType(); 3179 if (IVTy != OperTy) { 3180 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) && 3181 "cannot extend a chained IV"); 3182 IRBuilder<> Builder(InsertPt); 3183 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain"); 3184 } 3185 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper); 3186 DeadInsts.emplace_back(Inc.IVOperand); 3187 } 3188 // If LSR created a new, wider phi, we may also replace its postinc. We only 3189 // do this if we also found a wide value for the head of the chain. 3190 if (isa<PHINode>(Chain.tailUserInst())) { 3191 for (PHINode &Phi : L->getHeader()->phis()) { 3192 if (!isCompatibleIVType(&Phi, IVSrc)) 3193 continue; 3194 Instruction *PostIncV = dyn_cast<Instruction>( 3195 Phi.getIncomingValueForBlock(L->getLoopLatch())); 3196 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc))) 3197 continue; 3198 Value *IVOper = IVSrc; 3199 Type *PostIncTy = PostIncV->getType(); 3200 if (IVTy != PostIncTy) { 3201 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types"); 3202 IRBuilder<> Builder(L->getLoopLatch()->getTerminator()); 3203 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc()); 3204 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain"); 3205 } 3206 Phi.replaceUsesOfWith(PostIncV, IVOper); 3207 DeadInsts.emplace_back(PostIncV); 3208 } 3209 } 3210 } 3211 3212 void LSRInstance::CollectFixupsAndInitialFormulae() { 3213 for (const IVStrideUse &U : IU) { 3214 Instruction *UserInst = U.getUser(); 3215 // Skip IV users that are part of profitable IV Chains. 3216 User::op_iterator UseI = 3217 find(UserInst->operands(), U.getOperandValToReplace()); 3218 assert(UseI != UserInst->op_end() && "cannot find IV operand"); 3219 if (IVIncSet.count(UseI)) { 3220 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n'); 3221 continue; 3222 } 3223 3224 LSRUse::KindType Kind = LSRUse::Basic; 3225 MemAccessTy AccessTy; 3226 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) { 3227 Kind = LSRUse::Address; 3228 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace()); 3229 } 3230 3231 const SCEV *S = IU.getExpr(U); 3232 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops(); 3233 3234 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as 3235 // (N - i == 0), and this allows (N - i) to be the expression that we work 3236 // with rather than just N or i, so we can consider the register 3237 // requirements for both N and i at the same time. Limiting this code to 3238 // equality icmps is not a problem because all interesting loops use 3239 // equality icmps, thanks to IndVarSimplify. 3240 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) 3241 if (CI->isEquality()) { 3242 // Swap the operands if needed to put the OperandValToReplace on the 3243 // left, for consistency. 3244 Value *NV = CI->getOperand(1); 3245 if (NV == U.getOperandValToReplace()) { 3246 CI->setOperand(1, CI->getOperand(0)); 3247 CI->setOperand(0, NV); 3248 NV = CI->getOperand(1); 3249 Changed = true; 3250 } 3251 3252 // x == y --> x - y == 0 3253 const SCEV *N = SE.getSCEV(NV); 3254 if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) { 3255 // S is normalized, so normalize N before folding it into S 3256 // to keep the result normalized. 3257 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE); 3258 Kind = LSRUse::ICmpZero; 3259 S = SE.getMinusSCEV(N, S); 3260 } 3261 3262 // -1 and the negations of all interesting strides (except the negation 3263 // of -1) are now also interesting. 3264 for (size_t i = 0, e = Factors.size(); i != e; ++i) 3265 if (Factors[i] != -1) 3266 Factors.insert(-(uint64_t)Factors[i]); 3267 Factors.insert(-1); 3268 } 3269 3270 // Get or create an LSRUse. 3271 std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy); 3272 size_t LUIdx = P.first; 3273 int64_t Offset = P.second; 3274 LSRUse &LU = Uses[LUIdx]; 3275 3276 // Record the fixup. 3277 LSRFixup &LF = LU.getNewFixup(); 3278 LF.UserInst = UserInst; 3279 LF.OperandValToReplace = U.getOperandValToReplace(); 3280 LF.PostIncLoops = TmpPostIncLoops; 3281 LF.Offset = Offset; 3282 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); 3283 3284 if (!LU.WidestFixupType || 3285 SE.getTypeSizeInBits(LU.WidestFixupType) < 3286 SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) 3287 LU.WidestFixupType = LF.OperandValToReplace->getType(); 3288 3289 // If this is the first use of this LSRUse, give it a formula. 3290 if (LU.Formulae.empty()) { 3291 InsertInitialFormula(S, LU, LUIdx); 3292 CountRegisters(LU.Formulae.back(), LUIdx); 3293 } 3294 } 3295 3296 LLVM_DEBUG(print_fixups(dbgs())); 3297 } 3298 3299 /// Insert a formula for the given expression into the given use, separating out 3300 /// loop-variant portions from loop-invariant and loop-computable portions. 3301 void 3302 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { 3303 // Mark uses whose expressions cannot be expanded. 3304 if (!isSafeToExpand(S, SE)) 3305 LU.RigidFormula = true; 3306 3307 Formula F; 3308 F.initialMatch(S, L, SE); 3309 bool Inserted = InsertFormula(LU, LUIdx, F); 3310 assert(Inserted && "Initial formula already exists!"); (void)Inserted; 3311 } 3312 3313 /// Insert a simple single-register formula for the given expression into the 3314 /// given use. 3315 void 3316 LSRInstance::InsertSupplementalFormula(const SCEV *S, 3317 LSRUse &LU, size_t LUIdx) { 3318 Formula F; 3319 F.BaseRegs.push_back(S); 3320 F.HasBaseReg = true; 3321 bool Inserted = InsertFormula(LU, LUIdx, F); 3322 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted; 3323 } 3324 3325 /// Note which registers are used by the given formula, updating RegUses. 3326 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { 3327 if (F.ScaledReg) 3328 RegUses.countRegister(F.ScaledReg, LUIdx); 3329 for (const SCEV *BaseReg : F.BaseRegs) 3330 RegUses.countRegister(BaseReg, LUIdx); 3331 } 3332 3333 /// If the given formula has not yet been inserted, add it to the list, and 3334 /// return true. Return false otherwise. 3335 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { 3336 // Do not insert formula that we will not be able to expand. 3337 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) && 3338 "Formula is illegal"); 3339 3340 if (!LU.InsertFormula(F, *L)) 3341 return false; 3342 3343 CountRegisters(F, LUIdx); 3344 return true; 3345 } 3346 3347 /// Check for other uses of loop-invariant values which we're tracking. These 3348 /// other uses will pin these values in registers, making them less profitable 3349 /// for elimination. 3350 /// TODO: This currently misses non-constant addrec step registers. 3351 /// TODO: Should this give more weight to users inside the loop? 3352 void 3353 LSRInstance::CollectLoopInvariantFixupsAndFormulae() { 3354 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end()); 3355 SmallPtrSet<const SCEV *, 32> Visited; 3356 3357 while (!Worklist.empty()) { 3358 const SCEV *S = Worklist.pop_back_val(); 3359 3360 // Don't process the same SCEV twice 3361 if (!Visited.insert(S).second) 3362 continue; 3363 3364 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) 3365 Worklist.append(N->op_begin(), N->op_end()); 3366 else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) 3367 Worklist.push_back(C->getOperand()); 3368 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { 3369 Worklist.push_back(D->getLHS()); 3370 Worklist.push_back(D->getRHS()); 3371 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) { 3372 const Value *V = US->getValue(); 3373 if (const Instruction *Inst = dyn_cast<Instruction>(V)) { 3374 // Look for instructions defined outside the loop. 3375 if (L->contains(Inst)) continue; 3376 } else if (isa<UndefValue>(V)) 3377 // Undef doesn't have a live range, so it doesn't matter. 3378 continue; 3379 for (const Use &U : V->uses()) { 3380 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser()); 3381 // Ignore non-instructions. 3382 if (!UserInst) 3383 continue; 3384 // Ignore instructions in other functions (as can happen with 3385 // Constants). 3386 if (UserInst->getParent()->getParent() != L->getHeader()->getParent()) 3387 continue; 3388 // Ignore instructions not dominated by the loop. 3389 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ? 3390 UserInst->getParent() : 3391 cast<PHINode>(UserInst)->getIncomingBlock( 3392 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3393 if (!DT.dominates(L->getHeader(), UseBB)) 3394 continue; 3395 // Don't bother if the instruction is in a BB which ends in an EHPad. 3396 if (UseBB->getTerminator()->isEHPad()) 3397 continue; 3398 // Don't bother rewriting PHIs in catchswitch blocks. 3399 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator())) 3400 continue; 3401 // Ignore uses which are part of other SCEV expressions, to avoid 3402 // analyzing them multiple times. 3403 if (SE.isSCEVable(UserInst->getType())) { 3404 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst)); 3405 // If the user is a no-op, look through to its uses. 3406 if (!isa<SCEVUnknown>(UserS)) 3407 continue; 3408 if (UserS == US) { 3409 Worklist.push_back( 3410 SE.getUnknown(const_cast<Instruction *>(UserInst))); 3411 continue; 3412 } 3413 } 3414 // Ignore icmp instructions which are already being analyzed. 3415 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { 3416 unsigned OtherIdx = !U.getOperandNo(); 3417 Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx)); 3418 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L)) 3419 continue; 3420 } 3421 3422 std::pair<size_t, int64_t> P = getUse( 3423 S, LSRUse::Basic, MemAccessTy()); 3424 size_t LUIdx = P.first; 3425 int64_t Offset = P.second; 3426 LSRUse &LU = Uses[LUIdx]; 3427 LSRFixup &LF = LU.getNewFixup(); 3428 LF.UserInst = const_cast<Instruction *>(UserInst); 3429 LF.OperandValToReplace = U; 3430 LF.Offset = Offset; 3431 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); 3432 if (!LU.WidestFixupType || 3433 SE.getTypeSizeInBits(LU.WidestFixupType) < 3434 SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) 3435 LU.WidestFixupType = LF.OperandValToReplace->getType(); 3436 InsertSupplementalFormula(US, LU, LUIdx); 3437 CountRegisters(LU.Formulae.back(), Uses.size() - 1); 3438 break; 3439 } 3440 } 3441 } 3442 } 3443 3444 /// Split S into subexpressions which can be pulled out into separate 3445 /// registers. If C is non-null, multiply each subexpression by C. 3446 /// 3447 /// Return remainder expression after factoring the subexpressions captured by 3448 /// Ops. If Ops is complete, return NULL. 3449 static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, 3450 SmallVectorImpl<const SCEV *> &Ops, 3451 const Loop *L, 3452 ScalarEvolution &SE, 3453 unsigned Depth = 0) { 3454 // Arbitrarily cap recursion to protect compile time. 3455 if (Depth >= 3) 3456 return S; 3457 3458 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 3459 // Break out add operands. 3460 for (const SCEV *S : Add->operands()) { 3461 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1); 3462 if (Remainder) 3463 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); 3464 } 3465 return nullptr; 3466 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 3467 // Split a non-zero base out of an addrec. 3468 if (AR->getStart()->isZero() || !AR->isAffine()) 3469 return S; 3470 3471 const SCEV *Remainder = CollectSubexprs(AR->getStart(), 3472 C, Ops, L, SE, Depth+1); 3473 // Split the non-zero AddRec unless it is part of a nested recurrence that 3474 // does not pertain to this loop. 3475 if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) { 3476 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); 3477 Remainder = nullptr; 3478 } 3479 if (Remainder != AR->getStart()) { 3480 if (!Remainder) 3481 Remainder = SE.getConstant(AR->getType(), 0); 3482 return SE.getAddRecExpr(Remainder, 3483 AR->getStepRecurrence(SE), 3484 AR->getLoop(), 3485 //FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 3486 SCEV::FlagAnyWrap); 3487 } 3488 } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { 3489 // Break (C * (a + b + c)) into C*a + C*b + C*c. 3490 if (Mul->getNumOperands() != 2) 3491 return S; 3492 if (const SCEVConstant *Op0 = 3493 dyn_cast<SCEVConstant>(Mul->getOperand(0))) { 3494 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0; 3495 const SCEV *Remainder = 3496 CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1); 3497 if (Remainder) 3498 Ops.push_back(SE.getMulExpr(C, Remainder)); 3499 return nullptr; 3500 } 3501 } 3502 return S; 3503 } 3504 3505 /// Return true if the SCEV represents a value that may end up as a 3506 /// post-increment operation. 3507 static bool mayUsePostIncMode(const TargetTransformInfo &TTI, 3508 LSRUse &LU, const SCEV *S, const Loop *L, 3509 ScalarEvolution &SE) { 3510 if (LU.Kind != LSRUse::Address || 3511 !LU.AccessTy.getType()->isIntOrIntVectorTy()) 3512 return false; 3513 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S); 3514 if (!AR) 3515 return false; 3516 const SCEV *LoopStep = AR->getStepRecurrence(SE); 3517 if (!isa<SCEVConstant>(LoopStep)) 3518 return false; 3519 if (LU.AccessTy.getType()->getScalarSizeInBits() != 3520 LoopStep->getType()->getScalarSizeInBits()) 3521 return false; 3522 // Check if a post-indexed load/store can be used. 3523 if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || 3524 TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { 3525 const SCEV *LoopStart = AR->getStart(); 3526 if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L)) 3527 return true; 3528 } 3529 return false; 3530 } 3531 3532 /// Helper function for LSRInstance::GenerateReassociations. 3533 void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, 3534 const Formula &Base, 3535 unsigned Depth, size_t Idx, 3536 bool IsScaledReg) { 3537 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; 3538 // Don't generate reassociations for the base register of a value that 3539 // may generate a post-increment operator. The reason is that the 3540 // reassociations cause extra base+register formula to be created, 3541 // and possibly chosen, but the post-increment is more efficient. 3542 if (TTI.shouldFavorPostInc() && mayUsePostIncMode(TTI, LU, BaseReg, L, SE)) 3543 return; 3544 SmallVector<const SCEV *, 8> AddOps; 3545 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE); 3546 if (Remainder) 3547 AddOps.push_back(Remainder); 3548 3549 if (AddOps.size() == 1) 3550 return; 3551 3552 for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(), 3553 JE = AddOps.end(); 3554 J != JE; ++J) { 3555 // Loop-variant "unknown" values are uninteresting; we won't be able to 3556 // do anything meaningful with them. 3557 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L)) 3558 continue; 3559 3560 // Don't pull a constant into a register if the constant could be folded 3561 // into an immediate field. 3562 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, 3563 LU.AccessTy, *J, Base.getNumRegs() > 1)) 3564 continue; 3565 3566 // Collect all operands except *J. 3567 SmallVector<const SCEV *, 8> InnerAddOps( 3568 ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J); 3569 InnerAddOps.append(std::next(J), 3570 ((const SmallVector<const SCEV *, 8> &)AddOps).end()); 3571 3572 // Don't leave just a constant behind in a register if the constant could 3573 // be folded into an immediate field. 3574 if (InnerAddOps.size() == 1 && 3575 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, 3576 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1)) 3577 continue; 3578 3579 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); 3580 if (InnerSum->isZero()) 3581 continue; 3582 Formula F = Base; 3583 3584 // Add the remaining pieces of the add back into the new formula. 3585 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); 3586 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && 3587 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + 3588 InnerSumSC->getValue()->getZExtValue())) { 3589 F.UnfoldedOffset = 3590 (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue(); 3591 if (IsScaledReg) 3592 F.ScaledReg = nullptr; 3593 else 3594 F.BaseRegs.erase(F.BaseRegs.begin() + Idx); 3595 } else if (IsScaledReg) 3596 F.ScaledReg = InnerSum; 3597 else 3598 F.BaseRegs[Idx] = InnerSum; 3599 3600 // Add J as its own register, or an unfolded immediate. 3601 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); 3602 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && 3603 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + 3604 SC->getValue()->getZExtValue())) 3605 F.UnfoldedOffset = 3606 (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue(); 3607 else 3608 F.BaseRegs.push_back(*J); 3609 // We may have changed the number of register in base regs, adjust the 3610 // formula accordingly. 3611 F.canonicalize(*L); 3612 3613 if (InsertFormula(LU, LUIdx, F)) 3614 // If that formula hadn't been seen before, recurse to find more like 3615 // it. 3616 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2) 3617 // Because just Depth is not enough to bound compile time. 3618 // This means that every time AddOps.size() is greater 16^x we will add 3619 // x to Depth. 3620 GenerateReassociations(LU, LUIdx, LU.Formulae.back(), 3621 Depth + 1 + (Log2_32(AddOps.size()) >> 2)); 3622 } 3623 } 3624 3625 /// Split out subexpressions from adds and the bases of addrecs. 3626 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, 3627 Formula Base, unsigned Depth) { 3628 assert(Base.isCanonical(*L) && "Input must be in the canonical form"); 3629 // Arbitrarily cap recursion to protect compile time. 3630 if (Depth >= 3) 3631 return; 3632 3633 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) 3634 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i); 3635 3636 if (Base.Scale == 1) 3637 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, 3638 /* Idx */ -1, /* IsScaledReg */ true); 3639 } 3640 3641 /// Generate a formula consisting of all of the loop-dominating registers added 3642 /// into a single register. 3643 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, 3644 Formula Base) { 3645 // This method is only interesting on a plurality of registers. 3646 if (Base.BaseRegs.size() + (Base.Scale == 1) + 3647 (Base.UnfoldedOffset != 0) <= 1) 3648 return; 3649 3650 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before 3651 // processing the formula. 3652 Base.unscale(); 3653 SmallVector<const SCEV *, 4> Ops; 3654 Formula NewBase = Base; 3655 NewBase.BaseRegs.clear(); 3656 Type *CombinedIntegerType = nullptr; 3657 for (const SCEV *BaseReg : Base.BaseRegs) { 3658 if (SE.properlyDominates(BaseReg, L->getHeader()) && 3659 !SE.hasComputableLoopEvolution(BaseReg, L)) { 3660 if (!CombinedIntegerType) 3661 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType()); 3662 Ops.push_back(BaseReg); 3663 } 3664 else 3665 NewBase.BaseRegs.push_back(BaseReg); 3666 } 3667 3668 // If no register is relevant, we're done. 3669 if (Ops.size() == 0) 3670 return; 3671 3672 // Utility function for generating the required variants of the combined 3673 // registers. 3674 auto GenerateFormula = [&](const SCEV *Sum) { 3675 Formula F = NewBase; 3676 3677 // TODO: If Sum is zero, it probably means ScalarEvolution missed an 3678 // opportunity to fold something. For now, just ignore such cases 3679 // rather than proceed with zero in a register. 3680 if (Sum->isZero()) 3681 return; 3682 3683 F.BaseRegs.push_back(Sum); 3684 F.canonicalize(*L); 3685 (void)InsertFormula(LU, LUIdx, F); 3686 }; 3687 3688 // If we collected at least two registers, generate a formula combining them. 3689 if (Ops.size() > 1) { 3690 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops. 3691 GenerateFormula(SE.getAddExpr(OpsCopy)); 3692 } 3693 3694 // If we have an unfolded offset, generate a formula combining it with the 3695 // registers collected. 3696 if (NewBase.UnfoldedOffset) { 3697 assert(CombinedIntegerType && "Missing a type for the unfolded offset"); 3698 Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset, 3699 true)); 3700 NewBase.UnfoldedOffset = 0; 3701 GenerateFormula(SE.getAddExpr(Ops)); 3702 } 3703 } 3704 3705 /// Helper function for LSRInstance::GenerateSymbolicOffsets. 3706 void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, 3707 const Formula &Base, size_t Idx, 3708 bool IsScaledReg) { 3709 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; 3710 GlobalValue *GV = ExtractSymbol(G, SE); 3711 if (G->isZero() || !GV) 3712 return; 3713 Formula F = Base; 3714 F.BaseGV = GV; 3715 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) 3716 return; 3717 if (IsScaledReg) 3718 F.ScaledReg = G; 3719 else 3720 F.BaseRegs[Idx] = G; 3721 (void)InsertFormula(LU, LUIdx, F); 3722 } 3723 3724 /// Generate reuse formulae using symbolic offsets. 3725 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, 3726 Formula Base) { 3727 // We can't add a symbolic offset if the address already contains one. 3728 if (Base.BaseGV) return; 3729 3730 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) 3731 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i); 3732 if (Base.Scale == 1) 3733 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1, 3734 /* IsScaledReg */ true); 3735 } 3736 3737 /// Helper function for LSRInstance::GenerateConstantOffsets. 3738 void LSRInstance::GenerateConstantOffsetsImpl( 3739 LSRUse &LU, unsigned LUIdx, const Formula &Base, 3740 const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) { 3741 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; 3742 for (int64_t Offset : Worklist) { 3743 Formula F = Base; 3744 F.BaseOffset = (uint64_t)Base.BaseOffset - Offset; 3745 if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind, 3746 LU.AccessTy, F)) { 3747 // Add the offset to the base register. 3748 const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G); 3749 // If it cancelled out, drop the base register, otherwise update it. 3750 if (NewG->isZero()) { 3751 if (IsScaledReg) { 3752 F.Scale = 0; 3753 F.ScaledReg = nullptr; 3754 } else 3755 F.deleteBaseReg(F.BaseRegs[Idx]); 3756 F.canonicalize(*L); 3757 } else if (IsScaledReg) 3758 F.ScaledReg = NewG; 3759 else 3760 F.BaseRegs[Idx] = NewG; 3761 3762 (void)InsertFormula(LU, LUIdx, F); 3763 } 3764 } 3765 3766 int64_t Imm = ExtractImmediate(G, SE); 3767 if (G->isZero() || Imm == 0) 3768 return; 3769 Formula F = Base; 3770 F.BaseOffset = (uint64_t)F.BaseOffset + Imm; 3771 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) 3772 return; 3773 if (IsScaledReg) 3774 F.ScaledReg = G; 3775 else 3776 F.BaseRegs[Idx] = G; 3777 (void)InsertFormula(LU, LUIdx, F); 3778 } 3779 3780 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets. 3781 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, 3782 Formula Base) { 3783 // TODO: For now, just add the min and max offset, because it usually isn't 3784 // worthwhile looking at everything inbetween. 3785 SmallVector<int64_t, 2> Worklist; 3786 Worklist.push_back(LU.MinOffset); 3787 if (LU.MaxOffset != LU.MinOffset) 3788 Worklist.push_back(LU.MaxOffset); 3789 3790 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) 3791 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i); 3792 if (Base.Scale == 1) 3793 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1, 3794 /* IsScaledReg */ true); 3795 } 3796 3797 /// For ICmpZero, check to see if we can scale up the comparison. For example, x 3798 /// == y -> x*c == y*c. 3799 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, 3800 Formula Base) { 3801 if (LU.Kind != LSRUse::ICmpZero) return; 3802 3803 // Determine the integer type for the base formula. 3804 Type *IntTy = Base.getType(); 3805 if (!IntTy) return; 3806 if (SE.getTypeSizeInBits(IntTy) > 64) return; 3807 3808 // Don't do this if there is more than one offset. 3809 if (LU.MinOffset != LU.MaxOffset) return; 3810 3811 // Check if transformation is valid. It is illegal to multiply pointer. 3812 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy()) 3813 return; 3814 for (const SCEV *BaseReg : Base.BaseRegs) 3815 if (BaseReg->getType()->isPointerTy()) 3816 return; 3817 assert(!Base.BaseGV && "ICmpZero use is not legal!"); 3818 3819 // Check each interesting stride. 3820 for (int64_t Factor : Factors) { 3821 // Check that the multiplication doesn't overflow. 3822 if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1) 3823 continue; 3824 int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor; 3825 if (NewBaseOffset / Factor != Base.BaseOffset) 3826 continue; 3827 // If the offset will be truncated at this use, check that it is in bounds. 3828 if (!IntTy->isPointerTy() && 3829 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset)) 3830 continue; 3831 3832 // Check that multiplying with the use offset doesn't overflow. 3833 int64_t Offset = LU.MinOffset; 3834 if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1) 3835 continue; 3836 Offset = (uint64_t)Offset * Factor; 3837 if (Offset / Factor != LU.MinOffset) 3838 continue; 3839 // If the offset will be truncated at this use, check that it is in bounds. 3840 if (!IntTy->isPointerTy() && 3841 !ConstantInt::isValueValidForType(IntTy, Offset)) 3842 continue; 3843 3844 Formula F = Base; 3845 F.BaseOffset = NewBaseOffset; 3846 3847 // Check that this scale is legal. 3848 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F)) 3849 continue; 3850 3851 // Compensate for the use having MinOffset built into it. 3852 F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset; 3853 3854 const SCEV *FactorS = SE.getConstant(IntTy, Factor); 3855 3856 // Check that multiplying with each base register doesn't overflow. 3857 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) { 3858 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS); 3859 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i]) 3860 goto next; 3861 } 3862 3863 // Check that multiplying with the scaled register doesn't overflow. 3864 if (F.ScaledReg) { 3865 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS); 3866 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg) 3867 continue; 3868 } 3869 3870 // Check that multiplying with the unfolded offset doesn't overflow. 3871 if (F.UnfoldedOffset != 0) { 3872 if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() && 3873 Factor == -1) 3874 continue; 3875 F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor; 3876 if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset) 3877 continue; 3878 // If the offset will be truncated, check that it is in bounds. 3879 if (!IntTy->isPointerTy() && 3880 !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset)) 3881 continue; 3882 } 3883 3884 // If we make it here and it's legal, add it. 3885 (void)InsertFormula(LU, LUIdx, F); 3886 next:; 3887 } 3888 } 3889 3890 /// Generate stride factor reuse formulae by making use of scaled-offset address 3891 /// modes, for example. 3892 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { 3893 // Determine the integer type for the base formula. 3894 Type *IntTy = Base.getType(); 3895 if (!IntTy) return; 3896 3897 // If this Formula already has a scaled register, we can't add another one. 3898 // Try to unscale the formula to generate a better scale. 3899 if (Base.Scale != 0 && !Base.unscale()) 3900 return; 3901 3902 assert(Base.Scale == 0 && "unscale did not did its job!"); 3903 3904 // Check each interesting stride. 3905 for (int64_t Factor : Factors) { 3906 Base.Scale = Factor; 3907 Base.HasBaseReg = Base.BaseRegs.size() > 1; 3908 // Check whether this scale is going to be legal. 3909 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 3910 Base)) { 3911 // As a special-case, handle special out-of-loop Basic users specially. 3912 // TODO: Reconsider this special case. 3913 if (LU.Kind == LSRUse::Basic && 3914 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special, 3915 LU.AccessTy, Base) && 3916 LU.AllFixupsOutsideLoop) 3917 LU.Kind = LSRUse::Special; 3918 else 3919 continue; 3920 } 3921 // For an ICmpZero, negating a solitary base register won't lead to 3922 // new solutions. 3923 if (LU.Kind == LSRUse::ICmpZero && 3924 !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV) 3925 continue; 3926 // For each addrec base reg, if its loop is current loop, apply the scale. 3927 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { 3928 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]); 3929 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) { 3930 const SCEV *FactorS = SE.getConstant(IntTy, Factor); 3931 if (FactorS->isZero()) 3932 continue; 3933 // Divide out the factor, ignoring high bits, since we'll be 3934 // scaling the value back up in the end. 3935 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) { 3936 // TODO: This could be optimized to avoid all the copying. 3937 Formula F = Base; 3938 F.ScaledReg = Quotient; 3939 F.deleteBaseReg(F.BaseRegs[i]); 3940 // The canonical representation of 1*reg is reg, which is already in 3941 // Base. In that case, do not try to insert the formula, it will be 3942 // rejected anyway. 3943 if (F.Scale == 1 && (F.BaseRegs.empty() || 3944 (AR->getLoop() != L && LU.AllFixupsOutsideLoop))) 3945 continue; 3946 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate 3947 // non canonical Formula with ScaledReg's loop not being L. 3948 if (F.Scale == 1 && LU.AllFixupsOutsideLoop) 3949 F.canonicalize(*L); 3950 (void)InsertFormula(LU, LUIdx, F); 3951 } 3952 } 3953 } 3954 } 3955 } 3956 3957 /// Generate reuse formulae from different IV types. 3958 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { 3959 // Don't bother truncating symbolic values. 3960 if (Base.BaseGV) return; 3961 3962 // Determine the integer type for the base formula. 3963 Type *DstTy = Base.getType(); 3964 if (!DstTy) return; 3965 DstTy = SE.getEffectiveSCEVType(DstTy); 3966 3967 for (Type *SrcTy : Types) { 3968 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) { 3969 Formula F = Base; 3970 3971 if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy); 3972 for (const SCEV *&BaseReg : F.BaseRegs) 3973 BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy); 3974 3975 // TODO: This assumes we've done basic processing on all uses and 3976 // have an idea what the register usage is. 3977 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses)) 3978 continue; 3979 3980 F.canonicalize(*L); 3981 (void)InsertFormula(LU, LUIdx, F); 3982 } 3983 } 3984 } 3985 3986 namespace { 3987 3988 /// Helper class for GenerateCrossUseConstantOffsets. It's used to defer 3989 /// modifications so that the search phase doesn't have to worry about the data 3990 /// structures moving underneath it. 3991 struct WorkItem { 3992 size_t LUIdx; 3993 int64_t Imm; 3994 const SCEV *OrigReg; 3995 3996 WorkItem(size_t LI, int64_t I, const SCEV *R) 3997 : LUIdx(LI), Imm(I), OrigReg(R) {} 3998 3999 void print(raw_ostream &OS) const; 4000 void dump() const; 4001 }; 4002 4003 } // end anonymous namespace 4004 4005 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 4006 void WorkItem::print(raw_ostream &OS) const { 4007 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx 4008 << " , add offset " << Imm; 4009 } 4010 4011 LLVM_DUMP_METHOD void WorkItem::dump() const { 4012 print(errs()); errs() << '\n'; 4013 } 4014 #endif 4015 4016 /// Look for registers which are a constant distance apart and try to form reuse 4017 /// opportunities between them. 4018 void LSRInstance::GenerateCrossUseConstantOffsets() { 4019 // Group the registers by their value without any added constant offset. 4020 using ImmMapTy = std::map<int64_t, const SCEV *>; 4021 4022 DenseMap<const SCEV *, ImmMapTy> Map; 4023 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap; 4024 SmallVector<const SCEV *, 8> Sequence; 4025 for (const SCEV *Use : RegUses) { 4026 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify. 4027 int64_t Imm = ExtractImmediate(Reg, SE); 4028 auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy())); 4029 if (Pair.second) 4030 Sequence.push_back(Reg); 4031 Pair.first->second.insert(std::make_pair(Imm, Use)); 4032 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use); 4033 } 4034 4035 // Now examine each set of registers with the same base value. Build up 4036 // a list of work to do and do the work in a separate step so that we're 4037 // not adding formulae and register counts while we're searching. 4038 SmallVector<WorkItem, 32> WorkItems; 4039 SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems; 4040 for (const SCEV *Reg : Sequence) { 4041 const ImmMapTy &Imms = Map.find(Reg)->second; 4042 4043 // It's not worthwhile looking for reuse if there's only one offset. 4044 if (Imms.size() == 1) 4045 continue; 4046 4047 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':'; 4048 for (const auto &Entry 4049 : Imms) dbgs() 4050 << ' ' << Entry.first; 4051 dbgs() << '\n'); 4052 4053 // Examine each offset. 4054 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); 4055 J != JE; ++J) { 4056 const SCEV *OrigReg = J->second; 4057 4058 int64_t JImm = J->first; 4059 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg); 4060 4061 if (!isa<SCEVConstant>(OrigReg) && 4062 UsedByIndicesMap[Reg].count() == 1) { 4063 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg 4064 << '\n'); 4065 continue; 4066 } 4067 4068 // Conservatively examine offsets between this orig reg a few selected 4069 // other orig regs. 4070 ImmMapTy::const_iterator OtherImms[] = { 4071 Imms.begin(), std::prev(Imms.end()), 4072 Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) / 4073 2) 4074 }; 4075 for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { 4076 ImmMapTy::const_iterator M = OtherImms[i]; 4077 if (M == J || M == JE) continue; 4078 4079 // Compute the difference between the two. 4080 int64_t Imm = (uint64_t)JImm - M->first; 4081 for (unsigned LUIdx : UsedByIndices.set_bits()) 4082 // Make a memo of this use, offset, and register tuple. 4083 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second) 4084 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg)); 4085 } 4086 } 4087 } 4088 4089 Map.clear(); 4090 Sequence.clear(); 4091 UsedByIndicesMap.clear(); 4092 UniqueItems.clear(); 4093 4094 // Now iterate through the worklist and add new formulae. 4095 for (const WorkItem &WI : WorkItems) { 4096 size_t LUIdx = WI.LUIdx; 4097 LSRUse &LU = Uses[LUIdx]; 4098 int64_t Imm = WI.Imm; 4099 const SCEV *OrigReg = WI.OrigReg; 4100 4101 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); 4102 const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm)); 4103 unsigned BitWidth = SE.getTypeSizeInBits(IntTy); 4104 4105 // TODO: Use a more targeted data structure. 4106 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) { 4107 Formula F = LU.Formulae[L]; 4108 // FIXME: The code for the scaled and unscaled registers looks 4109 // very similar but slightly different. Investigate if they 4110 // could be merged. That way, we would not have to unscale the 4111 // Formula. 4112 F.unscale(); 4113 // Use the immediate in the scaled register. 4114 if (F.ScaledReg == OrigReg) { 4115 int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; 4116 // Don't create 50 + reg(-50). 4117 if (F.referencesReg(SE.getSCEV( 4118 ConstantInt::get(IntTy, -(uint64_t)Offset)))) 4119 continue; 4120 Formula NewF = F; 4121 NewF.BaseOffset = Offset; 4122 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 4123 NewF)) 4124 continue; 4125 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg); 4126 4127 // If the new scale is a constant in a register, and adding the constant 4128 // value to the immediate would produce a value closer to zero than the 4129 // immediate itself, then the formula isn't worthwhile. 4130 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) 4131 if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) && 4132 (C->getAPInt().abs() * APInt(BitWidth, F.Scale)) 4133 .ule(std::abs(NewF.BaseOffset))) 4134 continue; 4135 4136 // OK, looks good. 4137 NewF.canonicalize(*this->L); 4138 (void)InsertFormula(LU, LUIdx, NewF); 4139 } else { 4140 // Use the immediate in a base register. 4141 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) { 4142 const SCEV *BaseReg = F.BaseRegs[N]; 4143 if (BaseReg != OrigReg) 4144 continue; 4145 Formula NewF = F; 4146 NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm; 4147 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, 4148 LU.Kind, LU.AccessTy, NewF)) { 4149 if (TTI.shouldFavorPostInc() && 4150 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE)) 4151 continue; 4152 if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) 4153 continue; 4154 NewF = F; 4155 NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; 4156 } 4157 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); 4158 4159 // If the new formula has a constant in a register, and adding the 4160 // constant value to the immediate would produce a value closer to 4161 // zero than the immediate itself, then the formula isn't worthwhile. 4162 for (const SCEV *NewReg : NewF.BaseRegs) 4163 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) 4164 if ((C->getAPInt() + NewF.BaseOffset) 4165 .abs() 4166 .slt(std::abs(NewF.BaseOffset)) && 4167 (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >= 4168 countTrailingZeros<uint64_t>(NewF.BaseOffset)) 4169 goto skip_formula; 4170 4171 // Ok, looks good. 4172 NewF.canonicalize(*this->L); 4173 (void)InsertFormula(LU, LUIdx, NewF); 4174 break; 4175 skip_formula:; 4176 } 4177 } 4178 } 4179 } 4180 } 4181 4182 /// Generate formulae for each use. 4183 void 4184 LSRInstance::GenerateAllReuseFormulae() { 4185 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan 4186 // queries are more precise. 4187 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 4188 LSRUse &LU = Uses[LUIdx]; 4189 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 4190 GenerateReassociations(LU, LUIdx, LU.Formulae[i]); 4191 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 4192 GenerateCombinations(LU, LUIdx, LU.Formulae[i]); 4193 } 4194 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 4195 LSRUse &LU = Uses[LUIdx]; 4196 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 4197 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]); 4198 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 4199 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]); 4200 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 4201 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]); 4202 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 4203 GenerateScales(LU, LUIdx, LU.Formulae[i]); 4204 } 4205 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 4206 LSRUse &LU = Uses[LUIdx]; 4207 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 4208 GenerateTruncates(LU, LUIdx, LU.Formulae[i]); 4209 } 4210 4211 GenerateCrossUseConstantOffsets(); 4212 4213 LLVM_DEBUG(dbgs() << "\n" 4214 "After generating reuse formulae:\n"; 4215 print_uses(dbgs())); 4216 } 4217 4218 /// If there are multiple formulae with the same set of registers used 4219 /// by other uses, pick the best one and delete the others. 4220 void LSRInstance::FilterOutUndesirableDedicatedRegisters() { 4221 DenseSet<const SCEV *> VisitedRegs; 4222 SmallPtrSet<const SCEV *, 16> Regs; 4223 SmallPtrSet<const SCEV *, 16> LoserRegs; 4224 #ifndef NDEBUG 4225 bool ChangedFormulae = false; 4226 #endif 4227 4228 // Collect the best formula for each unique set of shared registers. This 4229 // is reset for each use. 4230 using BestFormulaeTy = 4231 DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>; 4232 4233 BestFormulaeTy BestFormulae; 4234 4235 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 4236 LSRUse &LU = Uses[LUIdx]; 4237 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); 4238 dbgs() << '\n'); 4239 4240 bool Any = false; 4241 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); 4242 FIdx != NumForms; ++FIdx) { 4243 Formula &F = LU.Formulae[FIdx]; 4244 4245 // Some formulas are instant losers. For example, they may depend on 4246 // nonexistent AddRecs from other loops. These need to be filtered 4247 // immediately, otherwise heuristics could choose them over others leading 4248 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here 4249 // avoids the need to recompute this information across formulae using the 4250 // same bad AddRec. Passing LoserRegs is also essential unless we remove 4251 // the corresponding bad register from the Regs set. 4252 Cost CostF; 4253 Regs.clear(); 4254 CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, SE, DT, LU, &LoserRegs); 4255 if (CostF.isLoser()) { 4256 // During initial formula generation, undesirable formulae are generated 4257 // by uses within other loops that have some non-trivial address mode or 4258 // use the postinc form of the IV. LSR needs to provide these formulae 4259 // as the basis of rediscovering the desired formula that uses an AddRec 4260 // corresponding to the existing phi. Once all formulae have been 4261 // generated, these initial losers may be pruned. 4262 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs()); 4263 dbgs() << "\n"); 4264 } 4265 else { 4266 SmallVector<const SCEV *, 4> Key; 4267 for (const SCEV *Reg : F.BaseRegs) { 4268 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx)) 4269 Key.push_back(Reg); 4270 } 4271 if (F.ScaledReg && 4272 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx)) 4273 Key.push_back(F.ScaledReg); 4274 // Unstable sort by host order ok, because this is only used for 4275 // uniquifying. 4276 llvm::sort(Key); 4277 4278 std::pair<BestFormulaeTy::const_iterator, bool> P = 4279 BestFormulae.insert(std::make_pair(Key, FIdx)); 4280 if (P.second) 4281 continue; 4282 4283 Formula &Best = LU.Formulae[P.first->second]; 4284 4285 Cost CostBest; 4286 Regs.clear(); 4287 CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU); 4288 if (CostF.isLess(CostBest, TTI)) 4289 std::swap(F, Best); 4290 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); 4291 dbgs() << "\n" 4292 " in favor of formula "; 4293 Best.print(dbgs()); dbgs() << '\n'); 4294 } 4295 #ifndef NDEBUG 4296 ChangedFormulae = true; 4297 #endif 4298 LU.DeleteFormula(F); 4299 --FIdx; 4300 --NumForms; 4301 Any = true; 4302 } 4303 4304 // Now that we've filtered out some formulae, recompute the Regs set. 4305 if (Any) 4306 LU.RecomputeRegs(LUIdx, RegUses); 4307 4308 // Reset this to prepare for the next use. 4309 BestFormulae.clear(); 4310 } 4311 4312 LLVM_DEBUG(if (ChangedFormulae) { 4313 dbgs() << "\n" 4314 "After filtering out undesirable candidates:\n"; 4315 print_uses(dbgs()); 4316 }); 4317 } 4318 4319 /// Estimate the worst-case number of solutions the solver might have to 4320 /// consider. It almost never considers this many solutions because it prune the 4321 /// search space, but the pruning isn't always sufficient. 4322 size_t LSRInstance::EstimateSearchSpaceComplexity() const { 4323 size_t Power = 1; 4324 for (const LSRUse &LU : Uses) { 4325 size_t FSize = LU.Formulae.size(); 4326 if (FSize >= ComplexityLimit) { 4327 Power = ComplexityLimit; 4328 break; 4329 } 4330 Power *= FSize; 4331 if (Power >= ComplexityLimit) 4332 break; 4333 } 4334 return Power; 4335 } 4336 4337 /// When one formula uses a superset of the registers of another formula, it 4338 /// won't help reduce register pressure (though it may not necessarily hurt 4339 /// register pressure); remove it to simplify the system. 4340 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { 4341 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 4342 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"); 4343 4344 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae " 4345 "which use a superset of registers used by other " 4346 "formulae.\n"); 4347 4348 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 4349 LSRUse &LU = Uses[LUIdx]; 4350 bool Any = false; 4351 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { 4352 Formula &F = LU.Formulae[i]; 4353 // Look for a formula with a constant or GV in a register. If the use 4354 // also has a formula with that same value in an immediate field, 4355 // delete the one that uses a register. 4356 for (SmallVectorImpl<const SCEV *>::const_iterator 4357 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { 4358 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) { 4359 Formula NewF = F; 4360 NewF.BaseOffset += C->getValue()->getSExtValue(); 4361 NewF.BaseRegs.erase(NewF.BaseRegs.begin() + 4362 (I - F.BaseRegs.begin())); 4363 if (LU.HasFormulaWithSameRegs(NewF)) { 4364 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); 4365 dbgs() << '\n'); 4366 LU.DeleteFormula(F); 4367 --i; 4368 --e; 4369 Any = true; 4370 break; 4371 } 4372 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) { 4373 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) 4374 if (!F.BaseGV) { 4375 Formula NewF = F; 4376 NewF.BaseGV = GV; 4377 NewF.BaseRegs.erase(NewF.BaseRegs.begin() + 4378 (I - F.BaseRegs.begin())); 4379 if (LU.HasFormulaWithSameRegs(NewF)) { 4380 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); 4381 dbgs() << '\n'); 4382 LU.DeleteFormula(F); 4383 --i; 4384 --e; 4385 Any = true; 4386 break; 4387 } 4388 } 4389 } 4390 } 4391 } 4392 if (Any) 4393 LU.RecomputeRegs(LUIdx, RegUses); 4394 } 4395 4396 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 4397 } 4398 } 4399 4400 /// When there are many registers for expressions like A, A+1, A+2, etc., 4401 /// allocate a single register for them. 4402 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { 4403 if (EstimateSearchSpaceComplexity() < ComplexityLimit) 4404 return; 4405 4406 LLVM_DEBUG( 4407 dbgs() << "The search space is too complex.\n" 4408 "Narrowing the search space by assuming that uses separated " 4409 "by a constant offset will use the same registers.\n"); 4410 4411 // This is especially useful for unrolled loops. 4412 4413 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 4414 LSRUse &LU = Uses[LUIdx]; 4415 for (const Formula &F : LU.Formulae) { 4416 if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1)) 4417 continue; 4418 4419 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU); 4420 if (!LUThatHas) 4421 continue; 4422 4423 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false, 4424 LU.Kind, LU.AccessTy)) 4425 continue; 4426 4427 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n'); 4428 4429 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; 4430 4431 // Transfer the fixups of LU to LUThatHas. 4432 for (LSRFixup &Fixup : LU.Fixups) { 4433 Fixup.Offset += F.BaseOffset; 4434 LUThatHas->pushFixup(Fixup); 4435 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n'); 4436 } 4437 4438 // Delete formulae from the new use which are no longer legal. 4439 bool Any = false; 4440 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { 4441 Formula &F = LUThatHas->Formulae[i]; 4442 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset, 4443 LUThatHas->Kind, LUThatHas->AccessTy, F)) { 4444 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); 4445 LUThatHas->DeleteFormula(F); 4446 --i; 4447 --e; 4448 Any = true; 4449 } 4450 } 4451 4452 if (Any) 4453 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses); 4454 4455 // Delete the old use. 4456 DeleteUse(LU, LUIdx); 4457 --LUIdx; 4458 --NumUses; 4459 break; 4460 } 4461 } 4462 4463 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 4464 } 4465 4466 /// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that 4467 /// we've done more filtering, as it may be able to find more formulae to 4468 /// eliminate. 4469 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ 4470 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 4471 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"); 4472 4473 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out " 4474 "undesirable dedicated registers.\n"); 4475 4476 FilterOutUndesirableDedicatedRegisters(); 4477 4478 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 4479 } 4480 } 4481 4482 /// If a LSRUse has multiple formulae with the same ScaledReg and Scale. 4483 /// Pick the best one and delete the others. 4484 /// This narrowing heuristic is to keep as many formulae with different 4485 /// Scale and ScaledReg pair as possible while narrowing the search space. 4486 /// The benefit is that it is more likely to find out a better solution 4487 /// from a formulae set with more Scale and ScaledReg variations than 4488 /// a formulae set with the same Scale and ScaledReg. The picking winner 4489 /// reg heuristic will often keep the formulae with the same Scale and 4490 /// ScaledReg and filter others, and we want to avoid that if possible. 4491 void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() { 4492 if (EstimateSearchSpaceComplexity() < ComplexityLimit) 4493 return; 4494 4495 LLVM_DEBUG( 4496 dbgs() << "The search space is too complex.\n" 4497 "Narrowing the search space by choosing the best Formula " 4498 "from the Formulae with the same Scale and ScaledReg.\n"); 4499 4500 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse. 4501 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>; 4502 4503 BestFormulaeTy BestFormulae; 4504 #ifndef NDEBUG 4505 bool ChangedFormulae = false; 4506 #endif 4507 DenseSet<const SCEV *> VisitedRegs; 4508 SmallPtrSet<const SCEV *, 16> Regs; 4509 4510 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 4511 LSRUse &LU = Uses[LUIdx]; 4512 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); 4513 dbgs() << '\n'); 4514 4515 // Return true if Formula FA is better than Formula FB. 4516 auto IsBetterThan = [&](Formula &FA, Formula &FB) { 4517 // First we will try to choose the Formula with fewer new registers. 4518 // For a register used by current Formula, the more the register is 4519 // shared among LSRUses, the less we increase the register number 4520 // counter of the formula. 4521 size_t FARegNum = 0; 4522 for (const SCEV *Reg : FA.BaseRegs) { 4523 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg); 4524 FARegNum += (NumUses - UsedByIndices.count() + 1); 4525 } 4526 size_t FBRegNum = 0; 4527 for (const SCEV *Reg : FB.BaseRegs) { 4528 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg); 4529 FBRegNum += (NumUses - UsedByIndices.count() + 1); 4530 } 4531 if (FARegNum != FBRegNum) 4532 return FARegNum < FBRegNum; 4533 4534 // If the new register numbers are the same, choose the Formula with 4535 // less Cost. 4536 Cost CostFA, CostFB; 4537 Regs.clear(); 4538 CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU); 4539 Regs.clear(); 4540 CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU); 4541 return CostFA.isLess(CostFB, TTI); 4542 }; 4543 4544 bool Any = false; 4545 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms; 4546 ++FIdx) { 4547 Formula &F = LU.Formulae[FIdx]; 4548 if (!F.ScaledReg) 4549 continue; 4550 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx}); 4551 if (P.second) 4552 continue; 4553 4554 Formula &Best = LU.Formulae[P.first->second]; 4555 if (IsBetterThan(F, Best)) 4556 std::swap(F, Best); 4557 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); 4558 dbgs() << "\n" 4559 " in favor of formula "; 4560 Best.print(dbgs()); dbgs() << '\n'); 4561 #ifndef NDEBUG 4562 ChangedFormulae = true; 4563 #endif 4564 LU.DeleteFormula(F); 4565 --FIdx; 4566 --NumForms; 4567 Any = true; 4568 } 4569 if (Any) 4570 LU.RecomputeRegs(LUIdx, RegUses); 4571 4572 // Reset this to prepare for the next use. 4573 BestFormulae.clear(); 4574 } 4575 4576 LLVM_DEBUG(if (ChangedFormulae) { 4577 dbgs() << "\n" 4578 "After filtering out undesirable candidates:\n"; 4579 print_uses(dbgs()); 4580 }); 4581 } 4582 4583 /// The function delete formulas with high registers number expectation. 4584 /// Assuming we don't know the value of each formula (already delete 4585 /// all inefficient), generate probability of not selecting for each 4586 /// register. 4587 /// For example, 4588 /// Use1: 4589 /// reg(a) + reg({0,+,1}) 4590 /// reg(a) + reg({-1,+,1}) + 1 4591 /// reg({a,+,1}) 4592 /// Use2: 4593 /// reg(b) + reg({0,+,1}) 4594 /// reg(b) + reg({-1,+,1}) + 1 4595 /// reg({b,+,1}) 4596 /// Use3: 4597 /// reg(c) + reg(b) + reg({0,+,1}) 4598 /// reg(c) + reg({b,+,1}) 4599 /// 4600 /// Probability of not selecting 4601 /// Use1 Use2 Use3 4602 /// reg(a) (1/3) * 1 * 1 4603 /// reg(b) 1 * (1/3) * (1/2) 4604 /// reg({0,+,1}) (2/3) * (2/3) * (1/2) 4605 /// reg({-1,+,1}) (2/3) * (2/3) * 1 4606 /// reg({a,+,1}) (2/3) * 1 * 1 4607 /// reg({b,+,1}) 1 * (2/3) * (2/3) 4608 /// reg(c) 1 * 1 * 0 4609 /// 4610 /// Now count registers number mathematical expectation for each formula: 4611 /// Note that for each use we exclude probability if not selecting for the use. 4612 /// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding 4613 /// probabilty 1/3 of not selecting for Use1). 4614 /// Use1: 4615 /// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted 4616 /// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted 4617 /// reg({a,+,1}) 1 4618 /// Use2: 4619 /// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted 4620 /// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted 4621 /// reg({b,+,1}) 2/3 4622 /// Use3: 4623 /// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted 4624 /// reg(c) + reg({b,+,1}) 1 + 2/3 4625 void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() { 4626 if (EstimateSearchSpaceComplexity() < ComplexityLimit) 4627 return; 4628 // Ok, we have too many of formulae on our hands to conveniently handle. 4629 // Use a rough heuristic to thin out the list. 4630 4631 // Set of Regs wich will be 100% used in final solution. 4632 // Used in each formula of a solution (in example above this is reg(c)). 4633 // We can skip them in calculations. 4634 SmallPtrSet<const SCEV *, 4> UniqRegs; 4635 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"); 4636 4637 // Map each register to probability of not selecting 4638 DenseMap <const SCEV *, float> RegNumMap; 4639 for (const SCEV *Reg : RegUses) { 4640 if (UniqRegs.count(Reg)) 4641 continue; 4642 float PNotSel = 1; 4643 for (const LSRUse &LU : Uses) { 4644 if (!LU.Regs.count(Reg)) 4645 continue; 4646 float P = LU.getNotSelectedProbability(Reg); 4647 if (P != 0.0) 4648 PNotSel *= P; 4649 else 4650 UniqRegs.insert(Reg); 4651 } 4652 RegNumMap.insert(std::make_pair(Reg, PNotSel)); 4653 } 4654 4655 LLVM_DEBUG( 4656 dbgs() << "Narrowing the search space by deleting costly formulas\n"); 4657 4658 // Delete formulas where registers number expectation is high. 4659 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 4660 LSRUse &LU = Uses[LUIdx]; 4661 // If nothing to delete - continue. 4662 if (LU.Formulae.size() < 2) 4663 continue; 4664 // This is temporary solution to test performance. Float should be 4665 // replaced with round independent type (based on integers) to avoid 4666 // different results for different target builds. 4667 float FMinRegNum = LU.Formulae[0].getNumRegs(); 4668 float FMinARegNum = LU.Formulae[0].getNumRegs(); 4669 size_t MinIdx = 0; 4670 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { 4671 Formula &F = LU.Formulae[i]; 4672 float FRegNum = 0; 4673 float FARegNum = 0; 4674 for (const SCEV *BaseReg : F.BaseRegs) { 4675 if (UniqRegs.count(BaseReg)) 4676 continue; 4677 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg); 4678 if (isa<SCEVAddRecExpr>(BaseReg)) 4679 FARegNum += 4680 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg); 4681 } 4682 if (const SCEV *ScaledReg = F.ScaledReg) { 4683 if (!UniqRegs.count(ScaledReg)) { 4684 FRegNum += 4685 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg); 4686 if (isa<SCEVAddRecExpr>(ScaledReg)) 4687 FARegNum += 4688 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg); 4689 } 4690 } 4691 if (FMinRegNum > FRegNum || 4692 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) { 4693 FMinRegNum = FRegNum; 4694 FMinARegNum = FARegNum; 4695 MinIdx = i; 4696 } 4697 } 4698 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs()); 4699 dbgs() << " with min reg num " << FMinRegNum << '\n'); 4700 if (MinIdx != 0) 4701 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]); 4702 while (LU.Formulae.size() != 1) { 4703 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs()); 4704 dbgs() << '\n'); 4705 LU.Formulae.pop_back(); 4706 } 4707 LU.RecomputeRegs(LUIdx, RegUses); 4708 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula"); 4709 Formula &F = LU.Formulae[0]; 4710 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n'); 4711 // When we choose the formula, the regs become unique. 4712 UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); 4713 if (F.ScaledReg) 4714 UniqRegs.insert(F.ScaledReg); 4715 } 4716 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 4717 } 4718 4719 /// Pick a register which seems likely to be profitable, and then in any use 4720 /// which has any reference to that register, delete all formulae which do not 4721 /// reference that register. 4722 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { 4723 // With all other options exhausted, loop until the system is simple 4724 // enough to handle. 4725 SmallPtrSet<const SCEV *, 4> Taken; 4726 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 4727 // Ok, we have too many of formulae on our hands to conveniently handle. 4728 // Use a rough heuristic to thin out the list. 4729 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"); 4730 4731 // Pick the register which is used by the most LSRUses, which is likely 4732 // to be a good reuse register candidate. 4733 const SCEV *Best = nullptr; 4734 unsigned BestNum = 0; 4735 for (const SCEV *Reg : RegUses) { 4736 if (Taken.count(Reg)) 4737 continue; 4738 if (!Best) { 4739 Best = Reg; 4740 BestNum = RegUses.getUsedByIndices(Reg).count(); 4741 } else { 4742 unsigned Count = RegUses.getUsedByIndices(Reg).count(); 4743 if (Count > BestNum) { 4744 Best = Reg; 4745 BestNum = Count; 4746 } 4747 } 4748 } 4749 4750 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best 4751 << " will yield profitable reuse.\n"); 4752 Taken.insert(Best); 4753 4754 // In any use with formulae which references this register, delete formulae 4755 // which don't reference it. 4756 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 4757 LSRUse &LU = Uses[LUIdx]; 4758 if (!LU.Regs.count(Best)) continue; 4759 4760 bool Any = false; 4761 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { 4762 Formula &F = LU.Formulae[i]; 4763 if (!F.referencesReg(Best)) { 4764 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); 4765 LU.DeleteFormula(F); 4766 --e; 4767 --i; 4768 Any = true; 4769 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?"); 4770 continue; 4771 } 4772 } 4773 4774 if (Any) 4775 LU.RecomputeRegs(LUIdx, RegUses); 4776 } 4777 4778 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 4779 } 4780 } 4781 4782 /// If there are an extraordinary number of formulae to choose from, use some 4783 /// rough heuristics to prune down the number of formulae. This keeps the main 4784 /// solver from taking an extraordinary amount of time in some worst-case 4785 /// scenarios. 4786 void LSRInstance::NarrowSearchSpaceUsingHeuristics() { 4787 NarrowSearchSpaceByDetectingSupersets(); 4788 NarrowSearchSpaceByCollapsingUnrolledCode(); 4789 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); 4790 if (FilterSameScaledReg) 4791 NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); 4792 if (LSRExpNarrow) 4793 NarrowSearchSpaceByDeletingCostlyFormulas(); 4794 else 4795 NarrowSearchSpaceByPickingWinnerRegs(); 4796 } 4797 4798 /// This is the recursive solver. 4799 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, 4800 Cost &SolutionCost, 4801 SmallVectorImpl<const Formula *> &Workspace, 4802 const Cost &CurCost, 4803 const SmallPtrSet<const SCEV *, 16> &CurRegs, 4804 DenseSet<const SCEV *> &VisitedRegs) const { 4805 // Some ideas: 4806 // - prune more: 4807 // - use more aggressive filtering 4808 // - sort the formula so that the most profitable solutions are found first 4809 // - sort the uses too 4810 // - search faster: 4811 // - don't compute a cost, and then compare. compare while computing a cost 4812 // and bail early. 4813 // - track register sets with SmallBitVector 4814 4815 const LSRUse &LU = Uses[Workspace.size()]; 4816 4817 // If this use references any register that's already a part of the 4818 // in-progress solution, consider it a requirement that a formula must 4819 // reference that register in order to be considered. This prunes out 4820 // unprofitable searching. 4821 SmallSetVector<const SCEV *, 4> ReqRegs; 4822 for (const SCEV *S : CurRegs) 4823 if (LU.Regs.count(S)) 4824 ReqRegs.insert(S); 4825 4826 SmallPtrSet<const SCEV *, 16> NewRegs; 4827 Cost NewCost; 4828 for (const Formula &F : LU.Formulae) { 4829 // Ignore formulae which may not be ideal in terms of register reuse of 4830 // ReqRegs. The formula should use all required registers before 4831 // introducing new ones. 4832 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size()); 4833 for (const SCEV *Reg : ReqRegs) { 4834 if ((F.ScaledReg && F.ScaledReg == Reg) || 4835 is_contained(F.BaseRegs, Reg)) { 4836 --NumReqRegsToFind; 4837 if (NumReqRegsToFind == 0) 4838 break; 4839 } 4840 } 4841 if (NumReqRegsToFind != 0) { 4842 // If none of the formulae satisfied the required registers, then we could 4843 // clear ReqRegs and try again. Currently, we simply give up in this case. 4844 continue; 4845 } 4846 4847 // Evaluate the cost of the current formula. If it's already worse than 4848 // the current best, prune the search at that point. 4849 NewCost = CurCost; 4850 NewRegs = CurRegs; 4851 NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU); 4852 if (NewCost.isLess(SolutionCost, TTI)) { 4853 Workspace.push_back(&F); 4854 if (Workspace.size() != Uses.size()) { 4855 SolveRecurse(Solution, SolutionCost, Workspace, NewCost, 4856 NewRegs, VisitedRegs); 4857 if (F.getNumRegs() == 1 && Workspace.size() == 1) 4858 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]); 4859 } else { 4860 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); 4861 dbgs() << ".\n Regs:"; for (const SCEV *S 4862 : NewRegs) dbgs() 4863 << ' ' << *S; 4864 dbgs() << '\n'); 4865 4866 SolutionCost = NewCost; 4867 Solution = Workspace; 4868 } 4869 Workspace.pop_back(); 4870 } 4871 } 4872 } 4873 4874 /// Choose one formula from each use. Return the results in the given Solution 4875 /// vector. 4876 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { 4877 SmallVector<const Formula *, 8> Workspace; 4878 Cost SolutionCost; 4879 SolutionCost.Lose(); 4880 Cost CurCost; 4881 SmallPtrSet<const SCEV *, 16> CurRegs; 4882 DenseSet<const SCEV *> VisitedRegs; 4883 Workspace.reserve(Uses.size()); 4884 4885 // SolveRecurse does all the work. 4886 SolveRecurse(Solution, SolutionCost, Workspace, CurCost, 4887 CurRegs, VisitedRegs); 4888 if (Solution.empty()) { 4889 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n"); 4890 return; 4891 } 4892 4893 // Ok, we've now made all our decisions. 4894 LLVM_DEBUG(dbgs() << "\n" 4895 "The chosen solution requires "; 4896 SolutionCost.print(dbgs()); dbgs() << ":\n"; 4897 for (size_t i = 0, e = Uses.size(); i != e; ++i) { 4898 dbgs() << " "; 4899 Uses[i].print(dbgs()); 4900 dbgs() << "\n" 4901 " "; 4902 Solution[i]->print(dbgs()); 4903 dbgs() << '\n'; 4904 }); 4905 4906 assert(Solution.size() == Uses.size() && "Malformed solution!"); 4907 } 4908 4909 /// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as 4910 /// we can go while still being dominated by the input positions. This helps 4911 /// canonicalize the insert position, which encourages sharing. 4912 BasicBlock::iterator 4913 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, 4914 const SmallVectorImpl<Instruction *> &Inputs) 4915 const { 4916 Instruction *Tentative = &*IP; 4917 while (true) { 4918 bool AllDominate = true; 4919 Instruction *BetterPos = nullptr; 4920 // Don't bother attempting to insert before a catchswitch, their basic block 4921 // cannot have other non-PHI instructions. 4922 if (isa<CatchSwitchInst>(Tentative)) 4923 return IP; 4924 4925 for (Instruction *Inst : Inputs) { 4926 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) { 4927 AllDominate = false; 4928 break; 4929 } 4930 // Attempt to find an insert position in the middle of the block, 4931 // instead of at the end, so that it can be used for other expansions. 4932 if (Tentative->getParent() == Inst->getParent() && 4933 (!BetterPos || !DT.dominates(Inst, BetterPos))) 4934 BetterPos = &*std::next(BasicBlock::iterator(Inst)); 4935 } 4936 if (!AllDominate) 4937 break; 4938 if (BetterPos) 4939 IP = BetterPos->getIterator(); 4940 else 4941 IP = Tentative->getIterator(); 4942 4943 const Loop *IPLoop = LI.getLoopFor(IP->getParent()); 4944 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0; 4945 4946 BasicBlock *IDom; 4947 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) { 4948 if (!Rung) return IP; 4949 Rung = Rung->getIDom(); 4950 if (!Rung) return IP; 4951 IDom = Rung->getBlock(); 4952 4953 // Don't climb into a loop though. 4954 const Loop *IDomLoop = LI.getLoopFor(IDom); 4955 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0; 4956 if (IDomDepth <= IPLoopDepth && 4957 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop)) 4958 break; 4959 } 4960 4961 Tentative = IDom->getTerminator(); 4962 } 4963 4964 return IP; 4965 } 4966 4967 /// Determine an input position which will be dominated by the operands and 4968 /// which will dominate the result. 4969 BasicBlock::iterator 4970 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, 4971 const LSRFixup &LF, 4972 const LSRUse &LU, 4973 SCEVExpander &Rewriter) const { 4974 // Collect some instructions which must be dominated by the 4975 // expanding replacement. These must be dominated by any operands that 4976 // will be required in the expansion. 4977 SmallVector<Instruction *, 4> Inputs; 4978 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace)) 4979 Inputs.push_back(I); 4980 if (LU.Kind == LSRUse::ICmpZero) 4981 if (Instruction *I = 4982 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1))) 4983 Inputs.push_back(I); 4984 if (LF.PostIncLoops.count(L)) { 4985 if (LF.isUseFullyOutsideLoop(L)) 4986 Inputs.push_back(L->getLoopLatch()->getTerminator()); 4987 else 4988 Inputs.push_back(IVIncInsertPos); 4989 } 4990 // The expansion must also be dominated by the increment positions of any 4991 // loops it for which it is using post-inc mode. 4992 for (const Loop *PIL : LF.PostIncLoops) { 4993 if (PIL == L) continue; 4994 4995 // Be dominated by the loop exit. 4996 SmallVector<BasicBlock *, 4> ExitingBlocks; 4997 PIL->getExitingBlocks(ExitingBlocks); 4998 if (!ExitingBlocks.empty()) { 4999 BasicBlock *BB = ExitingBlocks[0]; 5000 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i) 5001 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]); 5002 Inputs.push_back(BB->getTerminator()); 5003 } 5004 } 5005 5006 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() 5007 && !isa<DbgInfoIntrinsic>(LowestIP) && 5008 "Insertion point must be a normal instruction"); 5009 5010 // Then, climb up the immediate dominator tree as far as we can go while 5011 // still being dominated by the input positions. 5012 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs); 5013 5014 // Don't insert instructions before PHI nodes. 5015 while (isa<PHINode>(IP)) ++IP; 5016 5017 // Ignore landingpad instructions. 5018 while (IP->isEHPad()) ++IP; 5019 5020 // Ignore debug intrinsics. 5021 while (isa<DbgInfoIntrinsic>(IP)) ++IP; 5022 5023 // Set IP below instructions recently inserted by SCEVExpander. This keeps the 5024 // IP consistent across expansions and allows the previously inserted 5025 // instructions to be reused by subsequent expansion. 5026 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP) 5027 ++IP; 5028 5029 return IP; 5030 } 5031 5032 /// Emit instructions for the leading candidate expression for this LSRUse (this 5033 /// is called "expanding"). 5034 Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, 5035 const Formula &F, BasicBlock::iterator IP, 5036 SCEVExpander &Rewriter, 5037 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const { 5038 if (LU.RigidFormula) 5039 return LF.OperandValToReplace; 5040 5041 // Determine an input position which will be dominated by the operands and 5042 // which will dominate the result. 5043 IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter); 5044 Rewriter.setInsertPoint(&*IP); 5045 5046 // Inform the Rewriter if we have a post-increment use, so that it can 5047 // perform an advantageous expansion. 5048 Rewriter.setPostInc(LF.PostIncLoops); 5049 5050 // This is the type that the user actually needs. 5051 Type *OpTy = LF.OperandValToReplace->getType(); 5052 // This will be the type that we'll initially expand to. 5053 Type *Ty = F.getType(); 5054 if (!Ty) 5055 // No type known; just expand directly to the ultimate type. 5056 Ty = OpTy; 5057 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy)) 5058 // Expand directly to the ultimate type if it's the right size. 5059 Ty = OpTy; 5060 // This is the type to do integer arithmetic in. 5061 Type *IntTy = SE.getEffectiveSCEVType(Ty); 5062 5063 // Build up a list of operands to add together to form the full base. 5064 SmallVector<const SCEV *, 8> Ops; 5065 5066 // Expand the BaseRegs portion. 5067 for (const SCEV *Reg : F.BaseRegs) { 5068 assert(!Reg->isZero() && "Zero allocated in a base register!"); 5069 5070 // If we're expanding for a post-inc user, make the post-inc adjustment. 5071 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE); 5072 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr))); 5073 } 5074 5075 // Expand the ScaledReg portion. 5076 Value *ICmpScaledV = nullptr; 5077 if (F.Scale != 0) { 5078 const SCEV *ScaledS = F.ScaledReg; 5079 5080 // If we're expanding for a post-inc user, make the post-inc adjustment. 5081 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); 5082 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE); 5083 5084 if (LU.Kind == LSRUse::ICmpZero) { 5085 // Expand ScaleReg as if it was part of the base regs. 5086 if (F.Scale == 1) 5087 Ops.push_back( 5088 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr))); 5089 else { 5090 // An interesting way of "folding" with an icmp is to use a negated 5091 // scale, which we'll implement by inserting it into the other operand 5092 // of the icmp. 5093 assert(F.Scale == -1 && 5094 "The only scale supported by ICmpZero uses is -1!"); 5095 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr); 5096 } 5097 } else { 5098 // Otherwise just expand the scaled register and an explicit scale, 5099 // which is expected to be matched as part of the address. 5100 5101 // Flush the operand list to suppress SCEVExpander hoisting address modes. 5102 // Unless the addressing mode will not be folded. 5103 if (!Ops.empty() && LU.Kind == LSRUse::Address && 5104 isAMCompletelyFolded(TTI, LU, F)) { 5105 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr); 5106 Ops.clear(); 5107 Ops.push_back(SE.getUnknown(FullV)); 5108 } 5109 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)); 5110 if (F.Scale != 1) 5111 ScaledS = 5112 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale)); 5113 Ops.push_back(ScaledS); 5114 } 5115 } 5116 5117 // Expand the GV portion. 5118 if (F.BaseGV) { 5119 // Flush the operand list to suppress SCEVExpander hoisting. 5120 if (!Ops.empty()) { 5121 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty); 5122 Ops.clear(); 5123 Ops.push_back(SE.getUnknown(FullV)); 5124 } 5125 Ops.push_back(SE.getUnknown(F.BaseGV)); 5126 } 5127 5128 // Flush the operand list to suppress SCEVExpander hoisting of both folded and 5129 // unfolded offsets. LSR assumes they both live next to their uses. 5130 if (!Ops.empty()) { 5131 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty); 5132 Ops.clear(); 5133 Ops.push_back(SE.getUnknown(FullV)); 5134 } 5135 5136 // Expand the immediate portion. 5137 int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset; 5138 if (Offset != 0) { 5139 if (LU.Kind == LSRUse::ICmpZero) { 5140 // The other interesting way of "folding" with an ICmpZero is to use a 5141 // negated immediate. 5142 if (!ICmpScaledV) 5143 ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset); 5144 else { 5145 Ops.push_back(SE.getUnknown(ICmpScaledV)); 5146 ICmpScaledV = ConstantInt::get(IntTy, Offset); 5147 } 5148 } else { 5149 // Just add the immediate values. These again are expected to be matched 5150 // as part of the address. 5151 Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset))); 5152 } 5153 } 5154 5155 // Expand the unfolded offset portion. 5156 int64_t UnfoldedOffset = F.UnfoldedOffset; 5157 if (UnfoldedOffset != 0) { 5158 // Just add the immediate values. 5159 Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, 5160 UnfoldedOffset))); 5161 } 5162 5163 // Emit instructions summing all the operands. 5164 const SCEV *FullS = Ops.empty() ? 5165 SE.getConstant(IntTy, 0) : 5166 SE.getAddExpr(Ops); 5167 Value *FullV = Rewriter.expandCodeFor(FullS, Ty); 5168 5169 // We're done expanding now, so reset the rewriter. 5170 Rewriter.clearPostInc(); 5171 5172 // An ICmpZero Formula represents an ICmp which we're handling as a 5173 // comparison against zero. Now that we've expanded an expression for that 5174 // form, update the ICmp's other operand. 5175 if (LU.Kind == LSRUse::ICmpZero) { 5176 ICmpInst *CI = cast<ICmpInst>(LF.UserInst); 5177 DeadInsts.emplace_back(CI->getOperand(1)); 5178 assert(!F.BaseGV && "ICmp does not support folding a global value and " 5179 "a scale at the same time!"); 5180 if (F.Scale == -1) { 5181 if (ICmpScaledV->getType() != OpTy) { 5182 Instruction *Cast = 5183 CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false, 5184 OpTy, false), 5185 ICmpScaledV, OpTy, "tmp", CI); 5186 ICmpScaledV = Cast; 5187 } 5188 CI->setOperand(1, ICmpScaledV); 5189 } else { 5190 // A scale of 1 means that the scale has been expanded as part of the 5191 // base regs. 5192 assert((F.Scale == 0 || F.Scale == 1) && 5193 "ICmp does not support folding a global value and " 5194 "a scale at the same time!"); 5195 Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), 5196 -(uint64_t)Offset); 5197 if (C->getType() != OpTy) 5198 C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false, 5199 OpTy, false), 5200 C, OpTy); 5201 5202 CI->setOperand(1, C); 5203 } 5204 } 5205 5206 return FullV; 5207 } 5208 5209 /// Helper for Rewrite. PHI nodes are special because the use of their operands 5210 /// effectively happens in their predecessor blocks, so the expression may need 5211 /// to be expanded in multiple places. 5212 void LSRInstance::RewriteForPHI( 5213 PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F, 5214 SCEVExpander &Rewriter, SmallVectorImpl<WeakTrackingVH> &DeadInsts) const { 5215 DenseMap<BasicBlock *, Value *> Inserted; 5216 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 5217 if (PN->getIncomingValue(i) == LF.OperandValToReplace) { 5218 BasicBlock *BB = PN->getIncomingBlock(i); 5219 5220 // If this is a critical edge, split the edge so that we do not insert 5221 // the code on all predecessor/successor paths. We do this unless this 5222 // is the canonical backedge for this loop, which complicates post-inc 5223 // users. 5224 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && 5225 !isa<IndirectBrInst>(BB->getTerminator()) && 5226 !isa<CatchSwitchInst>(BB->getTerminator())) { 5227 BasicBlock *Parent = PN->getParent(); 5228 Loop *PNLoop = LI.getLoopFor(Parent); 5229 if (!PNLoop || Parent != PNLoop->getHeader()) { 5230 // Split the critical edge. 5231 BasicBlock *NewBB = nullptr; 5232 if (!Parent->isLandingPad()) { 5233 NewBB = SplitCriticalEdge(BB, Parent, 5234 CriticalEdgeSplittingOptions(&DT, &LI) 5235 .setMergeIdenticalEdges() 5236 .setDontDeleteUselessPHIs()); 5237 } else { 5238 SmallVector<BasicBlock*, 2> NewBBs; 5239 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI); 5240 NewBB = NewBBs[0]; 5241 } 5242 // If NewBB==NULL, then SplitCriticalEdge refused to split because all 5243 // phi predecessors are identical. The simple thing to do is skip 5244 // splitting in this case rather than complicate the API. 5245 if (NewBB) { 5246 // If PN is outside of the loop and BB is in the loop, we want to 5247 // move the block to be immediately before the PHI block, not 5248 // immediately after BB. 5249 if (L->contains(BB) && !L->contains(PN)) 5250 NewBB->moveBefore(PN->getParent()); 5251 5252 // Splitting the edge can reduce the number of PHI entries we have. 5253 e = PN->getNumIncomingValues(); 5254 BB = NewBB; 5255 i = PN->getBasicBlockIndex(BB); 5256 } 5257 } 5258 } 5259 5260 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair = 5261 Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr))); 5262 if (!Pair.second) 5263 PN->setIncomingValue(i, Pair.first->second); 5264 else { 5265 Value *FullV = Expand(LU, LF, F, BB->getTerminator()->getIterator(), 5266 Rewriter, DeadInsts); 5267 5268 // If this is reuse-by-noop-cast, insert the noop cast. 5269 Type *OpTy = LF.OperandValToReplace->getType(); 5270 if (FullV->getType() != OpTy) 5271 FullV = 5272 CastInst::Create(CastInst::getCastOpcode(FullV, false, 5273 OpTy, false), 5274 FullV, LF.OperandValToReplace->getType(), 5275 "tmp", BB->getTerminator()); 5276 5277 PN->setIncomingValue(i, FullV); 5278 Pair.first->second = FullV; 5279 } 5280 } 5281 } 5282 5283 /// Emit instructions for the leading candidate expression for this LSRUse (this 5284 /// is called "expanding"), and update the UserInst to reference the newly 5285 /// expanded value. 5286 void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF, 5287 const Formula &F, SCEVExpander &Rewriter, 5288 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const { 5289 // First, find an insertion point that dominates UserInst. For PHI nodes, 5290 // find the nearest block which dominates all the relevant uses. 5291 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) { 5292 RewriteForPHI(PN, LU, LF, F, Rewriter, DeadInsts); 5293 } else { 5294 Value *FullV = 5295 Expand(LU, LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts); 5296 5297 // If this is reuse-by-noop-cast, insert the noop cast. 5298 Type *OpTy = LF.OperandValToReplace->getType(); 5299 if (FullV->getType() != OpTy) { 5300 Instruction *Cast = 5301 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false), 5302 FullV, OpTy, "tmp", LF.UserInst); 5303 FullV = Cast; 5304 } 5305 5306 // Update the user. ICmpZero is handled specially here (for now) because 5307 // Expand may have updated one of the operands of the icmp already, and 5308 // its new value may happen to be equal to LF.OperandValToReplace, in 5309 // which case doing replaceUsesOfWith leads to replacing both operands 5310 // with the same value. TODO: Reorganize this. 5311 if (LU.Kind == LSRUse::ICmpZero) 5312 LF.UserInst->setOperand(0, FullV); 5313 else 5314 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV); 5315 } 5316 5317 DeadInsts.emplace_back(LF.OperandValToReplace); 5318 } 5319 5320 /// Rewrite all the fixup locations with new values, following the chosen 5321 /// solution. 5322 void LSRInstance::ImplementSolution( 5323 const SmallVectorImpl<const Formula *> &Solution) { 5324 // Keep track of instructions we may have made dead, so that 5325 // we can remove them after we are done working. 5326 SmallVector<WeakTrackingVH, 16> DeadInsts; 5327 5328 SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), 5329 "lsr"); 5330 #ifndef NDEBUG 5331 Rewriter.setDebugType(DEBUG_TYPE); 5332 #endif 5333 Rewriter.disableCanonicalMode(); 5334 Rewriter.enableLSRMode(); 5335 Rewriter.setIVIncInsertPos(L, IVIncInsertPos); 5336 5337 // Mark phi nodes that terminate chains so the expander tries to reuse them. 5338 for (const IVChain &Chain : IVChainVec) { 5339 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst())) 5340 Rewriter.setChainedPhi(PN); 5341 } 5342 5343 // Expand the new value definitions and update the users. 5344 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) 5345 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) { 5346 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], Rewriter, DeadInsts); 5347 Changed = true; 5348 } 5349 5350 for (const IVChain &Chain : IVChainVec) { 5351 GenerateIVChain(Chain, Rewriter, DeadInsts); 5352 Changed = true; 5353 } 5354 // Clean up after ourselves. This must be done before deleting any 5355 // instructions. 5356 Rewriter.clear(); 5357 5358 Changed |= DeleteTriviallyDeadInstructions(DeadInsts); 5359 } 5360 5361 LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, 5362 DominatorTree &DT, LoopInfo &LI, 5363 const TargetTransformInfo &TTI) 5364 : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) { 5365 // If LoopSimplify form is not available, stay out of trouble. 5366 if (!L->isLoopSimplifyForm()) 5367 return; 5368 5369 // If there's no interesting work to be done, bail early. 5370 if (IU.empty()) return; 5371 5372 // If there's too much analysis to be done, bail early. We won't be able to 5373 // model the problem anyway. 5374 unsigned NumUsers = 0; 5375 for (const IVStrideUse &U : IU) { 5376 if (++NumUsers > MaxIVUsers) { 5377 (void)U; 5378 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U 5379 << "\n"); 5380 return; 5381 } 5382 // Bail out if we have a PHI on an EHPad that gets a value from a 5383 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is 5384 // no good place to stick any instructions. 5385 if (auto *PN = dyn_cast<PHINode>(U.getUser())) { 5386 auto *FirstNonPHI = PN->getParent()->getFirstNonPHI(); 5387 if (isa<FuncletPadInst>(FirstNonPHI) || 5388 isa<CatchSwitchInst>(FirstNonPHI)) 5389 for (BasicBlock *PredBB : PN->blocks()) 5390 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI())) 5391 return; 5392 } 5393 } 5394 5395 #ifndef NDEBUG 5396 // All dominating loops must have preheaders, or SCEVExpander may not be able 5397 // to materialize an AddRecExpr whose Start is an outer AddRecExpr. 5398 // 5399 // IVUsers analysis should only create users that are dominated by simple loop 5400 // headers. Since this loop should dominate all of its users, its user list 5401 // should be empty if this loop itself is not within a simple loop nest. 5402 for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader()); 5403 Rung; Rung = Rung->getIDom()) { 5404 BasicBlock *BB = Rung->getBlock(); 5405 const Loop *DomLoop = LI.getLoopFor(BB); 5406 if (DomLoop && DomLoop->getHeader() == BB) { 5407 assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest"); 5408 } 5409 } 5410 #endif // DEBUG 5411 5412 LLVM_DEBUG(dbgs() << "\nLSR on loop "; 5413 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false); 5414 dbgs() << ":\n"); 5415 5416 // First, perform some low-level loop optimizations. 5417 OptimizeShadowIV(); 5418 OptimizeLoopTermCond(); 5419 5420 // If loop preparation eliminates all interesting IV users, bail. 5421 if (IU.empty()) return; 5422 5423 // Skip nested loops until we can model them better with formulae. 5424 if (!L->empty()) { 5425 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n"); 5426 return; 5427 } 5428 5429 // Start collecting data and preparing for the solver. 5430 CollectChains(); 5431 CollectInterestingTypesAndFactors(); 5432 CollectFixupsAndInitialFormulae(); 5433 CollectLoopInvariantFixupsAndFormulae(); 5434 5435 if (Uses.empty()) 5436 return; 5437 5438 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n"; 5439 print_uses(dbgs())); 5440 5441 // Now use the reuse data to generate a bunch of interesting ways 5442 // to formulate the values needed for the uses. 5443 GenerateAllReuseFormulae(); 5444 5445 FilterOutUndesirableDedicatedRegisters(); 5446 NarrowSearchSpaceUsingHeuristics(); 5447 5448 SmallVector<const Formula *, 8> Solution; 5449 Solve(Solution); 5450 5451 // Release memory that is no longer needed. 5452 Factors.clear(); 5453 Types.clear(); 5454 RegUses.clear(); 5455 5456 if (Solution.empty()) 5457 return; 5458 5459 #ifndef NDEBUG 5460 // Formulae should be legal. 5461 for (const LSRUse &LU : Uses) { 5462 for (const Formula &F : LU.Formulae) 5463 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 5464 F) && "Illegal formula generated!"); 5465 }; 5466 #endif 5467 5468 // Now that we've decided what we want, make it so. 5469 ImplementSolution(Solution); 5470 } 5471 5472 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 5473 void LSRInstance::print_factors_and_types(raw_ostream &OS) const { 5474 if (Factors.empty() && Types.empty()) return; 5475 5476 OS << "LSR has identified the following interesting factors and types: "; 5477 bool First = true; 5478 5479 for (int64_t Factor : Factors) { 5480 if (!First) OS << ", "; 5481 First = false; 5482 OS << '*' << Factor; 5483 } 5484 5485 for (Type *Ty : Types) { 5486 if (!First) OS << ", "; 5487 First = false; 5488 OS << '(' << *Ty << ')'; 5489 } 5490 OS << '\n'; 5491 } 5492 5493 void LSRInstance::print_fixups(raw_ostream &OS) const { 5494 OS << "LSR is examining the following fixup sites:\n"; 5495 for (const LSRUse &LU : Uses) 5496 for (const LSRFixup &LF : LU.Fixups) { 5497 dbgs() << " "; 5498 LF.print(OS); 5499 OS << '\n'; 5500 } 5501 } 5502 5503 void LSRInstance::print_uses(raw_ostream &OS) const { 5504 OS << "LSR is examining the following uses:\n"; 5505 for (const LSRUse &LU : Uses) { 5506 dbgs() << " "; 5507 LU.print(OS); 5508 OS << '\n'; 5509 for (const Formula &F : LU.Formulae) { 5510 OS << " "; 5511 F.print(OS); 5512 OS << '\n'; 5513 } 5514 } 5515 } 5516 5517 void LSRInstance::print(raw_ostream &OS) const { 5518 print_factors_and_types(OS); 5519 print_fixups(OS); 5520 print_uses(OS); 5521 } 5522 5523 LLVM_DUMP_METHOD void LSRInstance::dump() const { 5524 print(errs()); errs() << '\n'; 5525 } 5526 #endif 5527 5528 namespace { 5529 5530 class LoopStrengthReduce : public LoopPass { 5531 public: 5532 static char ID; // Pass ID, replacement for typeid 5533 5534 LoopStrengthReduce(); 5535 5536 private: 5537 bool runOnLoop(Loop *L, LPPassManager &LPM) override; 5538 void getAnalysisUsage(AnalysisUsage &AU) const override; 5539 }; 5540 5541 } // end anonymous namespace 5542 5543 LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) { 5544 initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); 5545 } 5546 5547 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { 5548 // We split critical edges, so we change the CFG. However, we do update 5549 // many analyses if they are around. 5550 AU.addPreservedID(LoopSimplifyID); 5551 5552 AU.addRequired<LoopInfoWrapperPass>(); 5553 AU.addPreserved<LoopInfoWrapperPass>(); 5554 AU.addRequiredID(LoopSimplifyID); 5555 AU.addRequired<DominatorTreeWrapperPass>(); 5556 AU.addPreserved<DominatorTreeWrapperPass>(); 5557 AU.addRequired<ScalarEvolutionWrapperPass>(); 5558 AU.addPreserved<ScalarEvolutionWrapperPass>(); 5559 // Requiring LoopSimplify a second time here prevents IVUsers from running 5560 // twice, since LoopSimplify was invalidated by running ScalarEvolution. 5561 AU.addRequiredID(LoopSimplifyID); 5562 AU.addRequired<IVUsersWrapperPass>(); 5563 AU.addPreserved<IVUsersWrapperPass>(); 5564 AU.addRequired<TargetTransformInfoWrapperPass>(); 5565 } 5566 5567 static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, 5568 DominatorTree &DT, LoopInfo &LI, 5569 const TargetTransformInfo &TTI) { 5570 bool Changed = false; 5571 5572 // Run the main LSR transformation. 5573 Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged(); 5574 5575 // Remove any extra phis created by processing inner loops. 5576 Changed |= DeleteDeadPHIs(L->getHeader()); 5577 if (EnablePhiElim && L->isLoopSimplifyForm()) { 5578 SmallVector<WeakTrackingVH, 16> DeadInsts; 5579 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 5580 SCEVExpander Rewriter(SE, DL, "lsr"); 5581 #ifndef NDEBUG 5582 Rewriter.setDebugType(DEBUG_TYPE); 5583 #endif 5584 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI); 5585 if (numFolded) { 5586 Changed = true; 5587 DeleteTriviallyDeadInstructions(DeadInsts); 5588 DeleteDeadPHIs(L->getHeader()); 5589 } 5590 } 5591 return Changed; 5592 } 5593 5594 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { 5595 if (skipLoop(L)) 5596 return false; 5597 5598 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU(); 5599 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 5600 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 5601 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 5602 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI( 5603 *L->getHeader()->getParent()); 5604 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI); 5605 } 5606 5607 PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM, 5608 LoopStandardAnalysisResults &AR, 5609 LPMUpdater &) { 5610 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE, 5611 AR.DT, AR.LI, AR.TTI)) 5612 return PreservedAnalyses::all(); 5613 5614 return getLoopPassPreservedAnalyses(); 5615 } 5616 5617 char LoopStrengthReduce::ID = 0; 5618 5619 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", 5620 "Loop Strength Reduction", false, false) 5621 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 5622 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 5623 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 5624 INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass) 5625 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 5626 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 5627 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", 5628 "Loop Strength Reduction", false, false) 5629 5630 Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); } 5631