1 //===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass implements an idiom recognizer that transforms simple loops into a 11 // non-loop form. In cases that this kicks in, it can be a significant 12 // performance win. 13 // 14 //===----------------------------------------------------------------------===// 15 // 16 // TODO List: 17 // 18 // Future loop memory idioms to recognize: 19 // memcmp, memmove, strlen, etc. 20 // Future floating point idioms to recognize in -ffast-math mode: 21 // fpowi 22 // Future integer operation idioms to recognize: 23 // ctpop, ctlz, cttz 24 // 25 // Beware that isel's default lowering for ctpop is highly inefficient for 26 // i64 and larger types when i64 is legal and the value has few bits set. It 27 // would be good to enhance isel to emit a loop for ctpop in this case. 28 // 29 // This could recognize common matrix multiplies and dot product idioms and 30 // replace them with calls to BLAS (if linked in??). 31 // 32 //===----------------------------------------------------------------------===// 33 34 #include "llvm/Transforms/Scalar.h" 35 #include "llvm/ADT/MapVector.h" 36 #include "llvm/ADT/SetVector.h" 37 #include "llvm/ADT/Statistic.h" 38 #include "llvm/Analysis/AliasAnalysis.h" 39 #include "llvm/Analysis/BasicAliasAnalysis.h" 40 #include "llvm/Analysis/GlobalsModRef.h" 41 #include "llvm/Analysis/LoopPass.h" 42 #include "llvm/Analysis/LoopAccessAnalysis.h" 43 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 44 #include "llvm/Analysis/ScalarEvolutionExpander.h" 45 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 46 #include "llvm/Analysis/TargetLibraryInfo.h" 47 #include "llvm/Analysis/TargetTransformInfo.h" 48 #include "llvm/Analysis/ValueTracking.h" 49 #include "llvm/IR/DataLayout.h" 50 #include "llvm/IR/Dominators.h" 51 #include "llvm/IR/IRBuilder.h" 52 #include "llvm/IR/IntrinsicInst.h" 53 #include "llvm/IR/Module.h" 54 #include "llvm/Support/Debug.h" 55 #include "llvm/Support/raw_ostream.h" 56 #include "llvm/Transforms/Utils/Local.h" 57 using namespace llvm; 58 59 #define DEBUG_TYPE "loop-idiom" 60 61 STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); 62 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); 63 64 namespace { 65 66 class LoopIdiomRecognize : public LoopPass { 67 Loop *CurLoop; 68 AliasAnalysis *AA; 69 DominatorTree *DT; 70 LoopInfo *LI; 71 ScalarEvolution *SE; 72 TargetLibraryInfo *TLI; 73 const TargetTransformInfo *TTI; 74 const DataLayout *DL; 75 76 public: 77 static char ID; 78 explicit LoopIdiomRecognize() : LoopPass(ID) { 79 initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); 80 } 81 82 bool runOnLoop(Loop *L, LPPassManager &LPM) override; 83 84 /// This transformation requires natural loop information & requires that 85 /// loop preheaders be inserted into the CFG. 86 /// 87 void getAnalysisUsage(AnalysisUsage &AU) const override { 88 AU.addRequired<LoopInfoWrapperPass>(); 89 AU.addPreserved<LoopInfoWrapperPass>(); 90 AU.addRequiredID(LoopSimplifyID); 91 AU.addPreservedID(LoopSimplifyID); 92 AU.addRequiredID(LCSSAID); 93 AU.addPreservedID(LCSSAID); 94 AU.addRequired<AAResultsWrapperPass>(); 95 AU.addPreserved<AAResultsWrapperPass>(); 96 AU.addRequired<ScalarEvolutionWrapperPass>(); 97 AU.addPreserved<ScalarEvolutionWrapperPass>(); 98 AU.addPreserved<SCEVAAWrapperPass>(); 99 AU.addRequired<DominatorTreeWrapperPass>(); 100 AU.addPreserved<DominatorTreeWrapperPass>(); 101 AU.addRequired<TargetLibraryInfoWrapperPass>(); 102 AU.addRequired<TargetTransformInfoWrapperPass>(); 103 AU.addPreserved<BasicAAWrapperPass>(); 104 AU.addPreserved<GlobalsAAWrapperPass>(); 105 } 106 107 private: 108 typedef SmallVector<StoreInst *, 8> StoreList; 109 typedef MapVector<Value *, StoreList> StoreListMap; 110 StoreListMap StoreRefsForMemset; 111 StoreListMap StoreRefsForMemsetPattern; 112 StoreList StoreRefsForMemcpy; 113 bool HasMemset; 114 bool HasMemsetPattern; 115 bool HasMemcpy; 116 117 /// \name Countable Loop Idiom Handling 118 /// @{ 119 120 bool runOnCountableLoop(); 121 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, 122 SmallVectorImpl<BasicBlock *> &ExitBlocks); 123 124 void collectStores(BasicBlock *BB); 125 bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemsetPattern, 126 bool &ForMemcpy); 127 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount, 128 bool ForMemset); 129 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); 130 131 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, 132 unsigned StoreAlignment, Value *StoredVal, 133 Instruction *TheStore, 134 SmallPtrSetImpl<Instruction *> &Stores, 135 const SCEVAddRecExpr *Ev, const SCEV *BECount, 136 bool NegStride); 137 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); 138 139 /// @} 140 /// \name Noncountable Loop Idiom Handling 141 /// @{ 142 143 bool runOnNoncountableLoop(); 144 145 bool recognizePopcount(); 146 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, 147 PHINode *CntPhi, Value *Var); 148 149 /// @} 150 }; 151 152 } // End anonymous namespace. 153 154 char LoopIdiomRecognize::ID = 0; 155 INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", 156 false, false) 157 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 158 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 159 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 160 INITIALIZE_PASS_DEPENDENCY(LCSSA) 161 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 162 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 163 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 164 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 165 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 166 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) 167 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 168 INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", 169 false, false) 170 171 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); } 172 173 /// deleteDeadInstruction - Delete this instruction. Before we do, go through 174 /// and zero out all the operands of this instruction. If any of them become 175 /// dead, delete them and the computation tree that feeds them. 176 /// 177 static void deleteDeadInstruction(Instruction *I, 178 const TargetLibraryInfo *TLI) { 179 SmallVector<Value *, 16> Operands(I->value_op_begin(), I->value_op_end()); 180 I->replaceAllUsesWith(UndefValue::get(I->getType())); 181 I->eraseFromParent(); 182 for (Value *Op : Operands) 183 RecursivelyDeleteTriviallyDeadInstructions(Op, TLI); 184 } 185 186 //===----------------------------------------------------------------------===// 187 // 188 // Implementation of LoopIdiomRecognize 189 // 190 //===----------------------------------------------------------------------===// 191 192 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { 193 if (skipOptnoneFunction(L)) 194 return false; 195 196 CurLoop = L; 197 // If the loop could not be converted to canonical form, it must have an 198 // indirectbr in it, just give up. 199 if (!L->getLoopPreheader()) 200 return false; 201 202 // Disable loop idiom recognition if the function's name is a common idiom. 203 StringRef Name = L->getHeader()->getParent()->getName(); 204 if (Name == "memset" || Name == "memcpy") 205 return false; 206 207 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 208 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 209 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 210 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 211 TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); 212 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( 213 *CurLoop->getHeader()->getParent()); 214 DL = &CurLoop->getHeader()->getModule()->getDataLayout(); 215 216 HasMemset = TLI->has(LibFunc::memset); 217 HasMemsetPattern = TLI->has(LibFunc::memset_pattern16); 218 HasMemcpy = TLI->has(LibFunc::memcpy); 219 220 if (HasMemset || HasMemsetPattern || HasMemcpy) 221 if (SE->hasLoopInvariantBackedgeTakenCount(L)) 222 return runOnCountableLoop(); 223 224 return runOnNoncountableLoop(); 225 } 226 227 bool LoopIdiomRecognize::runOnCountableLoop() { 228 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); 229 assert(!isa<SCEVCouldNotCompute>(BECount) && 230 "runOnCountableLoop() called on a loop without a predictable" 231 "backedge-taken count"); 232 233 // If this loop executes exactly one time, then it should be peeled, not 234 // optimized by this pass. 235 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) 236 if (BECst->getAPInt() == 0) 237 return false; 238 239 SmallVector<BasicBlock *, 8> ExitBlocks; 240 CurLoop->getUniqueExitBlocks(ExitBlocks); 241 242 DEBUG(dbgs() << "loop-idiom Scanning: F[" 243 << CurLoop->getHeader()->getParent()->getName() << "] Loop %" 244 << CurLoop->getHeader()->getName() << "\n"); 245 246 bool MadeChange = false; 247 // Scan all the blocks in the loop that are not in subloops. 248 for (auto *BB : CurLoop->getBlocks()) { 249 // Ignore blocks in subloops. 250 if (LI->getLoopFor(BB) != CurLoop) 251 continue; 252 253 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); 254 } 255 return MadeChange; 256 } 257 258 static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) { 259 uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType()); 260 assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) && 261 "Don't overflow unsigned."); 262 return (unsigned)SizeInBits >> 3; 263 } 264 265 static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) { 266 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1)); 267 return ConstStride->getAPInt().getZExtValue(); 268 } 269 270 /// getMemSetPatternValue - If a strided store of the specified value is safe to 271 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should 272 /// be passed in. Otherwise, return null. 273 /// 274 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these 275 /// just replicate their input array and then pass on to memset_pattern16. 276 static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { 277 // If the value isn't a constant, we can't promote it to being in a constant 278 // array. We could theoretically do a store to an alloca or something, but 279 // that doesn't seem worthwhile. 280 Constant *C = dyn_cast<Constant>(V); 281 if (!C) 282 return nullptr; 283 284 // Only handle simple values that are a power of two bytes in size. 285 uint64_t Size = DL->getTypeSizeInBits(V->getType()); 286 if (Size == 0 || (Size & 7) || (Size & (Size - 1))) 287 return nullptr; 288 289 // Don't care enough about darwin/ppc to implement this. 290 if (DL->isBigEndian()) 291 return nullptr; 292 293 // Convert to size in bytes. 294 Size /= 8; 295 296 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see 297 // if the top and bottom are the same (e.g. for vectors and large integers). 298 if (Size > 16) 299 return nullptr; 300 301 // If the constant is exactly 16 bytes, just use it. 302 if (Size == 16) 303 return C; 304 305 // Otherwise, we'll use an array of the constants. 306 unsigned ArraySize = 16 / Size; 307 ArrayType *AT = ArrayType::get(V->getType(), ArraySize); 308 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); 309 } 310 311 bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, 312 bool &ForMemsetPattern, bool &ForMemcpy) { 313 // Don't touch volatile stores. 314 if (!SI->isSimple()) 315 return false; 316 317 Value *StoredVal = SI->getValueOperand(); 318 Value *StorePtr = SI->getPointerOperand(); 319 320 // Reject stores that are so large that they overflow an unsigned. 321 uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); 322 if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) 323 return false; 324 325 // See if the pointer expression is an AddRec like {base,+,1} on the current 326 // loop, which indicates a strided store. If we have something else, it's a 327 // random store we can't handle. 328 const SCEVAddRecExpr *StoreEv = 329 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 330 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) 331 return false; 332 333 // Check to see if we have a constant stride. 334 if (!isa<SCEVConstant>(StoreEv->getOperand(1))) 335 return false; 336 337 // See if the store can be turned into a memset. 338 339 // If the stored value is a byte-wise value (like i32 -1), then it may be 340 // turned into a memset of i8 -1, assuming that all the consecutive bytes 341 // are stored. A store of i32 0x01020304 can never be turned into a memset, 342 // but it can be turned into memset_pattern if the target supports it. 343 Value *SplatValue = isBytewiseValue(StoredVal); 344 Constant *PatternValue = nullptr; 345 346 // If we're allowed to form a memset, and the stored value would be 347 // acceptable for memset, use it. 348 if (HasMemset && SplatValue && 349 // Verify that the stored value is loop invariant. If not, we can't 350 // promote the memset. 351 CurLoop->isLoopInvariant(SplatValue)) { 352 // It looks like we can use SplatValue. 353 ForMemset = true; 354 return true; 355 } else if (HasMemsetPattern && 356 // Don't create memset_pattern16s with address spaces. 357 StorePtr->getType()->getPointerAddressSpace() == 0 && 358 (PatternValue = getMemSetPatternValue(StoredVal, DL))) { 359 // It looks like we can use PatternValue! 360 ForMemsetPattern = true; 361 return true; 362 } 363 364 // Otherwise, see if the store can be turned into a memcpy. 365 if (HasMemcpy) { 366 // Check to see if the stride matches the size of the store. If so, then we 367 // know that every byte is touched in the loop. 368 unsigned Stride = getStoreStride(StoreEv); 369 unsigned StoreSize = getStoreSizeInBytes(SI, DL); 370 if (StoreSize != Stride && StoreSize != -Stride) 371 return false; 372 373 // The store must be feeding a non-volatile load. 374 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); 375 if (!LI || !LI->isSimple()) 376 return false; 377 378 // See if the pointer expression is an AddRec like {base,+,1} on the current 379 // loop, which indicates a strided load. If we have something else, it's a 380 // random load we can't handle. 381 const SCEVAddRecExpr *LoadEv = 382 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); 383 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) 384 return false; 385 386 // The store and load must share the same stride. 387 if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) 388 return false; 389 390 // Success. This store can be converted into a memcpy. 391 ForMemcpy = true; 392 return true; 393 } 394 // This store can't be transformed into a memset/memcpy. 395 return false; 396 } 397 398 void LoopIdiomRecognize::collectStores(BasicBlock *BB) { 399 StoreRefsForMemset.clear(); 400 StoreRefsForMemsetPattern.clear(); 401 StoreRefsForMemcpy.clear(); 402 for (Instruction &I : *BB) { 403 StoreInst *SI = dyn_cast<StoreInst>(&I); 404 if (!SI) 405 continue; 406 407 bool ForMemset = false; 408 bool ForMemsetPattern = false; 409 bool ForMemcpy = false; 410 // Make sure this is a strided store with a constant stride. 411 if (!isLegalStore(SI, ForMemset, ForMemsetPattern, ForMemcpy)) 412 continue; 413 414 // Save the store locations. 415 if (ForMemset) { 416 // Find the base pointer. 417 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); 418 StoreRefsForMemset[Ptr].push_back(SI); 419 } else if (ForMemsetPattern) { 420 // Find the base pointer. 421 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); 422 StoreRefsForMemsetPattern[Ptr].push_back(SI); 423 } else if (ForMemcpy) 424 StoreRefsForMemcpy.push_back(SI); 425 } 426 } 427 428 /// runOnLoopBlock - Process the specified block, which lives in a counted loop 429 /// with the specified backedge count. This block is known to be in the current 430 /// loop and not in any subloops. 431 bool LoopIdiomRecognize::runOnLoopBlock( 432 BasicBlock *BB, const SCEV *BECount, 433 SmallVectorImpl<BasicBlock *> &ExitBlocks) { 434 // We can only promote stores in this block if they are unconditionally 435 // executed in the loop. For a block to be unconditionally executed, it has 436 // to dominate all the exit blocks of the loop. Verify this now. 437 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) 438 if (!DT->dominates(BB, ExitBlocks[i])) 439 return false; 440 441 bool MadeChange = false; 442 // Look for store instructions, which may be optimized to memset/memcpy. 443 collectStores(BB); 444 445 // Look for a single store or sets of stores with a common base, which can be 446 // optimized into a memset (memset_pattern). The latter most commonly happens 447 // with structs and handunrolled loops. 448 for (auto &SL : StoreRefsForMemset) 449 MadeChange |= processLoopStores(SL.second, BECount, true); 450 451 for (auto &SL : StoreRefsForMemsetPattern) 452 MadeChange |= processLoopStores(SL.second, BECount, false); 453 454 // Optimize the store into a memcpy, if it feeds an similarly strided load. 455 for (auto &SI : StoreRefsForMemcpy) 456 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount); 457 458 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 459 Instruction *Inst = &*I++; 460 // Look for memset instructions, which may be optimized to a larger memset. 461 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { 462 WeakVH InstPtr(&*I); 463 if (!processLoopMemSet(MSI, BECount)) 464 continue; 465 MadeChange = true; 466 467 // If processing the memset invalidated our iterator, start over from the 468 // top of the block. 469 if (!InstPtr) 470 I = BB->begin(); 471 continue; 472 } 473 } 474 475 return MadeChange; 476 } 477 478 /// processLoopStores - See if this store(s) can be promoted to a memset. 479 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, 480 const SCEV *BECount, 481 bool ForMemset) { 482 // Try to find consecutive stores that can be transformed into memsets. 483 SetVector<StoreInst *> Heads, Tails; 484 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; 485 486 // Do a quadratic search on all of the given stores and find 487 // all of the pairs of stores that follow each other. 488 SmallVector<unsigned, 16> IndexQueue; 489 for (unsigned i = 0, e = SL.size(); i < e; ++i) { 490 assert(SL[i]->isSimple() && "Expected only non-volatile stores."); 491 492 Value *FirstStoredVal = SL[i]->getValueOperand(); 493 Value *FirstStorePtr = SL[i]->getPointerOperand(); 494 const SCEVAddRecExpr *FirstStoreEv = 495 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr)); 496 unsigned FirstStride = getStoreStride(FirstStoreEv); 497 unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL); 498 499 // See if we can optimize just this store in isolation. 500 if (FirstStride == FirstStoreSize || FirstStride == -FirstStoreSize) { 501 Heads.insert(SL[i]); 502 continue; 503 } 504 505 Value *FirstSplatValue = nullptr; 506 Constant *FirstPatternValue = nullptr; 507 508 if (ForMemset) 509 FirstSplatValue = isBytewiseValue(FirstStoredVal); 510 else 511 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL); 512 513 assert((FirstSplatValue || FirstPatternValue) && 514 "Expected either splat value or pattern value."); 515 516 IndexQueue.clear(); 517 // If a store has multiple consecutive store candidates, search Stores 518 // array according to the sequence: from i+1 to e, then from i-1 to 0. 519 // This is because usually pairing with immediate succeeding or preceding 520 // candidate create the best chance to find memset opportunity. 521 unsigned j = 0; 522 for (j = i + 1; j < e; ++j) 523 IndexQueue.push_back(j); 524 for (j = i; j > 0; --j) 525 IndexQueue.push_back(j - 1); 526 527 for (auto &k : IndexQueue) { 528 assert(SL[k]->isSimple() && "Expected only non-volatile stores."); 529 Value *SecondStorePtr = SL[k]->getPointerOperand(); 530 const SCEVAddRecExpr *SecondStoreEv = 531 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr)); 532 unsigned SecondStride = getStoreStride(SecondStoreEv); 533 534 if (FirstStride != SecondStride) 535 continue; 536 537 Value *SecondStoredVal = SL[k]->getValueOperand(); 538 Value *SecondSplatValue = nullptr; 539 Constant *SecondPatternValue = nullptr; 540 541 if (ForMemset) 542 SecondSplatValue = isBytewiseValue(SecondStoredVal); 543 else 544 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL); 545 546 assert((SecondSplatValue || SecondPatternValue) && 547 "Expected either splat value or pattern value."); 548 549 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { 550 if (ForMemset) { 551 if (FirstSplatValue != SecondSplatValue) 552 continue; 553 } else { 554 if (FirstPatternValue != SecondPatternValue) 555 continue; 556 } 557 Tails.insert(SL[k]); 558 Heads.insert(SL[i]); 559 ConsecutiveChain[SL[i]] = SL[k]; 560 break; 561 } 562 } 563 } 564 565 // We may run into multiple chains that merge into a single chain. We mark the 566 // stores that we transformed so that we don't visit the same store twice. 567 SmallPtrSet<Value *, 16> TransformedStores; 568 bool Changed = false; 569 570 // For stores that start but don't end a link in the chain: 571 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); 572 it != e; ++it) { 573 if (Tails.count(*it)) 574 continue; 575 576 // We found a store instr that starts a chain. Now follow the chain and try 577 // to transform it. 578 SmallPtrSet<Instruction *, 8> AdjacentStores; 579 StoreInst *I = *it; 580 581 StoreInst *HeadStore = I; 582 unsigned StoreSize = 0; 583 584 // Collect the chain into a list. 585 while (Tails.count(I) || Heads.count(I)) { 586 if (TransformedStores.count(I)) 587 break; 588 AdjacentStores.insert(I); 589 590 StoreSize += getStoreSizeInBytes(I, DL); 591 // Move to the next value in the chain. 592 I = ConsecutiveChain[I]; 593 } 594 595 Value *StoredVal = HeadStore->getValueOperand(); 596 Value *StorePtr = HeadStore->getPointerOperand(); 597 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 598 unsigned Stride = getStoreStride(StoreEv); 599 600 // Check to see if the stride matches the size of the stores. If so, then 601 // we know that every byte is touched in the loop. 602 if (StoreSize != Stride && StoreSize != -Stride) 603 continue; 604 605 bool NegStride = StoreSize == -Stride; 606 607 if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(), 608 StoredVal, HeadStore, AdjacentStores, StoreEv, 609 BECount, NegStride)) { 610 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); 611 Changed = true; 612 } 613 } 614 615 return Changed; 616 } 617 618 /// processLoopMemSet - See if this memset can be promoted to a large memset. 619 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, 620 const SCEV *BECount) { 621 // We can only handle non-volatile memsets with a constant size. 622 if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) 623 return false; 624 625 // If we're not allowed to hack on memset, we fail. 626 if (!TLI->has(LibFunc::memset)) 627 return false; 628 629 Value *Pointer = MSI->getDest(); 630 631 // See if the pointer expression is an AddRec like {base,+,1} on the current 632 // loop, which indicates a strided store. If we have something else, it's a 633 // random store we can't handle. 634 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer)); 635 if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine()) 636 return false; 637 638 // Reject memsets that are so large that they overflow an unsigned. 639 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); 640 if ((SizeInBytes >> 32) != 0) 641 return false; 642 643 // Check to see if the stride matches the size of the memset. If so, then we 644 // know that every byte is touched in the loop. 645 const SCEVConstant *Stride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); 646 647 // TODO: Could also handle negative stride here someday, that will require the 648 // validity check in mayLoopAccessLocation to be updated though. 649 if (!Stride || MSI->getLength() != Stride->getValue()) 650 return false; 651 652 // Verify that the memset value is loop invariant. If not, we can't promote 653 // the memset. 654 Value *SplatValue = MSI->getValue(); 655 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue)) 656 return false; 657 658 SmallPtrSet<Instruction *, 1> MSIs; 659 MSIs.insert(MSI); 660 return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, 661 MSI->getAlignment(), SplatValue, MSI, MSIs, Ev, 662 BECount, /*NegStride=*/false); 663 } 664 665 /// mayLoopAccessLocation - Return true if the specified loop might access the 666 /// specified pointer location, which is a loop-strided access. The 'Access' 667 /// argument specifies what the verboten forms of access are (read or write). 668 static bool 669 mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, 670 const SCEV *BECount, unsigned StoreSize, 671 AliasAnalysis &AA, 672 SmallPtrSetImpl<Instruction *> &IgnoredStores) { 673 // Get the location that may be stored across the loop. Since the access is 674 // strided positively through memory, we say that the modified location starts 675 // at the pointer and has infinite size. 676 uint64_t AccessSize = MemoryLocation::UnknownSize; 677 678 // If the loop iterates a fixed number of times, we can refine the access size 679 // to be exactly the size of the memset, which is (BECount+1)*StoreSize 680 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) 681 AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize; 682 683 // TODO: For this to be really effective, we have to dive into the pointer 684 // operand in the store. Store to &A[i] of 100 will always return may alias 685 // with store of &A[100], we need to StoreLoc to be "A" with size of 100, 686 // which will then no-alias a store to &A[100]. 687 MemoryLocation StoreLoc(Ptr, AccessSize); 688 689 for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; 690 ++BI) 691 for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) 692 if (IgnoredStores.count(&*I) == 0 && 693 (AA.getModRefInfo(&*I, StoreLoc) & Access)) 694 return true; 695 696 return false; 697 } 698 699 // If we have a negative stride, Start refers to the end of the memory location 700 // we're trying to memset. Therefore, we need to recompute the base pointer, 701 // which is just Start - BECount*Size. 702 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, 703 Type *IntPtr, unsigned StoreSize, 704 ScalarEvolution *SE) { 705 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr); 706 if (StoreSize != 1) 707 Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize), 708 SCEV::FlagNUW); 709 return SE->getMinusSCEV(Start, Index); 710 } 711 712 /// processLoopStridedStore - We see a strided store of some value. If we can 713 /// transform this into a memset or memset_pattern in the loop preheader, do so. 714 bool LoopIdiomRecognize::processLoopStridedStore( 715 Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, 716 Value *StoredVal, Instruction *TheStore, 717 SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, 718 const SCEV *BECount, bool NegStride) { 719 Value *SplatValue = isBytewiseValue(StoredVal); 720 Constant *PatternValue = nullptr; 721 722 if (!SplatValue) 723 PatternValue = getMemSetPatternValue(StoredVal, DL); 724 725 assert((SplatValue || PatternValue) && 726 "Expected either splat value or pattern value."); 727 728 // The trip count of the loop and the base pointer of the addrec SCEV is 729 // guaranteed to be loop invariant, which means that it should dominate the 730 // header. This allows us to insert code for it in the preheader. 731 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); 732 BasicBlock *Preheader = CurLoop->getLoopPreheader(); 733 IRBuilder<> Builder(Preheader->getTerminator()); 734 SCEVExpander Expander(*SE, *DL, "loop-idiom"); 735 736 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); 737 Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS); 738 739 const SCEV *Start = Ev->getStart(); 740 // Handle negative strided loops. 741 if (NegStride) 742 Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE); 743 744 // Okay, we have a strided store "p[i]" of a splattable value. We can turn 745 // this into a memset in the loop preheader now if we want. However, this 746 // would be unsafe to do if there is anything else in the loop that may read 747 // or write to the aliased location. Check for any overlap by generating the 748 // base pointer and checking the region. 749 Value *BasePtr = 750 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); 751 if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize, 752 *AA, Stores)) { 753 Expander.clear(); 754 // If we generated new code for the base pointer, clean up. 755 RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); 756 return false; 757 } 758 759 // Okay, everything looks good, insert the memset. 760 761 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to 762 // pointer size if it isn't already. 763 BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); 764 765 const SCEV *NumBytesS = 766 SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW); 767 if (StoreSize != 1) { 768 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), 769 SCEV::FlagNUW); 770 } 771 772 Value *NumBytes = 773 Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); 774 775 CallInst *NewCall; 776 if (SplatValue) { 777 NewCall = 778 Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); 779 } else { 780 // Everything is emitted in default address space 781 Type *Int8PtrTy = DestInt8PtrTy; 782 783 Module *M = TheStore->getModule(); 784 Value *MSP = 785 M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), 786 Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr); 787 788 // Otherwise we should form a memset_pattern16. PatternValue is known to be 789 // an constant array of 16-bytes. Plop the value into a mergable global. 790 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, 791 GlobalValue::PrivateLinkage, 792 PatternValue, ".memset_pattern"); 793 GV->setUnnamedAddr(true); // Ok to merge these. 794 GV->setAlignment(16); 795 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); 796 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); 797 } 798 799 DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" 800 << " from store to: " << *Ev << " at: " << *TheStore << "\n"); 801 NewCall->setDebugLoc(TheStore->getDebugLoc()); 802 803 // Okay, the memset has been formed. Zap the original store and anything that 804 // feeds into it. 805 for (auto *I : Stores) 806 deleteDeadInstruction(I, TLI); 807 ++NumMemSet; 808 return true; 809 } 810 811 /// If the stored value is a strided load in the same loop with the same stride 812 /// this may be transformable into a memcpy. This kicks in for stuff like 813 /// for (i) A[i] = B[i]; 814 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, 815 const SCEV *BECount) { 816 assert(SI->isSimple() && "Expected only non-volatile stores."); 817 818 Value *StorePtr = SI->getPointerOperand(); 819 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 820 unsigned Stride = getStoreStride(StoreEv); 821 unsigned StoreSize = getStoreSizeInBytes(SI, DL); 822 bool NegStride = StoreSize == -Stride; 823 824 // The store must be feeding a non-volatile load. 825 LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); 826 assert(LI->isSimple() && "Expected only non-volatile stores."); 827 828 // See if the pointer expression is an AddRec like {base,+,1} on the current 829 // loop, which indicates a strided load. If we have something else, it's a 830 // random load we can't handle. 831 const SCEVAddRecExpr *LoadEv = 832 cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); 833 834 // The trip count of the loop and the base pointer of the addrec SCEV is 835 // guaranteed to be loop invariant, which means that it should dominate the 836 // header. This allows us to insert code for it in the preheader. 837 BasicBlock *Preheader = CurLoop->getLoopPreheader(); 838 IRBuilder<> Builder(Preheader->getTerminator()); 839 SCEVExpander Expander(*SE, *DL, "loop-idiom"); 840 841 const SCEV *StrStart = StoreEv->getStart(); 842 unsigned StrAS = SI->getPointerAddressSpace(); 843 Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS); 844 845 // Handle negative strided loops. 846 if (NegStride) 847 StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE); 848 849 // Okay, we have a strided store "p[i]" of a loaded value. We can turn 850 // this into a memcpy in the loop preheader now if we want. However, this 851 // would be unsafe to do if there is anything else in the loop that may read 852 // or write the memory region we're storing to. This includes the load that 853 // feeds the stores. Check for an alias by generating the base address and 854 // checking everything. 855 Value *StoreBasePtr = Expander.expandCodeFor( 856 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); 857 858 SmallPtrSet<Instruction *, 1> Stores; 859 Stores.insert(SI); 860 if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, 861 StoreSize, *AA, Stores)) { 862 Expander.clear(); 863 // If we generated new code for the base pointer, clean up. 864 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); 865 return false; 866 } 867 868 const SCEV *LdStart = LoadEv->getStart(); 869 unsigned LdAS = LI->getPointerAddressSpace(); 870 871 // Handle negative strided loops. 872 if (NegStride) 873 LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); 874 875 // For a memcpy, we have to make sure that the input array is not being 876 // mutated by the loop. 877 Value *LoadBasePtr = Expander.expandCodeFor( 878 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); 879 880 if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, 881 *AA, Stores)) { 882 Expander.clear(); 883 // If we generated new code for the base pointer, clean up. 884 RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); 885 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); 886 return false; 887 } 888 889 // Okay, everything is safe, we can transform this! 890 891 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to 892 // pointer size if it isn't already. 893 BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); 894 895 const SCEV *NumBytesS = 896 SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW); 897 if (StoreSize != 1) 898 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), 899 SCEV::FlagNUW); 900 901 Value *NumBytes = 902 Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); 903 904 CallInst *NewCall = 905 Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, 906 std::min(SI->getAlignment(), LI->getAlignment())); 907 NewCall->setDebugLoc(SI->getDebugLoc()); 908 909 DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" 910 << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" 911 << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); 912 913 // Okay, the memcpy has been formed. Zap the original store and anything that 914 // feeds into it. 915 deleteDeadInstruction(SI, TLI); 916 ++NumMemCpy; 917 return true; 918 } 919 920 bool LoopIdiomRecognize::runOnNoncountableLoop() { 921 return recognizePopcount(); 922 } 923 924 /// Check if the given conditional branch is based on the comparison between 925 /// a variable and zero, and if the variable is non-zero, the control yields to 926 /// the loop entry. If the branch matches the behavior, the variable involved 927 /// in the comparion is returned. This function will be called to see if the 928 /// precondition and postcondition of the loop are in desirable form. 929 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) { 930 if (!BI || !BI->isConditional()) 931 return nullptr; 932 933 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); 934 if (!Cond) 935 return nullptr; 936 937 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); 938 if (!CmpZero || !CmpZero->isZero()) 939 return nullptr; 940 941 ICmpInst::Predicate Pred = Cond->getPredicate(); 942 if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) || 943 (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry)) 944 return Cond->getOperand(0); 945 946 return nullptr; 947 } 948 949 /// Return true iff the idiom is detected in the loop. 950 /// 951 /// Additionally: 952 /// 1) \p CntInst is set to the instruction counting the population bit. 953 /// 2) \p CntPhi is set to the corresponding phi node. 954 /// 3) \p Var is set to the value whose population bits are being counted. 955 /// 956 /// The core idiom we are trying to detect is: 957 /// \code 958 /// if (x0 != 0) 959 /// goto loop-exit // the precondition of the loop 960 /// cnt0 = init-val; 961 /// do { 962 /// x1 = phi (x0, x2); 963 /// cnt1 = phi(cnt0, cnt2); 964 /// 965 /// cnt2 = cnt1 + 1; 966 /// ... 967 /// x2 = x1 & (x1 - 1); 968 /// ... 969 /// } while(x != 0); 970 /// 971 /// loop-exit: 972 /// \endcode 973 static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, 974 Instruction *&CntInst, PHINode *&CntPhi, 975 Value *&Var) { 976 // step 1: Check to see if the look-back branch match this pattern: 977 // "if (a!=0) goto loop-entry". 978 BasicBlock *LoopEntry; 979 Instruction *DefX2, *CountInst; 980 Value *VarX1, *VarX0; 981 PHINode *PhiX, *CountPhi; 982 983 DefX2 = CountInst = nullptr; 984 VarX1 = VarX0 = nullptr; 985 PhiX = CountPhi = nullptr; 986 LoopEntry = *(CurLoop->block_begin()); 987 988 // step 1: Check if the loop-back branch is in desirable form. 989 { 990 if (Value *T = matchCondition( 991 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) 992 DefX2 = dyn_cast<Instruction>(T); 993 else 994 return false; 995 } 996 997 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" 998 { 999 if (!DefX2 || DefX2->getOpcode() != Instruction::And) 1000 return false; 1001 1002 BinaryOperator *SubOneOp; 1003 1004 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) 1005 VarX1 = DefX2->getOperand(1); 1006 else { 1007 VarX1 = DefX2->getOperand(0); 1008 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); 1009 } 1010 if (!SubOneOp) 1011 return false; 1012 1013 Instruction *SubInst = cast<Instruction>(SubOneOp); 1014 ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); 1015 if (!Dec || 1016 !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || 1017 (SubInst->getOpcode() == Instruction::Add && 1018 Dec->isAllOnesValue()))) { 1019 return false; 1020 } 1021 } 1022 1023 // step 3: Check the recurrence of variable X 1024 { 1025 PhiX = dyn_cast<PHINode>(VarX1); 1026 if (!PhiX || 1027 (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { 1028 return false; 1029 } 1030 } 1031 1032 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 1033 { 1034 CountInst = nullptr; 1035 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), 1036 IterE = LoopEntry->end(); 1037 Iter != IterE; Iter++) { 1038 Instruction *Inst = &*Iter; 1039 if (Inst->getOpcode() != Instruction::Add) 1040 continue; 1041 1042 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); 1043 if (!Inc || !Inc->isOne()) 1044 continue; 1045 1046 PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); 1047 if (!Phi || Phi->getParent() != LoopEntry) 1048 continue; 1049 1050 // Check if the result of the instruction is live of the loop. 1051 bool LiveOutLoop = false; 1052 for (User *U : Inst->users()) { 1053 if ((cast<Instruction>(U))->getParent() != LoopEntry) { 1054 LiveOutLoop = true; 1055 break; 1056 } 1057 } 1058 1059 if (LiveOutLoop) { 1060 CountInst = Inst; 1061 CountPhi = Phi; 1062 break; 1063 } 1064 } 1065 1066 if (!CountInst) 1067 return false; 1068 } 1069 1070 // step 5: check if the precondition is in this form: 1071 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" 1072 { 1073 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 1074 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader()); 1075 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) 1076 return false; 1077 1078 CntInst = CountInst; 1079 CntPhi = CountPhi; 1080 Var = T; 1081 } 1082 1083 return true; 1084 } 1085 1086 /// Recognizes a population count idiom in a non-countable loop. 1087 /// 1088 /// If detected, transforms the relevant code to issue the popcount intrinsic 1089 /// function call, and returns true; otherwise, returns false. 1090 bool LoopIdiomRecognize::recognizePopcount() { 1091 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) 1092 return false; 1093 1094 // Counting population are usually conducted by few arithmetic instructions. 1095 // Such instructions can be easily "absorbed" by vacant slots in a 1096 // non-compact loop. Therefore, recognizing popcount idiom only makes sense 1097 // in a compact loop. 1098 1099 // Give up if the loop has multiple blocks or multiple backedges. 1100 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) 1101 return false; 1102 1103 BasicBlock *LoopBody = *(CurLoop->block_begin()); 1104 if (LoopBody->size() >= 20) { 1105 // The loop is too big, bail out. 1106 return false; 1107 } 1108 1109 // It should have a preheader containing nothing but an unconditional branch. 1110 BasicBlock *PH = CurLoop->getLoopPreheader(); 1111 if (!PH) 1112 return false; 1113 if (&PH->front() != PH->getTerminator()) 1114 return false; 1115 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator()); 1116 if (!EntryBI || EntryBI->isConditional()) 1117 return false; 1118 1119 // It should have a precondition block where the generated popcount instrinsic 1120 // function can be inserted. 1121 auto *PreCondBB = PH->getSinglePredecessor(); 1122 if (!PreCondBB) 1123 return false; 1124 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 1125 if (!PreCondBI || PreCondBI->isUnconditional()) 1126 return false; 1127 1128 Instruction *CntInst; 1129 PHINode *CntPhi; 1130 Value *Val; 1131 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val)) 1132 return false; 1133 1134 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val); 1135 return true; 1136 } 1137 1138 static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, 1139 DebugLoc DL) { 1140 Value *Ops[] = {Val}; 1141 Type *Tys[] = {Val->getType()}; 1142 1143 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); 1144 Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); 1145 CallInst *CI = IRBuilder.CreateCall(Func, Ops); 1146 CI->setDebugLoc(DL); 1147 1148 return CI; 1149 } 1150 1151 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, 1152 Instruction *CntInst, 1153 PHINode *CntPhi, Value *Var) { 1154 BasicBlock *PreHead = CurLoop->getLoopPreheader(); 1155 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 1156 const DebugLoc DL = CntInst->getDebugLoc(); 1157 1158 // Assuming before transformation, the loop is following: 1159 // if (x) // the precondition 1160 // do { cnt++; x &= x - 1; } while(x); 1161 1162 // Step 1: Insert the ctpop instruction at the end of the precondition block 1163 IRBuilder<> Builder(PreCondBr); 1164 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; 1165 { 1166 PopCnt = createPopcntIntrinsic(Builder, Var, DL); 1167 NewCount = PopCntZext = 1168 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); 1169 1170 if (NewCount != PopCnt) 1171 (cast<Instruction>(NewCount))->setDebugLoc(DL); 1172 1173 // TripCnt is exactly the number of iterations the loop has 1174 TripCnt = NewCount; 1175 1176 // If the population counter's initial value is not zero, insert Add Inst. 1177 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); 1178 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); 1179 if (!InitConst || !InitConst->isZero()) { 1180 NewCount = Builder.CreateAdd(NewCount, CntInitVal); 1181 (cast<Instruction>(NewCount))->setDebugLoc(DL); 1182 } 1183 } 1184 1185 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to 1186 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic 1187 // function would be partial dead code, and downstream passes will drag 1188 // it back from the precondition block to the preheader. 1189 { 1190 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); 1191 1192 Value *Opnd0 = PopCntZext; 1193 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); 1194 if (PreCond->getOperand(0) != Var) 1195 std::swap(Opnd0, Opnd1); 1196 1197 ICmpInst *NewPreCond = cast<ICmpInst>( 1198 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); 1199 PreCondBr->setCondition(NewPreCond); 1200 1201 RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); 1202 } 1203 1204 // Step 3: Note that the population count is exactly the trip count of the 1205 // loop in question, which enable us to to convert the loop from noncountable 1206 // loop into a countable one. The benefit is twofold: 1207 // 1208 // - If the loop only counts population, the entire loop becomes dead after 1209 // the transformation. It is a lot easier to prove a countable loop dead 1210 // than to prove a noncountable one. (In some C dialects, an infinite loop 1211 // isn't dead even if it computes nothing useful. In general, DCE needs 1212 // to prove a noncountable loop finite before safely delete it.) 1213 // 1214 // - If the loop also performs something else, it remains alive. 1215 // Since it is transformed to countable form, it can be aggressively 1216 // optimized by some optimizations which are in general not applicable 1217 // to a noncountable loop. 1218 // 1219 // After this step, this loop (conceptually) would look like following: 1220 // newcnt = __builtin_ctpop(x); 1221 // t = newcnt; 1222 // if (x) 1223 // do { cnt++; x &= x-1; t--) } while (t > 0); 1224 BasicBlock *Body = *(CurLoop->block_begin()); 1225 { 1226 auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); 1227 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); 1228 Type *Ty = TripCnt->getType(); 1229 1230 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); 1231 1232 Builder.SetInsertPoint(LbCond); 1233 Instruction *TcDec = cast<Instruction>( 1234 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), 1235 "tcdec", false, true)); 1236 1237 TcPhi->addIncoming(TripCnt, PreHead); 1238 TcPhi->addIncoming(TcDec, Body); 1239 1240 CmpInst::Predicate Pred = 1241 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; 1242 LbCond->setPredicate(Pred); 1243 LbCond->setOperand(0, TcDec); 1244 LbCond->setOperand(1, ConstantInt::get(Ty, 0)); 1245 } 1246 1247 // Step 4: All the references to the original population counter outside 1248 // the loop are replaced with the NewCount -- the value returned from 1249 // __builtin_ctpop(). 1250 CntInst->replaceUsesOutsideBlock(NewCount, Body); 1251 1252 // step 5: Forget the "non-computable" trip-count SCEV associated with the 1253 // loop. The loop would otherwise not be deleted even if it becomes empty. 1254 SE->forgetLoop(CurLoop); 1255 } 1256