1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass implements an idiom recognizer that transforms simple loops into a 10 // non-loop form. In cases that this kicks in, it can be a significant 11 // performance win. 12 // 13 // If compiling for code size we avoid idiom recognition if the resulting 14 // code could be larger than the code for the original loop. One way this could 15 // happen is if the loop is not removable after idiom recognition due to the 16 // presence of non-idiom instructions. The initial implementation of the 17 // heuristics applies to idioms in multi-block loops. 18 // 19 //===----------------------------------------------------------------------===// 20 // 21 // TODO List: 22 // 23 // Future loop memory idioms to recognize: 24 // memcmp, memmove, strlen, etc. 25 // Future floating point idioms to recognize in -ffast-math mode: 26 // fpowi 27 // Future integer operation idioms to recognize: 28 // ctpop 29 // 30 // Beware that isel's default lowering for ctpop is highly inefficient for 31 // i64 and larger types when i64 is legal and the value has few bits set. It 32 // would be good to enhance isel to emit a loop for ctpop in this case. 33 // 34 // This could recognize common matrix multiplies and dot product idioms and 35 // replace them with calls to BLAS (if linked in??). 36 // 37 //===----------------------------------------------------------------------===// 38 39 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" 40 #include "llvm/ADT/APInt.h" 41 #include "llvm/ADT/ArrayRef.h" 42 #include "llvm/ADT/DenseMap.h" 43 #include "llvm/ADT/MapVector.h" 44 #include "llvm/ADT/SetVector.h" 45 #include "llvm/ADT/SmallPtrSet.h" 46 #include "llvm/ADT/SmallVector.h" 47 #include "llvm/ADT/Statistic.h" 48 #include "llvm/ADT/StringRef.h" 49 #include "llvm/Analysis/AliasAnalysis.h" 50 #include "llvm/Analysis/CmpInstAnalysis.h" 51 #include "llvm/Analysis/LoopAccessAnalysis.h" 52 #include "llvm/Analysis/LoopInfo.h" 53 #include "llvm/Analysis/LoopPass.h" 54 #include "llvm/Analysis/MemoryLocation.h" 55 #include "llvm/Analysis/MemorySSA.h" 56 #include "llvm/Analysis/MemorySSAUpdater.h" 57 #include "llvm/Analysis/MustExecute.h" 58 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 59 #include "llvm/Analysis/ScalarEvolution.h" 60 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 61 #include "llvm/Analysis/TargetLibraryInfo.h" 62 #include "llvm/Analysis/TargetTransformInfo.h" 63 #include "llvm/Analysis/ValueTracking.h" 64 #include "llvm/IR/Attributes.h" 65 #include "llvm/IR/BasicBlock.h" 66 #include "llvm/IR/Constant.h" 67 #include "llvm/IR/Constants.h" 68 #include "llvm/IR/DataLayout.h" 69 #include "llvm/IR/DebugLoc.h" 70 #include "llvm/IR/DerivedTypes.h" 71 #include "llvm/IR/Dominators.h" 72 #include "llvm/IR/GlobalValue.h" 73 #include "llvm/IR/GlobalVariable.h" 74 #include "llvm/IR/IRBuilder.h" 75 #include "llvm/IR/InstrTypes.h" 76 #include "llvm/IR/Instruction.h" 77 #include "llvm/IR/Instructions.h" 78 #include "llvm/IR/IntrinsicInst.h" 79 #include "llvm/IR/Intrinsics.h" 80 #include "llvm/IR/LLVMContext.h" 81 #include "llvm/IR/Module.h" 82 #include "llvm/IR/PassManager.h" 83 #include "llvm/IR/PatternMatch.h" 84 #include "llvm/IR/Type.h" 85 #include "llvm/IR/User.h" 86 #include "llvm/IR/Value.h" 87 #include "llvm/IR/ValueHandle.h" 88 #include "llvm/InitializePasses.h" 89 #include "llvm/Pass.h" 90 #include "llvm/Support/Casting.h" 91 #include "llvm/Support/CommandLine.h" 92 #include "llvm/Support/Debug.h" 93 #include "llvm/Support/InstructionCost.h" 94 #include "llvm/Support/raw_ostream.h" 95 #include "llvm/Transforms/Scalar.h" 96 #include "llvm/Transforms/Utils/BuildLibCalls.h" 97 #include "llvm/Transforms/Utils/Local.h" 98 #include "llvm/Transforms/Utils/LoopUtils.h" 99 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 100 #include <algorithm> 101 #include <cassert> 102 #include <cstdint> 103 #include <utility> 104 #include <vector> 105 106 using namespace llvm; 107 108 #define DEBUG_TYPE "loop-idiom" 109 110 STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); 111 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); 112 STATISTIC( 113 NumShiftUntilBitTest, 114 "Number of uncountable loops recognized as 'shift until bitttest' idiom"); 115 116 bool DisableLIRP::All; 117 static cl::opt<bool, true> 118 DisableLIRPAll("disable-" DEBUG_TYPE "-all", 119 cl::desc("Options to disable Loop Idiom Recognize Pass."), 120 cl::location(DisableLIRP::All), cl::init(false), 121 cl::ReallyHidden); 122 123 bool DisableLIRP::Memset; 124 static cl::opt<bool, true> 125 DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", 126 cl::desc("Proceed with loop idiom recognize pass, but do " 127 "not convert loop(s) to memset."), 128 cl::location(DisableLIRP::Memset), cl::init(false), 129 cl::ReallyHidden); 130 131 bool DisableLIRP::Memcpy; 132 static cl::opt<bool, true> 133 DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", 134 cl::desc("Proceed with loop idiom recognize pass, but do " 135 "not convert loop(s) to memcpy."), 136 cl::location(DisableLIRP::Memcpy), cl::init(false), 137 cl::ReallyHidden); 138 139 static cl::opt<bool> UseLIRCodeSizeHeurs( 140 "use-lir-code-size-heurs", 141 cl::desc("Use loop idiom recognition code size heuristics when compiling" 142 "with -Os/-Oz"), 143 cl::init(true), cl::Hidden); 144 145 namespace { 146 147 class LoopIdiomRecognize { 148 Loop *CurLoop = nullptr; 149 AliasAnalysis *AA; 150 DominatorTree *DT; 151 LoopInfo *LI; 152 ScalarEvolution *SE; 153 TargetLibraryInfo *TLI; 154 const TargetTransformInfo *TTI; 155 const DataLayout *DL; 156 OptimizationRemarkEmitter &ORE; 157 bool ApplyCodeSizeHeuristics; 158 std::unique_ptr<MemorySSAUpdater> MSSAU; 159 160 public: 161 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT, 162 LoopInfo *LI, ScalarEvolution *SE, 163 TargetLibraryInfo *TLI, 164 const TargetTransformInfo *TTI, MemorySSA *MSSA, 165 const DataLayout *DL, 166 OptimizationRemarkEmitter &ORE) 167 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) { 168 if (MSSA) 169 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 170 } 171 172 bool runOnLoop(Loop *L); 173 174 private: 175 using StoreList = SmallVector<StoreInst *, 8>; 176 using StoreListMap = MapVector<Value *, StoreList>; 177 178 StoreListMap StoreRefsForMemset; 179 StoreListMap StoreRefsForMemsetPattern; 180 StoreList StoreRefsForMemcpy; 181 bool HasMemset; 182 bool HasMemsetPattern; 183 bool HasMemcpy; 184 185 /// Return code for isLegalStore() 186 enum LegalStoreKind { 187 None = 0, 188 Memset, 189 MemsetPattern, 190 Memcpy, 191 UnorderedAtomicMemcpy, 192 DontUse // Dummy retval never to be used. Allows catching errors in retval 193 // handling. 194 }; 195 196 /// \name Countable Loop Idiom Handling 197 /// @{ 198 199 bool runOnCountableLoop(); 200 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, 201 SmallVectorImpl<BasicBlock *> &ExitBlocks); 202 203 void collectStores(BasicBlock *BB); 204 LegalStoreKind isLegalStore(StoreInst *SI); 205 enum class ForMemset { No, Yes }; 206 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount, 207 ForMemset For); 208 209 template <typename MemInst> 210 bool processLoopMemIntrinsic( 211 BasicBlock *BB, 212 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *), 213 const SCEV *BECount); 214 bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount); 215 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); 216 217 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, 218 MaybeAlign StoreAlignment, Value *StoredVal, 219 Instruction *TheStore, 220 SmallPtrSetImpl<Instruction *> &Stores, 221 const SCEVAddRecExpr *Ev, const SCEV *BECount, 222 bool NegStride, bool IsLoopMemset = false); 223 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); 224 bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr, 225 unsigned StoreSize, MaybeAlign StoreAlign, 226 MaybeAlign LoadAlign, Instruction *TheStore, 227 Instruction *TheLoad, 228 const SCEVAddRecExpr *StoreEv, 229 const SCEVAddRecExpr *LoadEv, 230 const SCEV *BECount); 231 bool avoidLIRForMultiBlockLoop(bool IsMemset = false, 232 bool IsLoopMemset = false); 233 234 /// @} 235 /// \name Noncountable Loop Idiom Handling 236 /// @{ 237 238 bool runOnNoncountableLoop(); 239 240 bool recognizePopcount(); 241 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, 242 PHINode *CntPhi, Value *Var); 243 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz 244 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB, 245 Instruction *CntInst, PHINode *CntPhi, 246 Value *Var, Instruction *DefX, 247 const DebugLoc &DL, bool ZeroCheck, 248 bool IsCntPhiUsedOutsideLoop); 249 250 bool recognizeShiftUntilBitTest(); 251 252 /// @} 253 }; 254 255 class LoopIdiomRecognizeLegacyPass : public LoopPass { 256 public: 257 static char ID; 258 259 explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) { 260 initializeLoopIdiomRecognizeLegacyPassPass( 261 *PassRegistry::getPassRegistry()); 262 } 263 264 bool runOnLoop(Loop *L, LPPassManager &LPM) override { 265 if (DisableLIRP::All) 266 return false; 267 268 if (skipLoop(L)) 269 return false; 270 271 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 272 DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 273 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 274 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 275 TargetLibraryInfo *TLI = 276 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( 277 *L->getHeader()->getParent()); 278 const TargetTransformInfo *TTI = 279 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( 280 *L->getHeader()->getParent()); 281 const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout(); 282 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>(); 283 MemorySSA *MSSA = nullptr; 284 if (MSSAAnalysis) 285 MSSA = &MSSAAnalysis->getMSSA(); 286 287 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis 288 // pass. Function analyses need to be preserved across loop transformations 289 // but ORE cannot be preserved (see comment before the pass definition). 290 OptimizationRemarkEmitter ORE(L->getHeader()->getParent()); 291 292 LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE); 293 return LIR.runOnLoop(L); 294 } 295 296 /// This transformation requires natural loop information & requires that 297 /// loop preheaders be inserted into the CFG. 298 void getAnalysisUsage(AnalysisUsage &AU) const override { 299 AU.addRequired<TargetLibraryInfoWrapperPass>(); 300 AU.addRequired<TargetTransformInfoWrapperPass>(); 301 AU.addPreserved<MemorySSAWrapperPass>(); 302 getLoopAnalysisUsage(AU); 303 } 304 }; 305 306 } // end anonymous namespace 307 308 char LoopIdiomRecognizeLegacyPass::ID = 0; 309 310 PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, 311 LoopStandardAnalysisResults &AR, 312 LPMUpdater &) { 313 if (DisableLIRP::All) 314 return PreservedAnalyses::all(); 315 316 const auto *DL = &L.getHeader()->getModule()->getDataLayout(); 317 318 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis 319 // pass. Function analyses need to be preserved across loop transformations 320 // but ORE cannot be preserved (see comment before the pass definition). 321 OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); 322 323 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, 324 AR.MSSA, DL, ORE); 325 if (!LIR.runOnLoop(&L)) 326 return PreservedAnalyses::all(); 327 328 auto PA = getLoopPassPreservedAnalyses(); 329 if (AR.MSSA) 330 PA.preserve<MemorySSAAnalysis>(); 331 return PA; 332 } 333 334 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", 335 "Recognize loop idioms", false, false) 336 INITIALIZE_PASS_DEPENDENCY(LoopPass) 337 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 338 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 339 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom", 340 "Recognize loop idioms", false, false) 341 342 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); } 343 344 static void deleteDeadInstruction(Instruction *I) { 345 I->replaceAllUsesWith(UndefValue::get(I->getType())); 346 I->eraseFromParent(); 347 } 348 349 //===----------------------------------------------------------------------===// 350 // 351 // Implementation of LoopIdiomRecognize 352 // 353 //===----------------------------------------------------------------------===// 354 355 bool LoopIdiomRecognize::runOnLoop(Loop *L) { 356 CurLoop = L; 357 // If the loop could not be converted to canonical form, it must have an 358 // indirectbr in it, just give up. 359 if (!L->getLoopPreheader()) 360 return false; 361 362 // Disable loop idiom recognition if the function's name is a common idiom. 363 StringRef Name = L->getHeader()->getParent()->getName(); 364 if (Name == "memset" || Name == "memcpy") 365 return false; 366 367 // Determine if code size heuristics need to be applied. 368 ApplyCodeSizeHeuristics = 369 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs; 370 371 HasMemset = TLI->has(LibFunc_memset); 372 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); 373 HasMemcpy = TLI->has(LibFunc_memcpy); 374 375 if (HasMemset || HasMemsetPattern || HasMemcpy) 376 if (SE->hasLoopInvariantBackedgeTakenCount(L)) 377 return runOnCountableLoop(); 378 379 return runOnNoncountableLoop(); 380 } 381 382 bool LoopIdiomRecognize::runOnCountableLoop() { 383 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); 384 assert(!isa<SCEVCouldNotCompute>(BECount) && 385 "runOnCountableLoop() called on a loop without a predictable" 386 "backedge-taken count"); 387 388 // If this loop executes exactly one time, then it should be peeled, not 389 // optimized by this pass. 390 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) 391 if (BECst->getAPInt() == 0) 392 return false; 393 394 SmallVector<BasicBlock *, 8> ExitBlocks; 395 CurLoop->getUniqueExitBlocks(ExitBlocks); 396 397 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" 398 << CurLoop->getHeader()->getParent()->getName() 399 << "] Countable Loop %" << CurLoop->getHeader()->getName() 400 << "\n"); 401 402 // The following transforms hoist stores/memsets into the loop pre-header. 403 // Give up if the loop has instructions that may throw. 404 SimpleLoopSafetyInfo SafetyInfo; 405 SafetyInfo.computeLoopSafetyInfo(CurLoop); 406 if (SafetyInfo.anyBlockMayThrow()) 407 return false; 408 409 bool MadeChange = false; 410 411 // Scan all the blocks in the loop that are not in subloops. 412 for (auto *BB : CurLoop->getBlocks()) { 413 // Ignore blocks in subloops. 414 if (LI->getLoopFor(BB) != CurLoop) 415 continue; 416 417 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); 418 } 419 return MadeChange; 420 } 421 422 static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) { 423 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1)); 424 return ConstStride->getAPInt(); 425 } 426 427 /// getMemSetPatternValue - If a strided store of the specified value is safe to 428 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should 429 /// be passed in. Otherwise, return null. 430 /// 431 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these 432 /// just replicate their input array and then pass on to memset_pattern16. 433 static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { 434 // FIXME: This could check for UndefValue because it can be merged into any 435 // other valid pattern. 436 437 // If the value isn't a constant, we can't promote it to being in a constant 438 // array. We could theoretically do a store to an alloca or something, but 439 // that doesn't seem worthwhile. 440 Constant *C = dyn_cast<Constant>(V); 441 if (!C) 442 return nullptr; 443 444 // Only handle simple values that are a power of two bytes in size. 445 uint64_t Size = DL->getTypeSizeInBits(V->getType()); 446 if (Size == 0 || (Size & 7) || (Size & (Size - 1))) 447 return nullptr; 448 449 // Don't care enough about darwin/ppc to implement this. 450 if (DL->isBigEndian()) 451 return nullptr; 452 453 // Convert to size in bytes. 454 Size /= 8; 455 456 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see 457 // if the top and bottom are the same (e.g. for vectors and large integers). 458 if (Size > 16) 459 return nullptr; 460 461 // If the constant is exactly 16 bytes, just use it. 462 if (Size == 16) 463 return C; 464 465 // Otherwise, we'll use an array of the constants. 466 unsigned ArraySize = 16 / Size; 467 ArrayType *AT = ArrayType::get(V->getType(), ArraySize); 468 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); 469 } 470 471 LoopIdiomRecognize::LegalStoreKind 472 LoopIdiomRecognize::isLegalStore(StoreInst *SI) { 473 // Don't touch volatile stores. 474 if (SI->isVolatile()) 475 return LegalStoreKind::None; 476 // We only want simple or unordered-atomic stores. 477 if (!SI->isUnordered()) 478 return LegalStoreKind::None; 479 480 // Avoid merging nontemporal stores. 481 if (SI->getMetadata(LLVMContext::MD_nontemporal)) 482 return LegalStoreKind::None; 483 484 Value *StoredVal = SI->getValueOperand(); 485 Value *StorePtr = SI->getPointerOperand(); 486 487 // Don't convert stores of non-integral pointer types to memsets (which stores 488 // integers). 489 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType())) 490 return LegalStoreKind::None; 491 492 // Reject stores that are so large that they overflow an unsigned. 493 // When storing out scalable vectors we bail out for now, since the code 494 // below currently only works for constant strides. 495 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); 496 if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) || 497 (SizeInBits.getFixedSize() >> 32) != 0) 498 return LegalStoreKind::None; 499 500 // See if the pointer expression is an AddRec like {base,+,1} on the current 501 // loop, which indicates a strided store. If we have something else, it's a 502 // random store we can't handle. 503 const SCEVAddRecExpr *StoreEv = 504 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 505 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) 506 return LegalStoreKind::None; 507 508 // Check to see if we have a constant stride. 509 if (!isa<SCEVConstant>(StoreEv->getOperand(1))) 510 return LegalStoreKind::None; 511 512 // See if the store can be turned into a memset. 513 514 // If the stored value is a byte-wise value (like i32 -1), then it may be 515 // turned into a memset of i8 -1, assuming that all the consecutive bytes 516 // are stored. A store of i32 0x01020304 can never be turned into a memset, 517 // but it can be turned into memset_pattern if the target supports it. 518 Value *SplatValue = isBytewiseValue(StoredVal, *DL); 519 520 // Note: memset and memset_pattern on unordered-atomic is yet not supported 521 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple(); 522 523 // If we're allowed to form a memset, and the stored value would be 524 // acceptable for memset, use it. 525 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset && 526 // Verify that the stored value is loop invariant. If not, we can't 527 // promote the memset. 528 CurLoop->isLoopInvariant(SplatValue)) { 529 // It looks like we can use SplatValue. 530 return LegalStoreKind::Memset; 531 } 532 if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset && 533 // Don't create memset_pattern16s with address spaces. 534 StorePtr->getType()->getPointerAddressSpace() == 0 && 535 getMemSetPatternValue(StoredVal, DL)) { 536 // It looks like we can use PatternValue! 537 return LegalStoreKind::MemsetPattern; 538 } 539 540 // Otherwise, see if the store can be turned into a memcpy. 541 if (HasMemcpy && !DisableLIRP::Memcpy) { 542 // Check to see if the stride matches the size of the store. If so, then we 543 // know that every byte is touched in the loop. 544 APInt Stride = getStoreStride(StoreEv); 545 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType()); 546 if (StoreSize != Stride && StoreSize != -Stride) 547 return LegalStoreKind::None; 548 549 // The store must be feeding a non-volatile load. 550 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); 551 552 // Only allow non-volatile loads 553 if (!LI || LI->isVolatile()) 554 return LegalStoreKind::None; 555 // Only allow simple or unordered-atomic loads 556 if (!LI->isUnordered()) 557 return LegalStoreKind::None; 558 559 // See if the pointer expression is an AddRec like {base,+,1} on the current 560 // loop, which indicates a strided load. If we have something else, it's a 561 // random load we can't handle. 562 const SCEVAddRecExpr *LoadEv = 563 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); 564 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) 565 return LegalStoreKind::None; 566 567 // The store and load must share the same stride. 568 if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) 569 return LegalStoreKind::None; 570 571 // Success. This store can be converted into a memcpy. 572 UnorderedAtomic = UnorderedAtomic || LI->isAtomic(); 573 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy 574 : LegalStoreKind::Memcpy; 575 } 576 // This store can't be transformed into a memset/memcpy. 577 return LegalStoreKind::None; 578 } 579 580 void LoopIdiomRecognize::collectStores(BasicBlock *BB) { 581 StoreRefsForMemset.clear(); 582 StoreRefsForMemsetPattern.clear(); 583 StoreRefsForMemcpy.clear(); 584 for (Instruction &I : *BB) { 585 StoreInst *SI = dyn_cast<StoreInst>(&I); 586 if (!SI) 587 continue; 588 589 // Make sure this is a strided store with a constant stride. 590 switch (isLegalStore(SI)) { 591 case LegalStoreKind::None: 592 // Nothing to do 593 break; 594 case LegalStoreKind::Memset: { 595 // Find the base pointer. 596 Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); 597 StoreRefsForMemset[Ptr].push_back(SI); 598 } break; 599 case LegalStoreKind::MemsetPattern: { 600 // Find the base pointer. 601 Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); 602 StoreRefsForMemsetPattern[Ptr].push_back(SI); 603 } break; 604 case LegalStoreKind::Memcpy: 605 case LegalStoreKind::UnorderedAtomicMemcpy: 606 StoreRefsForMemcpy.push_back(SI); 607 break; 608 default: 609 assert(false && "unhandled return value"); 610 break; 611 } 612 } 613 } 614 615 /// runOnLoopBlock - Process the specified block, which lives in a counted loop 616 /// with the specified backedge count. This block is known to be in the current 617 /// loop and not in any subloops. 618 bool LoopIdiomRecognize::runOnLoopBlock( 619 BasicBlock *BB, const SCEV *BECount, 620 SmallVectorImpl<BasicBlock *> &ExitBlocks) { 621 // We can only promote stores in this block if they are unconditionally 622 // executed in the loop. For a block to be unconditionally executed, it has 623 // to dominate all the exit blocks of the loop. Verify this now. 624 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) 625 if (!DT->dominates(BB, ExitBlocks[i])) 626 return false; 627 628 bool MadeChange = false; 629 // Look for store instructions, which may be optimized to memset/memcpy. 630 collectStores(BB); 631 632 // Look for a single store or sets of stores with a common base, which can be 633 // optimized into a memset (memset_pattern). The latter most commonly happens 634 // with structs and handunrolled loops. 635 for (auto &SL : StoreRefsForMemset) 636 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes); 637 638 for (auto &SL : StoreRefsForMemsetPattern) 639 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No); 640 641 // Optimize the store into a memcpy, if it feeds an similarly strided load. 642 for (auto &SI : StoreRefsForMemcpy) 643 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount); 644 645 MadeChange |= processLoopMemIntrinsic<MemCpyInst>( 646 BB, &LoopIdiomRecognize::processLoopMemCpy, BECount); 647 MadeChange |= processLoopMemIntrinsic<MemSetInst>( 648 BB, &LoopIdiomRecognize::processLoopMemSet, BECount); 649 650 return MadeChange; 651 } 652 653 /// See if this store(s) can be promoted to a memset. 654 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, 655 const SCEV *BECount, ForMemset For) { 656 // Try to find consecutive stores that can be transformed into memsets. 657 SetVector<StoreInst *> Heads, Tails; 658 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; 659 660 // Do a quadratic search on all of the given stores and find 661 // all of the pairs of stores that follow each other. 662 SmallVector<unsigned, 16> IndexQueue; 663 for (unsigned i = 0, e = SL.size(); i < e; ++i) { 664 assert(SL[i]->isSimple() && "Expected only non-volatile stores."); 665 666 Value *FirstStoredVal = SL[i]->getValueOperand(); 667 Value *FirstStorePtr = SL[i]->getPointerOperand(); 668 const SCEVAddRecExpr *FirstStoreEv = 669 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr)); 670 APInt FirstStride = getStoreStride(FirstStoreEv); 671 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType()); 672 673 // See if we can optimize just this store in isolation. 674 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) { 675 Heads.insert(SL[i]); 676 continue; 677 } 678 679 Value *FirstSplatValue = nullptr; 680 Constant *FirstPatternValue = nullptr; 681 682 if (For == ForMemset::Yes) 683 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL); 684 else 685 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL); 686 687 assert((FirstSplatValue || FirstPatternValue) && 688 "Expected either splat value or pattern value."); 689 690 IndexQueue.clear(); 691 // If a store has multiple consecutive store candidates, search Stores 692 // array according to the sequence: from i+1 to e, then from i-1 to 0. 693 // This is because usually pairing with immediate succeeding or preceding 694 // candidate create the best chance to find memset opportunity. 695 unsigned j = 0; 696 for (j = i + 1; j < e; ++j) 697 IndexQueue.push_back(j); 698 for (j = i; j > 0; --j) 699 IndexQueue.push_back(j - 1); 700 701 for (auto &k : IndexQueue) { 702 assert(SL[k]->isSimple() && "Expected only non-volatile stores."); 703 Value *SecondStorePtr = SL[k]->getPointerOperand(); 704 const SCEVAddRecExpr *SecondStoreEv = 705 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr)); 706 APInt SecondStride = getStoreStride(SecondStoreEv); 707 708 if (FirstStride != SecondStride) 709 continue; 710 711 Value *SecondStoredVal = SL[k]->getValueOperand(); 712 Value *SecondSplatValue = nullptr; 713 Constant *SecondPatternValue = nullptr; 714 715 if (For == ForMemset::Yes) 716 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL); 717 else 718 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL); 719 720 assert((SecondSplatValue || SecondPatternValue) && 721 "Expected either splat value or pattern value."); 722 723 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { 724 if (For == ForMemset::Yes) { 725 if (isa<UndefValue>(FirstSplatValue)) 726 FirstSplatValue = SecondSplatValue; 727 if (FirstSplatValue != SecondSplatValue) 728 continue; 729 } else { 730 if (isa<UndefValue>(FirstPatternValue)) 731 FirstPatternValue = SecondPatternValue; 732 if (FirstPatternValue != SecondPatternValue) 733 continue; 734 } 735 Tails.insert(SL[k]); 736 Heads.insert(SL[i]); 737 ConsecutiveChain[SL[i]] = SL[k]; 738 break; 739 } 740 } 741 } 742 743 // We may run into multiple chains that merge into a single chain. We mark the 744 // stores that we transformed so that we don't visit the same store twice. 745 SmallPtrSet<Value *, 16> TransformedStores; 746 bool Changed = false; 747 748 // For stores that start but don't end a link in the chain: 749 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); 750 it != e; ++it) { 751 if (Tails.count(*it)) 752 continue; 753 754 // We found a store instr that starts a chain. Now follow the chain and try 755 // to transform it. 756 SmallPtrSet<Instruction *, 8> AdjacentStores; 757 StoreInst *I = *it; 758 759 StoreInst *HeadStore = I; 760 unsigned StoreSize = 0; 761 762 // Collect the chain into a list. 763 while (Tails.count(I) || Heads.count(I)) { 764 if (TransformedStores.count(I)) 765 break; 766 AdjacentStores.insert(I); 767 768 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType()); 769 // Move to the next value in the chain. 770 I = ConsecutiveChain[I]; 771 } 772 773 Value *StoredVal = HeadStore->getValueOperand(); 774 Value *StorePtr = HeadStore->getPointerOperand(); 775 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 776 APInt Stride = getStoreStride(StoreEv); 777 778 // Check to see if the stride matches the size of the stores. If so, then 779 // we know that every byte is touched in the loop. 780 if (StoreSize != Stride && StoreSize != -Stride) 781 continue; 782 783 bool NegStride = StoreSize == -Stride; 784 785 if (processLoopStridedStore(StorePtr, StoreSize, 786 MaybeAlign(HeadStore->getAlignment()), 787 StoredVal, HeadStore, AdjacentStores, StoreEv, 788 BECount, NegStride)) { 789 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); 790 Changed = true; 791 } 792 } 793 794 return Changed; 795 } 796 797 /// processLoopMemIntrinsic - Template function for calling different processor 798 /// functions based on mem instrinsic type. 799 template <typename MemInst> 800 bool LoopIdiomRecognize::processLoopMemIntrinsic( 801 BasicBlock *BB, 802 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *), 803 const SCEV *BECount) { 804 bool MadeChange = false; 805 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 806 Instruction *Inst = &*I++; 807 // Look for memory instructions, which may be optimized to a larger one. 808 if (MemInst *MI = dyn_cast<MemInst>(Inst)) { 809 WeakTrackingVH InstPtr(&*I); 810 if (!(this->*Processor)(MI, BECount)) 811 continue; 812 MadeChange = true; 813 814 // If processing the instruction invalidated our iterator, start over from 815 // the top of the block. 816 if (!InstPtr) 817 I = BB->begin(); 818 } 819 } 820 return MadeChange; 821 } 822 823 /// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy 824 bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI, 825 const SCEV *BECount) { 826 // We can only handle non-volatile memcpys with a constant size. 827 if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength())) 828 return false; 829 830 // If we're not allowed to hack on memcpy, we fail. 831 if (!HasMemcpy || DisableLIRP::Memcpy) 832 return false; 833 834 Value *Dest = MCI->getDest(); 835 Value *Source = MCI->getSource(); 836 if (!Dest || !Source) 837 return false; 838 839 // See if the load and store pointer expressions are AddRec like {base,+,1} on 840 // the current loop, which indicates a strided load and store. If we have 841 // something else, it's a random load or store we can't handle. 842 const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest)); 843 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) 844 return false; 845 const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source)); 846 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) 847 return false; 848 849 // Reject memcpys that are so large that they overflow an unsigned. 850 uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue(); 851 if ((SizeInBytes >> 32) != 0) 852 return false; 853 854 // Check if the stride matches the size of the memcpy. If so, then we know 855 // that every byte is touched in the loop. 856 const SCEVConstant *StoreStride = 857 dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); 858 const SCEVConstant *LoadStride = 859 dyn_cast<SCEVConstant>(LoadEv->getOperand(1)); 860 if (!StoreStride || !LoadStride) 861 return false; 862 863 APInt StoreStrideValue = StoreStride->getAPInt(); 864 APInt LoadStrideValue = LoadStride->getAPInt(); 865 // Huge stride value - give up 866 if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64) 867 return false; 868 869 if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) { 870 ORE.emit([&]() { 871 return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI) 872 << ore::NV("Inst", "memcpy") << " in " 873 << ore::NV("Function", MCI->getFunction()) 874 << " function will not be hoised: " 875 << ore::NV("Reason", "memcpy size is not equal to stride"); 876 }); 877 return false; 878 } 879 880 int64_t StoreStrideInt = StoreStrideValue.getSExtValue(); 881 int64_t LoadStrideInt = LoadStrideValue.getSExtValue(); 882 // Check if the load stride matches the store stride. 883 if (StoreStrideInt != LoadStrideInt) 884 return false; 885 886 return processLoopStoreOfLoopLoad(Dest, Source, (unsigned)SizeInBytes, 887 MCI->getDestAlign(), MCI->getSourceAlign(), 888 MCI, MCI, StoreEv, LoadEv, BECount); 889 } 890 891 /// processLoopMemSet - See if this memset can be promoted to a large memset. 892 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, 893 const SCEV *BECount) { 894 // We can only handle non-volatile memsets with a constant size. 895 if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) 896 return false; 897 898 // If we're not allowed to hack on memset, we fail. 899 if (!HasMemset || DisableLIRP::Memset) 900 return false; 901 902 Value *Pointer = MSI->getDest(); 903 904 // See if the pointer expression is an AddRec like {base,+,1} on the current 905 // loop, which indicates a strided store. If we have something else, it's a 906 // random store we can't handle. 907 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer)); 908 if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine()) 909 return false; 910 911 // Reject memsets that are so large that they overflow an unsigned. 912 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); 913 if ((SizeInBytes >> 32) != 0) 914 return false; 915 916 // Check to see if the stride matches the size of the memset. If so, then we 917 // know that every byte is touched in the loop. 918 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); 919 if (!ConstStride) 920 return false; 921 922 APInt Stride = ConstStride->getAPInt(); 923 if (SizeInBytes != Stride && SizeInBytes != -Stride) 924 return false; 925 926 // Verify that the memset value is loop invariant. If not, we can't promote 927 // the memset. 928 Value *SplatValue = MSI->getValue(); 929 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue)) 930 return false; 931 932 SmallPtrSet<Instruction *, 1> MSIs; 933 MSIs.insert(MSI); 934 bool NegStride = SizeInBytes == -Stride; 935 return processLoopStridedStore( 936 Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()), 937 SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true); 938 } 939 940 /// mayLoopAccessLocation - Return true if the specified loop might access the 941 /// specified pointer location, which is a loop-strided access. The 'Access' 942 /// argument specifies what the verboten forms of access are (read or write). 943 static bool 944 mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, 945 const SCEV *BECount, unsigned StoreSize, 946 AliasAnalysis &AA, 947 SmallPtrSetImpl<Instruction *> &IgnoredStores) { 948 // Get the location that may be stored across the loop. Since the access is 949 // strided positively through memory, we say that the modified location starts 950 // at the pointer and has infinite size. 951 LocationSize AccessSize = LocationSize::afterPointer(); 952 953 // If the loop iterates a fixed number of times, we can refine the access size 954 // to be exactly the size of the memset, which is (BECount+1)*StoreSize 955 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) 956 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) * 957 StoreSize); 958 959 // TODO: For this to be really effective, we have to dive into the pointer 960 // operand in the store. Store to &A[i] of 100 will always return may alias 961 // with store of &A[100], we need to StoreLoc to be "A" with size of 100, 962 // which will then no-alias a store to &A[100]. 963 MemoryLocation StoreLoc(Ptr, AccessSize); 964 965 for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; 966 ++BI) 967 for (Instruction &I : **BI) 968 if (IgnoredStores.count(&I) == 0 && 969 isModOrRefSet( 970 intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access))) 971 return true; 972 973 return false; 974 } 975 976 // If we have a negative stride, Start refers to the end of the memory location 977 // we're trying to memset. Therefore, we need to recompute the base pointer, 978 // which is just Start - BECount*Size. 979 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, 980 Type *IntPtr, unsigned StoreSize, 981 ScalarEvolution *SE) { 982 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr); 983 if (StoreSize != 1) 984 Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize), 985 SCEV::FlagNUW); 986 return SE->getMinusSCEV(Start, Index); 987 } 988 989 /// Compute the number of bytes as a SCEV from the backedge taken count. 990 /// 991 /// This also maps the SCEV into the provided type and tries to handle the 992 /// computation in a way that will fold cleanly. 993 static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, 994 unsigned StoreSize, Loop *CurLoop, 995 const DataLayout *DL, ScalarEvolution *SE) { 996 const SCEV *NumBytesS; 997 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to 998 // pointer size if it isn't already. 999 // 1000 // If we're going to need to zero extend the BE count, check if we can add 1001 // one to it prior to zero extending without overflow. Provided this is safe, 1002 // it allows better simplification of the +1. 1003 if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() < 1004 DL->getTypeSizeInBits(IntPtr).getFixedSize() && 1005 SE->isLoopEntryGuardedByCond( 1006 CurLoop, ICmpInst::ICMP_NE, BECount, 1007 SE->getNegativeSCEV(SE->getOne(BECount->getType())))) { 1008 NumBytesS = SE->getZeroExtendExpr( 1009 SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW), 1010 IntPtr); 1011 } else { 1012 NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr), 1013 SE->getOne(IntPtr), SCEV::FlagNUW); 1014 } 1015 1016 // And scale it based on the store size. 1017 if (StoreSize != 1) { 1018 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), 1019 SCEV::FlagNUW); 1020 } 1021 return NumBytesS; 1022 } 1023 1024 /// processLoopStridedStore - We see a strided store of some value. If we can 1025 /// transform this into a memset or memset_pattern in the loop preheader, do so. 1026 bool LoopIdiomRecognize::processLoopStridedStore( 1027 Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment, 1028 Value *StoredVal, Instruction *TheStore, 1029 SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, 1030 const SCEV *BECount, bool NegStride, bool IsLoopMemset) { 1031 Value *SplatValue = isBytewiseValue(StoredVal, *DL); 1032 Constant *PatternValue = nullptr; 1033 1034 if (!SplatValue) 1035 PatternValue = getMemSetPatternValue(StoredVal, DL); 1036 1037 assert((SplatValue || PatternValue) && 1038 "Expected either splat value or pattern value."); 1039 1040 // The trip count of the loop and the base pointer of the addrec SCEV is 1041 // guaranteed to be loop invariant, which means that it should dominate the 1042 // header. This allows us to insert code for it in the preheader. 1043 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); 1044 BasicBlock *Preheader = CurLoop->getLoopPreheader(); 1045 IRBuilder<> Builder(Preheader->getTerminator()); 1046 SCEVExpander Expander(*SE, *DL, "loop-idiom"); 1047 SCEVExpanderCleaner ExpCleaner(Expander, *DT); 1048 1049 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); 1050 Type *IntIdxTy = DL->getIndexType(DestPtr->getType()); 1051 1052 bool Changed = false; 1053 const SCEV *Start = Ev->getStart(); 1054 // Handle negative strided loops. 1055 if (NegStride) 1056 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE); 1057 1058 // TODO: ideally we should still be able to generate memset if SCEV expander 1059 // is taught to generate the dependencies at the latest point. 1060 if (!isSafeToExpand(Start, *SE)) 1061 return Changed; 1062 1063 // Okay, we have a strided store "p[i]" of a splattable value. We can turn 1064 // this into a memset in the loop preheader now if we want. However, this 1065 // would be unsafe to do if there is anything else in the loop that may read 1066 // or write to the aliased location. Check for any overlap by generating the 1067 // base pointer and checking the region. 1068 Value *BasePtr = 1069 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); 1070 1071 // From here on out, conservatively report to the pass manager that we've 1072 // changed the IR, even if we later clean up these added instructions. There 1073 // may be structural differences e.g. in the order of use lists not accounted 1074 // for in just a textual dump of the IR. This is written as a variable, even 1075 // though statically all the places this dominates could be replaced with 1076 // 'true', with the hope that anyone trying to be clever / "more precise" with 1077 // the return value will read this comment, and leave them alone. 1078 Changed = true; 1079 1080 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount, 1081 StoreSize, *AA, Stores)) 1082 return Changed; 1083 1084 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset)) 1085 return Changed; 1086 1087 // Okay, everything looks good, insert the memset. 1088 1089 const SCEV *NumBytesS = 1090 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE); 1091 1092 // TODO: ideally we should still be able to generate memset if SCEV expander 1093 // is taught to generate the dependencies at the latest point. 1094 if (!isSafeToExpand(NumBytesS, *SE)) 1095 return Changed; 1096 1097 Value *NumBytes = 1098 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); 1099 1100 CallInst *NewCall; 1101 if (SplatValue) { 1102 NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, 1103 MaybeAlign(StoreAlignment)); 1104 } else { 1105 // Everything is emitted in default address space 1106 Type *Int8PtrTy = DestInt8PtrTy; 1107 1108 Module *M = TheStore->getModule(); 1109 StringRef FuncName = "memset_pattern16"; 1110 FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(), 1111 Int8PtrTy, Int8PtrTy, IntIdxTy); 1112 inferLibFuncAttributes(M, FuncName, *TLI); 1113 1114 // Otherwise we should form a memset_pattern16. PatternValue is known to be 1115 // an constant array of 16-bytes. Plop the value into a mergable global. 1116 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, 1117 GlobalValue::PrivateLinkage, 1118 PatternValue, ".memset_pattern"); 1119 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. 1120 GV->setAlignment(Align(16)); 1121 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); 1122 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); 1123 } 1124 NewCall->setDebugLoc(TheStore->getDebugLoc()); 1125 1126 if (MSSAU) { 1127 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB( 1128 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator); 1129 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true); 1130 } 1131 1132 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" 1133 << " from store to: " << *Ev << " at: " << *TheStore 1134 << "\n"); 1135 1136 ORE.emit([&]() { 1137 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore", 1138 NewCall->getDebugLoc(), Preheader) 1139 << "Transformed loop-strided store in " 1140 << ore::NV("Function", TheStore->getFunction()) 1141 << " function into a call to " 1142 << ore::NV("NewFunction", NewCall->getCalledFunction()) 1143 << "() intrinsic"; 1144 }); 1145 1146 // Okay, the memset has been formed. Zap the original store and anything that 1147 // feeds into it. 1148 for (auto *I : Stores) { 1149 if (MSSAU) 1150 MSSAU->removeMemoryAccess(I, true); 1151 deleteDeadInstruction(I); 1152 } 1153 if (MSSAU && VerifyMemorySSA) 1154 MSSAU->getMemorySSA()->verifyMemorySSA(); 1155 ++NumMemSet; 1156 ExpCleaner.markResultUsed(); 1157 return true; 1158 } 1159 1160 /// If the stored value is a strided load in the same loop with the same stride 1161 /// this may be transformable into a memcpy. This kicks in for stuff like 1162 /// for (i) A[i] = B[i]; 1163 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, 1164 const SCEV *BECount) { 1165 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores."); 1166 1167 Value *StorePtr = SI->getPointerOperand(); 1168 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 1169 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType()); 1170 1171 // The store must be feeding a non-volatile load. 1172 LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); 1173 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads."); 1174 1175 // See if the pointer expression is an AddRec like {base,+,1} on the current 1176 // loop, which indicates a strided load. If we have something else, it's a 1177 // random load we can't handle. 1178 Value *LoadPtr = LI->getPointerOperand(); 1179 const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr)); 1180 return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSize, 1181 SI->getAlign(), LI->getAlign(), SI, LI, 1182 StoreEv, LoadEv, BECount); 1183 } 1184 1185 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( 1186 Value *DestPtr, Value *SourcePtr, unsigned StoreSize, MaybeAlign StoreAlign, 1187 MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad, 1188 const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv, 1189 const SCEV *BECount) { 1190 // The trip count of the loop and the base pointer of the addrec SCEV is 1191 // guaranteed to be loop invariant, which means that it should dominate the 1192 // header. This allows us to insert code for it in the preheader. 1193 BasicBlock *Preheader = CurLoop->getLoopPreheader(); 1194 IRBuilder<> Builder(Preheader->getTerminator()); 1195 SCEVExpander Expander(*SE, *DL, "loop-idiom"); 1196 1197 SCEVExpanderCleaner ExpCleaner(Expander, *DT); 1198 1199 bool Changed = false; 1200 const SCEV *StrStart = StoreEv->getStart(); 1201 unsigned StrAS = DestPtr->getType()->getPointerAddressSpace(); 1202 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS)); 1203 1204 APInt Stride = getStoreStride(StoreEv); 1205 bool NegStride = StoreSize == -Stride; 1206 1207 // Handle negative strided loops. 1208 if (NegStride) 1209 StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE); 1210 1211 // Okay, we have a strided store "p[i]" of a loaded value. We can turn 1212 // this into a memcpy in the loop preheader now if we want. However, this 1213 // would be unsafe to do if there is anything else in the loop that may read 1214 // or write the memory region we're storing to. This includes the load that 1215 // feeds the stores. Check for an alias by generating the base address and 1216 // checking everything. 1217 Value *StoreBasePtr = Expander.expandCodeFor( 1218 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); 1219 1220 // From here on out, conservatively report to the pass manager that we've 1221 // changed the IR, even if we later clean up these added instructions. There 1222 // may be structural differences e.g. in the order of use lists not accounted 1223 // for in just a textual dump of the IR. This is written as a variable, even 1224 // though statically all the places this dominates could be replaced with 1225 // 'true', with the hope that anyone trying to be clever / "more precise" with 1226 // the return value will read this comment, and leave them alone. 1227 Changed = true; 1228 1229 SmallPtrSet<Instruction *, 1> Stores; 1230 Stores.insert(TheStore); 1231 1232 bool IsMemCpy = isa<MemCpyInst>(TheStore); 1233 const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store"; 1234 1235 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount, 1236 StoreSize, *AA, Stores)) { 1237 ORE.emit([&]() { 1238 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore", 1239 TheStore) 1240 << ore::NV("Inst", InstRemark) << " in " 1241 << ore::NV("Function", TheStore->getFunction()) 1242 << " function will not be hoisted: " 1243 << ore::NV("Reason", "The loop may access store location"); 1244 }); 1245 return Changed; 1246 } 1247 1248 const SCEV *LdStart = LoadEv->getStart(); 1249 unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace(); 1250 1251 // Handle negative strided loops. 1252 if (NegStride) 1253 LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE); 1254 1255 // For a memcpy, we have to make sure that the input array is not being 1256 // mutated by the loop. 1257 Value *LoadBasePtr = Expander.expandCodeFor( 1258 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); 1259 1260 // If the store is a memcpy instruction, we must check if it will write to 1261 // the load memory locations. So remove it from the ignored stores. 1262 if (IsMemCpy) 1263 Stores.erase(TheStore); 1264 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount, 1265 StoreSize, *AA, Stores)) { 1266 ORE.emit([&]() { 1267 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad) 1268 << ore::NV("Inst", InstRemark) << " in " 1269 << ore::NV("Function", TheStore->getFunction()) 1270 << " function will not be hoisted: " 1271 << ore::NV("Reason", "The loop may access load location"); 1272 }); 1273 return Changed; 1274 } 1275 1276 if (avoidLIRForMultiBlockLoop()) 1277 return Changed; 1278 1279 // Okay, everything is safe, we can transform this! 1280 1281 const SCEV *NumBytesS = 1282 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE); 1283 1284 Value *NumBytes = 1285 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); 1286 1287 CallInst *NewCall = nullptr; 1288 // Check whether to generate an unordered atomic memcpy: 1289 // If the load or store are atomic, then they must necessarily be unordered 1290 // by previous checks. 1291 if (!TheStore->isAtomic() && !TheLoad->isAtomic()) 1292 NewCall = Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, 1293 LoadAlign, NumBytes); 1294 else { 1295 // We cannot allow unaligned ops for unordered load/store, so reject 1296 // anything where the alignment isn't at least the element size. 1297 assert((StoreAlign.hasValue() && LoadAlign.hasValue()) && 1298 "Expect unordered load/store to have align."); 1299 if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize) 1300 return Changed; 1301 1302 // If the element.atomic memcpy is not lowered into explicit 1303 // loads/stores later, then it will be lowered into an element-size 1304 // specific lib call. If the lib call doesn't exist for our store size, then 1305 // we shouldn't generate the memcpy. 1306 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize()) 1307 return Changed; 1308 1309 // Create the call. 1310 // Note that unordered atomic loads/stores are *required* by the spec to 1311 // have an alignment but non-atomic loads/stores may not. 1312 NewCall = Builder.CreateElementUnorderedAtomicMemCpy( 1313 StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(), 1314 NumBytes, StoreSize); 1315 } 1316 NewCall->setDebugLoc(TheStore->getDebugLoc()); 1317 1318 if (MSSAU) { 1319 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB( 1320 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator); 1321 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true); 1322 } 1323 1324 LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" 1325 << " from load ptr=" << *LoadEv << " at: " << *TheLoad 1326 << "\n" 1327 << " from store ptr=" << *StoreEv << " at: " << *TheStore 1328 << "\n"); 1329 1330 ORE.emit([&]() { 1331 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad", 1332 NewCall->getDebugLoc(), Preheader) 1333 << "Formed a call to " 1334 << ore::NV("NewFunction", NewCall->getCalledFunction()) 1335 << "() intrinsic from " << ore::NV("Inst", InstRemark) 1336 << " instruction in " << ore::NV("Function", TheStore->getFunction()) 1337 << " function"; 1338 }); 1339 1340 // Okay, the memcpy has been formed. Zap the original store and anything that 1341 // feeds into it. 1342 if (MSSAU) 1343 MSSAU->removeMemoryAccess(TheStore, true); 1344 deleteDeadInstruction(TheStore); 1345 if (MSSAU && VerifyMemorySSA) 1346 MSSAU->getMemorySSA()->verifyMemorySSA(); 1347 ++NumMemCpy; 1348 ExpCleaner.markResultUsed(); 1349 return true; 1350 } 1351 1352 // When compiling for codesize we avoid idiom recognition for a multi-block loop 1353 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop. 1354 // 1355 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, 1356 bool IsLoopMemset) { 1357 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) { 1358 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) { 1359 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName() 1360 << " : LIR " << (IsMemset ? "Memset" : "Memcpy") 1361 << " avoided: multi-block top-level loop\n"); 1362 return true; 1363 } 1364 } 1365 1366 return false; 1367 } 1368 1369 bool LoopIdiomRecognize::runOnNoncountableLoop() { 1370 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" 1371 << CurLoop->getHeader()->getParent()->getName() 1372 << "] Noncountable Loop %" 1373 << CurLoop->getHeader()->getName() << "\n"); 1374 1375 return recognizePopcount() || recognizeAndInsertFFS() || 1376 recognizeShiftUntilBitTest(); 1377 } 1378 1379 /// Check if the given conditional branch is based on the comparison between 1380 /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is 1381 /// true), the control yields to the loop entry. If the branch matches the 1382 /// behavior, the variable involved in the comparison is returned. This function 1383 /// will be called to see if the precondition and postcondition of the loop are 1384 /// in desirable form. 1385 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry, 1386 bool JmpOnZero = false) { 1387 if (!BI || !BI->isConditional()) 1388 return nullptr; 1389 1390 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); 1391 if (!Cond) 1392 return nullptr; 1393 1394 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); 1395 if (!CmpZero || !CmpZero->isZero()) 1396 return nullptr; 1397 1398 BasicBlock *TrueSucc = BI->getSuccessor(0); 1399 BasicBlock *FalseSucc = BI->getSuccessor(1); 1400 if (JmpOnZero) 1401 std::swap(TrueSucc, FalseSucc); 1402 1403 ICmpInst::Predicate Pred = Cond->getPredicate(); 1404 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) || 1405 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry)) 1406 return Cond->getOperand(0); 1407 1408 return nullptr; 1409 } 1410 1411 // Check if the recurrence variable `VarX` is in the right form to create 1412 // the idiom. Returns the value coerced to a PHINode if so. 1413 static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX, 1414 BasicBlock *LoopEntry) { 1415 auto *PhiX = dyn_cast<PHINode>(VarX); 1416 if (PhiX && PhiX->getParent() == LoopEntry && 1417 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX)) 1418 return PhiX; 1419 return nullptr; 1420 } 1421 1422 /// Return true iff the idiom is detected in the loop. 1423 /// 1424 /// Additionally: 1425 /// 1) \p CntInst is set to the instruction counting the population bit. 1426 /// 2) \p CntPhi is set to the corresponding phi node. 1427 /// 3) \p Var is set to the value whose population bits are being counted. 1428 /// 1429 /// The core idiom we are trying to detect is: 1430 /// \code 1431 /// if (x0 != 0) 1432 /// goto loop-exit // the precondition of the loop 1433 /// cnt0 = init-val; 1434 /// do { 1435 /// x1 = phi (x0, x2); 1436 /// cnt1 = phi(cnt0, cnt2); 1437 /// 1438 /// cnt2 = cnt1 + 1; 1439 /// ... 1440 /// x2 = x1 & (x1 - 1); 1441 /// ... 1442 /// } while(x != 0); 1443 /// 1444 /// loop-exit: 1445 /// \endcode 1446 static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, 1447 Instruction *&CntInst, PHINode *&CntPhi, 1448 Value *&Var) { 1449 // step 1: Check to see if the look-back branch match this pattern: 1450 // "if (a!=0) goto loop-entry". 1451 BasicBlock *LoopEntry; 1452 Instruction *DefX2, *CountInst; 1453 Value *VarX1, *VarX0; 1454 PHINode *PhiX, *CountPhi; 1455 1456 DefX2 = CountInst = nullptr; 1457 VarX1 = VarX0 = nullptr; 1458 PhiX = CountPhi = nullptr; 1459 LoopEntry = *(CurLoop->block_begin()); 1460 1461 // step 1: Check if the loop-back branch is in desirable form. 1462 { 1463 if (Value *T = matchCondition( 1464 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) 1465 DefX2 = dyn_cast<Instruction>(T); 1466 else 1467 return false; 1468 } 1469 1470 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" 1471 { 1472 if (!DefX2 || DefX2->getOpcode() != Instruction::And) 1473 return false; 1474 1475 BinaryOperator *SubOneOp; 1476 1477 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) 1478 VarX1 = DefX2->getOperand(1); 1479 else { 1480 VarX1 = DefX2->getOperand(0); 1481 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); 1482 } 1483 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1) 1484 return false; 1485 1486 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1)); 1487 if (!Dec || 1488 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) || 1489 (SubOneOp->getOpcode() == Instruction::Add && 1490 Dec->isMinusOne()))) { 1491 return false; 1492 } 1493 } 1494 1495 // step 3: Check the recurrence of variable X 1496 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry); 1497 if (!PhiX) 1498 return false; 1499 1500 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 1501 { 1502 CountInst = nullptr; 1503 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), 1504 IterE = LoopEntry->end(); 1505 Iter != IterE; Iter++) { 1506 Instruction *Inst = &*Iter; 1507 if (Inst->getOpcode() != Instruction::Add) 1508 continue; 1509 1510 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); 1511 if (!Inc || !Inc->isOne()) 1512 continue; 1513 1514 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); 1515 if (!Phi) 1516 continue; 1517 1518 // Check if the result of the instruction is live of the loop. 1519 bool LiveOutLoop = false; 1520 for (User *U : Inst->users()) { 1521 if ((cast<Instruction>(U))->getParent() != LoopEntry) { 1522 LiveOutLoop = true; 1523 break; 1524 } 1525 } 1526 1527 if (LiveOutLoop) { 1528 CountInst = Inst; 1529 CountPhi = Phi; 1530 break; 1531 } 1532 } 1533 1534 if (!CountInst) 1535 return false; 1536 } 1537 1538 // step 5: check if the precondition is in this form: 1539 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" 1540 { 1541 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 1542 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader()); 1543 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) 1544 return false; 1545 1546 CntInst = CountInst; 1547 CntPhi = CountPhi; 1548 Var = T; 1549 } 1550 1551 return true; 1552 } 1553 1554 /// Return true if the idiom is detected in the loop. 1555 /// 1556 /// Additionally: 1557 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ) 1558 /// or nullptr if there is no such. 1559 /// 2) \p CntPhi is set to the corresponding phi node 1560 /// or nullptr if there is no such. 1561 /// 3) \p Var is set to the value whose CTLZ could be used. 1562 /// 4) \p DefX is set to the instruction calculating Loop exit condition. 1563 /// 1564 /// The core idiom we are trying to detect is: 1565 /// \code 1566 /// if (x0 == 0) 1567 /// goto loop-exit // the precondition of the loop 1568 /// cnt0 = init-val; 1569 /// do { 1570 /// x = phi (x0, x.next); //PhiX 1571 /// cnt = phi(cnt0, cnt.next); 1572 /// 1573 /// cnt.next = cnt + 1; 1574 /// ... 1575 /// x.next = x >> 1; // DefX 1576 /// ... 1577 /// } while(x.next != 0); 1578 /// 1579 /// loop-exit: 1580 /// \endcode 1581 static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, 1582 Intrinsic::ID &IntrinID, Value *&InitX, 1583 Instruction *&CntInst, PHINode *&CntPhi, 1584 Instruction *&DefX) { 1585 BasicBlock *LoopEntry; 1586 Value *VarX = nullptr; 1587 1588 DefX = nullptr; 1589 CntInst = nullptr; 1590 CntPhi = nullptr; 1591 LoopEntry = *(CurLoop->block_begin()); 1592 1593 // step 1: Check if the loop-back branch is in desirable form. 1594 if (Value *T = matchCondition( 1595 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) 1596 DefX = dyn_cast<Instruction>(T); 1597 else 1598 return false; 1599 1600 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1" 1601 if (!DefX || !DefX->isShift()) 1602 return false; 1603 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz : 1604 Intrinsic::ctlz; 1605 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)); 1606 if (!Shft || !Shft->isOne()) 1607 return false; 1608 VarX = DefX->getOperand(0); 1609 1610 // step 3: Check the recurrence of variable X 1611 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry); 1612 if (!PhiX) 1613 return false; 1614 1615 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader()); 1616 1617 // Make sure the initial value can't be negative otherwise the ashr in the 1618 // loop might never reach zero which would make the loop infinite. 1619 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL)) 1620 return false; 1621 1622 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 1623 // or cnt.next = cnt + -1. 1624 // TODO: We can skip the step. If loop trip count is known (CTLZ), 1625 // then all uses of "cnt.next" could be optimized to the trip count 1626 // plus "cnt0". Currently it is not optimized. 1627 // This step could be used to detect POPCNT instruction: 1628 // cnt.next = cnt + (x.next & 1) 1629 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), 1630 IterE = LoopEntry->end(); 1631 Iter != IterE; Iter++) { 1632 Instruction *Inst = &*Iter; 1633 if (Inst->getOpcode() != Instruction::Add) 1634 continue; 1635 1636 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); 1637 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne())) 1638 continue; 1639 1640 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); 1641 if (!Phi) 1642 continue; 1643 1644 CntInst = Inst; 1645 CntPhi = Phi; 1646 break; 1647 } 1648 if (!CntInst) 1649 return false; 1650 1651 return true; 1652 } 1653 1654 /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop 1655 /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new 1656 /// trip count returns true; otherwise, returns false. 1657 bool LoopIdiomRecognize::recognizeAndInsertFFS() { 1658 // Give up if the loop has multiple blocks or multiple backedges. 1659 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) 1660 return false; 1661 1662 Intrinsic::ID IntrinID; 1663 Value *InitX; 1664 Instruction *DefX = nullptr; 1665 PHINode *CntPhi = nullptr; 1666 Instruction *CntInst = nullptr; 1667 // Help decide if transformation is profitable. For ShiftUntilZero idiom, 1668 // this is always 6. 1669 size_t IdiomCanonicalSize = 6; 1670 1671 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, 1672 CntInst, CntPhi, DefX)) 1673 return false; 1674 1675 bool IsCntPhiUsedOutsideLoop = false; 1676 for (User *U : CntPhi->users()) 1677 if (!CurLoop->contains(cast<Instruction>(U))) { 1678 IsCntPhiUsedOutsideLoop = true; 1679 break; 1680 } 1681 bool IsCntInstUsedOutsideLoop = false; 1682 for (User *U : CntInst->users()) 1683 if (!CurLoop->contains(cast<Instruction>(U))) { 1684 IsCntInstUsedOutsideLoop = true; 1685 break; 1686 } 1687 // If both CntInst and CntPhi are used outside the loop the profitability 1688 // is questionable. 1689 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop) 1690 return false; 1691 1692 // For some CPUs result of CTLZ(X) intrinsic is undefined 1693 // when X is 0. If we can not guarantee X != 0, we need to check this 1694 // when expand. 1695 bool ZeroCheck = false; 1696 // It is safe to assume Preheader exist as it was checked in 1697 // parent function RunOnLoop. 1698 BasicBlock *PH = CurLoop->getLoopPreheader(); 1699 1700 // If we are using the count instruction outside the loop, make sure we 1701 // have a zero check as a precondition. Without the check the loop would run 1702 // one iteration for before any check of the input value. This means 0 and 1 1703 // would have identical behavior in the original loop and thus 1704 if (!IsCntPhiUsedOutsideLoop) { 1705 auto *PreCondBB = PH->getSinglePredecessor(); 1706 if (!PreCondBB) 1707 return false; 1708 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 1709 if (!PreCondBI) 1710 return false; 1711 if (matchCondition(PreCondBI, PH) != InitX) 1712 return false; 1713 ZeroCheck = true; 1714 } 1715 1716 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always 1717 // profitable if we delete the loop. 1718 1719 // the loop has only 6 instructions: 1720 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] 1721 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] 1722 // %shr = ashr %n.addr.0, 1 1723 // %tobool = icmp eq %shr, 0 1724 // %inc = add nsw %i.0, 1 1725 // br i1 %tobool 1726 1727 const Value *Args[] = {InitX, 1728 ConstantInt::getBool(InitX->getContext(), ZeroCheck)}; 1729 1730 // @llvm.dbg doesn't count as they have no semantic effect. 1731 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug(); 1732 uint32_t HeaderSize = 1733 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end()); 1734 1735 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args); 1736 InstructionCost Cost = 1737 TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency); 1738 if (HeaderSize != IdiomCanonicalSize && 1739 Cost > TargetTransformInfo::TCC_Basic) 1740 return false; 1741 1742 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX, 1743 DefX->getDebugLoc(), ZeroCheck, 1744 IsCntPhiUsedOutsideLoop); 1745 return true; 1746 } 1747 1748 /// Recognizes a population count idiom in a non-countable loop. 1749 /// 1750 /// If detected, transforms the relevant code to issue the popcount intrinsic 1751 /// function call, and returns true; otherwise, returns false. 1752 bool LoopIdiomRecognize::recognizePopcount() { 1753 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) 1754 return false; 1755 1756 // Counting population are usually conducted by few arithmetic instructions. 1757 // Such instructions can be easily "absorbed" by vacant slots in a 1758 // non-compact loop. Therefore, recognizing popcount idiom only makes sense 1759 // in a compact loop. 1760 1761 // Give up if the loop has multiple blocks or multiple backedges. 1762 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) 1763 return false; 1764 1765 BasicBlock *LoopBody = *(CurLoop->block_begin()); 1766 if (LoopBody->size() >= 20) { 1767 // The loop is too big, bail out. 1768 return false; 1769 } 1770 1771 // It should have a preheader containing nothing but an unconditional branch. 1772 BasicBlock *PH = CurLoop->getLoopPreheader(); 1773 if (!PH || &PH->front() != PH->getTerminator()) 1774 return false; 1775 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator()); 1776 if (!EntryBI || EntryBI->isConditional()) 1777 return false; 1778 1779 // It should have a precondition block where the generated popcount intrinsic 1780 // function can be inserted. 1781 auto *PreCondBB = PH->getSinglePredecessor(); 1782 if (!PreCondBB) 1783 return false; 1784 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 1785 if (!PreCondBI || PreCondBI->isUnconditional()) 1786 return false; 1787 1788 Instruction *CntInst; 1789 PHINode *CntPhi; 1790 Value *Val; 1791 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val)) 1792 return false; 1793 1794 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val); 1795 return true; 1796 } 1797 1798 static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, 1799 const DebugLoc &DL) { 1800 Value *Ops[] = {Val}; 1801 Type *Tys[] = {Val->getType()}; 1802 1803 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); 1804 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); 1805 CallInst *CI = IRBuilder.CreateCall(Func, Ops); 1806 CI->setDebugLoc(DL); 1807 1808 return CI; 1809 } 1810 1811 static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, 1812 const DebugLoc &DL, bool ZeroCheck, 1813 Intrinsic::ID IID) { 1814 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)}; 1815 Type *Tys[] = {Val->getType()}; 1816 1817 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); 1818 Function *Func = Intrinsic::getDeclaration(M, IID, Tys); 1819 CallInst *CI = IRBuilder.CreateCall(Func, Ops); 1820 CI->setDebugLoc(DL); 1821 1822 return CI; 1823 } 1824 1825 /// Transform the following loop (Using CTLZ, CTTZ is similar): 1826 /// loop: 1827 /// CntPhi = PHI [Cnt0, CntInst] 1828 /// PhiX = PHI [InitX, DefX] 1829 /// CntInst = CntPhi + 1 1830 /// DefX = PhiX >> 1 1831 /// LOOP_BODY 1832 /// Br: loop if (DefX != 0) 1833 /// Use(CntPhi) or Use(CntInst) 1834 /// 1835 /// Into: 1836 /// If CntPhi used outside the loop: 1837 /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1) 1838 /// Count = CountPrev + 1 1839 /// else 1840 /// Count = BitWidth(InitX) - CTLZ(InitX) 1841 /// loop: 1842 /// CntPhi = PHI [Cnt0, CntInst] 1843 /// PhiX = PHI [InitX, DefX] 1844 /// PhiCount = PHI [Count, Dec] 1845 /// CntInst = CntPhi + 1 1846 /// DefX = PhiX >> 1 1847 /// Dec = PhiCount - 1 1848 /// LOOP_BODY 1849 /// Br: loop if (Dec != 0) 1850 /// Use(CountPrev + Cnt0) // Use(CntPhi) 1851 /// or 1852 /// Use(Count + Cnt0) // Use(CntInst) 1853 /// 1854 /// If LOOP_BODY is empty the loop will be deleted. 1855 /// If CntInst and DefX are not used in LOOP_BODY they will be removed. 1856 void LoopIdiomRecognize::transformLoopToCountable( 1857 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst, 1858 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL, 1859 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { 1860 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator()); 1861 1862 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block 1863 IRBuilder<> Builder(PreheaderBr); 1864 Builder.SetCurrentDebugLocation(DL); 1865 1866 // If there are no uses of CntPhi crate: 1867 // Count = BitWidth - CTLZ(InitX); 1868 // NewCount = Count; 1869 // If there are uses of CntPhi create: 1870 // NewCount = BitWidth - CTLZ(InitX >> 1); 1871 // Count = NewCount + 1; 1872 Value *InitXNext; 1873 if (IsCntPhiUsedOutsideLoop) { 1874 if (DefX->getOpcode() == Instruction::AShr) 1875 InitXNext = Builder.CreateAShr(InitX, 1); 1876 else if (DefX->getOpcode() == Instruction::LShr) 1877 InitXNext = Builder.CreateLShr(InitX, 1); 1878 else if (DefX->getOpcode() == Instruction::Shl) // cttz 1879 InitXNext = Builder.CreateShl(InitX, 1); 1880 else 1881 llvm_unreachable("Unexpected opcode!"); 1882 } else 1883 InitXNext = InitX; 1884 Value *Count = 1885 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID); 1886 Type *CountTy = Count->getType(); 1887 Count = Builder.CreateSub( 1888 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count); 1889 Value *NewCount = Count; 1890 if (IsCntPhiUsedOutsideLoop) 1891 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1)); 1892 1893 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType()); 1894 1895 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader); 1896 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) { 1897 // If the counter was being incremented in the loop, add NewCount to the 1898 // counter's initial value, but only if the initial value is not zero. 1899 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); 1900 if (!InitConst || !InitConst->isZero()) 1901 NewCount = Builder.CreateAdd(NewCount, CntInitVal); 1902 } else { 1903 // If the count was being decremented in the loop, subtract NewCount from 1904 // the counter's initial value. 1905 NewCount = Builder.CreateSub(CntInitVal, NewCount); 1906 } 1907 1908 // Step 2: Insert new IV and loop condition: 1909 // loop: 1910 // ... 1911 // PhiCount = PHI [Count, Dec] 1912 // ... 1913 // Dec = PhiCount - 1 1914 // ... 1915 // Br: loop if (Dec != 0) 1916 BasicBlock *Body = *(CurLoop->block_begin()); 1917 auto *LbBr = cast<BranchInst>(Body->getTerminator()); 1918 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); 1919 1920 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front()); 1921 1922 Builder.SetInsertPoint(LbCond); 1923 Instruction *TcDec = cast<Instruction>(Builder.CreateSub( 1924 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true)); 1925 1926 TcPhi->addIncoming(Count, Preheader); 1927 TcPhi->addIncoming(TcDec, Body); 1928 1929 CmpInst::Predicate Pred = 1930 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; 1931 LbCond->setPredicate(Pred); 1932 LbCond->setOperand(0, TcDec); 1933 LbCond->setOperand(1, ConstantInt::get(CountTy, 0)); 1934 1935 // Step 3: All the references to the original counter outside 1936 // the loop are replaced with the NewCount 1937 if (IsCntPhiUsedOutsideLoop) 1938 CntPhi->replaceUsesOutsideBlock(NewCount, Body); 1939 else 1940 CntInst->replaceUsesOutsideBlock(NewCount, Body); 1941 1942 // step 4: Forget the "non-computable" trip-count SCEV associated with the 1943 // loop. The loop would otherwise not be deleted even if it becomes empty. 1944 SE->forgetLoop(CurLoop); 1945 } 1946 1947 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, 1948 Instruction *CntInst, 1949 PHINode *CntPhi, Value *Var) { 1950 BasicBlock *PreHead = CurLoop->getLoopPreheader(); 1951 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator()); 1952 const DebugLoc &DL = CntInst->getDebugLoc(); 1953 1954 // Assuming before transformation, the loop is following: 1955 // if (x) // the precondition 1956 // do { cnt++; x &= x - 1; } while(x); 1957 1958 // Step 1: Insert the ctpop instruction at the end of the precondition block 1959 IRBuilder<> Builder(PreCondBr); 1960 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; 1961 { 1962 PopCnt = createPopcntIntrinsic(Builder, Var, DL); 1963 NewCount = PopCntZext = 1964 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); 1965 1966 if (NewCount != PopCnt) 1967 (cast<Instruction>(NewCount))->setDebugLoc(DL); 1968 1969 // TripCnt is exactly the number of iterations the loop has 1970 TripCnt = NewCount; 1971 1972 // If the population counter's initial value is not zero, insert Add Inst. 1973 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); 1974 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); 1975 if (!InitConst || !InitConst->isZero()) { 1976 NewCount = Builder.CreateAdd(NewCount, CntInitVal); 1977 (cast<Instruction>(NewCount))->setDebugLoc(DL); 1978 } 1979 } 1980 1981 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to 1982 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic 1983 // function would be partial dead code, and downstream passes will drag 1984 // it back from the precondition block to the preheader. 1985 { 1986 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); 1987 1988 Value *Opnd0 = PopCntZext; 1989 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); 1990 if (PreCond->getOperand(0) != Var) 1991 std::swap(Opnd0, Opnd1); 1992 1993 ICmpInst *NewPreCond = cast<ICmpInst>( 1994 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); 1995 PreCondBr->setCondition(NewPreCond); 1996 1997 RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); 1998 } 1999 2000 // Step 3: Note that the population count is exactly the trip count of the 2001 // loop in question, which enable us to convert the loop from noncountable 2002 // loop into a countable one. The benefit is twofold: 2003 // 2004 // - If the loop only counts population, the entire loop becomes dead after 2005 // the transformation. It is a lot easier to prove a countable loop dead 2006 // than to prove a noncountable one. (In some C dialects, an infinite loop 2007 // isn't dead even if it computes nothing useful. In general, DCE needs 2008 // to prove a noncountable loop finite before safely delete it.) 2009 // 2010 // - If the loop also performs something else, it remains alive. 2011 // Since it is transformed to countable form, it can be aggressively 2012 // optimized by some optimizations which are in general not applicable 2013 // to a noncountable loop. 2014 // 2015 // After this step, this loop (conceptually) would look like following: 2016 // newcnt = __builtin_ctpop(x); 2017 // t = newcnt; 2018 // if (x) 2019 // do { cnt++; x &= x-1; t--) } while (t > 0); 2020 BasicBlock *Body = *(CurLoop->block_begin()); 2021 { 2022 auto *LbBr = cast<BranchInst>(Body->getTerminator()); 2023 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); 2024 Type *Ty = TripCnt->getType(); 2025 2026 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); 2027 2028 Builder.SetInsertPoint(LbCond); 2029 Instruction *TcDec = cast<Instruction>( 2030 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), 2031 "tcdec", false, true)); 2032 2033 TcPhi->addIncoming(TripCnt, PreHead); 2034 TcPhi->addIncoming(TcDec, Body); 2035 2036 CmpInst::Predicate Pred = 2037 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; 2038 LbCond->setPredicate(Pred); 2039 LbCond->setOperand(0, TcDec); 2040 LbCond->setOperand(1, ConstantInt::get(Ty, 0)); 2041 } 2042 2043 // Step 4: All the references to the original population counter outside 2044 // the loop are replaced with the NewCount -- the value returned from 2045 // __builtin_ctpop(). 2046 CntInst->replaceUsesOutsideBlock(NewCount, Body); 2047 2048 // step 5: Forget the "non-computable" trip-count SCEV associated with the 2049 // loop. The loop would otherwise not be deleted even if it becomes empty. 2050 SE->forgetLoop(CurLoop); 2051 } 2052 2053 /// Match loop-invariant value. 2054 template <typename SubPattern_t> struct match_LoopInvariant { 2055 SubPattern_t SubPattern; 2056 const Loop *L; 2057 2058 match_LoopInvariant(const SubPattern_t &SP, const Loop *L) 2059 : SubPattern(SP), L(L) {} 2060 2061 template <typename ITy> bool match(ITy *V) { 2062 return L->isLoopInvariant(V) && SubPattern.match(V); 2063 } 2064 }; 2065 2066 /// Matches if the value is loop-invariant. 2067 template <typename Ty> 2068 inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) { 2069 return match_LoopInvariant<Ty>(M, L); 2070 } 2071 2072 /// Return true if the idiom is detected in the loop. 2073 /// 2074 /// The core idiom we are trying to detect is: 2075 /// \code 2076 /// entry: 2077 /// <...> 2078 /// %bitmask = shl i32 1, %bitpos 2079 /// br label %loop 2080 /// 2081 /// loop: 2082 /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ] 2083 /// %x.curr.bitmasked = and i32 %x.curr, %bitmask 2084 /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0 2085 /// %x.next = shl i32 %x.curr, 1 2086 /// <...> 2087 /// br i1 %x.curr.isbitunset, label %loop, label %end 2088 /// 2089 /// end: 2090 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> 2091 /// %x.next.res = phi i32 [ %x.next, %loop ] <...> 2092 /// <...> 2093 /// \endcode 2094 static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, 2095 Value *&BitMask, Value *&BitPos, 2096 Value *&CurrX, Instruction *&NextX) { 2097 LLVM_DEBUG(dbgs() << DEBUG_TYPE 2098 " Performing shift-until-bittest idiom detection.\n"); 2099 2100 // Give up if the loop has multiple blocks or multiple backedges. 2101 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) { 2102 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n"); 2103 return false; 2104 } 2105 2106 BasicBlock *LoopHeaderBB = CurLoop->getHeader(); 2107 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader(); 2108 assert(LoopPreheaderBB && "There is always a loop preheader."); 2109 2110 using namespace PatternMatch; 2111 2112 // Step 1: Check if the loop backedge is in desirable form. 2113 2114 ICmpInst::Predicate Pred; 2115 Value *CmpLHS, *CmpRHS; 2116 BasicBlock *TrueBB, *FalseBB; 2117 if (!match(LoopHeaderBB->getTerminator(), 2118 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)), 2119 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) { 2120 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n"); 2121 return false; 2122 } 2123 2124 // Step 2: Check if the backedge's condition is in desirable form. 2125 2126 auto MatchVariableBitMask = [&]() { 2127 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) && 2128 match(CmpLHS, 2129 m_c_And(m_Value(CurrX), 2130 m_CombineAnd( 2131 m_Value(BitMask), 2132 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)), 2133 CurLoop)))); 2134 }; 2135 auto MatchConstantBitMask = [&]() { 2136 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) && 2137 match(CmpLHS, m_And(m_Value(CurrX), 2138 m_CombineAnd(m_Value(BitMask), m_Power2()))) && 2139 (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask))); 2140 }; 2141 auto MatchDecomposableConstantBitMask = [&]() { 2142 APInt Mask; 2143 return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) && 2144 ICmpInst::isEquality(Pred) && Mask.isPowerOf2() && 2145 (BitMask = ConstantInt::get(CurrX->getType(), Mask)) && 2146 (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2())); 2147 }; 2148 2149 if (!MatchVariableBitMask() && !MatchConstantBitMask() && 2150 !MatchDecomposableConstantBitMask()) { 2151 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n"); 2152 return false; 2153 } 2154 2155 // Step 3: Check if the recurrence is in desirable form. 2156 auto *CurrXPN = dyn_cast<PHINode>(CurrX); 2157 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) { 2158 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n"); 2159 return false; 2160 } 2161 2162 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB); 2163 NextX = 2164 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB)); 2165 2166 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) { 2167 // FIXME: support right-shift? 2168 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n"); 2169 return false; 2170 } 2171 2172 // Step 4: Check if the backedge's destinations are in desirable form. 2173 2174 assert(ICmpInst::isEquality(Pred) && 2175 "Should only get equality predicates here."); 2176 2177 // cmp-br is commutative, so canonicalize to a single variant. 2178 if (Pred != ICmpInst::Predicate::ICMP_EQ) { 2179 Pred = ICmpInst::getInversePredicate(Pred); 2180 std::swap(TrueBB, FalseBB); 2181 } 2182 2183 // We expect to exit loop when comparison yields false, 2184 // so when it yields true we should branch back to loop header. 2185 if (TrueBB != LoopHeaderBB) { 2186 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n"); 2187 return false; 2188 } 2189 2190 // Okay, idiom checks out. 2191 return true; 2192 } 2193 2194 /// Look for the following loop: 2195 /// \code 2196 /// entry: 2197 /// <...> 2198 /// %bitmask = shl i32 1, %bitpos 2199 /// br label %loop 2200 /// 2201 /// loop: 2202 /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ] 2203 /// %x.curr.bitmasked = and i32 %x.curr, %bitmask 2204 /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0 2205 /// %x.next = shl i32 %x.curr, 1 2206 /// <...> 2207 /// br i1 %x.curr.isbitunset, label %loop, label %end 2208 /// 2209 /// end: 2210 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> 2211 /// %x.next.res = phi i32 [ %x.next, %loop ] <...> 2212 /// <...> 2213 /// \endcode 2214 /// 2215 /// And transform it into: 2216 /// \code 2217 /// entry: 2218 /// %bitmask = shl i32 1, %bitpos 2219 /// %lowbitmask = add i32 %bitmask, -1 2220 /// %mask = or i32 %lowbitmask, %bitmask 2221 /// %x.masked = and i32 %x, %mask 2222 /// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked, 2223 /// i1 true) 2224 /// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros 2225 /// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1 2226 /// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos 2227 /// %tripcount = add i32 %backedgetakencount, 1 2228 /// %x.curr = shl i32 %x, %backedgetakencount 2229 /// %x.next = shl i32 %x, %tripcount 2230 /// br label %loop 2231 /// 2232 /// loop: 2233 /// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ] 2234 /// %loop.iv.next = add nuw i32 %loop.iv, 1 2235 /// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount 2236 /// <...> 2237 /// br i1 %loop.ivcheck, label %end, label %loop 2238 /// 2239 /// end: 2240 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> 2241 /// %x.next.res = phi i32 [ %x.next, %loop ] <...> 2242 /// <...> 2243 /// \endcode 2244 bool LoopIdiomRecognize::recognizeShiftUntilBitTest() { 2245 bool MadeChange = false; 2246 2247 Value *X, *BitMask, *BitPos, *XCurr; 2248 Instruction *XNext; 2249 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr, 2250 XNext)) { 2251 LLVM_DEBUG(dbgs() << DEBUG_TYPE 2252 " shift-until-bittest idiom detection failed.\n"); 2253 return MadeChange; 2254 } 2255 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n"); 2256 2257 // Ok, it is the idiom we were looking for, we *could* transform this loop, 2258 // but is it profitable to transform? 2259 2260 BasicBlock *LoopHeaderBB = CurLoop->getHeader(); 2261 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader(); 2262 assert(LoopPreheaderBB && "There is always a loop preheader."); 2263 2264 BasicBlock *SuccessorBB = CurLoop->getExitBlock(); 2265 assert(LoopPreheaderBB && "There is only a single successor."); 2266 2267 IRBuilder<> Builder(LoopPreheaderBB->getTerminator()); 2268 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc()); 2269 2270 Intrinsic::ID IntrID = Intrinsic::ctlz; 2271 Type *Ty = X->getType(); 2272 unsigned Bitwidth = Ty->getScalarSizeInBits(); 2273 2274 TargetTransformInfo::TargetCostKind CostKind = 2275 TargetTransformInfo::TCK_SizeAndLatency; 2276 2277 // The rewrite is considered to be unprofitable iff and only iff the 2278 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just* 2279 // making the loop countable, even if nothing else changes. 2280 IntrinsicCostAttributes Attrs( 2281 IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()}); 2282 InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind); 2283 if (Cost > TargetTransformInfo::TCC_Basic) { 2284 LLVM_DEBUG(dbgs() << DEBUG_TYPE 2285 " Intrinsic is too costly, not beneficial\n"); 2286 return MadeChange; 2287 } 2288 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) > 2289 TargetTransformInfo::TCC_Basic) { 2290 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n"); 2291 return MadeChange; 2292 } 2293 2294 // Ok, transform appears worthwhile. 2295 MadeChange = true; 2296 2297 // Step 1: Compute the loop trip count. 2298 2299 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty), 2300 BitPos->getName() + ".lowbitmask"); 2301 Value *Mask = 2302 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask"); 2303 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked"); 2304 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic( 2305 IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()}, 2306 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros"); 2307 Value *XMaskedNumActiveBits = Builder.CreateSub( 2308 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros, 2309 XMasked->getName() + ".numactivebits", /*HasNUW=*/true, 2310 /*HasNSW=*/Bitwidth != 2); 2311 Value *XMaskedLeadingOnePos = 2312 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty), 2313 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false, 2314 /*HasNSW=*/Bitwidth > 2); 2315 2316 Value *LoopBackedgeTakenCount = Builder.CreateSub( 2317 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount", 2318 /*HasNUW=*/true, /*HasNSW=*/true); 2319 // We know loop's backedge-taken count, but what's loop's trip count? 2320 // Note that while NUW is always safe, while NSW is only for bitwidths != 2. 2321 Value *LoopTripCount = 2322 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1), 2323 CurLoop->getName() + ".tripcount", /*HasNUW=*/true, 2324 /*HasNSW=*/Bitwidth != 2); 2325 2326 // Step 2: Compute the recurrence's final value without a loop. 2327 2328 // NewX is always safe to compute, because `LoopBackedgeTakenCount` 2329 // will always be smaller than `bitwidth(X)`, i.e. we never get poison. 2330 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount); 2331 NewX->takeName(XCurr); 2332 if (auto *I = dyn_cast<Instruction>(NewX)) 2333 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true); 2334 2335 Value *NewXNext; 2336 // Rewriting XNext is more complicated, however, because `X << LoopTripCount` 2337 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen 2338 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know 2339 // that isn't the case, we'll need to emit an alternative, safe IR. 2340 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() || 2341 PatternMatch::match( 2342 BitPos, PatternMatch::m_SpecificInt_ICMP( 2343 ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(), 2344 Ty->getScalarSizeInBits() - 1)))) 2345 NewXNext = Builder.CreateShl(X, LoopTripCount); 2346 else { 2347 // Otherwise, just additionally shift by one. It's the smallest solution, 2348 // alternatively, we could check that NewX is INT_MIN (or BitPos is ) 2349 // and select 0 instead. 2350 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1)); 2351 } 2352 2353 NewXNext->takeName(XNext); 2354 if (auto *I = dyn_cast<Instruction>(NewXNext)) 2355 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true); 2356 2357 // Step 3: Adjust the successor basic block to recieve the computed 2358 // recurrence's final value instead of the recurrence itself. 2359 2360 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB); 2361 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB); 2362 2363 // Step 4: Rewrite the loop into a countable form, with canonical IV. 2364 2365 // The new canonical induction variable. 2366 Builder.SetInsertPoint(&LoopHeaderBB->front()); 2367 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv"); 2368 2369 // The induction itself. 2370 // Note that while NUW is always safe, while NSW is only for bitwidths != 2. 2371 Builder.SetInsertPoint(LoopHeaderBB->getTerminator()); 2372 auto *IVNext = 2373 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next", 2374 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2); 2375 2376 // The loop trip count check. 2377 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount, 2378 CurLoop->getName() + ".ivcheck"); 2379 Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB); 2380 LoopHeaderBB->getTerminator()->eraseFromParent(); 2381 2382 // Populate the IV PHI. 2383 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB); 2384 IV->addIncoming(IVNext, LoopHeaderBB); 2385 2386 // Step 5: Forget the "non-computable" trip-count SCEV associated with the 2387 // loop. The loop would otherwise not be deleted even if it becomes empty. 2388 2389 SE->forgetLoop(CurLoop); 2390 2391 // Other passes will take care of actually deleting the loop if possible. 2392 2393 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n"); 2394 2395 ++NumShiftUntilBitTest; 2396 return MadeChange; 2397 } 2398