1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements an idiom recognizer that transforms simple loops into a
10 // non-loop form.  In cases that this kicks in, it can be a significant
11 // performance win.
12 //
13 // If compiling for code size we avoid idiom recognition if the resulting
14 // code could be larger than the code for the original loop. One way this could
15 // happen is if the loop is not removable after idiom recognition due to the
16 // presence of non-idiom instructions. The initial implementation of the
17 // heuristics applies to idioms in multi-block loops.
18 //
19 //===----------------------------------------------------------------------===//
20 //
21 // TODO List:
22 //
23 // Future loop memory idioms to recognize:
24 //   memcmp, memmove, strlen, etc.
25 // Future floating point idioms to recognize in -ffast-math mode:
26 //   fpowi
27 // Future integer operation idioms to recognize:
28 //   ctpop
29 //
30 // Beware that isel's default lowering for ctpop is highly inefficient for
31 // i64 and larger types when i64 is legal and the value has few bits set.  It
32 // would be good to enhance isel to emit a loop for ctpop in this case.
33 //
34 // This could recognize common matrix multiplies and dot product idioms and
35 // replace them with calls to BLAS (if linked in??).
36 //
37 //===----------------------------------------------------------------------===//
38 
39 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
40 #include "llvm/ADT/APInt.h"
41 #include "llvm/ADT/ArrayRef.h"
42 #include "llvm/ADT/DenseMap.h"
43 #include "llvm/ADT/MapVector.h"
44 #include "llvm/ADT/SetVector.h"
45 #include "llvm/ADT/SmallPtrSet.h"
46 #include "llvm/ADT/SmallVector.h"
47 #include "llvm/ADT/Statistic.h"
48 #include "llvm/ADT/StringRef.h"
49 #include "llvm/Analysis/AliasAnalysis.h"
50 #include "llvm/Analysis/CmpInstAnalysis.h"
51 #include "llvm/Analysis/LoopAccessAnalysis.h"
52 #include "llvm/Analysis/LoopInfo.h"
53 #include "llvm/Analysis/LoopPass.h"
54 #include "llvm/Analysis/MemoryLocation.h"
55 #include "llvm/Analysis/MemorySSA.h"
56 #include "llvm/Analysis/MemorySSAUpdater.h"
57 #include "llvm/Analysis/MustExecute.h"
58 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
59 #include "llvm/Analysis/ScalarEvolution.h"
60 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
61 #include "llvm/Analysis/TargetLibraryInfo.h"
62 #include "llvm/Analysis/TargetTransformInfo.h"
63 #include "llvm/Analysis/ValueTracking.h"
64 #include "llvm/IR/Attributes.h"
65 #include "llvm/IR/BasicBlock.h"
66 #include "llvm/IR/Constant.h"
67 #include "llvm/IR/Constants.h"
68 #include "llvm/IR/DataLayout.h"
69 #include "llvm/IR/DebugLoc.h"
70 #include "llvm/IR/DerivedTypes.h"
71 #include "llvm/IR/Dominators.h"
72 #include "llvm/IR/GlobalValue.h"
73 #include "llvm/IR/GlobalVariable.h"
74 #include "llvm/IR/IRBuilder.h"
75 #include "llvm/IR/InstrTypes.h"
76 #include "llvm/IR/Instruction.h"
77 #include "llvm/IR/Instructions.h"
78 #include "llvm/IR/IntrinsicInst.h"
79 #include "llvm/IR/Intrinsics.h"
80 #include "llvm/IR/LLVMContext.h"
81 #include "llvm/IR/Module.h"
82 #include "llvm/IR/PassManager.h"
83 #include "llvm/IR/PatternMatch.h"
84 #include "llvm/IR/Type.h"
85 #include "llvm/IR/User.h"
86 #include "llvm/IR/Value.h"
87 #include "llvm/IR/ValueHandle.h"
88 #include "llvm/InitializePasses.h"
89 #include "llvm/Pass.h"
90 #include "llvm/Support/Casting.h"
91 #include "llvm/Support/CommandLine.h"
92 #include "llvm/Support/Debug.h"
93 #include "llvm/Support/InstructionCost.h"
94 #include "llvm/Support/raw_ostream.h"
95 #include "llvm/Transforms/Scalar.h"
96 #include "llvm/Transforms/Utils/BuildLibCalls.h"
97 #include "llvm/Transforms/Utils/Local.h"
98 #include "llvm/Transforms/Utils/LoopUtils.h"
99 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
100 #include <algorithm>
101 #include <cassert>
102 #include <cstdint>
103 #include <utility>
104 #include <vector>
105 
106 using namespace llvm;
107 
108 #define DEBUG_TYPE "loop-idiom"
109 
110 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
111 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
112 STATISTIC(
113     NumShiftUntilBitTest,
114     "Number of uncountable loops recognized as 'shift until bitttest' idiom");
115 
116 bool DisableLIRP::All;
117 static cl::opt<bool, true>
118     DisableLIRPAll("disable-" DEBUG_TYPE "-all",
119                    cl::desc("Options to disable Loop Idiom Recognize Pass."),
120                    cl::location(DisableLIRP::All), cl::init(false),
121                    cl::ReallyHidden);
122 
123 bool DisableLIRP::Memset;
124 static cl::opt<bool, true>
125     DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
126                       cl::desc("Proceed with loop idiom recognize pass, but do "
127                                "not convert loop(s) to memset."),
128                       cl::location(DisableLIRP::Memset), cl::init(false),
129                       cl::ReallyHidden);
130 
131 bool DisableLIRP::Memcpy;
132 static cl::opt<bool, true>
133     DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
134                       cl::desc("Proceed with loop idiom recognize pass, but do "
135                                "not convert loop(s) to memcpy."),
136                       cl::location(DisableLIRP::Memcpy), cl::init(false),
137                       cl::ReallyHidden);
138 
139 static cl::opt<bool> UseLIRCodeSizeHeurs(
140     "use-lir-code-size-heurs",
141     cl::desc("Use loop idiom recognition code size heuristics when compiling"
142              "with -Os/-Oz"),
143     cl::init(true), cl::Hidden);
144 
145 namespace {
146 
147 class LoopIdiomRecognize {
148   Loop *CurLoop = nullptr;
149   AliasAnalysis *AA;
150   DominatorTree *DT;
151   LoopInfo *LI;
152   ScalarEvolution *SE;
153   TargetLibraryInfo *TLI;
154   const TargetTransformInfo *TTI;
155   const DataLayout *DL;
156   OptimizationRemarkEmitter &ORE;
157   bool ApplyCodeSizeHeuristics;
158   std::unique_ptr<MemorySSAUpdater> MSSAU;
159 
160 public:
161   explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
162                               LoopInfo *LI, ScalarEvolution *SE,
163                               TargetLibraryInfo *TLI,
164                               const TargetTransformInfo *TTI, MemorySSA *MSSA,
165                               const DataLayout *DL,
166                               OptimizationRemarkEmitter &ORE)
167       : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
168     if (MSSA)
169       MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
170   }
171 
172   bool runOnLoop(Loop *L);
173 
174 private:
175   using StoreList = SmallVector<StoreInst *, 8>;
176   using StoreListMap = MapVector<Value *, StoreList>;
177 
178   StoreListMap StoreRefsForMemset;
179   StoreListMap StoreRefsForMemsetPattern;
180   StoreList StoreRefsForMemcpy;
181   bool HasMemset;
182   bool HasMemsetPattern;
183   bool HasMemcpy;
184 
185   /// Return code for isLegalStore()
186   enum LegalStoreKind {
187     None = 0,
188     Memset,
189     MemsetPattern,
190     Memcpy,
191     UnorderedAtomicMemcpy,
192     DontUse // Dummy retval never to be used. Allows catching errors in retval
193             // handling.
194   };
195 
196   /// \name Countable Loop Idiom Handling
197   /// @{
198 
199   bool runOnCountableLoop();
200   bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
201                       SmallVectorImpl<BasicBlock *> &ExitBlocks);
202 
203   void collectStores(BasicBlock *BB);
204   LegalStoreKind isLegalStore(StoreInst *SI);
205   enum class ForMemset { No, Yes };
206   bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
207                          ForMemset For);
208   bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
209 
210   bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
211                                MaybeAlign StoreAlignment, Value *StoredVal,
212                                Instruction *TheStore,
213                                SmallPtrSetImpl<Instruction *> &Stores,
214                                const SCEVAddRecExpr *Ev, const SCEV *BECount,
215                                bool NegStride, bool IsLoopMemset = false);
216   bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
217   bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
218                                   unsigned StoreSize, MaybeAlign StoreAlign,
219                                   MaybeAlign LoadAlign, Instruction *TheStore,
220                                   Instruction *TheLoad,
221                                   const SCEVAddRecExpr *StoreEv,
222                                   const SCEVAddRecExpr *LoadEv,
223                                   const SCEV *BECount);
224   bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
225                                  bool IsLoopMemset = false);
226 
227   /// @}
228   /// \name Noncountable Loop Idiom Handling
229   /// @{
230 
231   bool runOnNoncountableLoop();
232 
233   bool recognizePopcount();
234   void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
235                                PHINode *CntPhi, Value *Var);
236   bool recognizeAndInsertFFS();  /// Find First Set: ctlz or cttz
237   void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
238                                 Instruction *CntInst, PHINode *CntPhi,
239                                 Value *Var, Instruction *DefX,
240                                 const DebugLoc &DL, bool ZeroCheck,
241                                 bool IsCntPhiUsedOutsideLoop);
242 
243   bool recognizeShiftUntilBitTest();
244 
245   /// @}
246 };
247 
248 class LoopIdiomRecognizeLegacyPass : public LoopPass {
249 public:
250   static char ID;
251 
252   explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
253     initializeLoopIdiomRecognizeLegacyPassPass(
254         *PassRegistry::getPassRegistry());
255   }
256 
257   bool runOnLoop(Loop *L, LPPassManager &LPM) override {
258     if (DisableLIRP::All)
259       return false;
260 
261     if (skipLoop(L))
262       return false;
263 
264     AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
265     DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
266     LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
267     ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
268     TargetLibraryInfo *TLI =
269         &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
270             *L->getHeader()->getParent());
271     const TargetTransformInfo *TTI =
272         &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
273             *L->getHeader()->getParent());
274     const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
275     auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
276     MemorySSA *MSSA = nullptr;
277     if (MSSAAnalysis)
278       MSSA = &MSSAAnalysis->getMSSA();
279 
280     // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
281     // pass.  Function analyses need to be preserved across loop transformations
282     // but ORE cannot be preserved (see comment before the pass definition).
283     OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
284 
285     LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
286     return LIR.runOnLoop(L);
287   }
288 
289   /// This transformation requires natural loop information & requires that
290   /// loop preheaders be inserted into the CFG.
291   void getAnalysisUsage(AnalysisUsage &AU) const override {
292     AU.addRequired<TargetLibraryInfoWrapperPass>();
293     AU.addRequired<TargetTransformInfoWrapperPass>();
294     AU.addPreserved<MemorySSAWrapperPass>();
295     getLoopAnalysisUsage(AU);
296   }
297 };
298 
299 } // end anonymous namespace
300 
301 char LoopIdiomRecognizeLegacyPass::ID = 0;
302 
303 PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
304                                               LoopStandardAnalysisResults &AR,
305                                               LPMUpdater &) {
306   if (DisableLIRP::All)
307     return PreservedAnalyses::all();
308 
309   const auto *DL = &L.getHeader()->getModule()->getDataLayout();
310 
311   // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
312   // pass.  Function analyses need to be preserved across loop transformations
313   // but ORE cannot be preserved (see comment before the pass definition).
314   OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
315 
316   LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
317                          AR.MSSA, DL, ORE);
318   if (!LIR.runOnLoop(&L))
319     return PreservedAnalyses::all();
320 
321   auto PA = getLoopPassPreservedAnalyses();
322   if (AR.MSSA)
323     PA.preserve<MemorySSAAnalysis>();
324   return PA;
325 }
326 
327 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
328                       "Recognize loop idioms", false, false)
329 INITIALIZE_PASS_DEPENDENCY(LoopPass)
330 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
331 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
332 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
333                     "Recognize loop idioms", false, false)
334 
335 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
336 
337 static void deleteDeadInstruction(Instruction *I) {
338   I->replaceAllUsesWith(UndefValue::get(I->getType()));
339   I->eraseFromParent();
340 }
341 
342 //===----------------------------------------------------------------------===//
343 //
344 //          Implementation of LoopIdiomRecognize
345 //
346 //===----------------------------------------------------------------------===//
347 
348 bool LoopIdiomRecognize::runOnLoop(Loop *L) {
349   CurLoop = L;
350   // If the loop could not be converted to canonical form, it must have an
351   // indirectbr in it, just give up.
352   if (!L->getLoopPreheader())
353     return false;
354 
355   // Disable loop idiom recognition if the function's name is a common idiom.
356   StringRef Name = L->getHeader()->getParent()->getName();
357   if (Name == "memset" || Name == "memcpy")
358     return false;
359 
360   // Determine if code size heuristics need to be applied.
361   ApplyCodeSizeHeuristics =
362       L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
363 
364   HasMemset = TLI->has(LibFunc_memset);
365   HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
366   HasMemcpy = TLI->has(LibFunc_memcpy);
367 
368   if (HasMemset || HasMemsetPattern || HasMemcpy)
369     if (SE->hasLoopInvariantBackedgeTakenCount(L))
370       return runOnCountableLoop();
371 
372   return runOnNoncountableLoop();
373 }
374 
375 bool LoopIdiomRecognize::runOnCountableLoop() {
376   const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
377   assert(!isa<SCEVCouldNotCompute>(BECount) &&
378          "runOnCountableLoop() called on a loop without a predictable"
379          "backedge-taken count");
380 
381   // If this loop executes exactly one time, then it should be peeled, not
382   // optimized by this pass.
383   if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
384     if (BECst->getAPInt() == 0)
385       return false;
386 
387   SmallVector<BasicBlock *, 8> ExitBlocks;
388   CurLoop->getUniqueExitBlocks(ExitBlocks);
389 
390   LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
391                     << CurLoop->getHeader()->getParent()->getName()
392                     << "] Countable Loop %" << CurLoop->getHeader()->getName()
393                     << "\n");
394 
395   // The following transforms hoist stores/memsets into the loop pre-header.
396   // Give up if the loop has instructions that may throw.
397   SimpleLoopSafetyInfo SafetyInfo;
398   SafetyInfo.computeLoopSafetyInfo(CurLoop);
399   if (SafetyInfo.anyBlockMayThrow())
400     return false;
401 
402   bool MadeChange = false;
403 
404   // Scan all the blocks in the loop that are not in subloops.
405   for (auto *BB : CurLoop->getBlocks()) {
406     // Ignore blocks in subloops.
407     if (LI->getLoopFor(BB) != CurLoop)
408       continue;
409 
410     MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
411   }
412   return MadeChange;
413 }
414 
415 static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
416   const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
417   return ConstStride->getAPInt();
418 }
419 
420 /// getMemSetPatternValue - If a strided store of the specified value is safe to
421 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
422 /// be passed in.  Otherwise, return null.
423 ///
424 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
425 /// just replicate their input array and then pass on to memset_pattern16.
426 static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
427   // FIXME: This could check for UndefValue because it can be merged into any
428   // other valid pattern.
429 
430   // If the value isn't a constant, we can't promote it to being in a constant
431   // array.  We could theoretically do a store to an alloca or something, but
432   // that doesn't seem worthwhile.
433   Constant *C = dyn_cast<Constant>(V);
434   if (!C)
435     return nullptr;
436 
437   // Only handle simple values that are a power of two bytes in size.
438   uint64_t Size = DL->getTypeSizeInBits(V->getType());
439   if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
440     return nullptr;
441 
442   // Don't care enough about darwin/ppc to implement this.
443   if (DL->isBigEndian())
444     return nullptr;
445 
446   // Convert to size in bytes.
447   Size /= 8;
448 
449   // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
450   // if the top and bottom are the same (e.g. for vectors and large integers).
451   if (Size > 16)
452     return nullptr;
453 
454   // If the constant is exactly 16 bytes, just use it.
455   if (Size == 16)
456     return C;
457 
458   // Otherwise, we'll use an array of the constants.
459   unsigned ArraySize = 16 / Size;
460   ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
461   return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
462 }
463 
464 LoopIdiomRecognize::LegalStoreKind
465 LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
466   // Don't touch volatile stores.
467   if (SI->isVolatile())
468     return LegalStoreKind::None;
469   // We only want simple or unordered-atomic stores.
470   if (!SI->isUnordered())
471     return LegalStoreKind::None;
472 
473   // Avoid merging nontemporal stores.
474   if (SI->getMetadata(LLVMContext::MD_nontemporal))
475     return LegalStoreKind::None;
476 
477   Value *StoredVal = SI->getValueOperand();
478   Value *StorePtr = SI->getPointerOperand();
479 
480   // Don't convert stores of non-integral pointer types to memsets (which stores
481   // integers).
482   if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
483     return LegalStoreKind::None;
484 
485   // Reject stores that are so large that they overflow an unsigned.
486   // When storing out scalable vectors we bail out for now, since the code
487   // below currently only works for constant strides.
488   TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
489   if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
490       (SizeInBits.getFixedSize() >> 32) != 0)
491     return LegalStoreKind::None;
492 
493   // See if the pointer expression is an AddRec like {base,+,1} on the current
494   // loop, which indicates a strided store.  If we have something else, it's a
495   // random store we can't handle.
496   const SCEVAddRecExpr *StoreEv =
497       dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
498   if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
499     return LegalStoreKind::None;
500 
501   // Check to see if we have a constant stride.
502   if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
503     return LegalStoreKind::None;
504 
505   // See if the store can be turned into a memset.
506 
507   // If the stored value is a byte-wise value (like i32 -1), then it may be
508   // turned into a memset of i8 -1, assuming that all the consecutive bytes
509   // are stored.  A store of i32 0x01020304 can never be turned into a memset,
510   // but it can be turned into memset_pattern if the target supports it.
511   Value *SplatValue = isBytewiseValue(StoredVal, *DL);
512 
513   // Note: memset and memset_pattern on unordered-atomic is yet not supported
514   bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
515 
516   // If we're allowed to form a memset, and the stored value would be
517   // acceptable for memset, use it.
518   if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
519       // Verify that the stored value is loop invariant.  If not, we can't
520       // promote the memset.
521       CurLoop->isLoopInvariant(SplatValue)) {
522     // It looks like we can use SplatValue.
523     return LegalStoreKind::Memset;
524   }
525   if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
526       // Don't create memset_pattern16s with address spaces.
527       StorePtr->getType()->getPointerAddressSpace() == 0 &&
528       getMemSetPatternValue(StoredVal, DL)) {
529     // It looks like we can use PatternValue!
530     return LegalStoreKind::MemsetPattern;
531   }
532 
533   // Otherwise, see if the store can be turned into a memcpy.
534   if (HasMemcpy && !DisableLIRP::Memcpy) {
535     // Check to see if the stride matches the size of the store.  If so, then we
536     // know that every byte is touched in the loop.
537     APInt Stride = getStoreStride(StoreEv);
538     unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
539     if (StoreSize != Stride && StoreSize != -Stride)
540       return LegalStoreKind::None;
541 
542     // The store must be feeding a non-volatile load.
543     LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
544 
545     // Only allow non-volatile loads
546     if (!LI || LI->isVolatile())
547       return LegalStoreKind::None;
548     // Only allow simple or unordered-atomic loads
549     if (!LI->isUnordered())
550       return LegalStoreKind::None;
551 
552     // See if the pointer expression is an AddRec like {base,+,1} on the current
553     // loop, which indicates a strided load.  If we have something else, it's a
554     // random load we can't handle.
555     const SCEVAddRecExpr *LoadEv =
556         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
557     if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
558       return LegalStoreKind::None;
559 
560     // The store and load must share the same stride.
561     if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
562       return LegalStoreKind::None;
563 
564     // Success.  This store can be converted into a memcpy.
565     UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
566     return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
567                            : LegalStoreKind::Memcpy;
568   }
569   // This store can't be transformed into a memset/memcpy.
570   return LegalStoreKind::None;
571 }
572 
573 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
574   StoreRefsForMemset.clear();
575   StoreRefsForMemsetPattern.clear();
576   StoreRefsForMemcpy.clear();
577   for (Instruction &I : *BB) {
578     StoreInst *SI = dyn_cast<StoreInst>(&I);
579     if (!SI)
580       continue;
581 
582     // Make sure this is a strided store with a constant stride.
583     switch (isLegalStore(SI)) {
584     case LegalStoreKind::None:
585       // Nothing to do
586       break;
587     case LegalStoreKind::Memset: {
588       // Find the base pointer.
589       Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
590       StoreRefsForMemset[Ptr].push_back(SI);
591     } break;
592     case LegalStoreKind::MemsetPattern: {
593       // Find the base pointer.
594       Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
595       StoreRefsForMemsetPattern[Ptr].push_back(SI);
596     } break;
597     case LegalStoreKind::Memcpy:
598     case LegalStoreKind::UnorderedAtomicMemcpy:
599       StoreRefsForMemcpy.push_back(SI);
600       break;
601     default:
602       assert(false && "unhandled return value");
603       break;
604     }
605   }
606 }
607 
608 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
609 /// with the specified backedge count.  This block is known to be in the current
610 /// loop and not in any subloops.
611 bool LoopIdiomRecognize::runOnLoopBlock(
612     BasicBlock *BB, const SCEV *BECount,
613     SmallVectorImpl<BasicBlock *> &ExitBlocks) {
614   // We can only promote stores in this block if they are unconditionally
615   // executed in the loop.  For a block to be unconditionally executed, it has
616   // to dominate all the exit blocks of the loop.  Verify this now.
617   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
618     if (!DT->dominates(BB, ExitBlocks[i]))
619       return false;
620 
621   bool MadeChange = false;
622   // Look for store instructions, which may be optimized to memset/memcpy.
623   collectStores(BB);
624 
625   // Look for a single store or sets of stores with a common base, which can be
626   // optimized into a memset (memset_pattern).  The latter most commonly happens
627   // with structs and handunrolled loops.
628   for (auto &SL : StoreRefsForMemset)
629     MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
630 
631   for (auto &SL : StoreRefsForMemsetPattern)
632     MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
633 
634   // Optimize the store into a memcpy, if it feeds an similarly strided load.
635   for (auto &SI : StoreRefsForMemcpy)
636     MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
637 
638   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
639     Instruction *Inst = &*I++;
640     // Look for memset instructions, which may be optimized to a larger memset.
641     if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
642       WeakTrackingVH InstPtr(&*I);
643       if (!processLoopMemSet(MSI, BECount))
644         continue;
645       MadeChange = true;
646 
647       // If processing the memset invalidated our iterator, start over from the
648       // top of the block.
649       if (!InstPtr)
650         I = BB->begin();
651       continue;
652     }
653   }
654 
655   return MadeChange;
656 }
657 
658 /// See if this store(s) can be promoted to a memset.
659 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
660                                            const SCEV *BECount, ForMemset For) {
661   // Try to find consecutive stores that can be transformed into memsets.
662   SetVector<StoreInst *> Heads, Tails;
663   SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
664 
665   // Do a quadratic search on all of the given stores and find
666   // all of the pairs of stores that follow each other.
667   SmallVector<unsigned, 16> IndexQueue;
668   for (unsigned i = 0, e = SL.size(); i < e; ++i) {
669     assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
670 
671     Value *FirstStoredVal = SL[i]->getValueOperand();
672     Value *FirstStorePtr = SL[i]->getPointerOperand();
673     const SCEVAddRecExpr *FirstStoreEv =
674         cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
675     APInt FirstStride = getStoreStride(FirstStoreEv);
676     unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
677 
678     // See if we can optimize just this store in isolation.
679     if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
680       Heads.insert(SL[i]);
681       continue;
682     }
683 
684     Value *FirstSplatValue = nullptr;
685     Constant *FirstPatternValue = nullptr;
686 
687     if (For == ForMemset::Yes)
688       FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
689     else
690       FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
691 
692     assert((FirstSplatValue || FirstPatternValue) &&
693            "Expected either splat value or pattern value.");
694 
695     IndexQueue.clear();
696     // If a store has multiple consecutive store candidates, search Stores
697     // array according to the sequence: from i+1 to e, then from i-1 to 0.
698     // This is because usually pairing with immediate succeeding or preceding
699     // candidate create the best chance to find memset opportunity.
700     unsigned j = 0;
701     for (j = i + 1; j < e; ++j)
702       IndexQueue.push_back(j);
703     for (j = i; j > 0; --j)
704       IndexQueue.push_back(j - 1);
705 
706     for (auto &k : IndexQueue) {
707       assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
708       Value *SecondStorePtr = SL[k]->getPointerOperand();
709       const SCEVAddRecExpr *SecondStoreEv =
710           cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
711       APInt SecondStride = getStoreStride(SecondStoreEv);
712 
713       if (FirstStride != SecondStride)
714         continue;
715 
716       Value *SecondStoredVal = SL[k]->getValueOperand();
717       Value *SecondSplatValue = nullptr;
718       Constant *SecondPatternValue = nullptr;
719 
720       if (For == ForMemset::Yes)
721         SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
722       else
723         SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
724 
725       assert((SecondSplatValue || SecondPatternValue) &&
726              "Expected either splat value or pattern value.");
727 
728       if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
729         if (For == ForMemset::Yes) {
730           if (isa<UndefValue>(FirstSplatValue))
731             FirstSplatValue = SecondSplatValue;
732           if (FirstSplatValue != SecondSplatValue)
733             continue;
734         } else {
735           if (isa<UndefValue>(FirstPatternValue))
736             FirstPatternValue = SecondPatternValue;
737           if (FirstPatternValue != SecondPatternValue)
738             continue;
739         }
740         Tails.insert(SL[k]);
741         Heads.insert(SL[i]);
742         ConsecutiveChain[SL[i]] = SL[k];
743         break;
744       }
745     }
746   }
747 
748   // We may run into multiple chains that merge into a single chain. We mark the
749   // stores that we transformed so that we don't visit the same store twice.
750   SmallPtrSet<Value *, 16> TransformedStores;
751   bool Changed = false;
752 
753   // For stores that start but don't end a link in the chain:
754   for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
755        it != e; ++it) {
756     if (Tails.count(*it))
757       continue;
758 
759     // We found a store instr that starts a chain. Now follow the chain and try
760     // to transform it.
761     SmallPtrSet<Instruction *, 8> AdjacentStores;
762     StoreInst *I = *it;
763 
764     StoreInst *HeadStore = I;
765     unsigned StoreSize = 0;
766 
767     // Collect the chain into a list.
768     while (Tails.count(I) || Heads.count(I)) {
769       if (TransformedStores.count(I))
770         break;
771       AdjacentStores.insert(I);
772 
773       StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
774       // Move to the next value in the chain.
775       I = ConsecutiveChain[I];
776     }
777 
778     Value *StoredVal = HeadStore->getValueOperand();
779     Value *StorePtr = HeadStore->getPointerOperand();
780     const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
781     APInt Stride = getStoreStride(StoreEv);
782 
783     // Check to see if the stride matches the size of the stores.  If so, then
784     // we know that every byte is touched in the loop.
785     if (StoreSize != Stride && StoreSize != -Stride)
786       continue;
787 
788     bool NegStride = StoreSize == -Stride;
789 
790     if (processLoopStridedStore(StorePtr, StoreSize,
791                                 MaybeAlign(HeadStore->getAlignment()),
792                                 StoredVal, HeadStore, AdjacentStores, StoreEv,
793                                 BECount, NegStride)) {
794       TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
795       Changed = true;
796     }
797   }
798 
799   return Changed;
800 }
801 
802 /// processLoopMemSet - See if this memset can be promoted to a large memset.
803 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
804                                            const SCEV *BECount) {
805   // We can only handle non-volatile memsets with a constant size.
806   if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
807     return false;
808 
809   // If we're not allowed to hack on memset, we fail.
810   if (!HasMemset)
811     return false;
812 
813   Value *Pointer = MSI->getDest();
814 
815   // See if the pointer expression is an AddRec like {base,+,1} on the current
816   // loop, which indicates a strided store.  If we have something else, it's a
817   // random store we can't handle.
818   const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
819   if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
820     return false;
821 
822   // Reject memsets that are so large that they overflow an unsigned.
823   uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
824   if ((SizeInBytes >> 32) != 0)
825     return false;
826 
827   // Check to see if the stride matches the size of the memset.  If so, then we
828   // know that every byte is touched in the loop.
829   const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
830   if (!ConstStride)
831     return false;
832 
833   APInt Stride = ConstStride->getAPInt();
834   if (SizeInBytes != Stride && SizeInBytes != -Stride)
835     return false;
836 
837   // Verify that the memset value is loop invariant.  If not, we can't promote
838   // the memset.
839   Value *SplatValue = MSI->getValue();
840   if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
841     return false;
842 
843   SmallPtrSet<Instruction *, 1> MSIs;
844   MSIs.insert(MSI);
845   bool NegStride = SizeInBytes == -Stride;
846   return processLoopStridedStore(
847       Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
848       SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
849 }
850 
851 /// mayLoopAccessLocation - Return true if the specified loop might access the
852 /// specified pointer location, which is a loop-strided access.  The 'Access'
853 /// argument specifies what the verboten forms of access are (read or write).
854 static bool
855 mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
856                       const SCEV *BECount, unsigned StoreSize,
857                       AliasAnalysis &AA,
858                       SmallPtrSetImpl<Instruction *> &IgnoredStores) {
859   // Get the location that may be stored across the loop.  Since the access is
860   // strided positively through memory, we say that the modified location starts
861   // at the pointer and has infinite size.
862   LocationSize AccessSize = LocationSize::afterPointer();
863 
864   // If the loop iterates a fixed number of times, we can refine the access size
865   // to be exactly the size of the memset, which is (BECount+1)*StoreSize
866   if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
867     AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
868                                        StoreSize);
869 
870   // TODO: For this to be really effective, we have to dive into the pointer
871   // operand in the store.  Store to &A[i] of 100 will always return may alias
872   // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
873   // which will then no-alias a store to &A[100].
874   MemoryLocation StoreLoc(Ptr, AccessSize);
875 
876   for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
877        ++BI)
878     for (Instruction &I : **BI)
879       if (IgnoredStores.count(&I) == 0 &&
880           isModOrRefSet(
881               intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
882         return true;
883 
884   return false;
885 }
886 
887 // If we have a negative stride, Start refers to the end of the memory location
888 // we're trying to memset.  Therefore, we need to recompute the base pointer,
889 // which is just Start - BECount*Size.
890 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
891                                         Type *IntPtr, unsigned StoreSize,
892                                         ScalarEvolution *SE) {
893   const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
894   if (StoreSize != 1)
895     Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
896                            SCEV::FlagNUW);
897   return SE->getMinusSCEV(Start, Index);
898 }
899 
900 /// Compute the number of bytes as a SCEV from the backedge taken count.
901 ///
902 /// This also maps the SCEV into the provided type and tries to handle the
903 /// computation in a way that will fold cleanly.
904 static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
905                                unsigned StoreSize, Loop *CurLoop,
906                                const DataLayout *DL, ScalarEvolution *SE) {
907   const SCEV *NumBytesS;
908   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
909   // pointer size if it isn't already.
910   //
911   // If we're going to need to zero extend the BE count, check if we can add
912   // one to it prior to zero extending without overflow. Provided this is safe,
913   // it allows better simplification of the +1.
914   if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() <
915           DL->getTypeSizeInBits(IntPtr).getFixedSize() &&
916       SE->isLoopEntryGuardedByCond(
917           CurLoop, ICmpInst::ICMP_NE, BECount,
918           SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
919     NumBytesS = SE->getZeroExtendExpr(
920         SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
921         IntPtr);
922   } else {
923     NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
924                                SE->getOne(IntPtr), SCEV::FlagNUW);
925   }
926 
927   // And scale it based on the store size.
928   if (StoreSize != 1) {
929     NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
930                                SCEV::FlagNUW);
931   }
932   return NumBytesS;
933 }
934 
935 /// processLoopStridedStore - We see a strided store of some value.  If we can
936 /// transform this into a memset or memset_pattern in the loop preheader, do so.
937 bool LoopIdiomRecognize::processLoopStridedStore(
938     Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
939     Value *StoredVal, Instruction *TheStore,
940     SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
941     const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
942   Value *SplatValue = isBytewiseValue(StoredVal, *DL);
943   Constant *PatternValue = nullptr;
944 
945   if (!SplatValue)
946     PatternValue = getMemSetPatternValue(StoredVal, DL);
947 
948   assert((SplatValue || PatternValue) &&
949          "Expected either splat value or pattern value.");
950 
951   // The trip count of the loop and the base pointer of the addrec SCEV is
952   // guaranteed to be loop invariant, which means that it should dominate the
953   // header.  This allows us to insert code for it in the preheader.
954   unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
955   BasicBlock *Preheader = CurLoop->getLoopPreheader();
956   IRBuilder<> Builder(Preheader->getTerminator());
957   SCEVExpander Expander(*SE, *DL, "loop-idiom");
958   SCEVExpanderCleaner ExpCleaner(Expander, *DT);
959 
960   Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
961   Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
962 
963   bool Changed = false;
964   const SCEV *Start = Ev->getStart();
965   // Handle negative strided loops.
966   if (NegStride)
967     Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
968 
969   // TODO: ideally we should still be able to generate memset if SCEV expander
970   // is taught to generate the dependencies at the latest point.
971   if (!isSafeToExpand(Start, *SE))
972     return Changed;
973 
974   // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
975   // this into a memset in the loop preheader now if we want.  However, this
976   // would be unsafe to do if there is anything else in the loop that may read
977   // or write to the aliased location.  Check for any overlap by generating the
978   // base pointer and checking the region.
979   Value *BasePtr =
980       Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
981 
982   // From here on out, conservatively report to the pass manager that we've
983   // changed the IR, even if we later clean up these added instructions. There
984   // may be structural differences e.g. in the order of use lists not accounted
985   // for in just a textual dump of the IR. This is written as a variable, even
986   // though statically all the places this dominates could be replaced with
987   // 'true', with the hope that anyone trying to be clever / "more precise" with
988   // the return value will read this comment, and leave them alone.
989   Changed = true;
990 
991   if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
992                             StoreSize, *AA, Stores))
993     return Changed;
994 
995   if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
996     return Changed;
997 
998   // Okay, everything looks good, insert the memset.
999 
1000   const SCEV *NumBytesS =
1001       getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
1002 
1003   // TODO: ideally we should still be able to generate memset if SCEV expander
1004   // is taught to generate the dependencies at the latest point.
1005   if (!isSafeToExpand(NumBytesS, *SE))
1006     return Changed;
1007 
1008   Value *NumBytes =
1009       Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1010 
1011   CallInst *NewCall;
1012   if (SplatValue) {
1013     NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
1014                                    MaybeAlign(StoreAlignment));
1015   } else {
1016     // Everything is emitted in default address space
1017     Type *Int8PtrTy = DestInt8PtrTy;
1018 
1019     Module *M = TheStore->getModule();
1020     StringRef FuncName = "memset_pattern16";
1021     FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
1022                                                 Int8PtrTy, Int8PtrTy, IntIdxTy);
1023     inferLibFuncAttributes(M, FuncName, *TLI);
1024 
1025     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
1026     // an constant array of 16-bytes.  Plop the value into a mergable global.
1027     GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1028                                             GlobalValue::PrivateLinkage,
1029                                             PatternValue, ".memset_pattern");
1030     GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1031     GV->setAlignment(Align(16));
1032     Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1033     NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1034   }
1035   NewCall->setDebugLoc(TheStore->getDebugLoc());
1036 
1037   if (MSSAU) {
1038     MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1039         NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1040     MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1041   }
1042 
1043   LLVM_DEBUG(dbgs() << "  Formed memset: " << *NewCall << "\n"
1044                     << "    from store to: " << *Ev << " at: " << *TheStore
1045                     << "\n");
1046 
1047   ORE.emit([&]() {
1048     return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",
1049                               NewCall->getDebugLoc(), Preheader)
1050            << "Transformed loop-strided store into a call to "
1051            << ore::NV("NewFunction", NewCall->getCalledFunction())
1052            << "() function";
1053   });
1054 
1055   // Okay, the memset has been formed.  Zap the original store and anything that
1056   // feeds into it.
1057   for (auto *I : Stores) {
1058     if (MSSAU)
1059       MSSAU->removeMemoryAccess(I, true);
1060     deleteDeadInstruction(I);
1061   }
1062   if (MSSAU && VerifyMemorySSA)
1063     MSSAU->getMemorySSA()->verifyMemorySSA();
1064   ++NumMemSet;
1065   ExpCleaner.markResultUsed();
1066   return true;
1067 }
1068 
1069 /// If the stored value is a strided load in the same loop with the same stride
1070 /// this may be transformable into a memcpy.  This kicks in for stuff like
1071 /// for (i) A[i] = B[i];
1072 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1073                                                     const SCEV *BECount) {
1074   assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1075 
1076   Value *StorePtr = SI->getPointerOperand();
1077   const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1078   unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1079 
1080   // The store must be feeding a non-volatile load.
1081   LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1082   assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1083 
1084   // See if the pointer expression is an AddRec like {base,+,1} on the current
1085   // loop, which indicates a strided load.  If we have something else, it's a
1086   // random load we can't handle.
1087   Value *LoadPtr = LI->getPointerOperand();
1088   const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1089   return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSize,
1090                                     SI->getAlign(), LI->getAlign(), SI, LI,
1091                                     StoreEv, LoadEv, BECount);
1092 }
1093 
1094 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1095     Value *DestPtr, Value *SourcePtr, unsigned StoreSize, MaybeAlign StoreAlign,
1096     MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad,
1097     const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv,
1098     const SCEV *BECount) {
1099   // The trip count of the loop and the base pointer of the addrec SCEV is
1100   // guaranteed to be loop invariant, which means that it should dominate the
1101   // header.  This allows us to insert code for it in the preheader.
1102   BasicBlock *Preheader = CurLoop->getLoopPreheader();
1103   IRBuilder<> Builder(Preheader->getTerminator());
1104   SCEVExpander Expander(*SE, *DL, "loop-idiom");
1105 
1106   SCEVExpanderCleaner ExpCleaner(Expander, *DT);
1107 
1108   bool Changed = false;
1109   const SCEV *StrStart = StoreEv->getStart();
1110   unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1111   Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1112 
1113   APInt Stride = getStoreStride(StoreEv);
1114   bool NegStride = StoreSize == -Stride;
1115 
1116   // Handle negative strided loops.
1117   if (NegStride)
1118     StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
1119 
1120   // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
1121   // this into a memcpy in the loop preheader now if we want.  However, this
1122   // would be unsafe to do if there is anything else in the loop that may read
1123   // or write the memory region we're storing to.  This includes the load that
1124   // feeds the stores.  Check for an alias by generating the base address and
1125   // checking everything.
1126   Value *StoreBasePtr = Expander.expandCodeFor(
1127       StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1128 
1129   // From here on out, conservatively report to the pass manager that we've
1130   // changed the IR, even if we later clean up these added instructions. There
1131   // may be structural differences e.g. in the order of use lists not accounted
1132   // for in just a textual dump of the IR. This is written as a variable, even
1133   // though statically all the places this dominates could be replaced with
1134   // 'true', with the hope that anyone trying to be clever / "more precise" with
1135   // the return value will read this comment, and leave them alone.
1136   Changed = true;
1137 
1138   SmallPtrSet<Instruction *, 1> Stores;
1139   Stores.insert(TheStore);
1140   if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1141                             StoreSize, *AA, Stores))
1142     return Changed;
1143 
1144   const SCEV *LdStart = LoadEv->getStart();
1145   unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1146 
1147   // Handle negative strided loops.
1148   if (NegStride)
1149     LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
1150 
1151   // For a memcpy, we have to make sure that the input array is not being
1152   // mutated by the loop.
1153   Value *LoadBasePtr = Expander.expandCodeFor(
1154       LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1155 
1156   if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1157                             StoreSize, *AA, Stores))
1158     return Changed;
1159 
1160   if (avoidLIRForMultiBlockLoop())
1161     return Changed;
1162 
1163   // Okay, everything is safe, we can transform this!
1164 
1165   const SCEV *NumBytesS =
1166       getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
1167 
1168   Value *NumBytes =
1169       Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1170 
1171   CallInst *NewCall = nullptr;
1172   // Check whether to generate an unordered atomic memcpy:
1173   //  If the load or store are atomic, then they must necessarily be unordered
1174   //  by previous checks.
1175   if (!TheStore->isAtomic() && !TheLoad->isAtomic())
1176     NewCall = Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr,
1177                                    LoadAlign, NumBytes);
1178   else {
1179     // We cannot allow unaligned ops for unordered load/store, so reject
1180     // anything where the alignment isn't at least the element size.
1181     assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&
1182            "Expect unordered load/store to have align.");
1183     if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize)
1184       return Changed;
1185 
1186     // If the element.atomic memcpy is not lowered into explicit
1187     // loads/stores later, then it will be lowered into an element-size
1188     // specific lib call. If the lib call doesn't exist for our store size, then
1189     // we shouldn't generate the memcpy.
1190     if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1191       return Changed;
1192 
1193     // Create the call.
1194     // Note that unordered atomic loads/stores are *required* by the spec to
1195     // have an alignment but non-atomic loads/stores may not.
1196     NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1197         StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(),
1198         NumBytes, StoreSize);
1199   }
1200   NewCall->setDebugLoc(TheStore->getDebugLoc());
1201 
1202   if (MSSAU) {
1203     MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1204         NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1205     MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1206   }
1207 
1208   LLVM_DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
1209                     << "    from load ptr=" << *LoadEv << " at: " << *TheLoad
1210                     << "\n"
1211                     << "    from store ptr=" << *StoreEv << " at: " << *TheStore
1212                     << "\n");
1213 
1214   ORE.emit([&]() {
1215     return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1216                               NewCall->getDebugLoc(), Preheader)
1217            << "Formed a call to "
1218            << ore::NV("NewFunction", NewCall->getCalledFunction())
1219            << "() function";
1220   });
1221 
1222   // Okay, the memcpy has been formed.  Zap the original store and anything that
1223   // feeds into it.
1224   if (MSSAU)
1225     MSSAU->removeMemoryAccess(TheStore, true);
1226   deleteDeadInstruction(TheStore);
1227   if (MSSAU && VerifyMemorySSA)
1228     MSSAU->getMemorySSA()->verifyMemorySSA();
1229   ++NumMemCpy;
1230   ExpCleaner.markResultUsed();
1231   return true;
1232 }
1233 
1234 // When compiling for codesize we avoid idiom recognition for a multi-block loop
1235 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1236 //
1237 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1238                                                    bool IsLoopMemset) {
1239   if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1240     if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1241       LLVM_DEBUG(dbgs() << "  " << CurLoop->getHeader()->getParent()->getName()
1242                         << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1243                         << " avoided: multi-block top-level loop\n");
1244       return true;
1245     }
1246   }
1247 
1248   return false;
1249 }
1250 
1251 bool LoopIdiomRecognize::runOnNoncountableLoop() {
1252   LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1253                     << CurLoop->getHeader()->getParent()->getName()
1254                     << "] Noncountable Loop %"
1255                     << CurLoop->getHeader()->getName() << "\n");
1256 
1257   return recognizePopcount() || recognizeAndInsertFFS() ||
1258          recognizeShiftUntilBitTest();
1259 }
1260 
1261 /// Check if the given conditional branch is based on the comparison between
1262 /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1263 /// true), the control yields to the loop entry. If the branch matches the
1264 /// behavior, the variable involved in the comparison is returned. This function
1265 /// will be called to see if the precondition and postcondition of the loop are
1266 /// in desirable form.
1267 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1268                              bool JmpOnZero = false) {
1269   if (!BI || !BI->isConditional())
1270     return nullptr;
1271 
1272   ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1273   if (!Cond)
1274     return nullptr;
1275 
1276   ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1277   if (!CmpZero || !CmpZero->isZero())
1278     return nullptr;
1279 
1280   BasicBlock *TrueSucc = BI->getSuccessor(0);
1281   BasicBlock *FalseSucc = BI->getSuccessor(1);
1282   if (JmpOnZero)
1283     std::swap(TrueSucc, FalseSucc);
1284 
1285   ICmpInst::Predicate Pred = Cond->getPredicate();
1286   if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1287       (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1288     return Cond->getOperand(0);
1289 
1290   return nullptr;
1291 }
1292 
1293 // Check if the recurrence variable `VarX` is in the right form to create
1294 // the idiom. Returns the value coerced to a PHINode if so.
1295 static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
1296                                  BasicBlock *LoopEntry) {
1297   auto *PhiX = dyn_cast<PHINode>(VarX);
1298   if (PhiX && PhiX->getParent() == LoopEntry &&
1299       (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1300     return PhiX;
1301   return nullptr;
1302 }
1303 
1304 /// Return true iff the idiom is detected in the loop.
1305 ///
1306 /// Additionally:
1307 /// 1) \p CntInst is set to the instruction counting the population bit.
1308 /// 2) \p CntPhi is set to the corresponding phi node.
1309 /// 3) \p Var is set to the value whose population bits are being counted.
1310 ///
1311 /// The core idiom we are trying to detect is:
1312 /// \code
1313 ///    if (x0 != 0)
1314 ///      goto loop-exit // the precondition of the loop
1315 ///    cnt0 = init-val;
1316 ///    do {
1317 ///       x1 = phi (x0, x2);
1318 ///       cnt1 = phi(cnt0, cnt2);
1319 ///
1320 ///       cnt2 = cnt1 + 1;
1321 ///        ...
1322 ///       x2 = x1 & (x1 - 1);
1323 ///        ...
1324 ///    } while(x != 0);
1325 ///
1326 /// loop-exit:
1327 /// \endcode
1328 static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1329                                 Instruction *&CntInst, PHINode *&CntPhi,
1330                                 Value *&Var) {
1331   // step 1: Check to see if the look-back branch match this pattern:
1332   //    "if (a!=0) goto loop-entry".
1333   BasicBlock *LoopEntry;
1334   Instruction *DefX2, *CountInst;
1335   Value *VarX1, *VarX0;
1336   PHINode *PhiX, *CountPhi;
1337 
1338   DefX2 = CountInst = nullptr;
1339   VarX1 = VarX0 = nullptr;
1340   PhiX = CountPhi = nullptr;
1341   LoopEntry = *(CurLoop->block_begin());
1342 
1343   // step 1: Check if the loop-back branch is in desirable form.
1344   {
1345     if (Value *T = matchCondition(
1346             dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1347       DefX2 = dyn_cast<Instruction>(T);
1348     else
1349       return false;
1350   }
1351 
1352   // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1353   {
1354     if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1355       return false;
1356 
1357     BinaryOperator *SubOneOp;
1358 
1359     if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1360       VarX1 = DefX2->getOperand(1);
1361     else {
1362       VarX1 = DefX2->getOperand(0);
1363       SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1364     }
1365     if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1366       return false;
1367 
1368     ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1369     if (!Dec ||
1370         !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1371           (SubOneOp->getOpcode() == Instruction::Add &&
1372            Dec->isMinusOne()))) {
1373       return false;
1374     }
1375   }
1376 
1377   // step 3: Check the recurrence of variable X
1378   PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1379   if (!PhiX)
1380     return false;
1381 
1382   // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1383   {
1384     CountInst = nullptr;
1385     for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1386                               IterE = LoopEntry->end();
1387          Iter != IterE; Iter++) {
1388       Instruction *Inst = &*Iter;
1389       if (Inst->getOpcode() != Instruction::Add)
1390         continue;
1391 
1392       ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1393       if (!Inc || !Inc->isOne())
1394         continue;
1395 
1396       PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1397       if (!Phi)
1398         continue;
1399 
1400       // Check if the result of the instruction is live of the loop.
1401       bool LiveOutLoop = false;
1402       for (User *U : Inst->users()) {
1403         if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1404           LiveOutLoop = true;
1405           break;
1406         }
1407       }
1408 
1409       if (LiveOutLoop) {
1410         CountInst = Inst;
1411         CountPhi = Phi;
1412         break;
1413       }
1414     }
1415 
1416     if (!CountInst)
1417       return false;
1418   }
1419 
1420   // step 5: check if the precondition is in this form:
1421   //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1422   {
1423     auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1424     Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1425     if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1426       return false;
1427 
1428     CntInst = CountInst;
1429     CntPhi = CountPhi;
1430     Var = T;
1431   }
1432 
1433   return true;
1434 }
1435 
1436 /// Return true if the idiom is detected in the loop.
1437 ///
1438 /// Additionally:
1439 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1440 ///       or nullptr if there is no such.
1441 /// 2) \p CntPhi is set to the corresponding phi node
1442 ///       or nullptr if there is no such.
1443 /// 3) \p Var is set to the value whose CTLZ could be used.
1444 /// 4) \p DefX is set to the instruction calculating Loop exit condition.
1445 ///
1446 /// The core idiom we are trying to detect is:
1447 /// \code
1448 ///    if (x0 == 0)
1449 ///      goto loop-exit // the precondition of the loop
1450 ///    cnt0 = init-val;
1451 ///    do {
1452 ///       x = phi (x0, x.next);   //PhiX
1453 ///       cnt = phi(cnt0, cnt.next);
1454 ///
1455 ///       cnt.next = cnt + 1;
1456 ///        ...
1457 ///       x.next = x >> 1;   // DefX
1458 ///        ...
1459 ///    } while(x.next != 0);
1460 ///
1461 /// loop-exit:
1462 /// \endcode
1463 static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1464                                       Intrinsic::ID &IntrinID, Value *&InitX,
1465                                       Instruction *&CntInst, PHINode *&CntPhi,
1466                                       Instruction *&DefX) {
1467   BasicBlock *LoopEntry;
1468   Value *VarX = nullptr;
1469 
1470   DefX = nullptr;
1471   CntInst = nullptr;
1472   CntPhi = nullptr;
1473   LoopEntry = *(CurLoop->block_begin());
1474 
1475   // step 1: Check if the loop-back branch is in desirable form.
1476   if (Value *T = matchCondition(
1477           dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1478     DefX = dyn_cast<Instruction>(T);
1479   else
1480     return false;
1481 
1482   // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1483   if (!DefX || !DefX->isShift())
1484     return false;
1485   IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1486                                                      Intrinsic::ctlz;
1487   ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1488   if (!Shft || !Shft->isOne())
1489     return false;
1490   VarX = DefX->getOperand(0);
1491 
1492   // step 3: Check the recurrence of variable X
1493   PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1494   if (!PhiX)
1495     return false;
1496 
1497   InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1498 
1499   // Make sure the initial value can't be negative otherwise the ashr in the
1500   // loop might never reach zero which would make the loop infinite.
1501   if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1502     return false;
1503 
1504   // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1505   //         or cnt.next = cnt + -1.
1506   // TODO: We can skip the step. If loop trip count is known (CTLZ),
1507   //       then all uses of "cnt.next" could be optimized to the trip count
1508   //       plus "cnt0". Currently it is not optimized.
1509   //       This step could be used to detect POPCNT instruction:
1510   //       cnt.next = cnt + (x.next & 1)
1511   for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1512                             IterE = LoopEntry->end();
1513        Iter != IterE; Iter++) {
1514     Instruction *Inst = &*Iter;
1515     if (Inst->getOpcode() != Instruction::Add)
1516       continue;
1517 
1518     ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1519     if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1520       continue;
1521 
1522     PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1523     if (!Phi)
1524       continue;
1525 
1526     CntInst = Inst;
1527     CntPhi = Phi;
1528     break;
1529   }
1530   if (!CntInst)
1531     return false;
1532 
1533   return true;
1534 }
1535 
1536 /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1537 /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1538 /// trip count returns true; otherwise, returns false.
1539 bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1540   // Give up if the loop has multiple blocks or multiple backedges.
1541   if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1542     return false;
1543 
1544   Intrinsic::ID IntrinID;
1545   Value *InitX;
1546   Instruction *DefX = nullptr;
1547   PHINode *CntPhi = nullptr;
1548   Instruction *CntInst = nullptr;
1549   // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1550   // this is always 6.
1551   size_t IdiomCanonicalSize = 6;
1552 
1553   if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1554                                  CntInst, CntPhi, DefX))
1555     return false;
1556 
1557   bool IsCntPhiUsedOutsideLoop = false;
1558   for (User *U : CntPhi->users())
1559     if (!CurLoop->contains(cast<Instruction>(U))) {
1560       IsCntPhiUsedOutsideLoop = true;
1561       break;
1562     }
1563   bool IsCntInstUsedOutsideLoop = false;
1564   for (User *U : CntInst->users())
1565     if (!CurLoop->contains(cast<Instruction>(U))) {
1566       IsCntInstUsedOutsideLoop = true;
1567       break;
1568     }
1569   // If both CntInst and CntPhi are used outside the loop the profitability
1570   // is questionable.
1571   if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1572     return false;
1573 
1574   // For some CPUs result of CTLZ(X) intrinsic is undefined
1575   // when X is 0. If we can not guarantee X != 0, we need to check this
1576   // when expand.
1577   bool ZeroCheck = false;
1578   // It is safe to assume Preheader exist as it was checked in
1579   // parent function RunOnLoop.
1580   BasicBlock *PH = CurLoop->getLoopPreheader();
1581 
1582   // If we are using the count instruction outside the loop, make sure we
1583   // have a zero check as a precondition. Without the check the loop would run
1584   // one iteration for before any check of the input value. This means 0 and 1
1585   // would have identical behavior in the original loop and thus
1586   if (!IsCntPhiUsedOutsideLoop) {
1587     auto *PreCondBB = PH->getSinglePredecessor();
1588     if (!PreCondBB)
1589       return false;
1590     auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1591     if (!PreCondBI)
1592       return false;
1593     if (matchCondition(PreCondBI, PH) != InitX)
1594       return false;
1595     ZeroCheck = true;
1596   }
1597 
1598   // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1599   // profitable if we delete the loop.
1600 
1601   // the loop has only 6 instructions:
1602   //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1603   //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1604   //  %shr = ashr %n.addr.0, 1
1605   //  %tobool = icmp eq %shr, 0
1606   //  %inc = add nsw %i.0, 1
1607   //  br i1 %tobool
1608 
1609   const Value *Args[] = {InitX,
1610                          ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1611 
1612   // @llvm.dbg doesn't count as they have no semantic effect.
1613   auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1614   uint32_t HeaderSize =
1615       std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1616 
1617   IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1618   InstructionCost Cost =
1619     TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
1620   if (HeaderSize != IdiomCanonicalSize &&
1621       Cost > TargetTransformInfo::TCC_Basic)
1622     return false;
1623 
1624   transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1625                            DefX->getDebugLoc(), ZeroCheck,
1626                            IsCntPhiUsedOutsideLoop);
1627   return true;
1628 }
1629 
1630 /// Recognizes a population count idiom in a non-countable loop.
1631 ///
1632 /// If detected, transforms the relevant code to issue the popcount intrinsic
1633 /// function call, and returns true; otherwise, returns false.
1634 bool LoopIdiomRecognize::recognizePopcount() {
1635   if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
1636     return false;
1637 
1638   // Counting population are usually conducted by few arithmetic instructions.
1639   // Such instructions can be easily "absorbed" by vacant slots in a
1640   // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1641   // in a compact loop.
1642 
1643   // Give up if the loop has multiple blocks or multiple backedges.
1644   if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1645     return false;
1646 
1647   BasicBlock *LoopBody = *(CurLoop->block_begin());
1648   if (LoopBody->size() >= 20) {
1649     // The loop is too big, bail out.
1650     return false;
1651   }
1652 
1653   // It should have a preheader containing nothing but an unconditional branch.
1654   BasicBlock *PH = CurLoop->getLoopPreheader();
1655   if (!PH || &PH->front() != PH->getTerminator())
1656     return false;
1657   auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1658   if (!EntryBI || EntryBI->isConditional())
1659     return false;
1660 
1661   // It should have a precondition block where the generated popcount intrinsic
1662   // function can be inserted.
1663   auto *PreCondBB = PH->getSinglePredecessor();
1664   if (!PreCondBB)
1665     return false;
1666   auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1667   if (!PreCondBI || PreCondBI->isUnconditional())
1668     return false;
1669 
1670   Instruction *CntInst;
1671   PHINode *CntPhi;
1672   Value *Val;
1673   if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1674     return false;
1675 
1676   transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1677   return true;
1678 }
1679 
1680 static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1681                                        const DebugLoc &DL) {
1682   Value *Ops[] = {Val};
1683   Type *Tys[] = {Val->getType()};
1684 
1685   Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1686   Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1687   CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1688   CI->setDebugLoc(DL);
1689 
1690   return CI;
1691 }
1692 
1693 static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1694                                     const DebugLoc &DL, bool ZeroCheck,
1695                                     Intrinsic::ID IID) {
1696   Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
1697   Type *Tys[] = {Val->getType()};
1698 
1699   Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1700   Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1701   CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1702   CI->setDebugLoc(DL);
1703 
1704   return CI;
1705 }
1706 
1707 /// Transform the following loop (Using CTLZ, CTTZ is similar):
1708 /// loop:
1709 ///   CntPhi = PHI [Cnt0, CntInst]
1710 ///   PhiX = PHI [InitX, DefX]
1711 ///   CntInst = CntPhi + 1
1712 ///   DefX = PhiX >> 1
1713 ///   LOOP_BODY
1714 ///   Br: loop if (DefX != 0)
1715 /// Use(CntPhi) or Use(CntInst)
1716 ///
1717 /// Into:
1718 /// If CntPhi used outside the loop:
1719 ///   CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1720 ///   Count = CountPrev + 1
1721 /// else
1722 ///   Count = BitWidth(InitX) - CTLZ(InitX)
1723 /// loop:
1724 ///   CntPhi = PHI [Cnt0, CntInst]
1725 ///   PhiX = PHI [InitX, DefX]
1726 ///   PhiCount = PHI [Count, Dec]
1727 ///   CntInst = CntPhi + 1
1728 ///   DefX = PhiX >> 1
1729 ///   Dec = PhiCount - 1
1730 ///   LOOP_BODY
1731 ///   Br: loop if (Dec != 0)
1732 /// Use(CountPrev + Cnt0) // Use(CntPhi)
1733 /// or
1734 /// Use(Count + Cnt0) // Use(CntInst)
1735 ///
1736 /// If LOOP_BODY is empty the loop will be deleted.
1737 /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1738 void LoopIdiomRecognize::transformLoopToCountable(
1739     Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1740     PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1741     bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1742   BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1743 
1744   // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1745   IRBuilder<> Builder(PreheaderBr);
1746   Builder.SetCurrentDebugLocation(DL);
1747 
1748   // If there are no uses of CntPhi crate:
1749   //   Count = BitWidth - CTLZ(InitX);
1750   //   NewCount = Count;
1751   // If there are uses of CntPhi create:
1752   //   NewCount = BitWidth - CTLZ(InitX >> 1);
1753   //   Count = NewCount + 1;
1754   Value *InitXNext;
1755   if (IsCntPhiUsedOutsideLoop) {
1756     if (DefX->getOpcode() == Instruction::AShr)
1757       InitXNext = Builder.CreateAShr(InitX, 1);
1758     else if (DefX->getOpcode() == Instruction::LShr)
1759       InitXNext = Builder.CreateLShr(InitX, 1);
1760     else if (DefX->getOpcode() == Instruction::Shl) // cttz
1761       InitXNext = Builder.CreateShl(InitX, 1);
1762     else
1763       llvm_unreachable("Unexpected opcode!");
1764   } else
1765     InitXNext = InitX;
1766   Value *Count =
1767       createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1768   Type *CountTy = Count->getType();
1769   Count = Builder.CreateSub(
1770       ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
1771   Value *NewCount = Count;
1772   if (IsCntPhiUsedOutsideLoop)
1773     Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
1774 
1775   NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
1776 
1777   Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1778   if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
1779     // If the counter was being incremented in the loop, add NewCount to the
1780     // counter's initial value, but only if the initial value is not zero.
1781     ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1782     if (!InitConst || !InitConst->isZero())
1783       NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1784   } else {
1785     // If the count was being decremented in the loop, subtract NewCount from
1786     // the counter's initial value.
1787     NewCount = Builder.CreateSub(CntInitVal, NewCount);
1788   }
1789 
1790   // Step 2: Insert new IV and loop condition:
1791   // loop:
1792   //   ...
1793   //   PhiCount = PHI [Count, Dec]
1794   //   ...
1795   //   Dec = PhiCount - 1
1796   //   ...
1797   //   Br: loop if (Dec != 0)
1798   BasicBlock *Body = *(CurLoop->block_begin());
1799   auto *LbBr = cast<BranchInst>(Body->getTerminator());
1800   ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1801 
1802   PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
1803 
1804   Builder.SetInsertPoint(LbCond);
1805   Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
1806       TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
1807 
1808   TcPhi->addIncoming(Count, Preheader);
1809   TcPhi->addIncoming(TcDec, Body);
1810 
1811   CmpInst::Predicate Pred =
1812       (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1813   LbCond->setPredicate(Pred);
1814   LbCond->setOperand(0, TcDec);
1815   LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
1816 
1817   // Step 3: All the references to the original counter outside
1818   //  the loop are replaced with the NewCount
1819   if (IsCntPhiUsedOutsideLoop)
1820     CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1821   else
1822     CntInst->replaceUsesOutsideBlock(NewCount, Body);
1823 
1824   // step 4: Forget the "non-computable" trip-count SCEV associated with the
1825   //   loop. The loop would otherwise not be deleted even if it becomes empty.
1826   SE->forgetLoop(CurLoop);
1827 }
1828 
1829 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1830                                                  Instruction *CntInst,
1831                                                  PHINode *CntPhi, Value *Var) {
1832   BasicBlock *PreHead = CurLoop->getLoopPreheader();
1833   auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
1834   const DebugLoc &DL = CntInst->getDebugLoc();
1835 
1836   // Assuming before transformation, the loop is following:
1837   //  if (x) // the precondition
1838   //     do { cnt++; x &= x - 1; } while(x);
1839 
1840   // Step 1: Insert the ctpop instruction at the end of the precondition block
1841   IRBuilder<> Builder(PreCondBr);
1842   Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
1843   {
1844     PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1845     NewCount = PopCntZext =
1846         Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1847 
1848     if (NewCount != PopCnt)
1849       (cast<Instruction>(NewCount))->setDebugLoc(DL);
1850 
1851     // TripCnt is exactly the number of iterations the loop has
1852     TripCnt = NewCount;
1853 
1854     // If the population counter's initial value is not zero, insert Add Inst.
1855     Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1856     ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1857     if (!InitConst || !InitConst->isZero()) {
1858       NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1859       (cast<Instruction>(NewCount))->setDebugLoc(DL);
1860     }
1861   }
1862 
1863   // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1864   //   "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1865   //   function would be partial dead code, and downstream passes will drag
1866   //   it back from the precondition block to the preheader.
1867   {
1868     ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1869 
1870     Value *Opnd0 = PopCntZext;
1871     Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1872     if (PreCond->getOperand(0) != Var)
1873       std::swap(Opnd0, Opnd1);
1874 
1875     ICmpInst *NewPreCond = cast<ICmpInst>(
1876         Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1877     PreCondBr->setCondition(NewPreCond);
1878 
1879     RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
1880   }
1881 
1882   // Step 3: Note that the population count is exactly the trip count of the
1883   // loop in question, which enable us to convert the loop from noncountable
1884   // loop into a countable one. The benefit is twofold:
1885   //
1886   //  - If the loop only counts population, the entire loop becomes dead after
1887   //    the transformation. It is a lot easier to prove a countable loop dead
1888   //    than to prove a noncountable one. (In some C dialects, an infinite loop
1889   //    isn't dead even if it computes nothing useful. In general, DCE needs
1890   //    to prove a noncountable loop finite before safely delete it.)
1891   //
1892   //  - If the loop also performs something else, it remains alive.
1893   //    Since it is transformed to countable form, it can be aggressively
1894   //    optimized by some optimizations which are in general not applicable
1895   //    to a noncountable loop.
1896   //
1897   // After this step, this loop (conceptually) would look like following:
1898   //   newcnt = __builtin_ctpop(x);
1899   //   t = newcnt;
1900   //   if (x)
1901   //     do { cnt++; x &= x-1; t--) } while (t > 0);
1902   BasicBlock *Body = *(CurLoop->block_begin());
1903   {
1904     auto *LbBr = cast<BranchInst>(Body->getTerminator());
1905     ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1906     Type *Ty = TripCnt->getType();
1907 
1908     PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1909 
1910     Builder.SetInsertPoint(LbCond);
1911     Instruction *TcDec = cast<Instruction>(
1912         Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1913                           "tcdec", false, true));
1914 
1915     TcPhi->addIncoming(TripCnt, PreHead);
1916     TcPhi->addIncoming(TcDec, Body);
1917 
1918     CmpInst::Predicate Pred =
1919         (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1920     LbCond->setPredicate(Pred);
1921     LbCond->setOperand(0, TcDec);
1922     LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1923   }
1924 
1925   // Step 4: All the references to the original population counter outside
1926   //  the loop are replaced with the NewCount -- the value returned from
1927   //  __builtin_ctpop().
1928   CntInst->replaceUsesOutsideBlock(NewCount, Body);
1929 
1930   // step 5: Forget the "non-computable" trip-count SCEV associated with the
1931   //   loop. The loop would otherwise not be deleted even if it becomes empty.
1932   SE->forgetLoop(CurLoop);
1933 }
1934 
1935 /// Match loop-invariant value.
1936 template <typename SubPattern_t> struct match_LoopInvariant {
1937   SubPattern_t SubPattern;
1938   const Loop *L;
1939 
1940   match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
1941       : SubPattern(SP), L(L) {}
1942 
1943   template <typename ITy> bool match(ITy *V) {
1944     return L->isLoopInvariant(V) && SubPattern.match(V);
1945   }
1946 };
1947 
1948 /// Matches if the value is loop-invariant.
1949 template <typename Ty>
1950 inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
1951   return match_LoopInvariant<Ty>(M, L);
1952 }
1953 
1954 /// Return true if the idiom is detected in the loop.
1955 ///
1956 /// The core idiom we are trying to detect is:
1957 /// \code
1958 ///   entry:
1959 ///     <...>
1960 ///     %bitmask = shl i32 1, %bitpos
1961 ///     br label %loop
1962 ///
1963 ///   loop:
1964 ///     %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
1965 ///     %x.curr.bitmasked = and i32 %x.curr, %bitmask
1966 ///     %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
1967 ///     %x.next = shl i32 %x.curr, 1
1968 ///     <...>
1969 ///     br i1 %x.curr.isbitunset, label %loop, label %end
1970 ///
1971 ///   end:
1972 ///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
1973 ///     %x.next.res = phi i32 [ %x.next, %loop ] <...>
1974 ///     <...>
1975 /// \endcode
1976 static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
1977                                          Value *&BitMask, Value *&BitPos,
1978                                          Value *&CurrX, Instruction *&NextX) {
1979   LLVM_DEBUG(dbgs() << DEBUG_TYPE
1980              " Performing shift-until-bittest idiom detection.\n");
1981 
1982   // Give up if the loop has multiple blocks or multiple backedges.
1983   if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
1984     LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
1985     return false;
1986   }
1987 
1988   BasicBlock *LoopHeaderBB = CurLoop->getHeader();
1989   BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
1990   assert(LoopPreheaderBB && "There is always a loop preheader.");
1991 
1992   using namespace PatternMatch;
1993 
1994   // Step 1: Check if the loop backedge is in desirable form.
1995 
1996   ICmpInst::Predicate Pred;
1997   Value *CmpLHS, *CmpRHS;
1998   BasicBlock *TrueBB, *FalseBB;
1999   if (!match(LoopHeaderBB->getTerminator(),
2000              m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2001                   m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2002     LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2003     return false;
2004   }
2005 
2006   // Step 2: Check if the backedge's condition is in desirable form.
2007 
2008   auto MatchVariableBitMask = [&]() {
2009     return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2010            match(CmpLHS,
2011                  m_c_And(m_Value(CurrX),
2012                          m_CombineAnd(
2013                              m_Value(BitMask),
2014                              m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2015                                              CurLoop))));
2016   };
2017   auto MatchConstantBitMask = [&]() {
2018     return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2019            match(CmpLHS, m_And(m_Value(CurrX),
2020                                m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2021            (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2022   };
2023   auto MatchDecomposableConstantBitMask = [&]() {
2024     APInt Mask;
2025     return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2026            ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2027            (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2028            (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2029   };
2030 
2031   if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2032       !MatchDecomposableConstantBitMask()) {
2033     LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2034     return false;
2035   }
2036 
2037   // Step 3: Check if the recurrence is in desirable form.
2038   auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2039   if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2040     LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2041     return false;
2042   }
2043 
2044   BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2045   NextX =
2046       dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2047 
2048   if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2049     // FIXME: support right-shift?
2050     LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2051     return false;
2052   }
2053 
2054   // Step 4: Check if the backedge's destinations are in desirable form.
2055 
2056   assert(ICmpInst::isEquality(Pred) &&
2057          "Should only get equality predicates here.");
2058 
2059   // cmp-br is commutative, so canonicalize to a single variant.
2060   if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2061     Pred = ICmpInst::getInversePredicate(Pred);
2062     std::swap(TrueBB, FalseBB);
2063   }
2064 
2065   // We expect to exit loop when comparison yields false,
2066   // so when it yields true we should branch back to loop header.
2067   if (TrueBB != LoopHeaderBB) {
2068     LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2069     return false;
2070   }
2071 
2072   // Okay, idiom checks out.
2073   return true;
2074 }
2075 
2076 /// Look for the following loop:
2077 /// \code
2078 ///   entry:
2079 ///     <...>
2080 ///     %bitmask = shl i32 1, %bitpos
2081 ///     br label %loop
2082 ///
2083 ///   loop:
2084 ///     %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2085 ///     %x.curr.bitmasked = and i32 %x.curr, %bitmask
2086 ///     %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2087 ///     %x.next = shl i32 %x.curr, 1
2088 ///     <...>
2089 ///     br i1 %x.curr.isbitunset, label %loop, label %end
2090 ///
2091 ///   end:
2092 ///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2093 ///     %x.next.res = phi i32 [ %x.next, %loop ] <...>
2094 ///     <...>
2095 /// \endcode
2096 ///
2097 /// And transform it into:
2098 /// \code
2099 ///   entry:
2100 ///     %bitmask = shl i32 1, %bitpos
2101 ///     %lowbitmask = add i32 %bitmask, -1
2102 ///     %mask = or i32 %lowbitmask, %bitmask
2103 ///     %x.masked = and i32 %x, %mask
2104 ///     %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2105 ///                                                         i1 true)
2106 ///     %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2107 ///     %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2108 ///     %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2109 ///     %tripcount = add i32 %backedgetakencount, 1
2110 ///     %x.curr = shl i32 %x, %backedgetakencount
2111 ///     %x.next = shl i32 %x, %tripcount
2112 ///     br label %loop
2113 ///
2114 ///   loop:
2115 ///     %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2116 ///     %loop.iv.next = add nuw i32 %loop.iv, 1
2117 ///     %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2118 ///     <...>
2119 ///     br i1 %loop.ivcheck, label %end, label %loop
2120 ///
2121 ///   end:
2122 ///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2123 ///     %x.next.res = phi i32 [ %x.next, %loop ] <...>
2124 ///     <...>
2125 /// \endcode
2126 bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2127   bool MadeChange = false;
2128 
2129   Value *X, *BitMask, *BitPos, *XCurr;
2130   Instruction *XNext;
2131   if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2132                                     XNext)) {
2133     LLVM_DEBUG(dbgs() << DEBUG_TYPE
2134                " shift-until-bittest idiom detection failed.\n");
2135     return MadeChange;
2136   }
2137   LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2138 
2139   // Ok, it is the idiom we were looking for, we *could* transform this loop,
2140   // but is it profitable to transform?
2141 
2142   BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2143   BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2144   assert(LoopPreheaderBB && "There is always a loop preheader.");
2145 
2146   BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2147   assert(LoopPreheaderBB && "There is only a single successor.");
2148 
2149   IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2150   Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2151 
2152   Intrinsic::ID IntrID = Intrinsic::ctlz;
2153   Type *Ty = X->getType();
2154   unsigned Bitwidth = Ty->getScalarSizeInBits();
2155 
2156   TargetTransformInfo::TargetCostKind CostKind =
2157       TargetTransformInfo::TCK_SizeAndLatency;
2158 
2159   // The rewrite is considered to be unprofitable iff and only iff the
2160   // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2161   // making the loop countable, even if nothing else changes.
2162   IntrinsicCostAttributes Attrs(
2163       IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
2164   InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
2165   if (Cost > TargetTransformInfo::TCC_Basic) {
2166     LLVM_DEBUG(dbgs() << DEBUG_TYPE
2167                " Intrinsic is too costly, not beneficial\n");
2168     return MadeChange;
2169   }
2170   if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2171       TargetTransformInfo::TCC_Basic) {
2172     LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2173     return MadeChange;
2174   }
2175 
2176   // Ok, transform appears worthwhile.
2177   MadeChange = true;
2178 
2179   // Step 1: Compute the loop trip count.
2180 
2181   Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2182                                         BitPos->getName() + ".lowbitmask");
2183   Value *Mask =
2184       Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2185   Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2186   CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2187       IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
2188       /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2189   Value *XMaskedNumActiveBits = Builder.CreateSub(
2190       ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2191       XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2192       /*HasNSW=*/Bitwidth != 2);
2193   Value *XMaskedLeadingOnePos =
2194       Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2195                         XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2196                         /*HasNSW=*/Bitwidth > 2);
2197 
2198   Value *LoopBackedgeTakenCount = Builder.CreateSub(
2199       BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2200       /*HasNUW=*/true, /*HasNSW=*/true);
2201   // We know loop's backedge-taken count, but what's loop's trip count?
2202   // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2203   Value *LoopTripCount =
2204       Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2205                         CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2206                         /*HasNSW=*/Bitwidth != 2);
2207 
2208   // Step 2: Compute the recurrence's final value without a loop.
2209 
2210   // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2211   // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2212   Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2213   NewX->takeName(XCurr);
2214   if (auto *I = dyn_cast<Instruction>(NewX))
2215     I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2216 
2217   Value *NewXNext;
2218   // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2219   // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2220   // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2221   // that isn't the case, we'll need to emit an alternative, safe IR.
2222   if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2223       PatternMatch::match(
2224           BitPos, PatternMatch::m_SpecificInt_ICMP(
2225                       ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
2226                                                Ty->getScalarSizeInBits() - 1))))
2227     NewXNext = Builder.CreateShl(X, LoopTripCount);
2228   else {
2229     // Otherwise, just additionally shift by one. It's the smallest solution,
2230     // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2231     // and select 0 instead.
2232     NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2233   }
2234 
2235   NewXNext->takeName(XNext);
2236   if (auto *I = dyn_cast<Instruction>(NewXNext))
2237     I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2238 
2239   // Step 3: Adjust the successor basic block to recieve the computed
2240   //         recurrence's final value instead of the recurrence itself.
2241 
2242   XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2243   XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2244 
2245   // Step 4: Rewrite the loop into a countable form, with canonical IV.
2246 
2247   // The new canonical induction variable.
2248   Builder.SetInsertPoint(&LoopHeaderBB->front());
2249   auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2250 
2251   // The induction itself.
2252   // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2253   Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2254   auto *IVNext =
2255       Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2256                         /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2257 
2258   // The loop trip count check.
2259   auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2260                                        CurLoop->getName() + ".ivcheck");
2261   Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2262   LoopHeaderBB->getTerminator()->eraseFromParent();
2263 
2264   // Populate the IV PHI.
2265   IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2266   IV->addIncoming(IVNext, LoopHeaderBB);
2267 
2268   // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2269   //   loop. The loop would otherwise not be deleted even if it becomes empty.
2270 
2271   SE->forgetLoop(CurLoop);
2272 
2273   // Other passes will take care of actually deleting the loop if possible.
2274 
2275   LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
2276 
2277   ++NumShiftUntilBitTest;
2278   return MadeChange;
2279 }
2280