1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the transformation that optimizes memory intrinsics
10 // such as memcpy using the size value profile. When memory intrinsic size
11 // value profile metadata is available, a single memory intrinsic is expanded
12 // to a sequence of guarded specialized versions that are called with the
13 // hottest size(s), for later expansion into more optimal inline sequences.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "llvm/ADT/ArrayRef.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/DomTreeUpdater.h"
23 #include "llvm/Analysis/GlobalsModRef.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/Analysis/TargetLibraryInfo.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/Instruction.h"
33 #include "llvm/IR/Instructions.h"
34 #include "llvm/IR/LLVMContext.h"
35 #include "llvm/IR/PassManager.h"
36 #include "llvm/IR/Type.h"
37 #include "llvm/InitializePasses.h"
38 #include "llvm/Pass.h"
39 #include "llvm/PassRegistry.h"
40 #include "llvm/ProfileData/InstrProf.h"
41 #define INSTR_PROF_VALUE_PROF_MEMOP_API
42 #include "llvm/ProfileData/InstrProfData.inc"
43 #include "llvm/Support/Casting.h"
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/Debug.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/MathExtras.h"
48 #include "llvm/Transforms/Instrumentation.h"
49 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
50 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
51 #include <cassert>
52 #include <cstdint>
53 #include <vector>
54 
55 using namespace llvm;
56 
57 #define DEBUG_TYPE "pgo-memop-opt"
58 
59 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
60 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
61 
62 // The minimum call count to optimize memory intrinsic calls.
63 static cl::opt<unsigned>
64     MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
65                         cl::init(1000),
66                         cl::desc("The minimum count to optimize memory "
67                                  "intrinsic calls"));
68 
69 // Command line option to disable memory intrinsic optimization. The default is
70 // false. This is for debug purpose.
71 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
72                                      cl::Hidden, cl::desc("Disable optimize"));
73 
74 // The percent threshold to optimize memory intrinsic calls.
75 static cl::opt<unsigned>
76     MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
77                           cl::Hidden, cl::ZeroOrMore,
78                           cl::desc("The percentage threshold for the "
79                                    "memory intrinsic calls optimization"));
80 
81 // Maximum number of versions for optimizing memory intrinsic call.
82 static cl::opt<unsigned>
83     MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
84                     cl::ZeroOrMore,
85                     cl::desc("The max version for the optimized memory "
86                              " intrinsic calls"));
87 
88 // Scale the counts from the annotation using the BB count value.
89 static cl::opt<bool>
90     MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
91                     cl::desc("Scale the memop size counts using the basic "
92                              " block count value"));
93 
94 cl::opt<bool>
95     MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
96                        cl::Hidden,
97                        cl::desc("Size-specialize memcmp and bcmp calls"));
98 
99 static cl::opt<unsigned>
100     MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
101                     cl::desc("Optimize the memop size <= this value"));
102 
103 namespace {
104 class PGOMemOPSizeOptLegacyPass : public FunctionPass {
105 public:
106   static char ID;
107 
108   PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
109     initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
110   }
111 
112   StringRef getPassName() const override { return "PGOMemOPSize"; }
113 
114 private:
115   bool runOnFunction(Function &F) override;
116   void getAnalysisUsage(AnalysisUsage &AU) const override {
117     AU.addRequired<BlockFrequencyInfoWrapperPass>();
118     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
119     AU.addPreserved<GlobalsAAWrapperPass>();
120     AU.addPreserved<DominatorTreeWrapperPass>();
121     AU.addRequired<TargetLibraryInfoWrapperPass>();
122   }
123 };
124 } // end anonymous namespace
125 
126 char PGOMemOPSizeOptLegacyPass::ID = 0;
127 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
128                       "Optimize memory intrinsic using its size value profile",
129                       false, false)
130 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
131 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
132 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
133                     "Optimize memory intrinsic using its size value profile",
134                     false, false)
135 
136 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
137   return new PGOMemOPSizeOptLegacyPass();
138 }
139 
140 namespace {
141 
142 static const char *getMIName(const MemIntrinsic *MI) {
143   switch (MI->getIntrinsicID()) {
144   case Intrinsic::memcpy:
145     return "memcpy";
146   case Intrinsic::memmove:
147     return "memmove";
148   case Intrinsic::memset:
149     return "memset";
150   default:
151     return "unknown";
152   }
153 }
154 
155 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
156 struct MemOp {
157   Instruction *I;
158   MemOp(MemIntrinsic *MI) : I(MI) {}
159   MemOp(CallInst *CI) : I(CI) {}
160   MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
161   CallInst *asCI() { return cast<CallInst>(I); }
162   MemOp clone() {
163     if (auto MI = asMI())
164       return MemOp(cast<MemIntrinsic>(MI->clone()));
165     return MemOp(cast<CallInst>(asCI()->clone()));
166   }
167   Value *getLength() {
168     if (auto MI = asMI())
169       return MI->getLength();
170     return asCI()->getArgOperand(2);
171   }
172   void setLength(Value *Length) {
173     if (auto MI = asMI())
174       return MI->setLength(Length);
175     asCI()->setArgOperand(2, Length);
176   }
177   StringRef getFuncName() {
178     if (auto MI = asMI())
179       return MI->getCalledFunction()->getName();
180     return asCI()->getCalledFunction()->getName();
181   }
182   bool isMemmove() {
183     if (auto MI = asMI())
184       if (MI->getIntrinsicID() == Intrinsic::memmove)
185         return true;
186     return false;
187   }
188   bool isMemcmp(TargetLibraryInfo &TLI) {
189     LibFunc Func;
190     if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
191         Func == LibFunc_memcmp) {
192       return true;
193     }
194     return false;
195   }
196   bool isBcmp(TargetLibraryInfo &TLI) {
197     LibFunc Func;
198     if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
199         Func == LibFunc_bcmp) {
200       return true;
201     }
202     return false;
203   }
204   const char *getName(TargetLibraryInfo &TLI) {
205     if (auto MI = asMI())
206       return getMIName(MI);
207     LibFunc Func;
208     if (TLI.getLibFunc(*asCI(), Func)) {
209       if (Func == LibFunc_memcmp)
210         return "memcmp";
211       if (Func == LibFunc_bcmp)
212         return "bcmp";
213     }
214     llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
215     return nullptr;
216   }
217 };
218 
219 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
220 public:
221   MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
222                OptimizationRemarkEmitter &ORE, DominatorTree *DT,
223                TargetLibraryInfo &TLI)
224       : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
225     ValueDataArray =
226         std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS);
227   }
228   bool isChanged() const { return Changed; }
229   void perform() {
230     WorkList.clear();
231     visit(Func);
232 
233     for (auto &MO : WorkList) {
234       ++NumOfPGOMemOPAnnotate;
235       if (perform(MO)) {
236         Changed = true;
237         ++NumOfPGOMemOPOpt;
238         LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
239                           << "is Transformed.\n");
240       }
241     }
242   }
243 
244   void visitMemIntrinsic(MemIntrinsic &MI) {
245     Value *Length = MI.getLength();
246     // Not perform on constant length calls.
247     if (isa<ConstantInt>(Length))
248       return;
249     WorkList.push_back(MemOp(&MI));
250   }
251 
252   void visitCallInst(CallInst &CI) {
253     LibFunc Func;
254     if (TLI.getLibFunc(CI, Func) &&
255         (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
256         !isa<ConstantInt>(CI.getArgOperand(2))) {
257       WorkList.push_back(MemOp(&CI));
258     }
259   }
260 
261 private:
262   Function &Func;
263   BlockFrequencyInfo &BFI;
264   OptimizationRemarkEmitter &ORE;
265   DominatorTree *DT;
266   TargetLibraryInfo &TLI;
267   bool Changed;
268   std::vector<MemOp> WorkList;
269   // The space to read the profile annotation.
270   std::unique_ptr<InstrProfValueData[]> ValueDataArray;
271   bool perform(MemOp MO);
272 };
273 
274 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
275   assert(Count <= TotalCount);
276   if (Count < MemOPCountThreshold)
277     return false;
278   if (Count < TotalCount * MemOPPercentThreshold / 100)
279     return false;
280   return true;
281 }
282 
283 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
284                                       uint64_t Denom) {
285   if (!MemOPScaleCount)
286     return Count;
287   bool Overflowed;
288   uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
289   return ScaleCount / Denom;
290 }
291 
292 bool MemOPSizeOpt::perform(MemOp MO) {
293   assert(MO.I);
294   if (MO.isMemmove())
295     return false;
296   if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
297     return false;
298 
299   uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS;
300   uint64_t TotalCount;
301   if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals,
302                                 ValueDataArray.get(), NumVals, TotalCount))
303     return false;
304 
305   uint64_t ActualCount = TotalCount;
306   uint64_t SavedTotalCount = TotalCount;
307   if (MemOPScaleCount) {
308     auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
309     if (!BBEdgeCount)
310       return false;
311     ActualCount = *BBEdgeCount;
312   }
313 
314   ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
315   LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
316                     << ActualCount << "\n");
317   LLVM_DEBUG(
318       for (auto &VD
319            : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
320 
321   if (ActualCount < MemOPCountThreshold)
322     return false;
323   // Skip if the total value profiled count is 0, in which case we can't
324   // scale up the counts properly (and there is no profitable transformation).
325   if (TotalCount == 0)
326     return false;
327 
328   TotalCount = ActualCount;
329   if (MemOPScaleCount)
330     LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
331                       << " denominator = " << SavedTotalCount << "\n");
332 
333   // Keeping track of the count of the default case:
334   uint64_t RemainCount = TotalCount;
335   uint64_t SavedRemainCount = SavedTotalCount;
336   SmallVector<uint64_t, 16> SizeIds;
337   SmallVector<uint64_t, 16> CaseCounts;
338   uint64_t MaxCount = 0;
339   unsigned Version = 0;
340   int64_t LastV = -1;
341   // Default case is in the front -- save the slot here.
342   CaseCounts.push_back(0);
343   SmallVector<InstrProfValueData, 24> RemainingVDs;
344   for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) {
345     auto &VD = *I;
346     int64_t V = VD.Value;
347     uint64_t C = VD.Count;
348     if (MemOPScaleCount)
349       C = getScaledCount(C, ActualCount, SavedTotalCount);
350 
351     if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) {
352       RemainingVDs.push_back(VD);
353       continue;
354     }
355 
356     // ValueCounts are sorted on the count. Break at the first un-profitable
357     // value.
358     if (!isProfitable(C, RemainCount)) {
359       RemainingVDs.insert(RemainingVDs.end(), I, E);
360       break;
361     }
362 
363     if (V == LastV) {
364       LLVM_DEBUG(dbgs() << "Invalid Profile Data in Function " << Func.getName()
365                         << ": Two consecutive, identical values in MemOp value"
366                            "counts.\n");
367       return false;
368     }
369 
370     LastV = V;
371 
372     SizeIds.push_back(V);
373     CaseCounts.push_back(C);
374     if (C > MaxCount)
375       MaxCount = C;
376 
377     assert(RemainCount >= C);
378     RemainCount -= C;
379     assert(SavedRemainCount >= VD.Count);
380     SavedRemainCount -= VD.Count;
381 
382     if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) {
383       RemainingVDs.insert(RemainingVDs.end(), I + 1, E);
384       break;
385     }
386   }
387 
388   if (Version == 0)
389     return false;
390 
391   CaseCounts[0] = RemainCount;
392   if (RemainCount > MaxCount)
393     MaxCount = RemainCount;
394 
395   uint64_t SumForOpt = TotalCount - RemainCount;
396 
397   LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
398                     << " Versions (covering " << SumForOpt << " out of "
399                     << TotalCount << ")\n");
400 
401   // mem_op(..., size)
402   // ==>
403   // switch (size) {
404   //   case s1:
405   //      mem_op(..., s1);
406   //      goto merge_bb;
407   //   case s2:
408   //      mem_op(..., s2);
409   //      goto merge_bb;
410   //   ...
411   //   default:
412   //      mem_op(..., size);
413   //      goto merge_bb;
414   // }
415   // merge_bb:
416 
417   BasicBlock *BB = MO.I->getParent();
418   LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
419   LLVM_DEBUG(dbgs() << *BB << "\n");
420   auto OrigBBFreq = BFI.getBlockFreq(BB);
421 
422   BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
423   BasicBlock::iterator It(*MO.I);
424   ++It;
425   assert(It != DefaultBB->end());
426   BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
427   MergeBB->setName("MemOP.Merge");
428   BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
429   DefaultBB->setName("MemOP.Default");
430 
431   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
432   auto &Ctx = Func.getContext();
433   IRBuilder<> IRB(BB);
434   BB->getTerminator()->eraseFromParent();
435   Value *SizeVar = MO.getLength();
436   SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
437   Type *MemOpTy = MO.I->getType();
438   PHINode *PHI = nullptr;
439   if (!MemOpTy->isVoidTy()) {
440     // Insert a phi for the return values at the merge block.
441     IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
442     PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
443     MO.I->replaceAllUsesWith(PHI);
444     PHI->addIncoming(MO.I, DefaultBB);
445   }
446 
447   // Clear the value profile data.
448   MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
449   // If all promoted, we don't need the MD.prof metadata.
450   if (SavedRemainCount > 0 || Version != NumVals) {
451     // Otherwise we need update with the un-promoted records back.
452     ArrayRef<InstrProfValueData> RemVDs(RemainingVDs);
453     annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount,
454                       IPVK_MemOPSize, NumVals);
455   }
456 
457   LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
458 
459   std::vector<DominatorTree::UpdateType> Updates;
460   if (DT)
461     Updates.reserve(2 * SizeIds.size());
462 
463   for (uint64_t SizeId : SizeIds) {
464     BasicBlock *CaseBB = BasicBlock::Create(
465         Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
466     MemOp NewMO = MO.clone();
467     // Fix the argument.
468     auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
469     assert(SizeType && "Expected integer type size argument.");
470     ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
471     NewMO.setLength(CaseSizeId);
472     CaseBB->getInstList().push_back(NewMO.I);
473     IRBuilder<> IRBCase(CaseBB);
474     IRBCase.CreateBr(MergeBB);
475     SI->addCase(CaseSizeId, CaseBB);
476     if (!MemOpTy->isVoidTy())
477       PHI->addIncoming(NewMO.I, CaseBB);
478     if (DT) {
479       Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
480       Updates.push_back({DominatorTree::Insert, BB, CaseBB});
481     }
482     LLVM_DEBUG(dbgs() << *CaseBB << "\n");
483   }
484   DTU.applyUpdates(Updates);
485   Updates.clear();
486 
487   setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
488 
489   LLVM_DEBUG(dbgs() << *BB << "\n");
490   LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
491   LLVM_DEBUG(dbgs() << *MergeBB << "\n");
492 
493   ORE.emit([&]() {
494     using namespace ore;
495     return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
496            << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
497            << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
498            << " for " << NV("Versions", Version) << " versions";
499   });
500 
501   return true;
502 }
503 } // namespace
504 
505 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
506                                 OptimizationRemarkEmitter &ORE,
507                                 DominatorTree *DT, TargetLibraryInfo &TLI) {
508   if (DisableMemOPOPT)
509     return false;
510 
511   if (F.hasFnAttribute(Attribute::OptimizeForSize))
512     return false;
513   MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
514   MemOPSizeOpt.perform();
515   return MemOPSizeOpt.isChanged();
516 }
517 
518 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
519   BlockFrequencyInfo &BFI =
520       getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
521   auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
522   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
523   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
524   TargetLibraryInfo &TLI =
525       getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
526   return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
527 }
528 
529 namespace llvm {
530 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
531 
532 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
533                                        FunctionAnalysisManager &FAM) {
534   auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
535   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
536   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
537   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
538   bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
539   if (!Changed)
540     return PreservedAnalyses::all();
541   auto PA = PreservedAnalyses();
542   PA.preserve<DominatorTreeAnalysis>();
543   return PA;
544 }
545 } // namespace llvm
546