1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the transformation that optimizes memory intrinsics
10 // such as memcpy using the size value profile. When memory intrinsic size
11 // value profile metadata is available, a single memory intrinsic is expanded
12 // to a sequence of guarded specialized versions that are called with the
13 // hottest size(s), for later expansion into more optimal inline sequences.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "llvm/ADT/ArrayRef.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/DomTreeUpdater.h"
23 #include "llvm/Analysis/GlobalsModRef.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/Analysis/TargetLibraryInfo.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/Instruction.h"
33 #include "llvm/IR/Instructions.h"
34 #include "llvm/IR/LLVMContext.h"
35 #include "llvm/IR/PassManager.h"
36 #include "llvm/IR/Type.h"
37 #include "llvm/InitializePasses.h"
38 #include "llvm/Pass.h"
39 #include "llvm/PassRegistry.h"
40 #include "llvm/ProfileData/InstrProf.h"
41 #define INSTR_PROF_VALUE_PROF_MEMOP_API
42 #include "llvm/ProfileData/InstrProfData.inc"
43 #include "llvm/Support/Casting.h"
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/Debug.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/MathExtras.h"
48 #include "llvm/Transforms/Instrumentation.h"
49 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
50 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
51 #include <cassert>
52 #include <cstdint>
53 #include <vector>
54 
55 using namespace llvm;
56 
57 #define DEBUG_TYPE "pgo-memop-opt"
58 
59 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
60 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
61 
62 // The minimum call count to optimize memory intrinsic calls.
63 static cl::opt<unsigned>
64     MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
65                         cl::init(1000),
66                         cl::desc("The minimum count to optimize memory "
67                                  "intrinsic calls"));
68 
69 // Command line option to disable memory intrinsic optimization. The default is
70 // false. This is for debug purpose.
71 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
72                                      cl::Hidden, cl::desc("Disable optimize"));
73 
74 // The percent threshold to optimize memory intrinsic calls.
75 static cl::opt<unsigned>
76     MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
77                           cl::Hidden, cl::ZeroOrMore,
78                           cl::desc("The percentage threshold for the "
79                                    "memory intrinsic calls optimization"));
80 
81 // Maximum number of versions for optimizing memory intrinsic call.
82 static cl::opt<unsigned>
83     MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
84                     cl::ZeroOrMore,
85                     cl::desc("The max version for the optimized memory "
86                              " intrinsic calls"));
87 
88 // Scale the counts from the annotation using the BB count value.
89 static cl::opt<bool>
90     MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
91                     cl::desc("Scale the memop size counts using the basic "
92                              " block count value"));
93 
94 cl::opt<bool>
95     MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
96                        cl::Hidden,
97                        cl::desc("Size-specialize memcmp and bcmp calls"));
98 
99 static cl::opt<unsigned>
100     MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
101                     cl::desc("Optimize the memop size <= this value"));
102 
103 namespace {
104 
105 static const char *getMIName(const MemIntrinsic *MI) {
106   switch (MI->getIntrinsicID()) {
107   case Intrinsic::memcpy:
108     return "memcpy";
109   case Intrinsic::memmove:
110     return "memmove";
111   case Intrinsic::memset:
112     return "memset";
113   default:
114     return "unknown";
115   }
116 }
117 
118 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
119 struct MemOp {
120   Instruction *I;
121   MemOp(MemIntrinsic *MI) : I(MI) {}
122   MemOp(CallInst *CI) : I(CI) {}
123   MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
124   CallInst *asCI() { return cast<CallInst>(I); }
125   MemOp clone() {
126     if (auto MI = asMI())
127       return MemOp(cast<MemIntrinsic>(MI->clone()));
128     return MemOp(cast<CallInst>(asCI()->clone()));
129   }
130   Value *getLength() {
131     if (auto MI = asMI())
132       return MI->getLength();
133     return asCI()->getArgOperand(2);
134   }
135   void setLength(Value *Length) {
136     if (auto MI = asMI())
137       return MI->setLength(Length);
138     asCI()->setArgOperand(2, Length);
139   }
140   StringRef getFuncName() {
141     if (auto MI = asMI())
142       return MI->getCalledFunction()->getName();
143     return asCI()->getCalledFunction()->getName();
144   }
145   bool isMemmove() {
146     if (auto MI = asMI())
147       if (MI->getIntrinsicID() == Intrinsic::memmove)
148         return true;
149     return false;
150   }
151   bool isMemcmp(TargetLibraryInfo &TLI) {
152     LibFunc Func;
153     if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
154         Func == LibFunc_memcmp) {
155       return true;
156     }
157     return false;
158   }
159   bool isBcmp(TargetLibraryInfo &TLI) {
160     LibFunc Func;
161     if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
162         Func == LibFunc_bcmp) {
163       return true;
164     }
165     return false;
166   }
167   const char *getName(TargetLibraryInfo &TLI) {
168     if (auto MI = asMI())
169       return getMIName(MI);
170     LibFunc Func;
171     if (TLI.getLibFunc(*asCI(), Func)) {
172       if (Func == LibFunc_memcmp)
173         return "memcmp";
174       if (Func == LibFunc_bcmp)
175         return "bcmp";
176     }
177     llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
178     return nullptr;
179   }
180 };
181 
182 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
183 public:
184   MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
185                OptimizationRemarkEmitter &ORE, DominatorTree *DT,
186                TargetLibraryInfo &TLI)
187       : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
188     ValueDataArray =
189         std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS);
190   }
191   bool isChanged() const { return Changed; }
192   void perform() {
193     WorkList.clear();
194     visit(Func);
195 
196     for (auto &MO : WorkList) {
197       ++NumOfPGOMemOPAnnotate;
198       if (perform(MO)) {
199         Changed = true;
200         ++NumOfPGOMemOPOpt;
201         LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
202                           << "is Transformed.\n");
203       }
204     }
205   }
206 
207   void visitMemIntrinsic(MemIntrinsic &MI) {
208     Value *Length = MI.getLength();
209     // Not perform on constant length calls.
210     if (isa<ConstantInt>(Length))
211       return;
212     WorkList.push_back(MemOp(&MI));
213   }
214 
215   void visitCallInst(CallInst &CI) {
216     LibFunc Func;
217     if (TLI.getLibFunc(CI, Func) &&
218         (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
219         !isa<ConstantInt>(CI.getArgOperand(2))) {
220       WorkList.push_back(MemOp(&CI));
221     }
222   }
223 
224 private:
225   Function &Func;
226   BlockFrequencyInfo &BFI;
227   OptimizationRemarkEmitter &ORE;
228   DominatorTree *DT;
229   TargetLibraryInfo &TLI;
230   bool Changed;
231   std::vector<MemOp> WorkList;
232   // The space to read the profile annotation.
233   std::unique_ptr<InstrProfValueData[]> ValueDataArray;
234   bool perform(MemOp MO);
235 };
236 
237 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
238   assert(Count <= TotalCount);
239   if (Count < MemOPCountThreshold)
240     return false;
241   if (Count < TotalCount * MemOPPercentThreshold / 100)
242     return false;
243   return true;
244 }
245 
246 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
247                                       uint64_t Denom) {
248   if (!MemOPScaleCount)
249     return Count;
250   bool Overflowed;
251   uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
252   return ScaleCount / Denom;
253 }
254 
255 bool MemOPSizeOpt::perform(MemOp MO) {
256   assert(MO.I);
257   if (MO.isMemmove())
258     return false;
259   if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
260     return false;
261 
262   uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS;
263   uint64_t TotalCount;
264   if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals,
265                                 ValueDataArray.get(), NumVals, TotalCount))
266     return false;
267 
268   uint64_t ActualCount = TotalCount;
269   uint64_t SavedTotalCount = TotalCount;
270   if (MemOPScaleCount) {
271     auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
272     if (!BBEdgeCount)
273       return false;
274     ActualCount = *BBEdgeCount;
275   }
276 
277   ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
278   LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
279                     << ActualCount << "\n");
280   LLVM_DEBUG(
281       for (auto &VD
282            : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
283 
284   if (ActualCount < MemOPCountThreshold)
285     return false;
286   // Skip if the total value profiled count is 0, in which case we can't
287   // scale up the counts properly (and there is no profitable transformation).
288   if (TotalCount == 0)
289     return false;
290 
291   TotalCount = ActualCount;
292   if (MemOPScaleCount)
293     LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
294                       << " denominator = " << SavedTotalCount << "\n");
295 
296   // Keeping track of the count of the default case:
297   uint64_t RemainCount = TotalCount;
298   uint64_t SavedRemainCount = SavedTotalCount;
299   SmallVector<uint64_t, 16> SizeIds;
300   SmallVector<uint64_t, 16> CaseCounts;
301   uint64_t MaxCount = 0;
302   unsigned Version = 0;
303   int64_t LastV = -1;
304   // Default case is in the front -- save the slot here.
305   CaseCounts.push_back(0);
306   SmallVector<InstrProfValueData, 24> RemainingVDs;
307   for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) {
308     auto &VD = *I;
309     int64_t V = VD.Value;
310     uint64_t C = VD.Count;
311     if (MemOPScaleCount)
312       C = getScaledCount(C, ActualCount, SavedTotalCount);
313 
314     if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) {
315       RemainingVDs.push_back(VD);
316       continue;
317     }
318 
319     // ValueCounts are sorted on the count. Break at the first un-profitable
320     // value.
321     if (!isProfitable(C, RemainCount)) {
322       RemainingVDs.insert(RemainingVDs.end(), I, E);
323       break;
324     }
325 
326     if (V == LastV) {
327       LLVM_DEBUG(dbgs() << "Invalid Profile Data in Function " << Func.getName()
328                         << ": Two consecutive, identical values in MemOp value"
329                            "counts.\n");
330       return false;
331     }
332 
333     LastV = V;
334 
335     SizeIds.push_back(V);
336     CaseCounts.push_back(C);
337     if (C > MaxCount)
338       MaxCount = C;
339 
340     assert(RemainCount >= C);
341     RemainCount -= C;
342     assert(SavedRemainCount >= VD.Count);
343     SavedRemainCount -= VD.Count;
344 
345     if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) {
346       RemainingVDs.insert(RemainingVDs.end(), I + 1, E);
347       break;
348     }
349   }
350 
351   if (Version == 0)
352     return false;
353 
354   CaseCounts[0] = RemainCount;
355   if (RemainCount > MaxCount)
356     MaxCount = RemainCount;
357 
358   uint64_t SumForOpt = TotalCount - RemainCount;
359 
360   LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
361                     << " Versions (covering " << SumForOpt << " out of "
362                     << TotalCount << ")\n");
363 
364   // mem_op(..., size)
365   // ==>
366   // switch (size) {
367   //   case s1:
368   //      mem_op(..., s1);
369   //      goto merge_bb;
370   //   case s2:
371   //      mem_op(..., s2);
372   //      goto merge_bb;
373   //   ...
374   //   default:
375   //      mem_op(..., size);
376   //      goto merge_bb;
377   // }
378   // merge_bb:
379 
380   BasicBlock *BB = MO.I->getParent();
381   LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
382   LLVM_DEBUG(dbgs() << *BB << "\n");
383   auto OrigBBFreq = BFI.getBlockFreq(BB);
384 
385   BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
386   BasicBlock::iterator It(*MO.I);
387   ++It;
388   assert(It != DefaultBB->end());
389   BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
390   MergeBB->setName("MemOP.Merge");
391   BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
392   DefaultBB->setName("MemOP.Default");
393 
394   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
395   auto &Ctx = Func.getContext();
396   IRBuilder<> IRB(BB);
397   BB->getTerminator()->eraseFromParent();
398   Value *SizeVar = MO.getLength();
399   SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
400   Type *MemOpTy = MO.I->getType();
401   PHINode *PHI = nullptr;
402   if (!MemOpTy->isVoidTy()) {
403     // Insert a phi for the return values at the merge block.
404     IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
405     PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
406     MO.I->replaceAllUsesWith(PHI);
407     PHI->addIncoming(MO.I, DefaultBB);
408   }
409 
410   // Clear the value profile data.
411   MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
412   // If all promoted, we don't need the MD.prof metadata.
413   if (SavedRemainCount > 0 || Version != NumVals) {
414     // Otherwise we need update with the un-promoted records back.
415     ArrayRef<InstrProfValueData> RemVDs(RemainingVDs);
416     annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount,
417                       IPVK_MemOPSize, NumVals);
418   }
419 
420   LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
421 
422   std::vector<DominatorTree::UpdateType> Updates;
423   if (DT)
424     Updates.reserve(2 * SizeIds.size());
425 
426   for (uint64_t SizeId : SizeIds) {
427     BasicBlock *CaseBB = BasicBlock::Create(
428         Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
429     MemOp NewMO = MO.clone();
430     // Fix the argument.
431     auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
432     assert(SizeType && "Expected integer type size argument.");
433     ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
434     NewMO.setLength(CaseSizeId);
435     CaseBB->getInstList().push_back(NewMO.I);
436     IRBuilder<> IRBCase(CaseBB);
437     IRBCase.CreateBr(MergeBB);
438     SI->addCase(CaseSizeId, CaseBB);
439     if (!MemOpTy->isVoidTy())
440       PHI->addIncoming(NewMO.I, CaseBB);
441     if (DT) {
442       Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
443       Updates.push_back({DominatorTree::Insert, BB, CaseBB});
444     }
445     LLVM_DEBUG(dbgs() << *CaseBB << "\n");
446   }
447   DTU.applyUpdates(Updates);
448   Updates.clear();
449 
450   setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
451 
452   LLVM_DEBUG(dbgs() << *BB << "\n");
453   LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
454   LLVM_DEBUG(dbgs() << *MergeBB << "\n");
455 
456   ORE.emit([&]() {
457     using namespace ore;
458     return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
459            << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
460            << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
461            << " for " << NV("Versions", Version) << " versions";
462   });
463 
464   return true;
465 }
466 } // namespace
467 
468 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
469                                 OptimizationRemarkEmitter &ORE,
470                                 DominatorTree *DT, TargetLibraryInfo &TLI) {
471   if (DisableMemOPOPT)
472     return false;
473 
474   if (F.hasFnAttribute(Attribute::OptimizeForSize))
475     return false;
476   MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
477   MemOPSizeOpt.perform();
478   return MemOPSizeOpt.isChanged();
479 }
480 
481 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
482                                        FunctionAnalysisManager &FAM) {
483   auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
484   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
485   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
486   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
487   bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
488   if (!Changed)
489     return PreservedAnalyses::all();
490   auto PA = PreservedAnalyses();
491   PA.preserve<DominatorTreeAnalysis>();
492   return PA;
493 }
494