1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the transformation that optimizes memory intrinsics
10 // such as memcpy using the size value profile. When memory intrinsic size
11 // value profile metadata is available, a single memory intrinsic is expanded
12 // to a sequence of guarded specialized versions that are called with the
13 // hottest size(s), for later expansion into more optimal inline sequences.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "llvm/ADT/ArrayRef.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/DomTreeUpdater.h"
23 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
24 #include "llvm/Analysis/TargetLibraryInfo.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Dominators.h"
28 #include "llvm/IR/Function.h"
29 #include "llvm/IR/IRBuilder.h"
30 #include "llvm/IR/InstVisitor.h"
31 #include "llvm/IR/Instruction.h"
32 #include "llvm/IR/Instructions.h"
33 #include "llvm/IR/LLVMContext.h"
34 #include "llvm/IR/PassManager.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/ProfileData/InstrProf.h"
37 #define INSTR_PROF_VALUE_PROF_MEMOP_API
38 #include "llvm/ProfileData/InstrProfData.inc"
39 #include "llvm/Support/Casting.h"
40 #include "llvm/Support/CommandLine.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/MathExtras.h"
44 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
45 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
46 #include <cassert>
47 #include <cstdint>
48 #include <vector>
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "pgo-memop-opt"
53 
54 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
55 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
56 
57 // The minimum call count to optimize memory intrinsic calls.
58 static cl::opt<unsigned>
59     MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
60                         cl::init(1000),
61                         cl::desc("The minimum count to optimize memory "
62                                  "intrinsic calls"));
63 
64 // Command line option to disable memory intrinsic optimization. The default is
65 // false. This is for debug purpose.
66 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
67                                      cl::Hidden, cl::desc("Disable optimize"));
68 
69 // The percent threshold to optimize memory intrinsic calls.
70 static cl::opt<unsigned>
71     MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
72                           cl::Hidden, cl::ZeroOrMore,
73                           cl::desc("The percentage threshold for the "
74                                    "memory intrinsic calls optimization"));
75 
76 // Maximum number of versions for optimizing memory intrinsic call.
77 static cl::opt<unsigned>
78     MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
79                     cl::ZeroOrMore,
80                     cl::desc("The max version for the optimized memory "
81                              " intrinsic calls"));
82 
83 // Scale the counts from the annotation using the BB count value.
84 static cl::opt<bool>
85     MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
86                     cl::desc("Scale the memop size counts using the basic "
87                              " block count value"));
88 
89 cl::opt<bool>
90     MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
91                        cl::Hidden,
92                        cl::desc("Size-specialize memcmp and bcmp calls"));
93 
94 static cl::opt<unsigned>
95     MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
96                     cl::desc("Optimize the memop size <= this value"));
97 
98 namespace {
99 
100 static const char *getMIName(const MemIntrinsic *MI) {
101   switch (MI->getIntrinsicID()) {
102   case Intrinsic::memcpy:
103     return "memcpy";
104   case Intrinsic::memmove:
105     return "memmove";
106   case Intrinsic::memset:
107     return "memset";
108   default:
109     return "unknown";
110   }
111 }
112 
113 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
114 struct MemOp {
115   Instruction *I;
116   MemOp(MemIntrinsic *MI) : I(MI) {}
117   MemOp(CallInst *CI) : I(CI) {}
118   MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
119   CallInst *asCI() { return cast<CallInst>(I); }
120   MemOp clone() {
121     if (auto MI = asMI())
122       return MemOp(cast<MemIntrinsic>(MI->clone()));
123     return MemOp(cast<CallInst>(asCI()->clone()));
124   }
125   Value *getLength() {
126     if (auto MI = asMI())
127       return MI->getLength();
128     return asCI()->getArgOperand(2);
129   }
130   void setLength(Value *Length) {
131     if (auto MI = asMI())
132       return MI->setLength(Length);
133     asCI()->setArgOperand(2, Length);
134   }
135   StringRef getFuncName() {
136     if (auto MI = asMI())
137       return MI->getCalledFunction()->getName();
138     return asCI()->getCalledFunction()->getName();
139   }
140   bool isMemmove() {
141     if (auto MI = asMI())
142       if (MI->getIntrinsicID() == Intrinsic::memmove)
143         return true;
144     return false;
145   }
146   bool isMemcmp(TargetLibraryInfo &TLI) {
147     LibFunc Func;
148     if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
149         Func == LibFunc_memcmp) {
150       return true;
151     }
152     return false;
153   }
154   bool isBcmp(TargetLibraryInfo &TLI) {
155     LibFunc Func;
156     if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
157         Func == LibFunc_bcmp) {
158       return true;
159     }
160     return false;
161   }
162   const char *getName(TargetLibraryInfo &TLI) {
163     if (auto MI = asMI())
164       return getMIName(MI);
165     LibFunc Func;
166     if (TLI.getLibFunc(*asCI(), Func)) {
167       if (Func == LibFunc_memcmp)
168         return "memcmp";
169       if (Func == LibFunc_bcmp)
170         return "bcmp";
171     }
172     llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
173     return nullptr;
174   }
175 };
176 
177 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
178 public:
179   MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
180                OptimizationRemarkEmitter &ORE, DominatorTree *DT,
181                TargetLibraryInfo &TLI)
182       : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
183     ValueDataArray =
184         std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS);
185   }
186   bool isChanged() const { return Changed; }
187   void perform() {
188     WorkList.clear();
189     visit(Func);
190 
191     for (auto &MO : WorkList) {
192       ++NumOfPGOMemOPAnnotate;
193       if (perform(MO)) {
194         Changed = true;
195         ++NumOfPGOMemOPOpt;
196         LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
197                           << "is Transformed.\n");
198       }
199     }
200   }
201 
202   void visitMemIntrinsic(MemIntrinsic &MI) {
203     Value *Length = MI.getLength();
204     // Not perform on constant length calls.
205     if (isa<ConstantInt>(Length))
206       return;
207     WorkList.push_back(MemOp(&MI));
208   }
209 
210   void visitCallInst(CallInst &CI) {
211     LibFunc Func;
212     if (TLI.getLibFunc(CI, Func) &&
213         (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
214         !isa<ConstantInt>(CI.getArgOperand(2))) {
215       WorkList.push_back(MemOp(&CI));
216     }
217   }
218 
219 private:
220   Function &Func;
221   BlockFrequencyInfo &BFI;
222   OptimizationRemarkEmitter &ORE;
223   DominatorTree *DT;
224   TargetLibraryInfo &TLI;
225   bool Changed;
226   std::vector<MemOp> WorkList;
227   // The space to read the profile annotation.
228   std::unique_ptr<InstrProfValueData[]> ValueDataArray;
229   bool perform(MemOp MO);
230 };
231 
232 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
233   assert(Count <= TotalCount);
234   if (Count < MemOPCountThreshold)
235     return false;
236   if (Count < TotalCount * MemOPPercentThreshold / 100)
237     return false;
238   return true;
239 }
240 
241 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
242                                       uint64_t Denom) {
243   if (!MemOPScaleCount)
244     return Count;
245   bool Overflowed;
246   uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
247   return ScaleCount / Denom;
248 }
249 
250 bool MemOPSizeOpt::perform(MemOp MO) {
251   assert(MO.I);
252   if (MO.isMemmove())
253     return false;
254   if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
255     return false;
256 
257   uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS;
258   uint64_t TotalCount;
259   if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals,
260                                 ValueDataArray.get(), NumVals, TotalCount))
261     return false;
262 
263   uint64_t ActualCount = TotalCount;
264   uint64_t SavedTotalCount = TotalCount;
265   if (MemOPScaleCount) {
266     auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
267     if (!BBEdgeCount)
268       return false;
269     ActualCount = *BBEdgeCount;
270   }
271 
272   ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
273   LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
274                     << ActualCount << "\n");
275   LLVM_DEBUG(
276       for (auto &VD
277            : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
278 
279   if (ActualCount < MemOPCountThreshold)
280     return false;
281   // Skip if the total value profiled count is 0, in which case we can't
282   // scale up the counts properly (and there is no profitable transformation).
283   if (TotalCount == 0)
284     return false;
285 
286   TotalCount = ActualCount;
287   if (MemOPScaleCount)
288     LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
289                       << " denominator = " << SavedTotalCount << "\n");
290 
291   // Keeping track of the count of the default case:
292   uint64_t RemainCount = TotalCount;
293   uint64_t SavedRemainCount = SavedTotalCount;
294   SmallVector<uint64_t, 16> SizeIds;
295   SmallVector<uint64_t, 16> CaseCounts;
296   uint64_t MaxCount = 0;
297   unsigned Version = 0;
298   int64_t LastV = -1;
299   // Default case is in the front -- save the slot here.
300   CaseCounts.push_back(0);
301   SmallVector<InstrProfValueData, 24> RemainingVDs;
302   for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) {
303     auto &VD = *I;
304     int64_t V = VD.Value;
305     uint64_t C = VD.Count;
306     if (MemOPScaleCount)
307       C = getScaledCount(C, ActualCount, SavedTotalCount);
308 
309     if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) {
310       RemainingVDs.push_back(VD);
311       continue;
312     }
313 
314     // ValueCounts are sorted on the count. Break at the first un-profitable
315     // value.
316     if (!isProfitable(C, RemainCount)) {
317       RemainingVDs.insert(RemainingVDs.end(), I, E);
318       break;
319     }
320 
321     if (V == LastV) {
322       LLVM_DEBUG(dbgs() << "Invalid Profile Data in Function " << Func.getName()
323                         << ": Two consecutive, identical values in MemOp value"
324                            "counts.\n");
325       return false;
326     }
327 
328     LastV = V;
329 
330     SizeIds.push_back(V);
331     CaseCounts.push_back(C);
332     if (C > MaxCount)
333       MaxCount = C;
334 
335     assert(RemainCount >= C);
336     RemainCount -= C;
337     assert(SavedRemainCount >= VD.Count);
338     SavedRemainCount -= VD.Count;
339 
340     if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) {
341       RemainingVDs.insert(RemainingVDs.end(), I + 1, E);
342       break;
343     }
344   }
345 
346   if (Version == 0)
347     return false;
348 
349   CaseCounts[0] = RemainCount;
350   if (RemainCount > MaxCount)
351     MaxCount = RemainCount;
352 
353   uint64_t SumForOpt = TotalCount - RemainCount;
354 
355   LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
356                     << " Versions (covering " << SumForOpt << " out of "
357                     << TotalCount << ")\n");
358 
359   // mem_op(..., size)
360   // ==>
361   // switch (size) {
362   //   case s1:
363   //      mem_op(..., s1);
364   //      goto merge_bb;
365   //   case s2:
366   //      mem_op(..., s2);
367   //      goto merge_bb;
368   //   ...
369   //   default:
370   //      mem_op(..., size);
371   //      goto merge_bb;
372   // }
373   // merge_bb:
374 
375   BasicBlock *BB = MO.I->getParent();
376   LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
377   LLVM_DEBUG(dbgs() << *BB << "\n");
378   auto OrigBBFreq = BFI.getBlockFreq(BB);
379 
380   BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
381   BasicBlock::iterator It(*MO.I);
382   ++It;
383   assert(It != DefaultBB->end());
384   BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
385   MergeBB->setName("MemOP.Merge");
386   BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
387   DefaultBB->setName("MemOP.Default");
388 
389   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
390   auto &Ctx = Func.getContext();
391   IRBuilder<> IRB(BB);
392   BB->getTerminator()->eraseFromParent();
393   Value *SizeVar = MO.getLength();
394   SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
395   Type *MemOpTy = MO.I->getType();
396   PHINode *PHI = nullptr;
397   if (!MemOpTy->isVoidTy()) {
398     // Insert a phi for the return values at the merge block.
399     IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
400     PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
401     MO.I->replaceAllUsesWith(PHI);
402     PHI->addIncoming(MO.I, DefaultBB);
403   }
404 
405   // Clear the value profile data.
406   MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
407   // If all promoted, we don't need the MD.prof metadata.
408   if (SavedRemainCount > 0 || Version != NumVals) {
409     // Otherwise we need update with the un-promoted records back.
410     ArrayRef<InstrProfValueData> RemVDs(RemainingVDs);
411     annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount,
412                       IPVK_MemOPSize, NumVals);
413   }
414 
415   LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
416 
417   std::vector<DominatorTree::UpdateType> Updates;
418   if (DT)
419     Updates.reserve(2 * SizeIds.size());
420 
421   for (uint64_t SizeId : SizeIds) {
422     BasicBlock *CaseBB = BasicBlock::Create(
423         Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
424     MemOp NewMO = MO.clone();
425     // Fix the argument.
426     auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
427     assert(SizeType && "Expected integer type size argument.");
428     ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
429     NewMO.setLength(CaseSizeId);
430     CaseBB->getInstList().push_back(NewMO.I);
431     IRBuilder<> IRBCase(CaseBB);
432     IRBCase.CreateBr(MergeBB);
433     SI->addCase(CaseSizeId, CaseBB);
434     if (!MemOpTy->isVoidTy())
435       PHI->addIncoming(NewMO.I, CaseBB);
436     if (DT) {
437       Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
438       Updates.push_back({DominatorTree::Insert, BB, CaseBB});
439     }
440     LLVM_DEBUG(dbgs() << *CaseBB << "\n");
441   }
442   DTU.applyUpdates(Updates);
443   Updates.clear();
444 
445   setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
446 
447   LLVM_DEBUG(dbgs() << *BB << "\n");
448   LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
449   LLVM_DEBUG(dbgs() << *MergeBB << "\n");
450 
451   ORE.emit([&]() {
452     using namespace ore;
453     return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
454            << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
455            << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
456            << " for " << NV("Versions", Version) << " versions";
457   });
458 
459   return true;
460 }
461 } // namespace
462 
463 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
464                                 OptimizationRemarkEmitter &ORE,
465                                 DominatorTree *DT, TargetLibraryInfo &TLI) {
466   if (DisableMemOPOPT)
467     return false;
468 
469   if (F.hasFnAttribute(Attribute::OptimizeForSize))
470     return false;
471   MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
472   MemOPSizeOpt.perform();
473   return MemOPSizeOpt.isChanged();
474 }
475 
476 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
477                                        FunctionAnalysisManager &FAM) {
478   auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
479   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
480   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
481   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
482   bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
483   if (!Changed)
484     return PreservedAnalyses::all();
485   auto PA = PreservedAnalyses();
486   PA.preserve<DominatorTreeAnalysis>();
487   return PA;
488 }
489