1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2 //
3 //                      The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the transformation that optimizes memory intrinsics
11 // such as memcpy using the size value profile. When memory intrinsic size
12 // value profile metadata is available, a single memory intrinsic is expanded
13 // to a sequence of guarded specialized versions that are called with the
14 // hottest size(s), for later expansion into more optimal inline sequences.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/ADT/Twine.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/GlobalsModRef.h"
24 #include "llvm/IR/BasicBlock.h"
25 #include "llvm/IR/CallSite.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Function.h"
29 #include "llvm/IR/IRBuilder.h"
30 #include "llvm/IR/InstrTypes.h"
31 #include "llvm/IR/Instruction.h"
32 #include "llvm/IR/Instructions.h"
33 #include "llvm/IR/InstVisitor.h"
34 #include "llvm/IR/LLVMContext.h"
35 #include "llvm/IR/PassManager.h"
36 #include "llvm/IR/Type.h"
37 #include "llvm/Pass.h"
38 #include "llvm/PassRegistry.h"
39 #include "llvm/PassSupport.h"
40 #include "llvm/ProfileData/InstrProf.h"
41 #include "llvm/Support/Casting.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/MathExtras.h"
46 #include "llvm/Transforms/Instrumentation.h"
47 #include "llvm/Transforms/PGOInstrumentation.h"
48 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
49 #include <cassert>
50 #include <cstdint>
51 #include <vector>
52 
53 using namespace llvm;
54 
55 #define DEBUG_TYPE "pgo-memop-opt"
56 
57 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
58 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
59 
60 // The minimum call count to optimize memory intrinsic calls.
61 static cl::opt<unsigned>
62     MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
63                         cl::init(1000),
64                         cl::desc("The minimum count to optimize memory "
65                                  "intrinsic calls"));
66 
67 // Command line option to disable memory intrinsic optimization. The default is
68 // false. This is for debug purpose.
69 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
70                                      cl::Hidden, cl::desc("Disable optimize"));
71 
72 // The percent threshold to optimize memory intrinsic calls.
73 static cl::opt<unsigned>
74     MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
75                           cl::Hidden, cl::ZeroOrMore,
76                           cl::desc("The percentage threshold for the "
77                                    "memory intrinsic calls optimization"));
78 
79 // Maximum number of versions for optimizing memory intrinsic call.
80 static cl::opt<unsigned>
81     MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
82                     cl::ZeroOrMore,
83                     cl::desc("The max version for the optimized memory "
84                              " intrinsic calls"));
85 
86 // Scale the counts from the annotation using the BB count value.
87 static cl::opt<bool>
88     MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
89                     cl::desc("Scale the memop size counts using the basic "
90                              " block count value"));
91 
92 // This option sets the rangge of precise profile memop sizes.
93 extern cl::opt<std::string> MemOPSizeRange;
94 
95 // This option sets the value that groups large memop sizes
96 extern cl::opt<unsigned> MemOPSizeLarge;
97 
98 namespace {
99 class PGOMemOPSizeOptLegacyPass : public FunctionPass {
100 public:
101   static char ID;
102 
103   PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
104     initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
105   }
106 
107   StringRef getPassName() const override { return "PGOMemOPSize"; }
108 
109 private:
110   bool runOnFunction(Function &F) override;
111   void getAnalysisUsage(AnalysisUsage &AU) const override {
112     AU.addRequired<BlockFrequencyInfoWrapperPass>();
113     AU.addPreserved<GlobalsAAWrapperPass>();
114   }
115 };
116 } // end anonymous namespace
117 
118 char PGOMemOPSizeOptLegacyPass::ID = 0;
119 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
120                       "Optimize memory intrinsic using its size value profile",
121                       false, false)
122 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
123 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
124                     "Optimize memory intrinsic using its size value profile",
125                     false, false)
126 
127 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
128   return new PGOMemOPSizeOptLegacyPass();
129 }
130 
131 namespace {
132 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
133 public:
134   MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI)
135       : Func(Func), BFI(BFI), Changed(false) {
136     ValueDataArray =
137         llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
138     // Get the MemOPSize range information from option MemOPSizeRange,
139     getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
140                                 PreciseRangeLast);
141   }
142   bool isChanged() const { return Changed; }
143   void perform() {
144     WorkList.clear();
145     visit(Func);
146 
147     for (auto &MI : WorkList) {
148       ++NumOfPGOMemOPAnnotate;
149       if (perform(MI)) {
150         Changed = true;
151         ++NumOfPGOMemOPOpt;
152         DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName()
153                      << "is Transformed.\n");
154       }
155     }
156   }
157 
158   void visitMemIntrinsic(MemIntrinsic &MI) {
159     Value *Length = MI.getLength();
160     // Not perform on constant length calls.
161     if (dyn_cast<ConstantInt>(Length))
162       return;
163     WorkList.push_back(&MI);
164   }
165 
166 private:
167   Function &Func;
168   BlockFrequencyInfo &BFI;
169   bool Changed;
170   std::vector<MemIntrinsic *> WorkList;
171   // Start of the previse range.
172   int64_t PreciseRangeStart;
173   // Last value of the previse range.
174   int64_t PreciseRangeLast;
175   // The space to read the profile annotation.
176   std::unique_ptr<InstrProfValueData[]> ValueDataArray;
177   bool perform(MemIntrinsic *MI);
178 
179   // This kind shows which group the value falls in. For PreciseValue, we have
180   // the profile count for that value. LargeGroup groups the values that are in
181   // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
182   enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
183 
184   MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
185     if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
186       return LargeGroup;
187     if (Value == PreciseRangeLast + 1)
188       return NonLargeGroup;
189     return PreciseValue;
190   }
191 };
192 
193 static const char *getMIName(const MemIntrinsic *MI) {
194   switch (MI->getIntrinsicID()) {
195   case Intrinsic::memcpy:
196     return "memcpy";
197   case Intrinsic::memmove:
198     return "memmove";
199   case Intrinsic::memset:
200     return "memset";
201   default:
202     return "unknown";
203   }
204 }
205 
206 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
207   assert(Count <= TotalCount);
208   if (Count < MemOPCountThreshold)
209     return false;
210   if (Count < TotalCount * MemOPPercentThreshold / 100)
211     return false;
212   return true;
213 }
214 
215 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
216                                       uint64_t Denom) {
217   if (!MemOPScaleCount)
218     return Count;
219   bool Overflowed;
220   uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
221   return ScaleCount / Denom;
222 }
223 
224 bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
225   assert(MI);
226   if (MI->getIntrinsicID() == Intrinsic::memmove)
227     return false;
228 
229   uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
230   uint64_t TotalCount;
231   if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
232                                 ValueDataArray.get(), NumVals, TotalCount))
233     return false;
234 
235   uint64_t ActualCount = TotalCount;
236   uint64_t SavedTotalCount = TotalCount;
237   if (MemOPScaleCount) {
238     auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
239     if (!BBEdgeCount)
240       return false;
241     ActualCount = *BBEdgeCount;
242   }
243 
244   ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
245   DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount
246                << "\n");
247   DEBUG(
248       for (auto &VD
249            : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
250 
251   if (ActualCount < MemOPCountThreshold)
252     return false;
253   // Skip if the total value profiled count is 0, in which case we can't
254   // scale up the counts properly (and there is no profitable transformation).
255   if (TotalCount == 0)
256     return false;
257 
258   TotalCount = ActualCount;
259   if (MemOPScaleCount)
260     DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
261                  << " denominator = " << SavedTotalCount << "\n");
262 
263   // Keeping track of the count of the default case:
264   uint64_t RemainCount = TotalCount;
265   uint64_t SavedRemainCount = SavedTotalCount;
266   SmallVector<uint64_t, 16> SizeIds;
267   SmallVector<uint64_t, 16> CaseCounts;
268   uint64_t MaxCount = 0;
269   unsigned Version = 0;
270   // Default case is in the front -- save the slot here.
271   CaseCounts.push_back(0);
272   for (auto &VD : VDs) {
273     int64_t V = VD.Value;
274     uint64_t C = VD.Count;
275     if (MemOPScaleCount)
276       C = getScaledCount(C, ActualCount, SavedTotalCount);
277 
278     // Only care precise value here.
279     if (getMemOPSizeKind(V) != PreciseValue)
280       continue;
281 
282     // ValueCounts are sorted on the count. Break at the first un-profitable
283     // value.
284     if (!isProfitable(C, RemainCount))
285       break;
286 
287     SizeIds.push_back(V);
288     CaseCounts.push_back(C);
289     if (C > MaxCount)
290       MaxCount = C;
291 
292     assert(RemainCount >= C);
293     RemainCount -= C;
294     assert(SavedRemainCount >= VD.Count);
295     SavedRemainCount -= VD.Count;
296 
297     if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
298       break;
299   }
300 
301   if (Version == 0)
302     return false;
303 
304   CaseCounts[0] = RemainCount;
305   if (RemainCount > MaxCount)
306     MaxCount = RemainCount;
307 
308   uint64_t SumForOpt = TotalCount - RemainCount;
309 
310   DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
311                << " Versions (covering " << SumForOpt << " out of "
312                << TotalCount << ")\n");
313 
314   // mem_op(..., size)
315   // ==>
316   // switch (size) {
317   //   case s1:
318   //      mem_op(..., s1);
319   //      goto merge_bb;
320   //   case s2:
321   //      mem_op(..., s2);
322   //      goto merge_bb;
323   //   ...
324   //   default:
325   //      mem_op(..., size);
326   //      goto merge_bb;
327   // }
328   // merge_bb:
329 
330   BasicBlock *BB = MI->getParent();
331   DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
332   DEBUG(dbgs() << *BB << "\n");
333   auto OrigBBFreq = BFI.getBlockFreq(BB);
334 
335   BasicBlock *DefaultBB = SplitBlock(BB, MI);
336   BasicBlock::iterator It(*MI);
337   ++It;
338   assert(It != DefaultBB->end());
339   BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It));
340   MergeBB->setName("MemOP.Merge");
341   BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
342   DefaultBB->setName("MemOP.Default");
343 
344   auto &Ctx = Func.getContext();
345   IRBuilder<> IRB(BB);
346   BB->getTerminator()->eraseFromParent();
347   Value *SizeVar = MI->getLength();
348   SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
349 
350   // Clear the value profile data.
351   MI->setMetadata(LLVMContext::MD_prof, nullptr);
352   // If all promoted, we don't need the MD.prof metadata.
353   if (SavedRemainCount > 0 || Version != NumVals)
354     // Otherwise we need update with the un-promoted records back.
355     annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
356                       SavedRemainCount, IPVK_MemOPSize, NumVals);
357 
358   DEBUG(dbgs() << "\n\n== Basic Block After==\n");
359 
360   for (uint64_t SizeId : SizeIds) {
361     ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId);
362     BasicBlock *CaseBB = BasicBlock::Create(
363         Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
364     Instruction *NewInst = MI->clone();
365     // Fix the argument.
366     dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId);
367     CaseBB->getInstList().push_back(NewInst);
368     IRBuilder<> IRBCase(CaseBB);
369     IRBCase.CreateBr(MergeBB);
370     SI->addCase(CaseSizeId, CaseBB);
371     DEBUG(dbgs() << *CaseBB << "\n");
372   }
373   setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
374 
375   DEBUG(dbgs() << *BB << "\n");
376   DEBUG(dbgs() << *DefaultBB << "\n");
377   DEBUG(dbgs() << *MergeBB << "\n");
378 
379   emitOptimizationRemark(Func.getContext(), "memop-opt", Func,
380                          MI->getDebugLoc(),
381                          Twine("optimize ") + getMIName(MI) + " with count " +
382                              Twine(SumForOpt) + " out of " + Twine(TotalCount) +
383                              " for " + Twine(Version) + " versions");
384 
385   return true;
386 }
387 } // namespace
388 
389 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI) {
390   if (DisableMemOPOPT)
391     return false;
392 
393   if (F.hasFnAttribute(Attribute::OptimizeForSize))
394     return false;
395   MemOPSizeOpt MemOPSizeOpt(F, BFI);
396   MemOPSizeOpt.perform();
397   return MemOPSizeOpt.isChanged();
398 }
399 
400 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
401   BlockFrequencyInfo &BFI =
402       getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
403   return PGOMemOPSizeOptImpl(F, BFI);
404 }
405 
406 namespace llvm {
407 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
408 
409 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
410                                        FunctionAnalysisManager &FAM) {
411   auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
412   bool Changed = PGOMemOPSizeOptImpl(F, BFI);
413   if (!Changed)
414     return PreservedAnalyses::all();
415   auto PA = PreservedAnalyses();
416   PA.preserve<GlobalsAA>();
417   return PA;
418 }
419 } // namespace llvm
420