1 //===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the CacheMetrics class and functions for showing metrics
10 // of cache lines.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "bolt/Passes/CacheMetrics.h"
15 #include "bolt/Core/BinaryBasicBlock.h"
16 #include "bolt/Core/BinaryFunction.h"
17 #include "llvm/Support/CommandLine.h"
18 #include <unordered_map>
19 
20 using namespace llvm;
21 using namespace bolt;
22 
23 namespace opts {
24 
25 extern cl::OptionCategory BoltOptCategory;
26 
27 extern cl::opt<double> ForwardWeight;
28 extern cl::opt<double> BackwardWeight;
29 extern cl::opt<unsigned> ForwardDistance;
30 extern cl::opt<unsigned> BackwardDistance;
31 extern cl::opt<unsigned> ITLBPageSize;
32 extern cl::opt<unsigned> ITLBEntries;
33 
34 } // namespace opts
35 
36 namespace {
37 
38 /// Initialize and return a position map for binary basic blocks
39 void extractBasicBlockInfo(
40     const std::vector<BinaryFunction *> &BinaryFunctions,
41     std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
42     std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
43 
44   for (BinaryFunction *BF : BinaryFunctions) {
45     const BinaryContext &BC = BF->getBinaryContext();
46     for (BinaryBasicBlock &BB : *BF) {
47       if (BF->isSimple() || BC.HasRelocations) {
48         // Use addresses/sizes as in the output binary
49         BBAddr[&BB] = BB.getOutputAddressRange().first;
50         BBSize[&BB] = BB.getOutputSize();
51       } else {
52         // Output ranges should match the input if the body hasn't changed
53         BBAddr[&BB] = BB.getInputAddressRange().first + BF->getAddress();
54         BBSize[&BB] = BB.getOriginalSize();
55       }
56     }
57   }
58 }
59 
60 /// Calculate TSP metric, which quantifies the number of fallthrough jumps in
61 /// the ordering of basic blocks
62 double
63 calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions,
64              const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
65              const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
66 
67   double Score = 0;
68   for (BinaryFunction *BF : BinaryFunctions) {
69     if (!BF->hasProfile())
70       continue;
71     for (BinaryBasicBlock &SrcBB : *BF) {
72       auto BI = SrcBB.branch_info_begin();
73       for (BinaryBasicBlock *DstBB : SrcBB.successors()) {
74         if (&SrcBB != DstBB &&
75             BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
76             BBAddr.at(&SrcBB) + BBSize.at(&SrcBB) == BBAddr.at(DstBB))
77           Score += BI->Count;
78         ++BI;
79       }
80     }
81   }
82   return Score;
83 }
84 
85 /// Calculate Ext-TSP metric, which quantifies the expected number of i-cache
86 /// misses for a given ordering of basic blocks
87 double calcExtTSPScore(
88     const std::vector<BinaryFunction *> &BinaryFunctions,
89     const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
90     const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
91 
92   double Score = 0.0;
93   for (BinaryFunction *BF : BinaryFunctions) {
94     if (!BF->hasProfile())
95       continue;
96     for (BinaryBasicBlock &SrcBB : *BF) {
97       auto BI = SrcBB.branch_info_begin();
98       for (BinaryBasicBlock *DstBB : SrcBB.successors()) {
99         if (DstBB != &SrcBB)
100           Score +=
101               CacheMetrics::extTSPScore(BBAddr.at(&SrcBB), BBSize.at(&SrcBB),
102                                         BBAddr.at(DstBB), BI->Count);
103         ++BI;
104       }
105     }
106   }
107   return Score;
108 }
109 
110 using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>;
111 
112 /// Build a simplified version of the call graph: For every function, keep
113 /// its callers and the frequencies of the calls
114 std::unordered_map<const BinaryFunction *, Predecessors>
115 extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) {
116   std::unordered_map<const BinaryFunction *, Predecessors> Calls;
117 
118   for (BinaryFunction *SrcFunction : BinaryFunctions) {
119     const BinaryContext &BC = SrcFunction->getBinaryContext();
120     for (BinaryBasicBlock *BB : SrcFunction->layout()) {
121       // Find call instructions and extract target symbols from each one
122       for (MCInst &Inst : *BB) {
123         if (!BC.MIB->isCall(Inst))
124           continue;
125 
126         // Call info
127         const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
128         uint64_t Count = BB->getKnownExecutionCount();
129         // Ignore calls w/o information
130         if (DstSym == nullptr || Count == 0)
131           continue;
132 
133         const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
134         // Ignore recursive calls
135         if (DstFunction == nullptr || DstFunction->layout_empty() ||
136             DstFunction == SrcFunction)
137           continue;
138 
139         // Record the call
140         Calls[DstFunction].emplace_back(SrcFunction, Count);
141       }
142     }
143   }
144   return Calls;
145 }
146 
147 /// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg).
148 /// Given an assignment of functions to the i-TLB pages), we divide all
149 /// functions calls into two categories:
150 /// - 'short' ones that have a caller-callee distance less than a page;
151 /// - 'long' ones where the distance exceeds a page.
152 /// The short calls are likely to result in a i-TLB cache hit. For the long
153 /// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how
154 /// often the page is accessed). Assuming that functions are sent to the i-TLB
155 /// cache in a random order, the probability that a page is present in the cache
156 /// is proportional to the number of samples corresponding to the functions on
157 /// the page. The following procedure detects short and long calls, and
158 /// estimates the expected number of cache misses for the long ones.
159 double expectedCacheHitRatio(
160     const std::vector<BinaryFunction *> &BinaryFunctions,
161     const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
162     const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
163 
164   const double PageSize = opts::ITLBPageSize;
165   const uint64_t CacheEntries = opts::ITLBEntries;
166   std::unordered_map<const BinaryFunction *, Predecessors> Calls =
167       extractFunctionCalls(BinaryFunctions);
168   // Compute 'hotness' of the functions
169   double TotalSamples = 0;
170   std::unordered_map<BinaryFunction *, double> FunctionSamples;
171   for (BinaryFunction *BF : BinaryFunctions) {
172     double Samples = 0;
173     for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF])
174       Samples += Pair.second;
175     Samples = std::max(Samples, (double)BF->getKnownExecutionCount());
176     FunctionSamples[BF] = Samples;
177     TotalSamples += Samples;
178   }
179 
180   // Compute 'hotness' of the pages
181   std::unordered_map<uint64_t, double> PageSamples;
182   for (BinaryFunction *BF : BinaryFunctions) {
183     if (BF->layout_empty())
184       continue;
185     double Page = BBAddr.at(BF->layout_front()) / PageSize;
186     PageSamples[Page] += FunctionSamples.at(BF);
187   }
188 
189   // Computing the expected number of misses for every function
190   double Misses = 0;
191   for (BinaryFunction *BF : BinaryFunctions) {
192     // Skip the function if it has no samples
193     if (BF->layout_empty() || FunctionSamples.at(BF) == 0.0)
194       continue;
195     double Samples = FunctionSamples.at(BF);
196     double Page = BBAddr.at(BF->layout_front()) / PageSize;
197     // The probability that the page is not present in the cache
198     double MissProb = pow(1.0 - PageSamples[Page] / TotalSamples, CacheEntries);
199 
200     // Processing all callers of the function
201     for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
202       BinaryFunction *SrcFunction = Pair.first;
203       double SrcPage = BBAddr.at(SrcFunction->layout_front()) / PageSize;
204       // Is this a 'long' or a 'short' call?
205       if (Page != SrcPage) {
206         // This is a miss
207         Misses += MissProb * Pair.second;
208       }
209       Samples -= Pair.second;
210     }
211     assert(Samples >= 0.0 && "Function samples computed incorrectly");
212     // The remaining samples likely come from the jitted code
213     Misses += Samples * MissProb;
214   }
215 
216   return 100.0 * (1.0 - Misses / TotalSamples);
217 }
218 
219 } // namespace
220 
221 double CacheMetrics::extTSPScore(uint64_t SrcAddr, uint64_t SrcSize,
222                                  uint64_t DstAddr, uint64_t Count) {
223   assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE);
224 
225   // Fallthrough
226   if (SrcAddr + SrcSize == DstAddr) {
227     // Assume that FallthroughWeight = 1.0 after normalization
228     return static_cast<double>(Count);
229   }
230   // Forward
231   if (SrcAddr + SrcSize < DstAddr) {
232     const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
233     if (Dist <= opts::ForwardDistance) {
234       double Prob = 1.0 - static_cast<double>(Dist) / opts::ForwardDistance;
235       return opts::ForwardWeight * Prob * Count;
236     }
237     return 0;
238   }
239   // Backward
240   const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
241   if (Dist <= opts::BackwardDistance) {
242     double Prob = 1.0 - static_cast<double>(Dist) / opts::BackwardDistance;
243     return opts::BackwardWeight * Prob * Count;
244   }
245   return 0;
246 }
247 
248 void CacheMetrics::printAll(const std::vector<BinaryFunction *> &BFs) {
249   // Stats related to hot-cold code splitting
250   size_t NumFunctions = 0;
251   size_t NumProfiledFunctions = 0;
252   size_t NumHotFunctions = 0;
253   size_t NumBlocks = 0;
254   size_t NumHotBlocks = 0;
255 
256   size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max();
257   size_t TotalCodeMaxAddr = 0;
258   size_t HotCodeMinAddr = std::numeric_limits<size_t>::max();
259   size_t HotCodeMaxAddr = 0;
260 
261   for (BinaryFunction *BF : BFs) {
262     NumFunctions++;
263     if (BF->hasProfile())
264       NumProfiledFunctions++;
265     if (BF->hasValidIndex())
266       NumHotFunctions++;
267     for (const BinaryBasicBlock &BB : *BF) {
268       NumBlocks++;
269       size_t BBAddrMin = BB.getOutputAddressRange().first;
270       size_t BBAddrMax = BB.getOutputAddressRange().second;
271       TotalCodeMinAddr = std::min(TotalCodeMinAddr, BBAddrMin);
272       TotalCodeMaxAddr = std::max(TotalCodeMaxAddr, BBAddrMax);
273       if (BF->hasValidIndex() && !BB.isCold()) {
274         NumHotBlocks++;
275         HotCodeMinAddr = std::min(HotCodeMinAddr, BBAddrMin);
276         HotCodeMaxAddr = std::max(HotCodeMaxAddr, BBAddrMax);
277       }
278     }
279   }
280 
281   outs() << format("  There are %zu functions;", NumFunctions)
282          << format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions,
283                    100.0 * NumHotFunctions / NumFunctions)
284          << format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions,
285                    100.0 * NumProfiledFunctions / NumFunctions);
286   outs() << format("  There are %zu basic blocks;", NumBlocks)
287          << format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks,
288                    100.0 * NumHotBlocks / NumBlocks);
289 
290   assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses");
291   size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr;
292   size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr;
293 
294   size_t HugePage2MB = 2 << 20;
295   outs() << format("  Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
296                    "%.2lf huge pages)\n",
297                    100.0 * HotCodeSize / TotalCodeSize, HotCodeSize,
298                    TotalCodeSize, double(HotCodeSize) / HugePage2MB);
299 
300   // Stats related to expected cache performance
301   std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
302   std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize;
303   extractBasicBlockInfo(BFs, BBAddr, BBSize);
304 
305   outs() << "  Expected i-TLB cache hit ratio: "
306          << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize));
307 
308   outs() << "  TSP score: "
309          << format("%.0lf\n", calcTSPScore(BFs, BBAddr, BBSize));
310 
311   outs() << "  ExtTSP score: "
312          << format("%.0lf\n", calcExtTSPScore(BFs, BBAddr, BBSize));
313 }
314