1 //===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the CacheMetrics class and functions for showing metrics
10 // of cache lines.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "bolt/Passes/CacheMetrics.h"
15 #include "bolt/Core/BinaryBasicBlock.h"
16 #include "bolt/Core/BinaryFunction.h"
17 #include "llvm/Support/CommandLine.h"
18 #include <unordered_map>
19
20 using namespace llvm;
21 using namespace bolt;
22
23 namespace opts {
24
25 extern cl::OptionCategory BoltOptCategory;
26
27 extern cl::opt<double> ForwardWeight;
28 extern cl::opt<double> BackwardWeight;
29 extern cl::opt<unsigned> ForwardDistance;
30 extern cl::opt<unsigned> BackwardDistance;
31 extern cl::opt<unsigned> ITLBPageSize;
32 extern cl::opt<unsigned> ITLBEntries;
33
34 } // namespace opts
35
36 namespace {
37
38 /// Initialize and return a position map for binary basic blocks
extractBasicBlockInfo(const std::vector<BinaryFunction * > & BinaryFunctions,std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)39 void extractBasicBlockInfo(
40 const std::vector<BinaryFunction *> &BinaryFunctions,
41 std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
42 std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
43
44 for (BinaryFunction *BF : BinaryFunctions) {
45 const BinaryContext &BC = BF->getBinaryContext();
46 for (BinaryBasicBlock &BB : *BF) {
47 if (BF->isSimple() || BC.HasRelocations) {
48 // Use addresses/sizes as in the output binary
49 BBAddr[&BB] = BB.getOutputAddressRange().first;
50 BBSize[&BB] = BB.getOutputSize();
51 } else {
52 // Output ranges should match the input if the body hasn't changed
53 BBAddr[&BB] = BB.getInputAddressRange().first + BF->getAddress();
54 BBSize[&BB] = BB.getOriginalSize();
55 }
56 }
57 }
58 }
59
60 /// Calculate TSP metric, which quantifies the number of fallthrough jumps in
61 /// the ordering of basic blocks
62 double
calcTSPScore(const std::vector<BinaryFunction * > & BinaryFunctions,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)63 calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions,
64 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
65 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
66
67 double Score = 0;
68 for (BinaryFunction *BF : BinaryFunctions) {
69 if (!BF->hasProfile())
70 continue;
71 for (BinaryBasicBlock &SrcBB : *BF) {
72 auto BI = SrcBB.branch_info_begin();
73 for (BinaryBasicBlock *DstBB : SrcBB.successors()) {
74 if (&SrcBB != DstBB &&
75 BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
76 BBAddr.at(&SrcBB) + BBSize.at(&SrcBB) == BBAddr.at(DstBB))
77 Score += BI->Count;
78 ++BI;
79 }
80 }
81 }
82 return Score;
83 }
84
85 /// Calculate Ext-TSP metric, which quantifies the expected number of i-cache
86 /// misses for a given ordering of basic blocks
calcExtTSPScore(const std::vector<BinaryFunction * > & BinaryFunctions,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)87 double calcExtTSPScore(
88 const std::vector<BinaryFunction *> &BinaryFunctions,
89 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
90 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
91
92 double Score = 0.0;
93 for (BinaryFunction *BF : BinaryFunctions) {
94 if (!BF->hasProfile())
95 continue;
96 for (BinaryBasicBlock &SrcBB : *BF) {
97 auto BI = SrcBB.branch_info_begin();
98 for (BinaryBasicBlock *DstBB : SrcBB.successors()) {
99 if (DstBB != &SrcBB)
100 Score +=
101 CacheMetrics::extTSPScore(BBAddr.at(&SrcBB), BBSize.at(&SrcBB),
102 BBAddr.at(DstBB), BI->Count);
103 ++BI;
104 }
105 }
106 }
107 return Score;
108 }
109
110 using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>;
111
112 /// Build a simplified version of the call graph: For every function, keep
113 /// its callers and the frequencies of the calls
114 std::unordered_map<const BinaryFunction *, Predecessors>
extractFunctionCalls(const std::vector<BinaryFunction * > & BinaryFunctions)115 extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) {
116 std::unordered_map<const BinaryFunction *, Predecessors> Calls;
117
118 for (BinaryFunction *SrcFunction : BinaryFunctions) {
119 const BinaryContext &BC = SrcFunction->getBinaryContext();
120 for (BinaryBasicBlock *BB : SrcFunction->getLayout().blocks()) {
121 // Find call instructions and extract target symbols from each one
122 for (MCInst &Inst : *BB) {
123 if (!BC.MIB->isCall(Inst))
124 continue;
125
126 // Call info
127 const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
128 uint64_t Count = BB->getKnownExecutionCount();
129 // Ignore calls w/o information
130 if (DstSym == nullptr || Count == 0)
131 continue;
132
133 const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
134 // Ignore recursive calls
135 if (DstFunction == nullptr || DstFunction->getLayout().block_empty() ||
136 DstFunction == SrcFunction)
137 continue;
138
139 // Record the call
140 Calls[DstFunction].emplace_back(SrcFunction, Count);
141 }
142 }
143 }
144 return Calls;
145 }
146
147 /// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg).
148 /// Given an assignment of functions to the i-TLB pages), we divide all
149 /// functions calls into two categories:
150 /// - 'short' ones that have a caller-callee distance less than a page;
151 /// - 'long' ones where the distance exceeds a page.
152 /// The short calls are likely to result in a i-TLB cache hit. For the long
153 /// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how
154 /// often the page is accessed). Assuming that functions are sent to the i-TLB
155 /// cache in a random order, the probability that a page is present in the cache
156 /// is proportional to the number of samples corresponding to the functions on
157 /// the page. The following procedure detects short and long calls, and
158 /// estimates the expected number of cache misses for the long ones.
expectedCacheHitRatio(const std::vector<BinaryFunction * > & BinaryFunctions,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)159 double expectedCacheHitRatio(
160 const std::vector<BinaryFunction *> &BinaryFunctions,
161 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
162 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
163
164 const double PageSize = opts::ITLBPageSize;
165 const uint64_t CacheEntries = opts::ITLBEntries;
166 std::unordered_map<const BinaryFunction *, Predecessors> Calls =
167 extractFunctionCalls(BinaryFunctions);
168 // Compute 'hotness' of the functions
169 double TotalSamples = 0;
170 std::unordered_map<BinaryFunction *, double> FunctionSamples;
171 for (BinaryFunction *BF : BinaryFunctions) {
172 double Samples = 0;
173 for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF])
174 Samples += Pair.second;
175 Samples = std::max(Samples, (double)BF->getKnownExecutionCount());
176 FunctionSamples[BF] = Samples;
177 TotalSamples += Samples;
178 }
179
180 // Compute 'hotness' of the pages
181 std::unordered_map<uint64_t, double> PageSamples;
182 for (BinaryFunction *BF : BinaryFunctions) {
183 if (BF->getLayout().block_empty())
184 continue;
185 double Page = BBAddr.at(BF->getLayout().block_front()) / PageSize;
186 PageSamples[Page] += FunctionSamples.at(BF);
187 }
188
189 // Computing the expected number of misses for every function
190 double Misses = 0;
191 for (BinaryFunction *BF : BinaryFunctions) {
192 // Skip the function if it has no samples
193 if (BF->getLayout().block_empty() || FunctionSamples.at(BF) == 0.0)
194 continue;
195 double Samples = FunctionSamples.at(BF);
196 double Page = BBAddr.at(BF->getLayout().block_front()) / PageSize;
197 // The probability that the page is not present in the cache
198 double MissProb = pow(1.0 - PageSamples[Page] / TotalSamples, CacheEntries);
199
200 // Processing all callers of the function
201 for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
202 BinaryFunction *SrcFunction = Pair.first;
203 double SrcPage =
204 BBAddr.at(SrcFunction->getLayout().block_front()) / PageSize;
205 // Is this a 'long' or a 'short' call?
206 if (Page != SrcPage) {
207 // This is a miss
208 Misses += MissProb * Pair.second;
209 }
210 Samples -= Pair.second;
211 }
212 assert(Samples >= 0.0 && "Function samples computed incorrectly");
213 // The remaining samples likely come from the jitted code
214 Misses += Samples * MissProb;
215 }
216
217 return 100.0 * (1.0 - Misses / TotalSamples);
218 }
219
220 } // namespace
221
extTSPScore(uint64_t SrcAddr,uint64_t SrcSize,uint64_t DstAddr,uint64_t Count)222 double CacheMetrics::extTSPScore(uint64_t SrcAddr, uint64_t SrcSize,
223 uint64_t DstAddr, uint64_t Count) {
224 assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE);
225
226 // Fallthrough
227 if (SrcAddr + SrcSize == DstAddr) {
228 // Assume that FallthroughWeight = 1.0 after normalization
229 return static_cast<double>(Count);
230 }
231 // Forward
232 if (SrcAddr + SrcSize < DstAddr) {
233 const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
234 if (Dist <= opts::ForwardDistance) {
235 double Prob = 1.0 - static_cast<double>(Dist) / opts::ForwardDistance;
236 return opts::ForwardWeight * Prob * Count;
237 }
238 return 0;
239 }
240 // Backward
241 const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
242 if (Dist <= opts::BackwardDistance) {
243 double Prob = 1.0 - static_cast<double>(Dist) / opts::BackwardDistance;
244 return opts::BackwardWeight * Prob * Count;
245 }
246 return 0;
247 }
248
printAll(const std::vector<BinaryFunction * > & BFs)249 void CacheMetrics::printAll(const std::vector<BinaryFunction *> &BFs) {
250 // Stats related to hot-cold code splitting
251 size_t NumFunctions = 0;
252 size_t NumProfiledFunctions = 0;
253 size_t NumHotFunctions = 0;
254 size_t NumBlocks = 0;
255 size_t NumHotBlocks = 0;
256
257 size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max();
258 size_t TotalCodeMaxAddr = 0;
259 size_t HotCodeMinAddr = std::numeric_limits<size_t>::max();
260 size_t HotCodeMaxAddr = 0;
261
262 for (BinaryFunction *BF : BFs) {
263 NumFunctions++;
264 if (BF->hasProfile())
265 NumProfiledFunctions++;
266 if (BF->hasValidIndex())
267 NumHotFunctions++;
268 for (const BinaryBasicBlock &BB : *BF) {
269 NumBlocks++;
270 size_t BBAddrMin = BB.getOutputAddressRange().first;
271 size_t BBAddrMax = BB.getOutputAddressRange().second;
272 TotalCodeMinAddr = std::min(TotalCodeMinAddr, BBAddrMin);
273 TotalCodeMaxAddr = std::max(TotalCodeMaxAddr, BBAddrMax);
274 if (BF->hasValidIndex() && !BB.isCold()) {
275 NumHotBlocks++;
276 HotCodeMinAddr = std::min(HotCodeMinAddr, BBAddrMin);
277 HotCodeMaxAddr = std::max(HotCodeMaxAddr, BBAddrMax);
278 }
279 }
280 }
281
282 outs() << format(" There are %zu functions;", NumFunctions)
283 << format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions,
284 100.0 * NumHotFunctions / NumFunctions)
285 << format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions,
286 100.0 * NumProfiledFunctions / NumFunctions);
287 outs() << format(" There are %zu basic blocks;", NumBlocks)
288 << format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks,
289 100.0 * NumHotBlocks / NumBlocks);
290
291 assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses");
292 size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr;
293 size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr;
294
295 size_t HugePage2MB = 2 << 20;
296 outs() << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
297 "%.2lf huge pages)\n",
298 100.0 * HotCodeSize / TotalCodeSize, HotCodeSize,
299 TotalCodeSize, double(HotCodeSize) / HugePage2MB);
300
301 // Stats related to expected cache performance
302 std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
303 std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize;
304 extractBasicBlockInfo(BFs, BBAddr, BBSize);
305
306 outs() << " Expected i-TLB cache hit ratio: "
307 << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize));
308
309 outs() << " TSP score: "
310 << format("%.0lf\n", calcTSPScore(BFs, BBAddr, BBSize));
311
312 outs() << " ExtTSP score: "
313 << format("%.0lf\n", calcExtTSPScore(BFs, BBAddr, BBSize));
314 }
315