12f09f445SMaksim Panchenko //===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===//
2a34c753fSRafael Auler //
3a34c753fSRafael Auler // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4a34c753fSRafael Auler // See https://llvm.org/LICENSE.txt for license information.
5a34c753fSRafael Auler // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6a34c753fSRafael Auler //
7a34c753fSRafael Auler //===----------------------------------------------------------------------===//
8a34c753fSRafael Auler //
92f09f445SMaksim Panchenko // This file implements the CacheMetrics class and functions for showing metrics
102f09f445SMaksim Panchenko // of cache lines.
11a34c753fSRafael Auler //
12a34c753fSRafael Auler //===----------------------------------------------------------------------===//
13a34c753fSRafael Auler
14a34c753fSRafael Auler #include "bolt/Passes/CacheMetrics.h"
15a34c753fSRafael Auler #include "bolt/Core/BinaryBasicBlock.h"
16a34c753fSRafael Auler #include "bolt/Core/BinaryFunction.h"
17a34c753fSRafael Auler #include "llvm/Support/CommandLine.h"
18a34c753fSRafael Auler #include <unordered_map>
19a34c753fSRafael Auler
20a34c753fSRafael Auler using namespace llvm;
21a34c753fSRafael Auler using namespace bolt;
22a34c753fSRafael Auler
23a34c753fSRafael Auler namespace opts {
24a34c753fSRafael Auler
25a34c753fSRafael Auler extern cl::OptionCategory BoltOptCategory;
26a34c753fSRafael Auler
27a34c753fSRafael Auler extern cl::opt<double> ForwardWeight;
28a34c753fSRafael Auler extern cl::opt<double> BackwardWeight;
29a34c753fSRafael Auler extern cl::opt<unsigned> ForwardDistance;
30a34c753fSRafael Auler extern cl::opt<unsigned> BackwardDistance;
31a34c753fSRafael Auler extern cl::opt<unsigned> ITLBPageSize;
32a34c753fSRafael Auler extern cl::opt<unsigned> ITLBEntries;
33a34c753fSRafael Auler
3440c2e0faSMaksim Panchenko } // namespace opts
35a34c753fSRafael Auler
36a34c753fSRafael Auler namespace {
37a34c753fSRafael Auler
38a34c753fSRafael Auler /// Initialize and return a position map for binary basic blocks
extractBasicBlockInfo(const std::vector<BinaryFunction * > & BinaryFunctions,std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)39a34c753fSRafael Auler void extractBasicBlockInfo(
40a34c753fSRafael Auler const std::vector<BinaryFunction *> &BinaryFunctions,
41a34c753fSRafael Auler std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
42a34c753fSRafael Auler std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
43a34c753fSRafael Auler
44a34c753fSRafael Auler for (BinaryFunction *BF : BinaryFunctions) {
45a34c753fSRafael Auler const BinaryContext &BC = BF->getBinaryContext();
46d55dfeafSFabian Parzefall for (BinaryBasicBlock &BB : *BF) {
47a34c753fSRafael Auler if (BF->isSimple() || BC.HasRelocations) {
48a34c753fSRafael Auler // Use addresses/sizes as in the output binary
49d55dfeafSFabian Parzefall BBAddr[&BB] = BB.getOutputAddressRange().first;
50d55dfeafSFabian Parzefall BBSize[&BB] = BB.getOutputSize();
51a34c753fSRafael Auler } else {
52a34c753fSRafael Auler // Output ranges should match the input if the body hasn't changed
53d55dfeafSFabian Parzefall BBAddr[&BB] = BB.getInputAddressRange().first + BF->getAddress();
54d55dfeafSFabian Parzefall BBSize[&BB] = BB.getOriginalSize();
55a34c753fSRafael Auler }
56a34c753fSRafael Auler }
57a34c753fSRafael Auler }
58a34c753fSRafael Auler }
59a34c753fSRafael Auler
60a34c753fSRafael Auler /// Calculate TSP metric, which quantifies the number of fallthrough jumps in
61a34c753fSRafael Auler /// the ordering of basic blocks
6240c2e0faSMaksim Panchenko double
calcTSPScore(const std::vector<BinaryFunction * > & BinaryFunctions,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)6340c2e0faSMaksim Panchenko calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions,
64a34c753fSRafael Auler const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
65a34c753fSRafael Auler const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
66a34c753fSRafael Auler
67a34c753fSRafael Auler double Score = 0;
68a34c753fSRafael Auler for (BinaryFunction *BF : BinaryFunctions) {
69a34c753fSRafael Auler if (!BF->hasProfile())
70a34c753fSRafael Auler continue;
71d55dfeafSFabian Parzefall for (BinaryBasicBlock &SrcBB : *BF) {
72d55dfeafSFabian Parzefall auto BI = SrcBB.branch_info_begin();
73d55dfeafSFabian Parzefall for (BinaryBasicBlock *DstBB : SrcBB.successors()) {
74d55dfeafSFabian Parzefall if (&SrcBB != DstBB &&
75d55dfeafSFabian Parzefall BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
76d55dfeafSFabian Parzefall BBAddr.at(&SrcBB) + BBSize.at(&SrcBB) == BBAddr.at(DstBB))
77a34c753fSRafael Auler Score += BI->Count;
78a34c753fSRafael Auler ++BI;
79a34c753fSRafael Auler }
80a34c753fSRafael Auler }
81a34c753fSRafael Auler }
82a34c753fSRafael Auler return Score;
83a34c753fSRafael Auler }
84a34c753fSRafael Auler
85a34c753fSRafael Auler /// Calculate Ext-TSP metric, which quantifies the expected number of i-cache
86a34c753fSRafael Auler /// misses for a given ordering of basic blocks
calcExtTSPScore(const std::vector<BinaryFunction * > & BinaryFunctions,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)87a34c753fSRafael Auler double calcExtTSPScore(
88a34c753fSRafael Auler const std::vector<BinaryFunction *> &BinaryFunctions,
89a34c753fSRafael Auler const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
90a34c753fSRafael Auler const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
91a34c753fSRafael Auler
92a34c753fSRafael Auler double Score = 0.0;
93a34c753fSRafael Auler for (BinaryFunction *BF : BinaryFunctions) {
94a34c753fSRafael Auler if (!BF->hasProfile())
95a34c753fSRafael Auler continue;
96d55dfeafSFabian Parzefall for (BinaryBasicBlock &SrcBB : *BF) {
97d55dfeafSFabian Parzefall auto BI = SrcBB.branch_info_begin();
98d55dfeafSFabian Parzefall for (BinaryBasicBlock *DstBB : SrcBB.successors()) {
99d55dfeafSFabian Parzefall if (DstBB != &SrcBB)
100d55dfeafSFabian Parzefall Score +=
101d55dfeafSFabian Parzefall CacheMetrics::extTSPScore(BBAddr.at(&SrcBB), BBSize.at(&SrcBB),
10240c2e0faSMaksim Panchenko BBAddr.at(DstBB), BI->Count);
103a34c753fSRafael Auler ++BI;
104a34c753fSRafael Auler }
105a34c753fSRafael Auler }
106a34c753fSRafael Auler }
107a34c753fSRafael Auler return Score;
108a34c753fSRafael Auler }
109a34c753fSRafael Auler
110a34c753fSRafael Auler using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>;
111a34c753fSRafael Auler
112a34c753fSRafael Auler /// Build a simplified version of the call graph: For every function, keep
113a34c753fSRafael Auler /// its callers and the frequencies of the calls
114a34c753fSRafael Auler std::unordered_map<const BinaryFunction *, Predecessors>
extractFunctionCalls(const std::vector<BinaryFunction * > & BinaryFunctions)115a34c753fSRafael Auler extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) {
116a34c753fSRafael Auler std::unordered_map<const BinaryFunction *, Predecessors> Calls;
117a34c753fSRafael Auler
118a34c753fSRafael Auler for (BinaryFunction *SrcFunction : BinaryFunctions) {
119a34c753fSRafael Auler const BinaryContext &BC = SrcFunction->getBinaryContext();
120*8477bc67SFabian Parzefall for (BinaryBasicBlock *BB : SrcFunction->getLayout().blocks()) {
121a34c753fSRafael Auler // Find call instructions and extract target symbols from each one
122a34c753fSRafael Auler for (MCInst &Inst : *BB) {
123a34c753fSRafael Auler if (!BC.MIB->isCall(Inst))
124a34c753fSRafael Auler continue;
125a34c753fSRafael Auler
126a34c753fSRafael Auler // Call info
127a34c753fSRafael Auler const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
128a34c753fSRafael Auler uint64_t Count = BB->getKnownExecutionCount();
129a34c753fSRafael Auler // Ignore calls w/o information
130a34c753fSRafael Auler if (DstSym == nullptr || Count == 0)
131a34c753fSRafael Auler continue;
132a34c753fSRafael Auler
133a34c753fSRafael Auler const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
134a34c753fSRafael Auler // Ignore recursive calls
135*8477bc67SFabian Parzefall if (DstFunction == nullptr || DstFunction->getLayout().block_empty() ||
136a34c753fSRafael Auler DstFunction == SrcFunction)
137a34c753fSRafael Auler continue;
138a34c753fSRafael Auler
139a34c753fSRafael Auler // Record the call
140a34c753fSRafael Auler Calls[DstFunction].emplace_back(SrcFunction, Count);
141a34c753fSRafael Auler }
142a34c753fSRafael Auler }
143a34c753fSRafael Auler }
144a34c753fSRafael Auler return Calls;
145a34c753fSRafael Auler }
146a34c753fSRafael Auler
147a34c753fSRafael Auler /// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg).
148a34c753fSRafael Auler /// Given an assignment of functions to the i-TLB pages), we divide all
149a34c753fSRafael Auler /// functions calls into two categories:
150a34c753fSRafael Auler /// - 'short' ones that have a caller-callee distance less than a page;
151a34c753fSRafael Auler /// - 'long' ones where the distance exceeds a page.
15240c2e0faSMaksim Panchenko /// The short calls are likely to result in a i-TLB cache hit. For the long
15340c2e0faSMaksim Panchenko /// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how
15440c2e0faSMaksim Panchenko /// often the page is accessed). Assuming that functions are sent to the i-TLB
15540c2e0faSMaksim Panchenko /// cache in a random order, the probability that a page is present in the cache
15640c2e0faSMaksim Panchenko /// is proportional to the number of samples corresponding to the functions on
15740c2e0faSMaksim Panchenko /// the page. The following procedure detects short and long calls, and
15840c2e0faSMaksim Panchenko /// estimates the expected number of cache misses for the long ones.
expectedCacheHitRatio(const std::vector<BinaryFunction * > & BinaryFunctions,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)159a34c753fSRafael Auler double expectedCacheHitRatio(
160a34c753fSRafael Auler const std::vector<BinaryFunction *> &BinaryFunctions,
161a34c753fSRafael Auler const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
162a34c753fSRafael Auler const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
163a34c753fSRafael Auler
164a34c753fSRafael Auler const double PageSize = opts::ITLBPageSize;
165a34c753fSRafael Auler const uint64_t CacheEntries = opts::ITLBEntries;
166a34c753fSRafael Auler std::unordered_map<const BinaryFunction *, Predecessors> Calls =
167a34c753fSRafael Auler extractFunctionCalls(BinaryFunctions);
168a34c753fSRafael Auler // Compute 'hotness' of the functions
169a34c753fSRafael Auler double TotalSamples = 0;
170a34c753fSRafael Auler std::unordered_map<BinaryFunction *, double> FunctionSamples;
171a34c753fSRafael Auler for (BinaryFunction *BF : BinaryFunctions) {
172a34c753fSRafael Auler double Samples = 0;
173f92ab6afSAmir Ayupov for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF])
174a34c753fSRafael Auler Samples += Pair.second;
175a34c753fSRafael Auler Samples = std::max(Samples, (double)BF->getKnownExecutionCount());
176a34c753fSRafael Auler FunctionSamples[BF] = Samples;
177a34c753fSRafael Auler TotalSamples += Samples;
178a34c753fSRafael Auler }
179a34c753fSRafael Auler
180a34c753fSRafael Auler // Compute 'hotness' of the pages
181a34c753fSRafael Auler std::unordered_map<uint64_t, double> PageSamples;
182a34c753fSRafael Auler for (BinaryFunction *BF : BinaryFunctions) {
183*8477bc67SFabian Parzefall if (BF->getLayout().block_empty())
184a34c753fSRafael Auler continue;
185*8477bc67SFabian Parzefall double Page = BBAddr.at(BF->getLayout().block_front()) / PageSize;
186a34c753fSRafael Auler PageSamples[Page] += FunctionSamples.at(BF);
187a34c753fSRafael Auler }
188a34c753fSRafael Auler
189a34c753fSRafael Auler // Computing the expected number of misses for every function
190a34c753fSRafael Auler double Misses = 0;
191a34c753fSRafael Auler for (BinaryFunction *BF : BinaryFunctions) {
192a34c753fSRafael Auler // Skip the function if it has no samples
193*8477bc67SFabian Parzefall if (BF->getLayout().block_empty() || FunctionSamples.at(BF) == 0.0)
194a34c753fSRafael Auler continue;
195a34c753fSRafael Auler double Samples = FunctionSamples.at(BF);
196*8477bc67SFabian Parzefall double Page = BBAddr.at(BF->getLayout().block_front()) / PageSize;
197a34c753fSRafael Auler // The probability that the page is not present in the cache
198a34c753fSRafael Auler double MissProb = pow(1.0 - PageSamples[Page] / TotalSamples, CacheEntries);
199a34c753fSRafael Auler
200a34c753fSRafael Auler // Processing all callers of the function
201a34c753fSRafael Auler for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
202a34c753fSRafael Auler BinaryFunction *SrcFunction = Pair.first;
203*8477bc67SFabian Parzefall double SrcPage =
204*8477bc67SFabian Parzefall BBAddr.at(SrcFunction->getLayout().block_front()) / PageSize;
205a34c753fSRafael Auler // Is this a 'long' or a 'short' call?
206a34c753fSRafael Auler if (Page != SrcPage) {
207a34c753fSRafael Auler // This is a miss
208a34c753fSRafael Auler Misses += MissProb * Pair.second;
209a34c753fSRafael Auler }
210a34c753fSRafael Auler Samples -= Pair.second;
211a34c753fSRafael Auler }
212a34c753fSRafael Auler assert(Samples >= 0.0 && "Function samples computed incorrectly");
213a34c753fSRafael Auler // The remaining samples likely come from the jitted code
214a34c753fSRafael Auler Misses += Samples * MissProb;
215a34c753fSRafael Auler }
216a34c753fSRafael Auler
217a34c753fSRafael Auler return 100.0 * (1.0 - Misses / TotalSamples);
218a34c753fSRafael Auler }
219a34c753fSRafael Auler
22040c2e0faSMaksim Panchenko } // namespace
221a34c753fSRafael Auler
extTSPScore(uint64_t SrcAddr,uint64_t SrcSize,uint64_t DstAddr,uint64_t Count)22240c2e0faSMaksim Panchenko double CacheMetrics::extTSPScore(uint64_t SrcAddr, uint64_t SrcSize,
22340c2e0faSMaksim Panchenko uint64_t DstAddr, uint64_t Count) {
224a34c753fSRafael Auler assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE);
225a34c753fSRafael Auler
226a34c753fSRafael Auler // Fallthrough
227a34c753fSRafael Auler if (SrcAddr + SrcSize == DstAddr) {
228a34c753fSRafael Auler // Assume that FallthroughWeight = 1.0 after normalization
229a34c753fSRafael Auler return static_cast<double>(Count);
230a34c753fSRafael Auler }
231a34c753fSRafael Auler // Forward
232a34c753fSRafael Auler if (SrcAddr + SrcSize < DstAddr) {
233a34c753fSRafael Auler const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
234a34c753fSRafael Auler if (Dist <= opts::ForwardDistance) {
235a34c753fSRafael Auler double Prob = 1.0 - static_cast<double>(Dist) / opts::ForwardDistance;
236a34c753fSRafael Auler return opts::ForwardWeight * Prob * Count;
237a34c753fSRafael Auler }
238a34c753fSRafael Auler return 0;
239a34c753fSRafael Auler }
240a34c753fSRafael Auler // Backward
241a34c753fSRafael Auler const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
242a34c753fSRafael Auler if (Dist <= opts::BackwardDistance) {
243a34c753fSRafael Auler double Prob = 1.0 - static_cast<double>(Dist) / opts::BackwardDistance;
244a34c753fSRafael Auler return opts::BackwardWeight * Prob * Count;
245a34c753fSRafael Auler }
246a34c753fSRafael Auler return 0;
247a34c753fSRafael Auler }
248a34c753fSRafael Auler
printAll(const std::vector<BinaryFunction * > & BFs)249a34c753fSRafael Auler void CacheMetrics::printAll(const std::vector<BinaryFunction *> &BFs) {
250a34c753fSRafael Auler // Stats related to hot-cold code splitting
251a34c753fSRafael Auler size_t NumFunctions = 0;
252a34c753fSRafael Auler size_t NumProfiledFunctions = 0;
253a34c753fSRafael Auler size_t NumHotFunctions = 0;
254a34c753fSRafael Auler size_t NumBlocks = 0;
255a34c753fSRafael Auler size_t NumHotBlocks = 0;
256a34c753fSRafael Auler
257a34c753fSRafael Auler size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max();
258a34c753fSRafael Auler size_t TotalCodeMaxAddr = 0;
259a34c753fSRafael Auler size_t HotCodeMinAddr = std::numeric_limits<size_t>::max();
260a34c753fSRafael Auler size_t HotCodeMaxAddr = 0;
261a34c753fSRafael Auler
262a34c753fSRafael Auler for (BinaryFunction *BF : BFs) {
263a34c753fSRafael Auler NumFunctions++;
264a34c753fSRafael Auler if (BF->hasProfile())
265a34c753fSRafael Auler NumProfiledFunctions++;
266a34c753fSRafael Auler if (BF->hasValidIndex())
267a34c753fSRafael Auler NumHotFunctions++;
268d55dfeafSFabian Parzefall for (const BinaryBasicBlock &BB : *BF) {
269a34c753fSRafael Auler NumBlocks++;
270d55dfeafSFabian Parzefall size_t BBAddrMin = BB.getOutputAddressRange().first;
271d55dfeafSFabian Parzefall size_t BBAddrMax = BB.getOutputAddressRange().second;
272a34c753fSRafael Auler TotalCodeMinAddr = std::min(TotalCodeMinAddr, BBAddrMin);
273a34c753fSRafael Auler TotalCodeMaxAddr = std::max(TotalCodeMaxAddr, BBAddrMax);
274d55dfeafSFabian Parzefall if (BF->hasValidIndex() && !BB.isCold()) {
275a34c753fSRafael Auler NumHotBlocks++;
276a34c753fSRafael Auler HotCodeMinAddr = std::min(HotCodeMinAddr, BBAddrMin);
277a34c753fSRafael Auler HotCodeMaxAddr = std::max(HotCodeMaxAddr, BBAddrMax);
278a34c753fSRafael Auler }
279a34c753fSRafael Auler }
280a34c753fSRafael Auler }
281a34c753fSRafael Auler
282a34c753fSRafael Auler outs() << format(" There are %zu functions;", NumFunctions)
28340c2e0faSMaksim Panchenko << format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions,
28440c2e0faSMaksim Panchenko 100.0 * NumHotFunctions / NumFunctions)
28540c2e0faSMaksim Panchenko << format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions,
28640c2e0faSMaksim Panchenko 100.0 * NumProfiledFunctions / NumFunctions);
287a34c753fSRafael Auler outs() << format(" There are %zu basic blocks;", NumBlocks)
28840c2e0faSMaksim Panchenko << format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks,
28940c2e0faSMaksim Panchenko 100.0 * NumHotBlocks / NumBlocks);
290a34c753fSRafael Auler
291a34c753fSRafael Auler assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses");
292a34c753fSRafael Auler size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr;
293a34c753fSRafael Auler size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr;
294a34c753fSRafael Auler
295a34c753fSRafael Auler size_t HugePage2MB = 2 << 20;
29640c2e0faSMaksim Panchenko outs() << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
29740c2e0faSMaksim Panchenko "%.2lf huge pages)\n",
29840c2e0faSMaksim Panchenko 100.0 * HotCodeSize / TotalCodeSize, HotCodeSize,
29940c2e0faSMaksim Panchenko TotalCodeSize, double(HotCodeSize) / HugePage2MB);
300a34c753fSRafael Auler
301a34c753fSRafael Auler // Stats related to expected cache performance
302a34c753fSRafael Auler std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
303a34c753fSRafael Auler std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize;
304a34c753fSRafael Auler extractBasicBlockInfo(BFs, BBAddr, BBSize);
305a34c753fSRafael Auler
306a34c753fSRafael Auler outs() << " Expected i-TLB cache hit ratio: "
307a34c753fSRafael Auler << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize));
308a34c753fSRafael Auler
309a34c753fSRafael Auler outs() << " TSP score: "
310a34c753fSRafael Auler << format("%.0lf\n", calcTSPScore(BFs, BBAddr, BBSize));
311a34c753fSRafael Auler
312a34c753fSRafael Auler outs() << " ExtTSP score: "
313a34c753fSRafael Auler << format("%.0lf\n", calcExtTSPScore(BFs, BBAddr, BBSize));
314a34c753fSRafael Auler }
315