1 //===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the CacheMetrics class and functions for showing metrics 10 // of cache lines. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "bolt/Passes/CacheMetrics.h" 15 #include "bolt/Core/BinaryBasicBlock.h" 16 #include "bolt/Core/BinaryFunction.h" 17 #include "llvm/Support/CommandLine.h" 18 #include <unordered_map> 19 20 using namespace llvm; 21 using namespace bolt; 22 23 namespace opts { 24 25 extern cl::OptionCategory BoltOptCategory; 26 27 extern cl::opt<double> ForwardWeight; 28 extern cl::opt<double> BackwardWeight; 29 extern cl::opt<unsigned> ForwardDistance; 30 extern cl::opt<unsigned> BackwardDistance; 31 extern cl::opt<unsigned> ITLBPageSize; 32 extern cl::opt<unsigned> ITLBEntries; 33 34 } // namespace opts 35 36 namespace { 37 38 /// Initialize and return a position map for binary basic blocks 39 void extractBasicBlockInfo( 40 const std::vector<BinaryFunction *> &BinaryFunctions, 41 std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, 42 std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { 43 44 for (BinaryFunction *BF : BinaryFunctions) { 45 const BinaryContext &BC = BF->getBinaryContext(); 46 for (BinaryBasicBlock &BB : *BF) { 47 if (BF->isSimple() || BC.HasRelocations) { 48 // Use addresses/sizes as in the output binary 49 BBAddr[&BB] = BB.getOutputAddressRange().first; 50 BBSize[&BB] = BB.getOutputSize(); 51 } else { 52 // Output ranges should match the input if the body hasn't changed 53 BBAddr[&BB] = BB.getInputAddressRange().first + BF->getAddress(); 54 BBSize[&BB] = BB.getOriginalSize(); 55 } 56 } 57 } 58 } 59 60 /// Calculate TSP metric, which quantifies the number of fallthrough jumps in 61 /// the ordering of basic blocks 62 double 63 calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions, 64 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, 65 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { 66 67 double Score = 0; 68 for (BinaryFunction *BF : BinaryFunctions) { 69 if (!BF->hasProfile()) 70 continue; 71 for (BinaryBasicBlock &SrcBB : *BF) { 72 auto BI = SrcBB.branch_info_begin(); 73 for (BinaryBasicBlock *DstBB : SrcBB.successors()) { 74 if (&SrcBB != DstBB && 75 BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 76 BBAddr.at(&SrcBB) + BBSize.at(&SrcBB) == BBAddr.at(DstBB)) 77 Score += BI->Count; 78 ++BI; 79 } 80 } 81 } 82 return Score; 83 } 84 85 /// Calculate Ext-TSP metric, which quantifies the expected number of i-cache 86 /// misses for a given ordering of basic blocks 87 double calcExtTSPScore( 88 const std::vector<BinaryFunction *> &BinaryFunctions, 89 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, 90 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { 91 92 double Score = 0.0; 93 for (BinaryFunction *BF : BinaryFunctions) { 94 if (!BF->hasProfile()) 95 continue; 96 for (BinaryBasicBlock &SrcBB : *BF) { 97 auto BI = SrcBB.branch_info_begin(); 98 for (BinaryBasicBlock *DstBB : SrcBB.successors()) { 99 if (DstBB != &SrcBB) 100 Score += 101 CacheMetrics::extTSPScore(BBAddr.at(&SrcBB), BBSize.at(&SrcBB), 102 BBAddr.at(DstBB), BI->Count); 103 ++BI; 104 } 105 } 106 } 107 return Score; 108 } 109 110 using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>; 111 112 /// Build a simplified version of the call graph: For every function, keep 113 /// its callers and the frequencies of the calls 114 std::unordered_map<const BinaryFunction *, Predecessors> 115 extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) { 116 std::unordered_map<const BinaryFunction *, Predecessors> Calls; 117 118 for (BinaryFunction *SrcFunction : BinaryFunctions) { 119 const BinaryContext &BC = SrcFunction->getBinaryContext(); 120 for (BinaryBasicBlock *BB : SrcFunction->getLayout().blocks()) { 121 // Find call instructions and extract target symbols from each one 122 for (MCInst &Inst : *BB) { 123 if (!BC.MIB->isCall(Inst)) 124 continue; 125 126 // Call info 127 const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); 128 uint64_t Count = BB->getKnownExecutionCount(); 129 // Ignore calls w/o information 130 if (DstSym == nullptr || Count == 0) 131 continue; 132 133 const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym); 134 // Ignore recursive calls 135 if (DstFunction == nullptr || DstFunction->getLayout().block_empty() || 136 DstFunction == SrcFunction) 137 continue; 138 139 // Record the call 140 Calls[DstFunction].emplace_back(SrcFunction, Count); 141 } 142 } 143 } 144 return Calls; 145 } 146 147 /// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg). 148 /// Given an assignment of functions to the i-TLB pages), we divide all 149 /// functions calls into two categories: 150 /// - 'short' ones that have a caller-callee distance less than a page; 151 /// - 'long' ones where the distance exceeds a page. 152 /// The short calls are likely to result in a i-TLB cache hit. For the long 153 /// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how 154 /// often the page is accessed). Assuming that functions are sent to the i-TLB 155 /// cache in a random order, the probability that a page is present in the cache 156 /// is proportional to the number of samples corresponding to the functions on 157 /// the page. The following procedure detects short and long calls, and 158 /// estimates the expected number of cache misses for the long ones. 159 double expectedCacheHitRatio( 160 const std::vector<BinaryFunction *> &BinaryFunctions, 161 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, 162 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { 163 164 const double PageSize = opts::ITLBPageSize; 165 const uint64_t CacheEntries = opts::ITLBEntries; 166 std::unordered_map<const BinaryFunction *, Predecessors> Calls = 167 extractFunctionCalls(BinaryFunctions); 168 // Compute 'hotness' of the functions 169 double TotalSamples = 0; 170 std::unordered_map<BinaryFunction *, double> FunctionSamples; 171 for (BinaryFunction *BF : BinaryFunctions) { 172 double Samples = 0; 173 for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) 174 Samples += Pair.second; 175 Samples = std::max(Samples, (double)BF->getKnownExecutionCount()); 176 FunctionSamples[BF] = Samples; 177 TotalSamples += Samples; 178 } 179 180 // Compute 'hotness' of the pages 181 std::unordered_map<uint64_t, double> PageSamples; 182 for (BinaryFunction *BF : BinaryFunctions) { 183 if (BF->getLayout().block_empty()) 184 continue; 185 double Page = BBAddr.at(BF->getLayout().block_front()) / PageSize; 186 PageSamples[Page] += FunctionSamples.at(BF); 187 } 188 189 // Computing the expected number of misses for every function 190 double Misses = 0; 191 for (BinaryFunction *BF : BinaryFunctions) { 192 // Skip the function if it has no samples 193 if (BF->getLayout().block_empty() || FunctionSamples.at(BF) == 0.0) 194 continue; 195 double Samples = FunctionSamples.at(BF); 196 double Page = BBAddr.at(BF->getLayout().block_front()) / PageSize; 197 // The probability that the page is not present in the cache 198 double MissProb = pow(1.0 - PageSamples[Page] / TotalSamples, CacheEntries); 199 200 // Processing all callers of the function 201 for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) { 202 BinaryFunction *SrcFunction = Pair.first; 203 double SrcPage = 204 BBAddr.at(SrcFunction->getLayout().block_front()) / PageSize; 205 // Is this a 'long' or a 'short' call? 206 if (Page != SrcPage) { 207 // This is a miss 208 Misses += MissProb * Pair.second; 209 } 210 Samples -= Pair.second; 211 } 212 assert(Samples >= 0.0 && "Function samples computed incorrectly"); 213 // The remaining samples likely come from the jitted code 214 Misses += Samples * MissProb; 215 } 216 217 return 100.0 * (1.0 - Misses / TotalSamples); 218 } 219 220 } // namespace 221 222 double CacheMetrics::extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, 223 uint64_t DstAddr, uint64_t Count) { 224 assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE); 225 226 // Fallthrough 227 if (SrcAddr + SrcSize == DstAddr) { 228 // Assume that FallthroughWeight = 1.0 after normalization 229 return static_cast<double>(Count); 230 } 231 // Forward 232 if (SrcAddr + SrcSize < DstAddr) { 233 const uint64_t Dist = DstAddr - (SrcAddr + SrcSize); 234 if (Dist <= opts::ForwardDistance) { 235 double Prob = 1.0 - static_cast<double>(Dist) / opts::ForwardDistance; 236 return opts::ForwardWeight * Prob * Count; 237 } 238 return 0; 239 } 240 // Backward 241 const uint64_t Dist = SrcAddr + SrcSize - DstAddr; 242 if (Dist <= opts::BackwardDistance) { 243 double Prob = 1.0 - static_cast<double>(Dist) / opts::BackwardDistance; 244 return opts::BackwardWeight * Prob * Count; 245 } 246 return 0; 247 } 248 249 void CacheMetrics::printAll(const std::vector<BinaryFunction *> &BFs) { 250 // Stats related to hot-cold code splitting 251 size_t NumFunctions = 0; 252 size_t NumProfiledFunctions = 0; 253 size_t NumHotFunctions = 0; 254 size_t NumBlocks = 0; 255 size_t NumHotBlocks = 0; 256 257 size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max(); 258 size_t TotalCodeMaxAddr = 0; 259 size_t HotCodeMinAddr = std::numeric_limits<size_t>::max(); 260 size_t HotCodeMaxAddr = 0; 261 262 for (BinaryFunction *BF : BFs) { 263 NumFunctions++; 264 if (BF->hasProfile()) 265 NumProfiledFunctions++; 266 if (BF->hasValidIndex()) 267 NumHotFunctions++; 268 for (const BinaryBasicBlock &BB : *BF) { 269 NumBlocks++; 270 size_t BBAddrMin = BB.getOutputAddressRange().first; 271 size_t BBAddrMax = BB.getOutputAddressRange().second; 272 TotalCodeMinAddr = std::min(TotalCodeMinAddr, BBAddrMin); 273 TotalCodeMaxAddr = std::max(TotalCodeMaxAddr, BBAddrMax); 274 if (BF->hasValidIndex() && !BB.isCold()) { 275 NumHotBlocks++; 276 HotCodeMinAddr = std::min(HotCodeMinAddr, BBAddrMin); 277 HotCodeMaxAddr = std::max(HotCodeMaxAddr, BBAddrMax); 278 } 279 } 280 } 281 282 outs() << format(" There are %zu functions;", NumFunctions) 283 << format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions, 284 100.0 * NumHotFunctions / NumFunctions) 285 << format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions, 286 100.0 * NumProfiledFunctions / NumFunctions); 287 outs() << format(" There are %zu basic blocks;", NumBlocks) 288 << format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks, 289 100.0 * NumHotBlocks / NumBlocks); 290 291 assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses"); 292 size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr; 293 size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr; 294 295 size_t HugePage2MB = 2 << 20; 296 outs() << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu, " 297 "%.2lf huge pages)\n", 298 100.0 * HotCodeSize / TotalCodeSize, HotCodeSize, 299 TotalCodeSize, double(HotCodeSize) / HugePage2MB); 300 301 // Stats related to expected cache performance 302 std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr; 303 std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize; 304 extractBasicBlockInfo(BFs, BBAddr, BBSize); 305 306 outs() << " Expected i-TLB cache hit ratio: " 307 << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize)); 308 309 outs() << " TSP score: " 310 << format("%.0lf\n", calcTSPScore(BFs, BBAddr, BBSize)); 311 312 outs() << " ExtTSP score: " 313 << format("%.0lf\n", calcExtTSPScore(BFs, BBAddr, BBSize)); 314 } 315