1 //===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the CacheMetrics class and functions for showing metrics 10 // of cache lines. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "bolt/Passes/CacheMetrics.h" 15 #include "bolt/Core/BinaryBasicBlock.h" 16 #include "bolt/Core/BinaryFunction.h" 17 #include "llvm/Support/CommandLine.h" 18 #include <unordered_map> 19 20 using namespace llvm; 21 using namespace bolt; 22 23 namespace opts { 24 25 extern cl::OptionCategory BoltOptCategory; 26 27 extern cl::opt<double> ForwardWeight; 28 extern cl::opt<double> BackwardWeight; 29 extern cl::opt<unsigned> ForwardDistance; 30 extern cl::opt<unsigned> BackwardDistance; 31 extern cl::opt<unsigned> ITLBPageSize; 32 extern cl::opt<unsigned> ITLBEntries; 33 34 } // namespace opts 35 36 namespace { 37 38 /// Initialize and return a position map for binary basic blocks 39 void extractBasicBlockInfo( 40 const std::vector<BinaryFunction *> &BinaryFunctions, 41 std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, 42 std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { 43 44 for (BinaryFunction *BF : BinaryFunctions) { 45 const BinaryContext &BC = BF->getBinaryContext(); 46 for (BinaryBasicBlock *BB : BF->layout()) { 47 if (BF->isSimple() || BC.HasRelocations) { 48 // Use addresses/sizes as in the output binary 49 BBAddr[BB] = BB->getOutputAddressRange().first; 50 BBSize[BB] = BB->getOutputSize(); 51 } else { 52 // Output ranges should match the input if the body hasn't changed 53 BBAddr[BB] = BB->getInputAddressRange().first + BF->getAddress(); 54 BBSize[BB] = BB->getOriginalSize(); 55 } 56 } 57 } 58 } 59 60 /// Calculate TSP metric, which quantifies the number of fallthrough jumps in 61 /// the ordering of basic blocks 62 double 63 calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions, 64 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, 65 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { 66 67 double Score = 0; 68 for (BinaryFunction *BF : BinaryFunctions) { 69 if (!BF->hasProfile()) 70 continue; 71 for (BinaryBasicBlock *SrcBB : BF->layout()) { 72 auto BI = SrcBB->branch_info_begin(); 73 for (BinaryBasicBlock *DstBB : SrcBB->successors()) { 74 if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 75 BBAddr.at(SrcBB) + BBSize.at(SrcBB) == BBAddr.at(DstBB)) 76 Score += BI->Count; 77 ++BI; 78 } 79 } 80 } 81 return Score; 82 } 83 84 /// Calculate Ext-TSP metric, which quantifies the expected number of i-cache 85 /// misses for a given ordering of basic blocks 86 double calcExtTSPScore( 87 const std::vector<BinaryFunction *> &BinaryFunctions, 88 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, 89 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { 90 91 double Score = 0.0; 92 for (BinaryFunction *BF : BinaryFunctions) { 93 if (!BF->hasProfile()) 94 continue; 95 for (BinaryBasicBlock *SrcBB : BF->layout()) { 96 auto BI = SrcBB->branch_info_begin(); 97 for (BinaryBasicBlock *DstBB : SrcBB->successors()) { 98 if (DstBB != SrcBB) 99 Score += CacheMetrics::extTSPScore(BBAddr.at(SrcBB), BBSize.at(SrcBB), 100 BBAddr.at(DstBB), BI->Count); 101 ++BI; 102 } 103 } 104 } 105 return Score; 106 } 107 108 using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>; 109 110 /// Build a simplified version of the call graph: For every function, keep 111 /// its callers and the frequencies of the calls 112 std::unordered_map<const BinaryFunction *, Predecessors> 113 extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) { 114 std::unordered_map<const BinaryFunction *, Predecessors> Calls; 115 116 for (BinaryFunction *SrcFunction : BinaryFunctions) { 117 const BinaryContext &BC = SrcFunction->getBinaryContext(); 118 for (BinaryBasicBlock *BB : SrcFunction->layout()) { 119 // Find call instructions and extract target symbols from each one 120 for (MCInst &Inst : *BB) { 121 if (!BC.MIB->isCall(Inst)) 122 continue; 123 124 // Call info 125 const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); 126 uint64_t Count = BB->getKnownExecutionCount(); 127 // Ignore calls w/o information 128 if (DstSym == nullptr || Count == 0) 129 continue; 130 131 const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym); 132 // Ignore recursive calls 133 if (DstFunction == nullptr || DstFunction->layout_empty() || 134 DstFunction == SrcFunction) 135 continue; 136 137 // Record the call 138 Calls[DstFunction].emplace_back(SrcFunction, Count); 139 } 140 } 141 } 142 return Calls; 143 } 144 145 /// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg). 146 /// Given an assignment of functions to the i-TLB pages), we divide all 147 /// functions calls into two categories: 148 /// - 'short' ones that have a caller-callee distance less than a page; 149 /// - 'long' ones where the distance exceeds a page. 150 /// The short calls are likely to result in a i-TLB cache hit. For the long 151 /// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how 152 /// often the page is accessed). Assuming that functions are sent to the i-TLB 153 /// cache in a random order, the probability that a page is present in the cache 154 /// is proportional to the number of samples corresponding to the functions on 155 /// the page. The following procedure detects short and long calls, and 156 /// estimates the expected number of cache misses for the long ones. 157 double expectedCacheHitRatio( 158 const std::vector<BinaryFunction *> &BinaryFunctions, 159 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, 160 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { 161 162 const double PageSize = opts::ITLBPageSize; 163 const uint64_t CacheEntries = opts::ITLBEntries; 164 std::unordered_map<const BinaryFunction *, Predecessors> Calls = 165 extractFunctionCalls(BinaryFunctions); 166 // Compute 'hotness' of the functions 167 double TotalSamples = 0; 168 std::unordered_map<BinaryFunction *, double> FunctionSamples; 169 for (BinaryFunction *BF : BinaryFunctions) { 170 double Samples = 0; 171 for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) 172 Samples += Pair.second; 173 Samples = std::max(Samples, (double)BF->getKnownExecutionCount()); 174 FunctionSamples[BF] = Samples; 175 TotalSamples += Samples; 176 } 177 178 // Compute 'hotness' of the pages 179 std::unordered_map<uint64_t, double> PageSamples; 180 for (BinaryFunction *BF : BinaryFunctions) { 181 if (BF->layout_empty()) 182 continue; 183 double Page = BBAddr.at(BF->layout_front()) / PageSize; 184 PageSamples[Page] += FunctionSamples.at(BF); 185 } 186 187 // Computing the expected number of misses for every function 188 double Misses = 0; 189 for (BinaryFunction *BF : BinaryFunctions) { 190 // Skip the function if it has no samples 191 if (BF->layout_empty() || FunctionSamples.at(BF) == 0.0) 192 continue; 193 double Samples = FunctionSamples.at(BF); 194 double Page = BBAddr.at(BF->layout_front()) / PageSize; 195 // The probability that the page is not present in the cache 196 double MissProb = pow(1.0 - PageSamples[Page] / TotalSamples, CacheEntries); 197 198 // Processing all callers of the function 199 for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) { 200 BinaryFunction *SrcFunction = Pair.first; 201 double SrcPage = BBAddr.at(SrcFunction->layout_front()) / PageSize; 202 // Is this a 'long' or a 'short' call? 203 if (Page != SrcPage) { 204 // This is a miss 205 Misses += MissProb * Pair.second; 206 } 207 Samples -= Pair.second; 208 } 209 assert(Samples >= 0.0 && "Function samples computed incorrectly"); 210 // The remaining samples likely come from the jitted code 211 Misses += Samples * MissProb; 212 } 213 214 return 100.0 * (1.0 - Misses / TotalSamples); 215 } 216 217 } // namespace 218 219 double CacheMetrics::extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, 220 uint64_t DstAddr, uint64_t Count) { 221 assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE); 222 223 // Fallthrough 224 if (SrcAddr + SrcSize == DstAddr) { 225 // Assume that FallthroughWeight = 1.0 after normalization 226 return static_cast<double>(Count); 227 } 228 // Forward 229 if (SrcAddr + SrcSize < DstAddr) { 230 const uint64_t Dist = DstAddr - (SrcAddr + SrcSize); 231 if (Dist <= opts::ForwardDistance) { 232 double Prob = 1.0 - static_cast<double>(Dist) / opts::ForwardDistance; 233 return opts::ForwardWeight * Prob * Count; 234 } 235 return 0; 236 } 237 // Backward 238 const uint64_t Dist = SrcAddr + SrcSize - DstAddr; 239 if (Dist <= opts::BackwardDistance) { 240 double Prob = 1.0 - static_cast<double>(Dist) / opts::BackwardDistance; 241 return opts::BackwardWeight * Prob * Count; 242 } 243 return 0; 244 } 245 246 void CacheMetrics::printAll(const std::vector<BinaryFunction *> &BFs) { 247 // Stats related to hot-cold code splitting 248 size_t NumFunctions = 0; 249 size_t NumProfiledFunctions = 0; 250 size_t NumHotFunctions = 0; 251 size_t NumBlocks = 0; 252 size_t NumHotBlocks = 0; 253 254 size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max(); 255 size_t TotalCodeMaxAddr = 0; 256 size_t HotCodeMinAddr = std::numeric_limits<size_t>::max(); 257 size_t HotCodeMaxAddr = 0; 258 259 for (BinaryFunction *BF : BFs) { 260 NumFunctions++; 261 if (BF->hasProfile()) 262 NumProfiledFunctions++; 263 if (BF->hasValidIndex()) 264 NumHotFunctions++; 265 for (BinaryBasicBlock *BB : BF->layout()) { 266 NumBlocks++; 267 size_t BBAddrMin = BB->getOutputAddressRange().first; 268 size_t BBAddrMax = BB->getOutputAddressRange().second; 269 TotalCodeMinAddr = std::min(TotalCodeMinAddr, BBAddrMin); 270 TotalCodeMaxAddr = std::max(TotalCodeMaxAddr, BBAddrMax); 271 if (BF->hasValidIndex() && !BB->isCold()) { 272 NumHotBlocks++; 273 HotCodeMinAddr = std::min(HotCodeMinAddr, BBAddrMin); 274 HotCodeMaxAddr = std::max(HotCodeMaxAddr, BBAddrMax); 275 } 276 } 277 } 278 279 outs() << format(" There are %zu functions;", NumFunctions) 280 << format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions, 281 100.0 * NumHotFunctions / NumFunctions) 282 << format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions, 283 100.0 * NumProfiledFunctions / NumFunctions); 284 outs() << format(" There are %zu basic blocks;", NumBlocks) 285 << format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks, 286 100.0 * NumHotBlocks / NumBlocks); 287 288 assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses"); 289 size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr; 290 size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr; 291 292 size_t HugePage2MB = 2 << 20; 293 outs() << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu, " 294 "%.2lf huge pages)\n", 295 100.0 * HotCodeSize / TotalCodeSize, HotCodeSize, 296 TotalCodeSize, double(HotCodeSize) / HugePage2MB); 297 298 // Stats related to expected cache performance 299 std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr; 300 std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize; 301 extractBasicBlockInfo(BFs, BBAddr, BBSize); 302 303 outs() << " Expected i-TLB cache hit ratio: " 304 << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize)); 305 306 outs() << " TSP score: " 307 << format("%.0lf\n", calcTSPScore(BFs, BBAddr, BBSize)); 308 309 outs() << " ExtTSP score: " 310 << format("%.0lf\n", calcExtTSPScore(BFs, BBAddr, BBSize)); 311 } 312