1 //===- bolt/Passes/HFSortPlus.cpp - Order functions by hotness ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // hfsort+ - layout of hot functions with i-TLB cache optimization. 10 // 11 // Given an ordering of hot functions (and hence, their assignment to the 12 // i-TLB pages), we can divide all functions calls Into two categories: 13 // - 'short' ones that have a caller-callee distance less than a page; 14 // - 'long' ones where the distance exceeds a page. 15 // The short calls are likely to result in a i-TLB cache hit. For the long ones, 16 // the hit/miss result depends on the 'hotness' of the page (i.e., how often 17 // the page is accessed). Assuming that functions are sent to the i-TLB cache 18 // in a random order, the probability that a page is present in the cache is 19 // proportional to the number of samples corresponding to the functions on the 20 // page. The following algorithm detects short and long calls, and optimizes 21 // the expected number of cache misses for the long ones. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #include "bolt/Passes/HFSort.h" 26 #include "llvm/Support/CommandLine.h" 27 #include <cmath> 28 #include <set> 29 #include <vector> 30 31 #define DEBUG_TYPE "hfsort" 32 33 using namespace llvm; 34 using namespace bolt; 35 36 namespace opts { 37 38 extern cl::OptionCategory BoltOptCategory; 39 40 cl::opt<unsigned> ITLBPageSize("itlb-page-size", 41 cl::desc("The size of i-tlb cache page"), 42 cl::init(4096), cl::ReallyHidden, 43 cl::cat(BoltOptCategory)); 44 45 cl::opt<unsigned> ITLBEntries("itlb-entries", 46 cl::desc("The number of entries in i-tlb cache"), 47 cl::init(16), cl::ReallyHidden, 48 cl::cat(BoltOptCategory)); 49 50 static cl::opt<unsigned> ITLBDensity("itlb-density", 51 cl::desc("The density of i-tlb cache"), 52 cl::init(4096), cl::ReallyHidden, 53 cl::cat(BoltOptCategory)); 54 55 static cl::opt<double> MergeProbability( 56 "merge-probability", 57 cl::desc("The minimum probability of a call for merging two clusters"), 58 cl::init(0.9), cl::ReallyHidden, cl::cat(BoltOptCategory)); 59 60 static cl::opt<double> ArcThreshold( 61 "arc-threshold", 62 cl::desc("The threshold for ignoring arcs with a small relative weight"), 63 cl::init(0.00000001), cl::ReallyHidden, cl::cat(BoltOptCategory)); 64 65 } // namespace opts 66 67 namespace llvm { 68 namespace bolt { 69 70 using NodeId = CallGraph::NodeId; 71 using Arc = CallGraph::Arc; 72 73 namespace { 74 75 class Edge; 76 using ArcList = std::vector<const Arc *>; 77 78 // A chain (ordered sequence) of nodes (functions) in the call graph 79 class Chain { 80 public: 81 Chain(const Chain &) = delete; 82 Chain(Chain &&) = default; 83 Chain &operator=(const Chain &) = delete; 84 Chain &operator=(Chain &&) = default; 85 86 explicit Chain(size_t Id_, NodeId Node, size_t Samples_, size_t Size_) 87 : Id(Id_), Samples(Samples_), Size(Size_), Nodes(1, Node) {} 88 89 double density() const { return static_cast<double>(Samples) / Size; } 90 91 Edge *getEdge(Chain *Other) const { 92 for (std::pair<Chain *, Edge *> It : Edges) 93 if (It.first == Other) 94 return It.second; 95 return nullptr; 96 } 97 98 void removeEdge(Chain *Other) { 99 auto It = Edges.begin(); 100 while (It != Edges.end()) { 101 if (It->first == Other) { 102 Edges.erase(It); 103 return; 104 } 105 It++; 106 } 107 } 108 109 void addEdge(Chain *Other, Edge *Edge) { Edges.emplace_back(Other, Edge); } 110 111 void merge(Chain *Other) { 112 Nodes.insert(Nodes.end(), Other->Nodes.begin(), Other->Nodes.end()); 113 Samples += Other->Samples; 114 Size += Other->Size; 115 } 116 117 void mergeEdges(Chain *Other); 118 119 void clear() { 120 Nodes.clear(); 121 Edges.clear(); 122 } 123 124 public: 125 size_t Id; 126 uint64_t Samples; 127 uint64_t Size; 128 // Cached score for the chain 129 double Score{0}; 130 // Cached short-calls for the chain 131 double ShortCalls{0}; 132 // Nodes in the chain 133 std::vector<NodeId> Nodes; 134 // Adjacent chains and corresponding edges (lists of arcs) 135 std::vector<std::pair<Chain *, Edge *>> Edges; 136 }; 137 138 // An edge in the call graph representing Arcs between two Chains. 139 // When functions are merged Into chains, the edges are combined too so that 140 // there is always at most one edge between a pair of chains 141 class Edge { 142 public: 143 Edge(const Edge &) = delete; 144 Edge(Edge &&) = default; 145 Edge &operator=(const Edge &) = delete; 146 Edge &operator=(Edge &&) = default; 147 148 explicit Edge(Chain *SrcChain_, Chain *DstChain_, const Arc *A) 149 : SrcChain(SrcChain_), DstChain(DstChain_), Arcs(1, A) {} 150 151 void changeEndpoint(Chain *From, Chain *To) { 152 if (From == SrcChain) 153 SrcChain = To; 154 if (From == DstChain) 155 DstChain = To; 156 } 157 158 void moveArcs(Edge *Other) { 159 Arcs.insert(Arcs.end(), Other->Arcs.begin(), Other->Arcs.end()); 160 Other->Arcs.clear(); 161 } 162 163 void setMergeGain(Chain *PredChain, double ForwardGain, double BackwardGain) { 164 // When forward and backward gains are the same, prioritize merging that 165 // preserves the original order of the functions in the binary 166 if (std::abs(ForwardGain - BackwardGain) < 1e-8) { 167 if (SrcChain->Id < DstChain->Id) { 168 IsGainForward = true; 169 CachedGain = PredChain == SrcChain ? ForwardGain : BackwardGain; 170 } else { 171 IsGainForward = false; 172 CachedGain = PredChain == SrcChain ? BackwardGain : ForwardGain; 173 } 174 } else if (ForwardGain > BackwardGain) { 175 IsGainForward = PredChain == SrcChain; 176 CachedGain = ForwardGain; 177 } else { 178 IsGainForward = PredChain != SrcChain; 179 CachedGain = BackwardGain; 180 } 181 } 182 183 double gain() const { return CachedGain; } 184 185 Chain *predChain() const { return IsGainForward ? SrcChain : DstChain; } 186 187 Chain *succChain() const { return IsGainForward ? DstChain : SrcChain; } 188 189 private: 190 Chain *SrcChain{nullptr}; 191 Chain *DstChain{nullptr}; 192 193 public: 194 // Original arcs in the binary with corresponding execution counts 195 ArcList Arcs; 196 // Cached gain of merging the pair of chains 197 double CachedGain{-1.0}; 198 // Since the gain of merging (Src, Dst) and (Dst, Src) might be different, 199 // we store a flag indicating which of the options results in a higher gain 200 bool IsGainForward; 201 }; 202 203 void Chain::mergeEdges(Chain *Other) { 204 // Update edges adjacent to chain other 205 for (auto EdgeIt : Other->Edges) { 206 Chain *const DstChain = EdgeIt.first; 207 Edge *const DstEdge = EdgeIt.second; 208 Chain *const TargetChain = DstChain == Other ? this : DstChain; 209 210 // Find the corresponding edge in the current chain 211 Edge *CurEdge = getEdge(TargetChain); 212 if (CurEdge == nullptr) { 213 DstEdge->changeEndpoint(Other, this); 214 this->addEdge(TargetChain, DstEdge); 215 if (DstChain != this && DstChain != Other) 216 DstChain->addEdge(this, DstEdge); 217 } else { 218 CurEdge->moveArcs(DstEdge); 219 } 220 // Cleanup leftover edge 221 if (DstChain != Other) 222 DstChain->removeEdge(Other); 223 } 224 } 225 226 class HFSortPlus { 227 public: 228 explicit HFSortPlus(const CallGraph &Cg) : Cg(Cg) { initialize(); } 229 230 /// Run the algorithm and return ordered set of function clusters. 231 std::vector<Cluster> run() { 232 // Pass 1 233 runPassOne(); 234 235 // Pass 2 236 runPassTwo(); 237 238 outs() << "BOLT-INFO: hfsort+ reduced the number of chains from " 239 << Cg.numNodes() << " to " << HotChains.size() << "\n"; 240 241 // Sorting chains by density in decreasing order 242 auto DensityComparator = [](const Chain *L, const Chain *R) { 243 if (L->density() != R->density()) 244 return L->density() > R->density(); 245 // Making sure the comparison is deterministic 246 return L->Id < R->Id; 247 }; 248 std::stable_sort(HotChains.begin(), HotChains.end(), DensityComparator); 249 250 // Return the set of clusters that are left, which are the ones that 251 // didn't get merged (so their first func is its original func) 252 std::vector<Cluster> Clusters; 253 Clusters.reserve(HotChains.size()); 254 for (Chain *Chain : HotChains) 255 Clusters.emplace_back(Cluster(Chain->Nodes, Cg)); 256 return Clusters; 257 } 258 259 private: 260 /// Initialize the set of active chains, function id to chain mapping, 261 /// total number of samples and function addresses. 262 void initialize() { 263 OutWeight.resize(Cg.numNodes(), 0); 264 InWeight.resize(Cg.numNodes(), 0); 265 AllChains.reserve(Cg.numNodes()); 266 HotChains.reserve(Cg.numNodes()); 267 NodeChain.resize(Cg.numNodes(), nullptr); 268 Addr.resize(Cg.numNodes(), 0); 269 270 // Initialize chains 271 for (NodeId F = 0; F < Cg.numNodes(); ++F) { 272 AllChains.emplace_back(F, F, Cg.samples(F), Cg.size(F)); 273 HotChains.push_back(&AllChains.back()); 274 NodeChain[F] = &AllChains.back(); 275 TotalSamples += Cg.samples(F); 276 for (NodeId Succ : Cg.successors(F)) { 277 if (F == Succ) 278 continue; 279 const Arc &Arc = *Cg.findArc(F, Succ); 280 OutWeight[F] += Arc.weight(); 281 InWeight[Succ] += Arc.weight(); 282 } 283 } 284 285 AllEdges.reserve(Cg.numArcs()); 286 for (NodeId F = 0; F < Cg.numNodes(); ++F) { 287 for (NodeId Succ : Cg.successors(F)) { 288 if (F == Succ) 289 continue; 290 const Arc &Arc = *Cg.findArc(F, Succ); 291 if (Arc.weight() == 0.0 || 292 Arc.weight() / TotalSamples < opts::ArcThreshold) { 293 continue; 294 } 295 296 Edge *CurEdge = NodeChain[F]->getEdge(NodeChain[Succ]); 297 if (CurEdge != nullptr) { 298 // This edge is already present in the graph 299 assert(NodeChain[Succ]->getEdge(NodeChain[F]) != nullptr); 300 CurEdge->Arcs.push_back(&Arc); 301 } else { 302 // This is a new edge 303 AllEdges.emplace_back(NodeChain[F], NodeChain[Succ], &Arc); 304 NodeChain[F]->addEdge(NodeChain[Succ], &AllEdges.back()); 305 NodeChain[Succ]->addEdge(NodeChain[F], &AllEdges.back()); 306 } 307 } 308 } 309 310 for (Chain *&Chain : HotChains) { 311 Chain->ShortCalls = shortCalls(Chain); 312 Chain->Score = score(Chain); 313 } 314 } 315 316 /// The probability that a page with a given density is not in the cache. 317 /// 318 /// Assume that the hot functions are called in a random order; then the 319 /// probability of an i-TLB page being accessed after a function call is 320 /// p = pageSamples / TotalSamples. The probability that the page is not 321 /// accessed is (1 - p), and the probability that it is not in the cache 322 /// (i.e. not accessed during the last kCacheEntries function calls) 323 /// is (1 - p)^kCacheEntries 324 double missProbability(double ChainDensity) const { 325 double PageSamples = ChainDensity * opts::ITLBDensity; 326 327 if (PageSamples >= TotalSamples) 328 return 0; 329 330 double P = PageSamples / TotalSamples; 331 return pow(1.0 - P, double(opts::ITLBEntries)); 332 } 333 334 /// The expected number of calls on different i-TLB pages for an arc of the 335 /// call graph with a specified weight 336 double expectedCalls(uint64_t SrcAddr, uint64_t DstAddr, 337 double Weight) const { 338 uint64_t Dist = SrcAddr >= DstAddr ? SrcAddr - DstAddr : DstAddr - SrcAddr; 339 if (Dist >= opts::ITLBPageSize) 340 return 0; 341 342 double D = double(Dist) / double(opts::ITLBPageSize); 343 // Increasing the importance of shorter calls 344 return (1.0 - D * D) * Weight; 345 } 346 347 /// The expected number of calls within a given chain with both endpoints on 348 /// the same cache page 349 double shortCalls(Chain *Chain) const { 350 Edge *Edge = Chain->getEdge(Chain); 351 if (Edge == nullptr) 352 return 0; 353 354 double Calls = 0; 355 for (const Arc *Arc : Edge->Arcs) { 356 uint64_t SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()); 357 uint64_t DstAddr = Addr[Arc->dst()]; 358 Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight()); 359 } 360 return Calls; 361 } 362 363 /// The number of calls between the two chains with both endpoints on 364 /// the same i-TLB page, assuming that a given pair of chains gets merged 365 double shortCalls(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const { 366 double Calls = 0; 367 for (const Arc *Arc : Edge->Arcs) { 368 Chain *SrcChain = NodeChain[Arc->src()]; 369 uint64_t SrcAddr; 370 uint64_t DstAddr; 371 if (SrcChain == ChainPred) { 372 SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()); 373 DstAddr = Addr[Arc->dst()] + ChainPred->Size; 374 } else { 375 SrcAddr = 376 Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()) + ChainPred->Size; 377 DstAddr = Addr[Arc->dst()]; 378 } 379 Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight()); 380 } 381 382 Calls += ChainPred->ShortCalls; 383 Calls += ChainSucc->ShortCalls; 384 385 return Calls; 386 } 387 388 double score(Chain *Chain) const { 389 double LongCalls = Chain->Samples - Chain->ShortCalls; 390 return LongCalls * missProbability(Chain->density()); 391 } 392 393 /// The gain of merging two chains. 394 /// 395 /// We assume that the final chains are sorted by their density, and hence 396 /// every chain is likely to be adjacent with chains of the same density. 397 /// Thus, the 'hotness' of every chain can be estimated by density*pageSize, 398 /// which is used to compute the probability of cache misses for long calls 399 /// of a given chain. 400 /// The result is also scaled by the size of the resulting chain in order to 401 /// increase the chance of merging short chains, which is helpful for 402 /// the i-cache performance. 403 double mergeGain(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const { 404 // Cache misses on the chains before merging 405 double CurScore = ChainPred->Score + ChainSucc->Score; 406 407 // Cache misses on the merged chain 408 double LongCalls = ChainPred->Samples + ChainSucc->Samples - 409 shortCalls(ChainPred, ChainSucc, Edge); 410 const double MergedSamples = ChainPred->Samples + ChainSucc->Samples; 411 const double MergedSize = ChainPred->Size + ChainSucc->Size; 412 double NewScore = LongCalls * missProbability(MergedSamples / MergedSize); 413 414 double Gain = CurScore - NewScore; 415 // Scale the result to increase the importance of merging short chains 416 Gain /= std::min(ChainPred->Size, ChainSucc->Size); 417 418 return Gain; 419 } 420 421 /// Run the first optimization pass of the algorithm: 422 /// Merge chains that call each other with a high probability. 423 void runPassOne() { 424 // Find candidate pairs of chains for merging 425 std::vector<const Arc *> ArcsToMerge; 426 for (Chain *ChainPred : HotChains) { 427 NodeId F = ChainPred->Nodes.back(); 428 for (NodeId Succ : Cg.successors(F)) { 429 if (F == Succ) 430 continue; 431 432 const Arc &Arc = *Cg.findArc(F, Succ); 433 if (Arc.weight() == 0.0 || 434 Arc.weight() / TotalSamples < opts::ArcThreshold) 435 continue; 436 437 const double CallsFromPred = OutWeight[F]; 438 const double CallsToSucc = InWeight[Succ]; 439 const double CallsPredSucc = Arc.weight(); 440 441 // Probability that the first chain is calling the second one 442 const double ProbOut = 443 CallsFromPred > 0 ? CallsPredSucc / CallsFromPred : 0; 444 assert(0.0 <= ProbOut && ProbOut <= 1.0 && "incorrect out-probability"); 445 446 // Probability that the second chain is called From the first one 447 const double ProbIn = CallsToSucc > 0 ? CallsPredSucc / CallsToSucc : 0; 448 assert(0.0 <= ProbIn && ProbIn <= 1.0 && "incorrect in-probability"); 449 450 if (std::min(ProbOut, ProbIn) >= opts::MergeProbability) 451 ArcsToMerge.push_back(&Arc); 452 } 453 } 454 455 // Sort the pairs by the weight in reverse order 456 std::sort( 457 ArcsToMerge.begin(), ArcsToMerge.end(), 458 [](const Arc *L, const Arc *R) { return L->weight() > R->weight(); }); 459 460 // Merge the pairs of chains 461 for (const Arc *Arc : ArcsToMerge) { 462 Chain *ChainPred = NodeChain[Arc->src()]; 463 Chain *ChainSucc = NodeChain[Arc->dst()]; 464 if (ChainPred == ChainSucc) 465 continue; 466 if (ChainPred->Nodes.back() == Arc->src() && 467 ChainSucc->Nodes.front() == Arc->dst()) 468 mergeChains(ChainPred, ChainSucc); 469 } 470 } 471 472 /// Run the second optimization pass of the hfsort+ algorithm: 473 /// Merge pairs of chains while there is an improvement in the 474 /// expected cache miss ratio. 475 void runPassTwo() { 476 // Creating a priority queue containing all edges ordered by the merge gain 477 auto GainComparator = [](Edge *L, Edge *R) { 478 if (std::abs(L->gain() - R->gain()) > 1e-8) 479 return L->gain() > R->gain(); 480 481 // Making sure the comparison is deterministic 482 if (L->predChain()->Id != R->predChain()->Id) 483 return L->predChain()->Id < R->predChain()->Id; 484 485 return L->succChain()->Id < R->succChain()->Id; 486 }; 487 std::set<Edge *, decltype(GainComparator)> Queue(GainComparator); 488 489 // Inserting the edges Into the queue 490 for (Chain *ChainPred : HotChains) { 491 for (auto EdgeIt : ChainPred->Edges) { 492 Chain *ChainSucc = EdgeIt.first; 493 Edge *ChainEdge = EdgeIt.second; 494 // Ignore loop edges 495 if (ChainPred == ChainSucc) 496 continue; 497 // Ignore already processed edges 498 if (ChainEdge->gain() != -1.0) 499 continue; 500 501 // Compute the gain of merging the two chains 502 auto ForwardGain = mergeGain(ChainPred, ChainSucc, ChainEdge); 503 auto BackwardGain = mergeGain(ChainSucc, ChainPred, ChainEdge); 504 ChainEdge->setMergeGain(ChainPred, ForwardGain, BackwardGain); 505 if (ChainEdge->gain() > 0.0) 506 Queue.insert(ChainEdge); 507 } 508 } 509 510 // Merge the chains while the gain of merging is positive 511 while (!Queue.empty()) { 512 // Extract the best (top) edge for merging 513 Edge *It = *Queue.begin(); 514 Queue.erase(Queue.begin()); 515 Edge *BestEdge = It; 516 Chain *BestChainPred = BestEdge->predChain(); 517 Chain *BestChainSucc = BestEdge->succChain(); 518 if (BestChainPred == BestChainSucc || BestEdge->gain() <= 0.0) 519 continue; 520 521 // Remove outdated edges 522 for (std::pair<Chain *, Edge *> EdgeIt : BestChainPred->Edges) 523 Queue.erase(EdgeIt.second); 524 for (std::pair<Chain *, Edge *> EdgeIt : BestChainSucc->Edges) 525 Queue.erase(EdgeIt.second); 526 527 // Merge the best pair of chains 528 mergeChains(BestChainPred, BestChainSucc); 529 530 // Insert newly created edges Into the queue 531 for (auto EdgeIt : BestChainPred->Edges) { 532 Chain *ChainSucc = EdgeIt.first; 533 Edge *ChainEdge = EdgeIt.second; 534 // Ignore loop edges 535 if (BestChainPred == ChainSucc) 536 continue; 537 538 // Compute the gain of merging the two chains 539 auto ForwardGain = mergeGain(BestChainPred, ChainSucc, ChainEdge); 540 auto BackwardGain = mergeGain(ChainSucc, BestChainPred, ChainEdge); 541 ChainEdge->setMergeGain(BestChainPred, ForwardGain, BackwardGain); 542 if (ChainEdge->gain() > 0.0) 543 Queue.insert(ChainEdge); 544 } 545 } 546 } 547 548 /// Merge chain From into chain Into and update the list of active chains. 549 void mergeChains(Chain *Into, Chain *From) { 550 assert(Into != From && "cannot merge a chain with itself"); 551 Into->merge(From); 552 553 // Update the chains and addresses for functions merged from From 554 size_t CurAddr = 0; 555 for (NodeId F : Into->Nodes) { 556 NodeChain[F] = Into; 557 Addr[F] = CurAddr; 558 CurAddr += Cg.size(F); 559 } 560 561 // Merge edges 562 Into->mergeEdges(From); 563 From->clear(); 564 565 // Update cached scores for the new chain 566 Into->ShortCalls = shortCalls(Into); 567 Into->Score = score(Into); 568 569 // Remove chain From From the list of active chains 570 auto it = std::remove(HotChains.begin(), HotChains.end(), From); 571 HotChains.erase(it, HotChains.end()); 572 } 573 574 private: 575 // The call graph 576 const CallGraph &Cg; 577 578 // All chains of functions 579 std::vector<Chain> AllChains; 580 581 // Active chains. The vector gets updated at runtime when chains are merged 582 std::vector<Chain *> HotChains; 583 584 // All edges between chains 585 std::vector<Edge> AllEdges; 586 587 // Node_id => chain 588 std::vector<Chain *> NodeChain; 589 590 // Current address of the function From the beginning of its chain 591 std::vector<uint64_t> Addr; 592 593 // Total weight of outgoing arcs for each function 594 std::vector<double> OutWeight; 595 596 // Total weight of incoming arcs for each function 597 std::vector<double> InWeight; 598 // The total number of samples in the graph 599 double TotalSamples{0}; 600 }; 601 602 } // end anonymous namespace 603 604 std::vector<Cluster> hfsortPlus(CallGraph &Cg) { 605 // It is required that the sum of incoming arc weights is not greater 606 // than the number of samples for every function. 607 // Ensuring the call graph obeys the property before running the algorithm. 608 Cg.adjustArcWeights(); 609 return HFSortPlus(Cg).run(); 610 } 611 612 } // namespace bolt 613 } // namespace llvm 614