1 //===- bolt/Passes/HFSortPlus.cpp - Order functions by hotness ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // hfsort+ - layout of hot functions with i-TLB cache optimization. 10 // 11 // Given an ordering of hot functions (and hence, their assignment to the 12 // i-TLB pages), we can divide all functions calls Into two categories: 13 // - 'short' ones that have a caller-callee distance less than a page; 14 // - 'long' ones where the distance exceeds a page. 15 // The short calls are likely to result in a i-TLB cache hit. For the long ones, 16 // the hit/miss result depends on the 'hotness' of the page (i.e., how often 17 // the page is accessed). Assuming that functions are sent to the i-TLB cache 18 // in a random order, the probability that a page is present in the cache is 19 // proportional to the number of samples corresponding to the functions on the 20 // page. The following algorithm detects short and long calls, and optimizes 21 // the expected number of cache misses for the long ones. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #include "bolt/Passes/HFSort.h" 26 #include "llvm/Support/CommandLine.h" 27 #include <cmath> 28 #include <set> 29 #include <vector> 30 31 #define DEBUG_TYPE "hfsort" 32 33 using namespace llvm; 34 using namespace bolt; 35 36 namespace opts { 37 38 extern cl::OptionCategory BoltOptCategory; 39 40 cl::opt<unsigned> 41 ITLBPageSize("itlb-page-size", 42 cl::desc("The size of i-tlb cache page"), 43 cl::init(4096), 44 cl::ReallyHidden, 45 cl::ZeroOrMore, 46 cl::cat(BoltOptCategory)); 47 48 cl::opt<unsigned> 49 ITLBEntries("itlb-entries", 50 cl::desc("The number of entries in i-tlb cache"), 51 cl::init(16), 52 cl::ReallyHidden, 53 cl::ZeroOrMore, 54 cl::cat(BoltOptCategory)); 55 56 static cl::opt<unsigned> 57 ITLBDensity("itlb-density", 58 cl::desc("The density of i-tlb cache"), 59 cl::init(4096), 60 cl::ReallyHidden, 61 cl::ZeroOrMore, 62 cl::cat(BoltOptCategory)); 63 64 static cl::opt<double> 65 MergeProbability("merge-probability", 66 cl::desc("The minimum probability of a call for merging two clusters"), 67 cl::init(0.9), 68 cl::ReallyHidden, 69 cl::ZeroOrMore, 70 cl::cat(BoltOptCategory)); 71 72 static cl::opt<double> 73 ArcThreshold("arc-threshold", 74 cl::desc("The threshold for ignoring arcs with a small relative weight"), 75 cl::init(0.00000001), 76 cl::ReallyHidden, 77 cl::ZeroOrMore, 78 cl::cat(BoltOptCategory)); 79 80 } // namespace opts 81 82 namespace llvm { 83 namespace bolt { 84 85 using NodeId = CallGraph::NodeId; 86 using Arc = CallGraph::Arc; 87 88 namespace { 89 90 class Edge; 91 using ArcList = std::vector<const Arc *>; 92 93 // A chain (ordered sequence) of nodes (functions) in the call graph 94 class Chain { 95 public: 96 Chain(const Chain &) = delete; 97 Chain(Chain &&) = default; 98 Chain &operator=(const Chain &) = delete; 99 Chain &operator=(Chain &&) = default; 100 101 explicit Chain(size_t Id_, NodeId Node, size_t Samples_, size_t Size_) 102 : Id(Id_), Samples(Samples_), Size(Size_), Nodes(1, Node) {} 103 104 double density() const { return static_cast<double>(Samples) / Size; } 105 106 Edge *getEdge(Chain *Other) const { 107 for (std::pair<Chain *, Edge *> It : Edges) 108 if (It.first == Other) 109 return It.second; 110 return nullptr; 111 } 112 113 void removeEdge(Chain *Other) { 114 auto It = Edges.begin(); 115 while (It != Edges.end()) { 116 if (It->first == Other) { 117 Edges.erase(It); 118 return; 119 } 120 It++; 121 } 122 } 123 124 void addEdge(Chain *Other, Edge *Edge) { Edges.emplace_back(Other, Edge); } 125 126 void merge(Chain *Other) { 127 Nodes.insert(Nodes.end(), Other->Nodes.begin(), Other->Nodes.end()); 128 Samples += Other->Samples; 129 Size += Other->Size; 130 } 131 132 void mergeEdges(Chain *Other); 133 134 void clear() { 135 Nodes.clear(); 136 Edges.clear(); 137 } 138 139 public: 140 size_t Id; 141 uint64_t Samples; 142 uint64_t Size; 143 // Cached score for the chain 144 double Score{0}; 145 // Cached short-calls for the chain 146 double ShortCalls{0}; 147 // Nodes in the chain 148 std::vector<NodeId> Nodes; 149 // Adjacent chains and corresponding edges (lists of arcs) 150 std::vector<std::pair<Chain *, Edge *>> Edges; 151 }; 152 153 // An edge in the call graph representing Arcs between two Chains. 154 // When functions are merged Into chains, the edges are combined too so that 155 // there is always at most one edge between a pair of chains 156 class Edge { 157 public: 158 Edge(const Edge &) = delete; 159 Edge(Edge &&) = default; 160 Edge &operator=(const Edge &) = delete; 161 Edge &operator=(Edge &&) = default; 162 163 explicit Edge(Chain *SrcChain_, Chain *DstChain_, const Arc *A) 164 : SrcChain(SrcChain_), DstChain(DstChain_), Arcs(1, A) {} 165 166 void changeEndpoint(Chain *From, Chain *To) { 167 if (From == SrcChain) 168 SrcChain = To; 169 if (From == DstChain) 170 DstChain = To; 171 } 172 173 void moveArcs(Edge *Other) { 174 Arcs.insert(Arcs.end(), Other->Arcs.begin(), Other->Arcs.end()); 175 Other->Arcs.clear(); 176 } 177 178 void setMergeGain(Chain *PredChain, double ForwardGain, double BackwardGain) { 179 // When forward and backward gains are the same, prioritize merging that 180 // preserves the original order of the functions in the binary 181 if (std::abs(ForwardGain - BackwardGain) < 1e-8) { 182 if (SrcChain->Id < DstChain->Id) { 183 IsGainForward = true; 184 CachedGain = PredChain == SrcChain ? ForwardGain : BackwardGain; 185 } else { 186 IsGainForward = false; 187 CachedGain = PredChain == SrcChain ? BackwardGain : ForwardGain; 188 } 189 } else if (ForwardGain > BackwardGain) { 190 IsGainForward = PredChain == SrcChain; 191 CachedGain = ForwardGain; 192 } else { 193 IsGainForward = PredChain != SrcChain; 194 CachedGain = BackwardGain; 195 } 196 } 197 198 double gain() const { return CachedGain; } 199 200 Chain *predChain() const { return IsGainForward ? SrcChain : DstChain; } 201 202 Chain *succChain() const { return IsGainForward ? DstChain : SrcChain; } 203 204 private: 205 Chain *SrcChain{nullptr}; 206 Chain *DstChain{nullptr}; 207 208 public: 209 // Original arcs in the binary with corresponding execution counts 210 ArcList Arcs; 211 // Cached gain of merging the pair of chains 212 double CachedGain{-1.0}; 213 // Since the gain of merging (Src, Dst) and (Dst, Src) might be different, 214 // we store a flag indicating which of the options results in a higher gain 215 bool IsGainForward; 216 }; 217 218 void Chain::mergeEdges(Chain *Other) { 219 // Update edges adjacent to chain other 220 for (auto EdgeIt : Other->Edges) { 221 Chain *const DstChain = EdgeIt.first; 222 Edge *const DstEdge = EdgeIt.second; 223 Chain *const TargetChain = DstChain == Other ? this : DstChain; 224 225 // Find the corresponding edge in the current chain 226 Edge *CurEdge = getEdge(TargetChain); 227 if (CurEdge == nullptr) { 228 DstEdge->changeEndpoint(Other, this); 229 this->addEdge(TargetChain, DstEdge); 230 if (DstChain != this && DstChain != Other) 231 DstChain->addEdge(this, DstEdge); 232 } else { 233 CurEdge->moveArcs(DstEdge); 234 } 235 // Cleanup leftover edge 236 if (DstChain != Other) 237 DstChain->removeEdge(Other); 238 } 239 } 240 241 class HFSortPlus { 242 public: 243 explicit HFSortPlus(const CallGraph &Cg) : Cg(Cg) { initialize(); } 244 245 /// Run the algorithm and return ordered set of function clusters. 246 std::vector<Cluster> run() { 247 // Pass 1 248 runPassOne(); 249 250 // Pass 2 251 runPassTwo(); 252 253 outs() << "BOLT-INFO: hfsort+ reduced the number of chains from " 254 << Cg.numNodes() << " to " << HotChains.size() << "\n"; 255 256 // Sorting chains by density in decreasing order 257 auto DensityComparator = [](const Chain *L, const Chain *R) { 258 if (L->density() != R->density()) 259 return L->density() > R->density(); 260 // Making sure the comparison is deterministic 261 return L->Id < R->Id; 262 }; 263 std::stable_sort(HotChains.begin(), HotChains.end(), DensityComparator); 264 265 // Return the set of clusters that are left, which are the ones that 266 // didn't get merged (so their first func is its original func) 267 std::vector<Cluster> Clusters; 268 Clusters.reserve(HotChains.size()); 269 for (Chain *Chain : HotChains) 270 Clusters.emplace_back(Cluster(Chain->Nodes, Cg)); 271 return Clusters; 272 } 273 274 private: 275 /// Initialize the set of active chains, function id to chain mapping, 276 /// total number of samples and function addresses. 277 void initialize() { 278 OutWeight.resize(Cg.numNodes(), 0); 279 InWeight.resize(Cg.numNodes(), 0); 280 AllChains.reserve(Cg.numNodes()); 281 HotChains.reserve(Cg.numNodes()); 282 NodeChain.resize(Cg.numNodes(), nullptr); 283 Addr.resize(Cg.numNodes(), 0); 284 285 // Initialize chains 286 for (NodeId F = 0; F < Cg.numNodes(); ++F) { 287 AllChains.emplace_back(F, F, Cg.samples(F), Cg.size(F)); 288 HotChains.push_back(&AllChains.back()); 289 NodeChain[F] = &AllChains.back(); 290 TotalSamples += Cg.samples(F); 291 for (NodeId Succ : Cg.successors(F)) { 292 if (F == Succ) 293 continue; 294 const Arc &Arc = *Cg.findArc(F, Succ); 295 OutWeight[F] += Arc.weight(); 296 InWeight[Succ] += Arc.weight(); 297 } 298 } 299 300 AllEdges.reserve(Cg.numArcs()); 301 for (NodeId F = 0; F < Cg.numNodes(); ++F) { 302 for (NodeId Succ : Cg.successors(F)) { 303 if (F == Succ) 304 continue; 305 const Arc &Arc = *Cg.findArc(F, Succ); 306 if (Arc.weight() == 0.0 || 307 Arc.weight() / TotalSamples < opts::ArcThreshold) { 308 continue; 309 } 310 311 Edge *CurEdge = NodeChain[F]->getEdge(NodeChain[Succ]); 312 if (CurEdge != nullptr) { 313 // This edge is already present in the graph 314 assert(NodeChain[Succ]->getEdge(NodeChain[F]) != nullptr); 315 CurEdge->Arcs.push_back(&Arc); 316 } else { 317 // This is a new edge 318 AllEdges.emplace_back(NodeChain[F], NodeChain[Succ], &Arc); 319 NodeChain[F]->addEdge(NodeChain[Succ], &AllEdges.back()); 320 NodeChain[Succ]->addEdge(NodeChain[F], &AllEdges.back()); 321 } 322 } 323 } 324 325 for (Chain *&Chain : HotChains) { 326 Chain->ShortCalls = shortCalls(Chain); 327 Chain->Score = score(Chain); 328 } 329 } 330 331 /// The probability that a page with a given density is not in the cache. 332 /// 333 /// Assume that the hot functions are called in a random order; then the 334 /// probability of an i-TLB page being accessed after a function call is 335 /// p = pageSamples / TotalSamples. The probability that the page is not 336 /// accessed is (1 - p), and the probability that it is not in the cache 337 /// (i.e. not accessed during the last kCacheEntries function calls) 338 /// is (1 - p)^kCacheEntries 339 double missProbability(double ChainDensity) const { 340 double PageSamples = ChainDensity * opts::ITLBDensity; 341 342 if (PageSamples >= TotalSamples) 343 return 0; 344 345 double P = PageSamples / TotalSamples; 346 return pow(1.0 - P, double(opts::ITLBEntries)); 347 } 348 349 /// The expected number of calls on different i-TLB pages for an arc of the 350 /// call graph with a specified weight 351 double expectedCalls(uint64_t SrcAddr, uint64_t DstAddr, 352 double Weight) const { 353 uint64_t Dist = SrcAddr >= DstAddr ? SrcAddr - DstAddr : DstAddr - SrcAddr; 354 if (Dist >= opts::ITLBPageSize) 355 return 0; 356 357 double D = double(Dist) / double(opts::ITLBPageSize); 358 // Increasing the importance of shorter calls 359 return (1.0 - D * D) * Weight; 360 } 361 362 /// The expected number of calls within a given chain with both endpoints on 363 /// the same cache page 364 double shortCalls(Chain *Chain) const { 365 Edge *Edge = Chain->getEdge(Chain); 366 if (Edge == nullptr) 367 return 0; 368 369 double Calls = 0; 370 for (const Arc *Arc : Edge->Arcs) { 371 uint64_t SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()); 372 uint64_t DstAddr = Addr[Arc->dst()]; 373 Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight()); 374 } 375 return Calls; 376 } 377 378 /// The number of calls between the two chains with both endpoints on 379 /// the same i-TLB page, assuming that a given pair of chains gets merged 380 double shortCalls(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const { 381 double Calls = 0; 382 for (const Arc *Arc : Edge->Arcs) { 383 Chain *SrcChain = NodeChain[Arc->src()]; 384 uint64_t SrcAddr; 385 uint64_t DstAddr; 386 if (SrcChain == ChainPred) { 387 SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()); 388 DstAddr = Addr[Arc->dst()] + ChainPred->Size; 389 } else { 390 SrcAddr = 391 Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()) + ChainPred->Size; 392 DstAddr = Addr[Arc->dst()]; 393 } 394 Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight()); 395 } 396 397 Calls += ChainPred->ShortCalls; 398 Calls += ChainSucc->ShortCalls; 399 400 return Calls; 401 } 402 403 double score(Chain *Chain) const { 404 double LongCalls = Chain->Samples - Chain->ShortCalls; 405 return LongCalls * missProbability(Chain->density()); 406 } 407 408 /// The gain of merging two chains. 409 /// 410 /// We assume that the final chains are sorted by their density, and hence 411 /// every chain is likely to be adjacent with chains of the same density. 412 /// Thus, the 'hotness' of every chain can be estimated by density*pageSize, 413 /// which is used to compute the probability of cache misses for long calls 414 /// of a given chain. 415 /// The result is also scaled by the size of the resulting chain in order to 416 /// increase the chance of merging short chains, which is helpful for 417 /// the i-cache performance. 418 double mergeGain(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const { 419 // Cache misses on the chains before merging 420 double CurScore = ChainPred->Score + ChainSucc->Score; 421 422 // Cache misses on the merged chain 423 double LongCalls = ChainPred->Samples + ChainSucc->Samples - 424 shortCalls(ChainPred, ChainSucc, Edge); 425 const double MergedSamples = ChainPred->Samples + ChainSucc->Samples; 426 const double MergedSize = ChainPred->Size + ChainSucc->Size; 427 double NewScore = LongCalls * missProbability(MergedSamples / MergedSize); 428 429 double Gain = CurScore - NewScore; 430 // Scale the result to increase the importance of merging short chains 431 Gain /= std::min(ChainPred->Size, ChainSucc->Size); 432 433 return Gain; 434 } 435 436 /// Run the first optimization pass of the algorithm: 437 /// Merge chains that call each other with a high probability. 438 void runPassOne() { 439 // Find candidate pairs of chains for merging 440 std::vector<const Arc *> ArcsToMerge; 441 for (Chain *ChainPred : HotChains) { 442 NodeId F = ChainPred->Nodes.back(); 443 for (NodeId Succ : Cg.successors(F)) { 444 if (F == Succ) 445 continue; 446 447 const Arc &Arc = *Cg.findArc(F, Succ); 448 if (Arc.weight() == 0.0 || 449 Arc.weight() / TotalSamples < opts::ArcThreshold) 450 continue; 451 452 const double CallsFromPred = OutWeight[F]; 453 const double CallsToSucc = InWeight[Succ]; 454 const double CallsPredSucc = Arc.weight(); 455 456 // Probability that the first chain is calling the second one 457 const double ProbOut = 458 CallsFromPred > 0 ? CallsPredSucc / CallsFromPred : 0; 459 assert(0.0 <= ProbOut && ProbOut <= 1.0 && "incorrect out-probability"); 460 461 // Probability that the second chain is called From the first one 462 const double ProbIn = CallsToSucc > 0 ? CallsPredSucc / CallsToSucc : 0; 463 assert(0.0 <= ProbIn && ProbIn <= 1.0 && "incorrect in-probability"); 464 465 if (std::min(ProbOut, ProbIn) >= opts::MergeProbability) 466 ArcsToMerge.push_back(&Arc); 467 } 468 } 469 470 // Sort the pairs by the weight in reverse order 471 std::sort( 472 ArcsToMerge.begin(), ArcsToMerge.end(), 473 [](const Arc *L, const Arc *R) { return L->weight() > R->weight(); }); 474 475 // Merge the pairs of chains 476 for (const Arc *Arc : ArcsToMerge) { 477 Chain *ChainPred = NodeChain[Arc->src()]; 478 Chain *ChainSucc = NodeChain[Arc->dst()]; 479 if (ChainPred == ChainSucc) 480 continue; 481 if (ChainPred->Nodes.back() == Arc->src() && 482 ChainSucc->Nodes.front() == Arc->dst()) 483 mergeChains(ChainPred, ChainSucc); 484 } 485 } 486 487 /// Run the second optimization pass of the hfsort+ algorithm: 488 /// Merge pairs of chains while there is an improvement in the 489 /// expected cache miss ratio. 490 void runPassTwo() { 491 // Creating a priority queue containing all edges ordered by the merge gain 492 auto GainComparator = [](Edge *L, Edge *R) { 493 if (std::abs(L->gain() - R->gain()) > 1e-8) 494 return L->gain() > R->gain(); 495 496 // Making sure the comparison is deterministic 497 if (L->predChain()->Id != R->predChain()->Id) 498 return L->predChain()->Id < R->predChain()->Id; 499 500 return L->succChain()->Id < R->succChain()->Id; 501 }; 502 std::set<Edge *, decltype(GainComparator)> Queue(GainComparator); 503 504 // Inserting the edges Into the queue 505 for (Chain *ChainPred : HotChains) { 506 for (auto EdgeIt : ChainPred->Edges) { 507 Chain *ChainSucc = EdgeIt.first; 508 Edge *ChainEdge = EdgeIt.second; 509 // Ignore loop edges 510 if (ChainPred == ChainSucc) 511 continue; 512 // Ignore already processed edges 513 if (ChainEdge->gain() != -1.0) 514 continue; 515 516 // Compute the gain of merging the two chains 517 auto ForwardGain = mergeGain(ChainPred, ChainSucc, ChainEdge); 518 auto BackwardGain = mergeGain(ChainSucc, ChainPred, ChainEdge); 519 ChainEdge->setMergeGain(ChainPred, ForwardGain, BackwardGain); 520 if (ChainEdge->gain() > 0.0) 521 Queue.insert(ChainEdge); 522 } 523 } 524 525 // Merge the chains while the gain of merging is positive 526 while (!Queue.empty()) { 527 // Extract the best (top) edge for merging 528 Edge *It = *Queue.begin(); 529 Queue.erase(Queue.begin()); 530 Edge *BestEdge = It; 531 Chain *BestChainPred = BestEdge->predChain(); 532 Chain *BestChainSucc = BestEdge->succChain(); 533 if (BestChainPred == BestChainSucc || BestEdge->gain() <= 0.0) 534 continue; 535 536 // Remove outdated edges 537 for (std::pair<Chain *, Edge *> EdgeIt : BestChainPred->Edges) 538 Queue.erase(EdgeIt.second); 539 for (std::pair<Chain *, Edge *> EdgeIt : BestChainSucc->Edges) 540 Queue.erase(EdgeIt.second); 541 542 // Merge the best pair of chains 543 mergeChains(BestChainPred, BestChainSucc); 544 545 // Insert newly created edges Into the queue 546 for (auto EdgeIt : BestChainPred->Edges) { 547 Chain *ChainSucc = EdgeIt.first; 548 Edge *ChainEdge = EdgeIt.second; 549 // Ignore loop edges 550 if (BestChainPred == ChainSucc) 551 continue; 552 553 // Compute the gain of merging the two chains 554 auto ForwardGain = mergeGain(BestChainPred, ChainSucc, ChainEdge); 555 auto BackwardGain = mergeGain(ChainSucc, BestChainPred, ChainEdge); 556 ChainEdge->setMergeGain(BestChainPred, ForwardGain, BackwardGain); 557 if (ChainEdge->gain() > 0.0) 558 Queue.insert(ChainEdge); 559 } 560 } 561 } 562 563 /// Merge chain From into chain Into and update the list of active chains. 564 void mergeChains(Chain *Into, Chain *From) { 565 assert(Into != From && "cannot merge a chain with itself"); 566 Into->merge(From); 567 568 // Update the chains and addresses for functions merged from From 569 size_t CurAddr = 0; 570 for (NodeId F : Into->Nodes) { 571 NodeChain[F] = Into; 572 Addr[F] = CurAddr; 573 CurAddr += Cg.size(F); 574 } 575 576 // Merge edges 577 Into->mergeEdges(From); 578 From->clear(); 579 580 // Update cached scores for the new chain 581 Into->ShortCalls = shortCalls(Into); 582 Into->Score = score(Into); 583 584 // Remove chain From From the list of active chains 585 auto it = std::remove(HotChains.begin(), HotChains.end(), From); 586 HotChains.erase(it, HotChains.end()); 587 } 588 589 private: 590 // The call graph 591 const CallGraph &Cg; 592 593 // All chains of functions 594 std::vector<Chain> AllChains; 595 596 // Active chains. The vector gets updated at runtime when chains are merged 597 std::vector<Chain *> HotChains; 598 599 // All edges between chains 600 std::vector<Edge> AllEdges; 601 602 // Node_id => chain 603 std::vector<Chain *> NodeChain; 604 605 // Current address of the function From the beginning of its chain 606 std::vector<uint64_t> Addr; 607 608 // Total weight of outgoing arcs for each function 609 std::vector<double> OutWeight; 610 611 // Total weight of incoming arcs for each function 612 std::vector<double> InWeight; 613 // The total number of samples in the graph 614 double TotalSamples{0}; 615 }; 616 617 } // end anonymous namespace 618 619 std::vector<Cluster> hfsortPlus(CallGraph &Cg) { 620 // It is required that the sum of incoming arc weights is not greater 621 // than the number of samples for every function. 622 // Ensuring the call graph obeys the property before running the algorithm. 623 Cg.adjustArcWeights(); 624 return HFSortPlus(Cg).run(); 625 } 626 627 } // namespace bolt 628 } // namespace llvm 629