1 //===- bolt/Passes/HFSortPlus.cpp - Order functions by hotness ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // hfsort+ - layout of hot functions with i-TLB cache optimization.
10 //
11 // Given an ordering of hot functions (and hence, their assignment to the
12 // i-TLB pages), we can divide all functions calls Into two categories:
13 // - 'short' ones that have a caller-callee distance less than a page;
14 // - 'long' ones where the distance exceeds a page.
15 // The short calls are likely to result in a i-TLB cache hit. For the long ones,
16 // the hit/miss result depends on the 'hotness' of the page (i.e., how often
17 // the page is accessed). Assuming that functions are sent to the i-TLB cache
18 // in a random order, the probability that a page is present in the cache is
19 // proportional to the number of samples corresponding to the functions on the
20 // page. The following algorithm detects short and long calls, and optimizes
21 // the expected number of cache misses for the long ones.
22 //
23 //===----------------------------------------------------------------------===//
24 
25 #include "bolt/Passes/HFSort.h"
26 #include "llvm/Support/CommandLine.h"
27 #include <cmath>
28 #include <set>
29 #include <vector>
30 
31 #define DEBUG_TYPE "hfsort"
32 
33 using namespace llvm;
34 using namespace bolt;
35 
36 namespace opts {
37 
38 extern cl::OptionCategory BoltOptCategory;
39 
40 cl::opt<unsigned>
41 ITLBPageSize("itlb-page-size",
42   cl::desc("The size of i-tlb cache page"),
43   cl::init(4096),
44   cl::ReallyHidden,
45   cl::ZeroOrMore,
46   cl::cat(BoltOptCategory));
47 
48 cl::opt<unsigned>
49 ITLBEntries("itlb-entries",
50   cl::desc("The number of entries in i-tlb cache"),
51   cl::init(16),
52   cl::ReallyHidden,
53   cl::ZeroOrMore,
54   cl::cat(BoltOptCategory));
55 
56 static cl::opt<unsigned>
57 ITLBDensity("itlb-density",
58   cl::desc("The density of i-tlb cache"),
59   cl::init(4096),
60   cl::ReallyHidden,
61   cl::ZeroOrMore,
62   cl::cat(BoltOptCategory));
63 
64 static cl::opt<double>
65 MergeProbability("merge-probability",
66   cl::desc("The minimum probability of a call for merging two clusters"),
67   cl::init(0.9),
68   cl::ReallyHidden,
69   cl::ZeroOrMore,
70   cl::cat(BoltOptCategory));
71 
72 static cl::opt<double>
73 ArcThreshold("arc-threshold",
74   cl::desc("The threshold for ignoring arcs with a small relative weight"),
75   cl::init(0.00000001),
76   cl::ReallyHidden,
77   cl::ZeroOrMore,
78   cl::cat(BoltOptCategory));
79 
80 } // namespace opts
81 
82 namespace llvm {
83 namespace bolt {
84 
85 using NodeId = CallGraph::NodeId;
86 using Arc = CallGraph::Arc;
87 
88 namespace {
89 
90 class Edge;
91 using ArcList = std::vector<const Arc *>;
92 
93 // A chain (ordered sequence) of nodes (functions) in the call graph
94 class Chain {
95 public:
96   Chain(const Chain &) = delete;
97   Chain(Chain &&) = default;
98   Chain &operator=(const Chain &) = delete;
99   Chain &operator=(Chain &&) = default;
100 
101   explicit Chain(size_t Id_, NodeId Node, size_t Samples_, size_t Size_)
102       : Id(Id_), Samples(Samples_), Size(Size_), Nodes(1, Node) {}
103 
104   double density() const { return static_cast<double>(Samples) / Size; }
105 
106   Edge *getEdge(Chain *Other) const {
107     for (std::pair<Chain *, Edge *> It : Edges)
108       if (It.first == Other)
109         return It.second;
110     return nullptr;
111   }
112 
113   void removeEdge(Chain *Other) {
114     auto It = Edges.begin();
115     while (It != Edges.end()) {
116       if (It->first == Other) {
117         Edges.erase(It);
118         return;
119       }
120       It++;
121     }
122   }
123 
124   void addEdge(Chain *Other, Edge *Edge) { Edges.emplace_back(Other, Edge); }
125 
126   void merge(Chain *Other) {
127     Nodes.insert(Nodes.end(), Other->Nodes.begin(), Other->Nodes.end());
128     Samples += Other->Samples;
129     Size += Other->Size;
130   }
131 
132   void mergeEdges(Chain *Other);
133 
134   void clear() {
135     Nodes.clear();
136     Edges.clear();
137   }
138 
139 public:
140   size_t Id;
141   uint64_t Samples;
142   uint64_t Size;
143   // Cached score for the chain
144   double Score{0};
145   // Cached short-calls for the chain
146   double ShortCalls{0};
147   // Nodes in the chain
148   std::vector<NodeId> Nodes;
149   // Adjacent chains and corresponding edges (lists of arcs)
150   std::vector<std::pair<Chain *, Edge *>> Edges;
151 };
152 
153 // An edge in the call graph representing Arcs between two Chains.
154 // When functions are merged Into chains, the edges are combined too so that
155 // there is always at most one edge between a pair of chains
156 class Edge {
157 public:
158   Edge(const Edge &) = delete;
159   Edge(Edge &&) = default;
160   Edge &operator=(const Edge &) = delete;
161   Edge &operator=(Edge &&) = default;
162 
163   explicit Edge(Chain *SrcChain_, Chain *DstChain_, const Arc *A)
164       : SrcChain(SrcChain_), DstChain(DstChain_), Arcs(1, A) {}
165 
166   void changeEndpoint(Chain *From, Chain *To) {
167     if (From == SrcChain)
168       SrcChain = To;
169     if (From == DstChain)
170       DstChain = To;
171   }
172 
173   void moveArcs(Edge *Other) {
174     Arcs.insert(Arcs.end(), Other->Arcs.begin(), Other->Arcs.end());
175     Other->Arcs.clear();
176   }
177 
178   void setMergeGain(Chain *PredChain, double ForwardGain, double BackwardGain) {
179     // When forward and backward gains are the same, prioritize merging that
180     // preserves the original order of the functions in the binary
181     if (std::abs(ForwardGain - BackwardGain) < 1e-8) {
182       if (SrcChain->Id < DstChain->Id) {
183         IsGainForward = true;
184         CachedGain = PredChain == SrcChain ? ForwardGain : BackwardGain;
185       } else {
186         IsGainForward = false;
187         CachedGain = PredChain == SrcChain ? BackwardGain : ForwardGain;
188       }
189     } else if (ForwardGain > BackwardGain) {
190       IsGainForward = PredChain == SrcChain;
191       CachedGain = ForwardGain;
192     } else {
193       IsGainForward = PredChain != SrcChain;
194       CachedGain = BackwardGain;
195     }
196   }
197 
198   double gain() const { return CachedGain; }
199 
200   Chain *predChain() const { return IsGainForward ? SrcChain : DstChain; }
201 
202   Chain *succChain() const { return IsGainForward ? DstChain : SrcChain; }
203 
204 private:
205   Chain *SrcChain{nullptr};
206   Chain *DstChain{nullptr};
207 
208 public:
209   // Original arcs in the binary with corresponding execution counts
210   ArcList Arcs;
211   // Cached gain of merging the pair of chains
212   double CachedGain{-1.0};
213   // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
214   // we store a flag indicating which of the options results in a higher gain
215   bool IsGainForward;
216 };
217 
218 void Chain::mergeEdges(Chain *Other) {
219   // Update edges adjacent to chain other
220   for (auto EdgeIt : Other->Edges) {
221     Chain *const DstChain = EdgeIt.first;
222     Edge *const DstEdge = EdgeIt.second;
223     Chain *const TargetChain = DstChain == Other ? this : DstChain;
224 
225     // Find the corresponding edge in the current chain
226     Edge *CurEdge = getEdge(TargetChain);
227     if (CurEdge == nullptr) {
228       DstEdge->changeEndpoint(Other, this);
229       this->addEdge(TargetChain, DstEdge);
230       if (DstChain != this && DstChain != Other)
231         DstChain->addEdge(this, DstEdge);
232     } else {
233       CurEdge->moveArcs(DstEdge);
234     }
235     // Cleanup leftover edge
236     if (DstChain != Other)
237       DstChain->removeEdge(Other);
238   }
239 }
240 
241 class HFSortPlus {
242 public:
243   explicit HFSortPlus(const CallGraph &Cg) : Cg(Cg) { initialize(); }
244 
245   /// Run the algorithm and return ordered set of function clusters.
246   std::vector<Cluster> run() {
247     // Pass 1
248     runPassOne();
249 
250     // Pass 2
251     runPassTwo();
252 
253     outs() << "BOLT-INFO: hfsort+ reduced the number of chains from "
254            << Cg.numNodes() << " to " << HotChains.size() << "\n";
255 
256     // Sorting chains by density in decreasing order
257     auto DensityComparator = [](const Chain *L, const Chain *R) {
258       if (L->density() != R->density())
259         return L->density() > R->density();
260       // Making sure the comparison is deterministic
261       return L->Id < R->Id;
262     };
263     std::stable_sort(HotChains.begin(), HotChains.end(), DensityComparator);
264 
265     // Return the set of clusters that are left, which are the ones that
266     // didn't get merged (so their first func is its original func)
267     std::vector<Cluster> Clusters;
268     Clusters.reserve(HotChains.size());
269     for (Chain *Chain : HotChains)
270       Clusters.emplace_back(Cluster(Chain->Nodes, Cg));
271     return Clusters;
272   }
273 
274 private:
275   /// Initialize the set of active chains, function id to chain mapping,
276   /// total number of samples and function addresses.
277   void initialize() {
278     OutWeight.resize(Cg.numNodes(), 0);
279     InWeight.resize(Cg.numNodes(), 0);
280     AllChains.reserve(Cg.numNodes());
281     HotChains.reserve(Cg.numNodes());
282     NodeChain.resize(Cg.numNodes(), nullptr);
283     Addr.resize(Cg.numNodes(), 0);
284 
285     // Initialize chains
286     for (NodeId F = 0; F < Cg.numNodes(); ++F) {
287       AllChains.emplace_back(F, F, Cg.samples(F), Cg.size(F));
288       HotChains.push_back(&AllChains.back());
289       NodeChain[F] = &AllChains.back();
290       TotalSamples += Cg.samples(F);
291       for (NodeId Succ : Cg.successors(F)) {
292         if (F == Succ)
293           continue;
294         const Arc &Arc = *Cg.findArc(F, Succ);
295         OutWeight[F] += Arc.weight();
296         InWeight[Succ] += Arc.weight();
297       }
298     }
299 
300     AllEdges.reserve(Cg.numArcs());
301     for (NodeId F = 0; F < Cg.numNodes(); ++F) {
302       for (NodeId Succ : Cg.successors(F)) {
303         if (F == Succ)
304           continue;
305         const Arc &Arc = *Cg.findArc(F, Succ);
306         if (Arc.weight() == 0.0 ||
307             Arc.weight() / TotalSamples < opts::ArcThreshold) {
308           continue;
309         }
310 
311         Edge *CurEdge = NodeChain[F]->getEdge(NodeChain[Succ]);
312         if (CurEdge != nullptr) {
313           // This edge is already present in the graph
314           assert(NodeChain[Succ]->getEdge(NodeChain[F]) != nullptr);
315           CurEdge->Arcs.push_back(&Arc);
316         } else {
317           // This is a new edge
318           AllEdges.emplace_back(NodeChain[F], NodeChain[Succ], &Arc);
319           NodeChain[F]->addEdge(NodeChain[Succ], &AllEdges.back());
320           NodeChain[Succ]->addEdge(NodeChain[F], &AllEdges.back());
321         }
322       }
323     }
324 
325     for (Chain *&Chain : HotChains) {
326       Chain->ShortCalls = shortCalls(Chain);
327       Chain->Score = score(Chain);
328     }
329   }
330 
331   /// The probability that a page with a given density is not in the cache.
332   ///
333   /// Assume that the hot functions are called in a random order; then the
334   /// probability of an i-TLB page being accessed after a function call is
335   /// p = pageSamples / TotalSamples. The probability that the page is not
336   /// accessed is (1 - p), and the probability that it is not in the cache
337   /// (i.e. not accessed during the last kCacheEntries function calls)
338   /// is (1 - p)^kCacheEntries
339   double missProbability(double ChainDensity) const {
340     double PageSamples = ChainDensity * opts::ITLBDensity;
341 
342     if (PageSamples >= TotalSamples)
343       return 0;
344 
345     double P = PageSamples / TotalSamples;
346     return pow(1.0 - P, double(opts::ITLBEntries));
347   }
348 
349   /// The expected number of calls on different i-TLB pages for an arc of the
350   /// call graph with a specified weight
351   double expectedCalls(uint64_t SrcAddr, uint64_t DstAddr,
352                        double Weight) const {
353     uint64_t Dist = SrcAddr >= DstAddr ? SrcAddr - DstAddr : DstAddr - SrcAddr;
354     if (Dist >= opts::ITLBPageSize)
355       return 0;
356 
357     double D = double(Dist) / double(opts::ITLBPageSize);
358     // Increasing the importance of shorter calls
359     return (1.0 - D * D) * Weight;
360   }
361 
362   /// The expected number of calls within a given chain with both endpoints on
363   /// the same cache page
364   double shortCalls(Chain *Chain) const {
365     Edge *Edge = Chain->getEdge(Chain);
366     if (Edge == nullptr)
367       return 0;
368 
369     double Calls = 0;
370     for (const Arc *Arc : Edge->Arcs) {
371       uint64_t SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset());
372       uint64_t DstAddr = Addr[Arc->dst()];
373       Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight());
374     }
375     return Calls;
376   }
377 
378   /// The number of calls between the two chains with both endpoints on
379   /// the same i-TLB page, assuming that a given pair of chains gets merged
380   double shortCalls(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const {
381     double Calls = 0;
382     for (const Arc *Arc : Edge->Arcs) {
383       Chain *SrcChain = NodeChain[Arc->src()];
384       uint64_t SrcAddr;
385       uint64_t DstAddr;
386       if (SrcChain == ChainPred) {
387         SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset());
388         DstAddr = Addr[Arc->dst()] + ChainPred->Size;
389       } else {
390         SrcAddr =
391             Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()) + ChainPred->Size;
392         DstAddr = Addr[Arc->dst()];
393       }
394       Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight());
395     }
396 
397     Calls += ChainPred->ShortCalls;
398     Calls += ChainSucc->ShortCalls;
399 
400     return Calls;
401   }
402 
403   double score(Chain *Chain) const {
404     double LongCalls = Chain->Samples - Chain->ShortCalls;
405     return LongCalls * missProbability(Chain->density());
406   }
407 
408   /// The gain of merging two chains.
409   ///
410   /// We assume that the final chains are sorted by their density, and hence
411   /// every chain is likely to be adjacent with chains of the same density.
412   /// Thus, the 'hotness' of every chain can be estimated by density*pageSize,
413   /// which is used to compute the probability of cache misses for long calls
414   /// of a given chain.
415   /// The result is also scaled by the size of the resulting chain in order to
416   /// increase the chance of merging short chains, which is helpful for
417   /// the i-cache performance.
418   double mergeGain(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const {
419     // Cache misses on the chains before merging
420     double CurScore = ChainPred->Score + ChainSucc->Score;
421 
422     // Cache misses on the merged chain
423     double LongCalls = ChainPred->Samples + ChainSucc->Samples -
424                        shortCalls(ChainPred, ChainSucc, Edge);
425     const double MergedSamples = ChainPred->Samples + ChainSucc->Samples;
426     const double MergedSize = ChainPred->Size + ChainSucc->Size;
427     double NewScore = LongCalls * missProbability(MergedSamples / MergedSize);
428 
429     double Gain = CurScore - NewScore;
430     // Scale the result to increase the importance of merging short chains
431     Gain /= std::min(ChainPred->Size, ChainSucc->Size);
432 
433     return Gain;
434   }
435 
436   /// Run the first optimization pass of the algorithm:
437   /// Merge chains that call each other with a high probability.
438   void runPassOne() {
439     // Find candidate pairs of chains for merging
440     std::vector<const Arc *> ArcsToMerge;
441     for (Chain *ChainPred : HotChains) {
442       NodeId F = ChainPred->Nodes.back();
443       for (NodeId Succ : Cg.successors(F)) {
444         if (F == Succ)
445           continue;
446 
447         const Arc &Arc = *Cg.findArc(F, Succ);
448         if (Arc.weight() == 0.0 ||
449             Arc.weight() / TotalSamples < opts::ArcThreshold)
450           continue;
451 
452         const double CallsFromPred = OutWeight[F];
453         const double CallsToSucc = InWeight[Succ];
454         const double CallsPredSucc = Arc.weight();
455 
456         // Probability that the first chain is calling the second one
457         const double ProbOut =
458             CallsFromPred > 0 ? CallsPredSucc / CallsFromPred : 0;
459         assert(0.0 <= ProbOut && ProbOut <= 1.0 && "incorrect out-probability");
460 
461         // Probability that the second chain is called From the first one
462         const double ProbIn = CallsToSucc > 0 ? CallsPredSucc / CallsToSucc : 0;
463         assert(0.0 <= ProbIn && ProbIn <= 1.0 && "incorrect in-probability");
464 
465         if (std::min(ProbOut, ProbIn) >= opts::MergeProbability)
466           ArcsToMerge.push_back(&Arc);
467       }
468     }
469 
470     // Sort the pairs by the weight in reverse order
471     std::sort(
472         ArcsToMerge.begin(), ArcsToMerge.end(),
473         [](const Arc *L, const Arc *R) { return L->weight() > R->weight(); });
474 
475     // Merge the pairs of chains
476     for (const Arc *Arc : ArcsToMerge) {
477       Chain *ChainPred = NodeChain[Arc->src()];
478       Chain *ChainSucc = NodeChain[Arc->dst()];
479       if (ChainPred == ChainSucc)
480         continue;
481       if (ChainPred->Nodes.back() == Arc->src() &&
482           ChainSucc->Nodes.front() == Arc->dst())
483         mergeChains(ChainPred, ChainSucc);
484     }
485   }
486 
487   /// Run the second optimization pass of the hfsort+ algorithm:
488   /// Merge pairs of chains while there is an improvement in the
489   /// expected cache miss ratio.
490   void runPassTwo() {
491     // Creating a priority queue containing all edges ordered by the merge gain
492     auto GainComparator = [](Edge *L, Edge *R) {
493       if (std::abs(L->gain() - R->gain()) > 1e-8)
494         return L->gain() > R->gain();
495 
496       // Making sure the comparison is deterministic
497       if (L->predChain()->Id != R->predChain()->Id)
498         return L->predChain()->Id < R->predChain()->Id;
499 
500       return L->succChain()->Id < R->succChain()->Id;
501     };
502     std::set<Edge *, decltype(GainComparator)> Queue(GainComparator);
503 
504     // Inserting the edges Into the queue
505     for (Chain *ChainPred : HotChains) {
506       for (auto EdgeIt : ChainPred->Edges) {
507         Chain *ChainSucc = EdgeIt.first;
508         Edge *ChainEdge = EdgeIt.second;
509         // Ignore loop edges
510         if (ChainPred == ChainSucc)
511           continue;
512         // Ignore already processed edges
513         if (ChainEdge->gain() != -1.0)
514           continue;
515 
516         // Compute the gain of merging the two chains
517         auto ForwardGain = mergeGain(ChainPred, ChainSucc, ChainEdge);
518         auto BackwardGain = mergeGain(ChainSucc, ChainPred, ChainEdge);
519         ChainEdge->setMergeGain(ChainPred, ForwardGain, BackwardGain);
520         if (ChainEdge->gain() > 0.0)
521           Queue.insert(ChainEdge);
522       }
523     }
524 
525     // Merge the chains while the gain of merging is positive
526     while (!Queue.empty()) {
527       // Extract the best (top) edge for merging
528       Edge *It = *Queue.begin();
529       Queue.erase(Queue.begin());
530       Edge *BestEdge = It;
531       Chain *BestChainPred = BestEdge->predChain();
532       Chain *BestChainSucc = BestEdge->succChain();
533       if (BestChainPred == BestChainSucc || BestEdge->gain() <= 0.0)
534         continue;
535 
536       // Remove outdated edges
537       for (std::pair<Chain *, Edge *> EdgeIt : BestChainPred->Edges)
538         Queue.erase(EdgeIt.second);
539       for (std::pair<Chain *, Edge *> EdgeIt : BestChainSucc->Edges)
540         Queue.erase(EdgeIt.second);
541 
542       // Merge the best pair of chains
543       mergeChains(BestChainPred, BestChainSucc);
544 
545       // Insert newly created edges Into the queue
546       for (auto EdgeIt : BestChainPred->Edges) {
547         Chain *ChainSucc = EdgeIt.first;
548         Edge *ChainEdge = EdgeIt.second;
549         // Ignore loop edges
550         if (BestChainPred == ChainSucc)
551           continue;
552 
553         // Compute the gain of merging the two chains
554         auto ForwardGain = mergeGain(BestChainPred, ChainSucc, ChainEdge);
555         auto BackwardGain = mergeGain(ChainSucc, BestChainPred, ChainEdge);
556         ChainEdge->setMergeGain(BestChainPred, ForwardGain, BackwardGain);
557         if (ChainEdge->gain() > 0.0)
558           Queue.insert(ChainEdge);
559       }
560     }
561   }
562 
563   /// Merge chain From into chain Into and update the list of active chains.
564   void mergeChains(Chain *Into, Chain *From) {
565     assert(Into != From && "cannot merge a chain with itself");
566     Into->merge(From);
567 
568     // Update the chains and addresses for functions merged from From
569     size_t CurAddr = 0;
570     for (NodeId F : Into->Nodes) {
571       NodeChain[F] = Into;
572       Addr[F] = CurAddr;
573       CurAddr += Cg.size(F);
574     }
575 
576     // Merge edges
577     Into->mergeEdges(From);
578     From->clear();
579 
580     // Update cached scores for the new chain
581     Into->ShortCalls = shortCalls(Into);
582     Into->Score = score(Into);
583 
584     // Remove chain From From the list of active chains
585     auto it = std::remove(HotChains.begin(), HotChains.end(), From);
586     HotChains.erase(it, HotChains.end());
587   }
588 
589 private:
590   // The call graph
591   const CallGraph &Cg;
592 
593   // All chains of functions
594   std::vector<Chain> AllChains;
595 
596   // Active chains. The vector gets updated at runtime when chains are merged
597   std::vector<Chain *> HotChains;
598 
599   // All edges between chains
600   std::vector<Edge> AllEdges;
601 
602   // Node_id => chain
603   std::vector<Chain *> NodeChain;
604 
605   // Current address of the function From the beginning of its chain
606   std::vector<uint64_t> Addr;
607 
608   // Total weight of outgoing arcs for each function
609   std::vector<double> OutWeight;
610 
611   // Total weight of incoming arcs for each function
612   std::vector<double> InWeight;
613   // The total number of samples in the graph
614   double TotalSamples{0};
615 };
616 
617 } // end anonymous namespace
618 
619 std::vector<Cluster> hfsortPlus(CallGraph &Cg) {
620   // It is required that the sum of incoming arc weights is not greater
621   // than the number of samples for every function.
622   // Ensuring the call graph obeys the property before running the algorithm.
623   Cg.adjustArcWeights();
624   return HFSortPlus(Cg).run();
625 }
626 
627 } // namespace bolt
628 } // namespace llvm
629