1b842725cSMichael J. Spencer //===- CallGraphSort.cpp --------------------------------------------------===//
2b842725cSMichael J. Spencer //
3b842725cSMichael J. Spencer //                             The LLVM Linker
4b842725cSMichael J. Spencer //
5b842725cSMichael J. Spencer // This file is distributed under the University of Illinois Open Source
6b842725cSMichael J. Spencer // License. See LICENSE.TXT for details.
7b842725cSMichael J. Spencer //
8b842725cSMichael J. Spencer //===----------------------------------------------------------------------===//
9b842725cSMichael J. Spencer ///
10b842725cSMichael J. Spencer /// Implementation of Call-Chain Clustering from: Optimizing Function Placement
11b842725cSMichael J. Spencer /// for Large-Scale Data-Center Applications
12b842725cSMichael J. Spencer /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
13b842725cSMichael J. Spencer ///
14b842725cSMichael J. Spencer /// The goal of this algorithm is to improve runtime performance of the final
15b842725cSMichael J. Spencer /// executable by arranging code sections such that page table and i-cache
16b842725cSMichael J. Spencer /// misses are minimized.
17b842725cSMichael J. Spencer ///
18b842725cSMichael J. Spencer /// Definitions:
19b842725cSMichael J. Spencer /// * Cluster
20b842725cSMichael J. Spencer ///   * An ordered list of input sections which are layed out as a unit. At the
21b842725cSMichael J. Spencer ///     beginning of the algorithm each input section has its own cluster and
22b842725cSMichael J. Spencer ///     the weight of the cluster is the sum of the weight of all incomming
23b842725cSMichael J. Spencer ///     edges.
24b842725cSMichael J. Spencer /// * Call-Chain Clustering (C³) Heuristic
25b842725cSMichael J. Spencer ///   * Defines when and how clusters are combined. Pick the highest weighted
26b842725cSMichael J. Spencer ///     input section then add it to its most likely predecessor if it wouldn't
27b842725cSMichael J. Spencer ///     penalize it too much.
28b842725cSMichael J. Spencer /// * Density
29b842725cSMichael J. Spencer ///   * The weight of the cluster divided by the size of the cluster. This is a
30b842725cSMichael J. Spencer ///     proxy for the ammount of execution time spent per byte of the cluster.
31b842725cSMichael J. Spencer ///
32b842725cSMichael J. Spencer /// It does so given a call graph profile by the following:
33b842725cSMichael J. Spencer /// * Build a weighted call graph from the call graph profile
34b842725cSMichael J. Spencer /// * Sort input sections by weight
35b842725cSMichael J. Spencer /// * For each input section starting with the highest weight
36b842725cSMichael J. Spencer ///   * Find its most likely predecessor cluster
37b842725cSMichael J. Spencer ///   * Check if the combined cluster would be too large, or would have too low
38b842725cSMichael J. Spencer ///     a density.
39b842725cSMichael J. Spencer ///   * If not, then combine the clusters.
40b842725cSMichael J. Spencer /// * Sort non-empty clusters by density
41b842725cSMichael J. Spencer ///
42b842725cSMichael J. Spencer //===----------------------------------------------------------------------===//
43b842725cSMichael J. Spencer 
44b842725cSMichael J. Spencer #include "CallGraphSort.h"
45b842725cSMichael J. Spencer #include "OutputSections.h"
46b842725cSMichael J. Spencer #include "SymbolTable.h"
47b842725cSMichael J. Spencer #include "Symbols.h"
48b842725cSMichael J. Spencer 
49b842725cSMichael J. Spencer using namespace llvm;
50b842725cSMichael J. Spencer using namespace lld;
51b842725cSMichael J. Spencer using namespace lld::elf;
52b842725cSMichael J. Spencer 
53b842725cSMichael J. Spencer namespace {
54b842725cSMichael J. Spencer struct Edge {
55b842725cSMichael J. Spencer   int From;
56b842725cSMichael J. Spencer   uint64_t Weight;
57b842725cSMichael J. Spencer };
58b842725cSMichael J. Spencer 
59b842725cSMichael J. Spencer struct Cluster {
60b842725cSMichael J. Spencer   Cluster(int Sec, size_t S) {
61b842725cSMichael J. Spencer     Sections.push_back(Sec);
62b842725cSMichael J. Spencer     Size = S;
63b842725cSMichael J. Spencer   }
64b842725cSMichael J. Spencer 
65b842725cSMichael J. Spencer   double getDensity() const {
66b842725cSMichael J. Spencer     if (Size == 0)
67b842725cSMichael J. Spencer       return 0;
68b842725cSMichael J. Spencer     return double(Weight) / double(Size);
69b842725cSMichael J. Spencer   }
70b842725cSMichael J. Spencer 
71b842725cSMichael J. Spencer   std::vector<int> Sections;
72b842725cSMichael J. Spencer   size_t Size = 0;
73b842725cSMichael J. Spencer   uint64_t Weight = 0;
74b842725cSMichael J. Spencer   uint64_t InitialWeight = 0;
75b842725cSMichael J. Spencer   std::vector<Edge> Preds;
76b842725cSMichael J. Spencer };
77b842725cSMichael J. Spencer 
78b842725cSMichael J. Spencer class CallGraphSort {
79b842725cSMichael J. Spencer public:
80b842725cSMichael J. Spencer   CallGraphSort();
81b842725cSMichael J. Spencer 
82b842725cSMichael J. Spencer   DenseMap<const InputSectionBase *, int> run();
83b842725cSMichael J. Spencer 
84b842725cSMichael J. Spencer private:
85b842725cSMichael J. Spencer   std::vector<Cluster> Clusters;
86b842725cSMichael J. Spencer   std::vector<const InputSectionBase *> Sections;
87b842725cSMichael J. Spencer 
88b842725cSMichael J. Spencer   void groupClusters();
89b842725cSMichael J. Spencer };
90b842725cSMichael J. Spencer 
91b842725cSMichael J. Spencer // Maximum ammount the combined cluster density can be worse than the original
92b842725cSMichael J. Spencer // cluster to consider merging.
93b842725cSMichael J. Spencer constexpr int MAX_DENSITY_DEGRADATION = 8;
94b842725cSMichael J. Spencer 
95b842725cSMichael J. Spencer // Maximum cluster size in bytes.
96b842725cSMichael J. Spencer constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
97b842725cSMichael J. Spencer } // end anonymous namespace
98b842725cSMichael J. Spencer 
99b842725cSMichael J. Spencer // Take the edge list in Config->CallGraphProfile, resolve symbol names to
100b842725cSMichael J. Spencer // Symbols, and generate a graph between InputSections with the provided
101b842725cSMichael J. Spencer // weights.
102b842725cSMichael J. Spencer CallGraphSort::CallGraphSort() {
103b842725cSMichael J. Spencer   llvm::MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>,
104b842725cSMichael J. Spencer                   uint64_t> &Profile = Config->CallGraphProfile;
105b842725cSMichael J. Spencer   DenseMap<const InputSectionBase *, int> SecToCluster;
106b842725cSMichael J. Spencer 
107b842725cSMichael J. Spencer   auto GetOrCreateNode = [&](const InputSectionBase *IS) -> int {
108b842725cSMichael J. Spencer     auto Res = SecToCluster.insert(std::make_pair(IS, Clusters.size()));
109b842725cSMichael J. Spencer     if (Res.second) {
110b842725cSMichael J. Spencer       Sections.push_back(IS);
111b842725cSMichael J. Spencer       Clusters.emplace_back(Clusters.size(), IS->getSize());
112b842725cSMichael J. Spencer     }
113b842725cSMichael J. Spencer     return Res.first->second;
114b842725cSMichael J. Spencer   };
115b842725cSMichael J. Spencer 
116b842725cSMichael J. Spencer   // Create the graph.
117b842725cSMichael J. Spencer   for (const auto &C : Profile) {
118b842725cSMichael J. Spencer     const auto *FromSB = cast<InputSectionBase>(C.first.first->Repl);
119b842725cSMichael J. Spencer     const auto *ToSB = cast<InputSectionBase>(C.first.second->Repl);
120b842725cSMichael J. Spencer     uint64_t Weight = C.second;
121b842725cSMichael J. Spencer 
122b842725cSMichael J. Spencer     // Ignore edges between input sections belonging to different output
123b842725cSMichael J. Spencer     // sections.  This is done because otherwise we would end up with clusters
124b842725cSMichael J. Spencer     // containing input sections that can't actually be placed adjacently in the
125b842725cSMichael J. Spencer     // output.  This messes with the cluster size and density calculations.  We
126b842725cSMichael J. Spencer     // would also end up moving input sections in other output sections without
127b842725cSMichael J. Spencer     // moving them closer to what calls them.
128b842725cSMichael J. Spencer     if (FromSB->getOutputSection() != ToSB->getOutputSection())
129b842725cSMichael J. Spencer       continue;
130b842725cSMichael J. Spencer 
131b842725cSMichael J. Spencer     int From = GetOrCreateNode(FromSB);
132b842725cSMichael J. Spencer     int To = GetOrCreateNode(ToSB);
133b842725cSMichael J. Spencer 
134b842725cSMichael J. Spencer     Clusters[To].Weight += Weight;
135b842725cSMichael J. Spencer 
136b842725cSMichael J. Spencer     if (From == To)
137b842725cSMichael J. Spencer       continue;
138b842725cSMichael J. Spencer 
139b842725cSMichael J. Spencer     // Add an edge
140b842725cSMichael J. Spencer     Clusters[To].Preds.push_back({From, Weight});
141b842725cSMichael J. Spencer   }
142b842725cSMichael J. Spencer   for (Cluster &C : Clusters)
143b842725cSMichael J. Spencer     C.InitialWeight = C.Weight;
144b842725cSMichael J. Spencer }
145b842725cSMichael J. Spencer 
146b842725cSMichael J. Spencer // It's bad to merge clusters which would degrade the density too much.
147b842725cSMichael J. Spencer static bool isNewDensityBad(Cluster &A, Cluster &B) {
148b842725cSMichael J. Spencer   double NewDensity = double(A.Weight + B.Weight) / double(A.Size + B.Size);
149b842725cSMichael J. Spencer   if (NewDensity < A.getDensity() / MAX_DENSITY_DEGRADATION)
150b842725cSMichael J. Spencer     return true;
151b842725cSMichael J. Spencer   return false;
152b842725cSMichael J. Spencer }
153b842725cSMichael J. Spencer 
154b842725cSMichael J. Spencer static void mergeClusters(Cluster &Into, Cluster &From) {
155b842725cSMichael J. Spencer   Into.Sections.insert(Into.Sections.end(), From.Sections.begin(),
156b842725cSMichael J. Spencer                        From.Sections.end());
157b842725cSMichael J. Spencer   Into.Size += From.Size;
158b842725cSMichael J. Spencer   Into.Weight += From.Weight;
159b842725cSMichael J. Spencer   From.Sections.clear();
160b842725cSMichael J. Spencer   From.Size = 0;
161b842725cSMichael J. Spencer   From.Weight = 0;
162b842725cSMichael J. Spencer }
163b842725cSMichael J. Spencer 
164b842725cSMichael J. Spencer // Group InputSections into clusters using the Call-Chain Clustering heuristic
165b842725cSMichael J. Spencer // then sort the clusters by density.
166b842725cSMichael J. Spencer void CallGraphSort::groupClusters() {
167b842725cSMichael J. Spencer   std::vector<int> SortedSecs(Clusters.size());
168b842725cSMichael J. Spencer   std::vector<Cluster *> SecToCluster(Clusters.size());
169b842725cSMichael J. Spencer 
170*a5cf8da1SGeorge Rimar   for (size_t I = 0; I < Clusters.size(); ++I) {
171*a5cf8da1SGeorge Rimar     SortedSecs[I] = I;
172*a5cf8da1SGeorge Rimar     SecToCluster[I] = &Clusters[I];
173b842725cSMichael J. Spencer   }
174b842725cSMichael J. Spencer 
175b842725cSMichael J. Spencer   std::stable_sort(SortedSecs.begin(), SortedSecs.end(), [&](int A, int B) {
176b842725cSMichael J. Spencer     return Clusters[B].getDensity() < Clusters[A].getDensity();
177b842725cSMichael J. Spencer   });
178b842725cSMichael J. Spencer 
179b842725cSMichael J. Spencer   for (int SI : SortedSecs) {
180b842725cSMichael J. Spencer     // Clusters[SI] is the same as SecToClusters[SI] here because it has not
181b842725cSMichael J. Spencer     // been merged into another cluster yet.
182b842725cSMichael J. Spencer     Cluster &C = Clusters[SI];
183b842725cSMichael J. Spencer 
184b842725cSMichael J. Spencer     int BestPred = -1;
185b842725cSMichael J. Spencer     uint64_t BestWeight = 0;
186b842725cSMichael J. Spencer 
187b842725cSMichael J. Spencer     for (Edge &E : C.Preds) {
188b842725cSMichael J. Spencer       if (BestPred == -1 || E.Weight > BestWeight) {
189b842725cSMichael J. Spencer         BestPred = E.From;
190b842725cSMichael J. Spencer         BestWeight = E.Weight;
191b842725cSMichael J. Spencer       }
192b842725cSMichael J. Spencer     }
193b842725cSMichael J. Spencer 
194b842725cSMichael J. Spencer     // don't consider merging if the edge is unlikely.
195b842725cSMichael J. Spencer     if (BestWeight * 10 <= C.InitialWeight)
196b842725cSMichael J. Spencer       continue;
197b842725cSMichael J. Spencer 
198b842725cSMichael J. Spencer     Cluster *PredC = SecToCluster[BestPred];
199b842725cSMichael J. Spencer     if (PredC == &C)
200b842725cSMichael J. Spencer       continue;
201b842725cSMichael J. Spencer 
202b842725cSMichael J. Spencer     if (C.Size + PredC->Size > MAX_CLUSTER_SIZE)
203b842725cSMichael J. Spencer       continue;
204b842725cSMichael J. Spencer 
205b842725cSMichael J. Spencer     if (isNewDensityBad(*PredC, C))
206b842725cSMichael J. Spencer       continue;
207b842725cSMichael J. Spencer 
208b842725cSMichael J. Spencer     // NOTE: Consider using a disjoint-set to track section -> cluster mapping
209b842725cSMichael J. Spencer     // if this is ever slow.
210b842725cSMichael J. Spencer     for (int SI : C.Sections)
211b842725cSMichael J. Spencer       SecToCluster[SI] = PredC;
212b842725cSMichael J. Spencer 
213b842725cSMichael J. Spencer     mergeClusters(*PredC, C);
214b842725cSMichael J. Spencer   }
215b842725cSMichael J. Spencer 
216b842725cSMichael J. Spencer   // Remove empty or dead nodes. Invalidates all cluster indices.
21797df22f1SGeorge Rimar   llvm::erase_if(Clusters, [](const Cluster &C) {
218b842725cSMichael J. Spencer     return C.Size == 0 || C.Sections.empty();
21997df22f1SGeorge Rimar   });
220b842725cSMichael J. Spencer 
221b842725cSMichael J. Spencer   // Sort by density.
222de83cbf3SGeorge Rimar   std::stable_sort(Clusters.begin(), Clusters.end(),
223b842725cSMichael J. Spencer                    [](const Cluster &A, const Cluster &B) {
224b842725cSMichael J. Spencer                      return A.getDensity() > B.getDensity();
225b842725cSMichael J. Spencer                    });
226b842725cSMichael J. Spencer }
227b842725cSMichael J. Spencer 
228b842725cSMichael J. Spencer DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
229b842725cSMichael J. Spencer   groupClusters();
230b842725cSMichael J. Spencer 
231b842725cSMichael J. Spencer   // Generate order.
232b842725cSMichael J. Spencer   llvm::DenseMap<const InputSectionBase *, int> OrderMap;
233b842725cSMichael J. Spencer   ssize_t CurOrder = 1;
234b842725cSMichael J. Spencer 
235b842725cSMichael J. Spencer   for (const Cluster &C : Clusters)
236b842725cSMichael J. Spencer     for (int SecIndex : C.Sections)
237b842725cSMichael J. Spencer       OrderMap[Sections[SecIndex]] = CurOrder++;
238b842725cSMichael J. Spencer 
239b842725cSMichael J. Spencer   return OrderMap;
240b842725cSMichael J. Spencer }
241b842725cSMichael J. Spencer 
242b842725cSMichael J. Spencer // Sort sections by the profile data provided by -callgraph-profile-file
243b842725cSMichael J. Spencer //
244b842725cSMichael J. Spencer // This first builds a call graph based on the profile data then merges sections
245b842725cSMichael J. Spencer // according to the C³ huristic. All clusters are then sorted by a density
246b842725cSMichael J. Spencer // metric to further improve locality.
247b842725cSMichael J. Spencer DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
248b842725cSMichael J. Spencer   return CallGraphSort().run();
249b842725cSMichael J. Spencer }
250