1b842725cSMichael J. Spencer //===- CallGraphSort.cpp --------------------------------------------------===//
2b842725cSMichael J. Spencer //
3b842725cSMichael J. Spencer //                             The LLVM Linker
4b842725cSMichael J. Spencer //
5b842725cSMichael J. Spencer // This file is distributed under the University of Illinois Open Source
6b842725cSMichael J. Spencer // License. See LICENSE.TXT for details.
7b842725cSMichael J. Spencer //
8b842725cSMichael J. Spencer //===----------------------------------------------------------------------===//
9b842725cSMichael J. Spencer ///
10b842725cSMichael J. Spencer /// Implementation of Call-Chain Clustering from: Optimizing Function Placement
11b842725cSMichael J. Spencer /// for Large-Scale Data-Center Applications
12b842725cSMichael J. Spencer /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
13b842725cSMichael J. Spencer ///
14b842725cSMichael J. Spencer /// The goal of this algorithm is to improve runtime performance of the final
15b842725cSMichael J. Spencer /// executable by arranging code sections such that page table and i-cache
16b842725cSMichael J. Spencer /// misses are minimized.
17b842725cSMichael J. Spencer ///
18b842725cSMichael J. Spencer /// Definitions:
19b842725cSMichael J. Spencer /// * Cluster
20b842725cSMichael J. Spencer ///   * An ordered list of input sections which are layed out as a unit. At the
21b842725cSMichael J. Spencer ///     beginning of the algorithm each input section has its own cluster and
22b842725cSMichael J. Spencer ///     the weight of the cluster is the sum of the weight of all incomming
23b842725cSMichael J. Spencer ///     edges.
24b842725cSMichael J. Spencer /// * Call-Chain Clustering (C³) Heuristic
25b842725cSMichael J. Spencer ///   * Defines when and how clusters are combined. Pick the highest weighted
26b842725cSMichael J. Spencer ///     input section then add it to its most likely predecessor if it wouldn't
27b842725cSMichael J. Spencer ///     penalize it too much.
28b842725cSMichael J. Spencer /// * Density
29b842725cSMichael J. Spencer ///   * The weight of the cluster divided by the size of the cluster. This is a
30b842725cSMichael J. Spencer ///     proxy for the ammount of execution time spent per byte of the cluster.
31b842725cSMichael J. Spencer ///
32b842725cSMichael J. Spencer /// It does so given a call graph profile by the following:
33b842725cSMichael J. Spencer /// * Build a weighted call graph from the call graph profile
34b842725cSMichael J. Spencer /// * Sort input sections by weight
35b842725cSMichael J. Spencer /// * For each input section starting with the highest weight
36b842725cSMichael J. Spencer ///   * Find its most likely predecessor cluster
37b842725cSMichael J. Spencer ///   * Check if the combined cluster would be too large, or would have too low
38b842725cSMichael J. Spencer ///     a density.
39b842725cSMichael J. Spencer ///   * If not, then combine the clusters.
40b842725cSMichael J. Spencer /// * Sort non-empty clusters by density
41b842725cSMichael J. Spencer ///
42b842725cSMichael J. Spencer //===----------------------------------------------------------------------===//
43b842725cSMichael J. Spencer 
44b842725cSMichael J. Spencer #include "CallGraphSort.h"
45b842725cSMichael J. Spencer #include "OutputSections.h"
46b842725cSMichael J. Spencer #include "SymbolTable.h"
47b842725cSMichael J. Spencer #include "Symbols.h"
48b842725cSMichael J. Spencer 
49b842725cSMichael J. Spencer using namespace llvm;
50b842725cSMichael J. Spencer using namespace lld;
51b842725cSMichael J. Spencer using namespace lld::elf;
52b842725cSMichael J. Spencer 
53b842725cSMichael J. Spencer namespace {
54b842725cSMichael J. Spencer struct Edge {
55b842725cSMichael J. Spencer   int From;
56b842725cSMichael J. Spencer   uint64_t Weight;
57b842725cSMichael J. Spencer };
58b842725cSMichael J. Spencer 
59b842725cSMichael J. Spencer struct Cluster {
60b842725cSMichael J. Spencer   Cluster(int Sec, size_t S) {
61b842725cSMichael J. Spencer     Sections.push_back(Sec);
62b842725cSMichael J. Spencer     Size = S;
63b842725cSMichael J. Spencer   }
64b842725cSMichael J. Spencer 
65b842725cSMichael J. Spencer   double getDensity() const {
66b842725cSMichael J. Spencer     if (Size == 0)
67b842725cSMichael J. Spencer       return 0;
68b842725cSMichael J. Spencer     return double(Weight) / double(Size);
69b842725cSMichael J. Spencer   }
70b842725cSMichael J. Spencer 
71b842725cSMichael J. Spencer   std::vector<int> Sections;
72b842725cSMichael J. Spencer   size_t Size = 0;
73b842725cSMichael J. Spencer   uint64_t Weight = 0;
74b842725cSMichael J. Spencer   uint64_t InitialWeight = 0;
75*f0eedbceSGeorge Rimar   Edge BestPred = {-1, 0};
76b842725cSMichael J. Spencer };
77b842725cSMichael J. Spencer 
78b842725cSMichael J. Spencer class CallGraphSort {
79b842725cSMichael J. Spencer public:
80b842725cSMichael J. Spencer   CallGraphSort();
81b842725cSMichael J. Spencer 
82b842725cSMichael J. Spencer   DenseMap<const InputSectionBase *, int> run();
83b842725cSMichael J. Spencer 
84b842725cSMichael J. Spencer private:
85b842725cSMichael J. Spencer   std::vector<Cluster> Clusters;
86b842725cSMichael J. Spencer   std::vector<const InputSectionBase *> Sections;
87b842725cSMichael J. Spencer 
88b842725cSMichael J. Spencer   void groupClusters();
89b842725cSMichael J. Spencer };
90b842725cSMichael J. Spencer 
91b842725cSMichael J. Spencer // Maximum ammount the combined cluster density can be worse than the original
92b842725cSMichael J. Spencer // cluster to consider merging.
93b842725cSMichael J. Spencer constexpr int MAX_DENSITY_DEGRADATION = 8;
94b842725cSMichael J. Spencer 
95b842725cSMichael J. Spencer // Maximum cluster size in bytes.
96b842725cSMichael J. Spencer constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
97b842725cSMichael J. Spencer } // end anonymous namespace
98b842725cSMichael J. Spencer 
99b842725cSMichael J. Spencer // Take the edge list in Config->CallGraphProfile, resolve symbol names to
100b842725cSMichael J. Spencer // Symbols, and generate a graph between InputSections with the provided
101b842725cSMichael J. Spencer // weights.
102b842725cSMichael J. Spencer CallGraphSort::CallGraphSort() {
103b842725cSMichael J. Spencer   llvm::MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>,
104b842725cSMichael J. Spencer                   uint64_t> &Profile = Config->CallGraphProfile;
105b842725cSMichael J. Spencer   DenseMap<const InputSectionBase *, int> SecToCluster;
106b842725cSMichael J. Spencer 
107b842725cSMichael J. Spencer   auto GetOrCreateNode = [&](const InputSectionBase *IS) -> int {
108b842725cSMichael J. Spencer     auto Res = SecToCluster.insert(std::make_pair(IS, Clusters.size()));
109b842725cSMichael J. Spencer     if (Res.second) {
110b842725cSMichael J. Spencer       Sections.push_back(IS);
111b842725cSMichael J. Spencer       Clusters.emplace_back(Clusters.size(), IS->getSize());
112b842725cSMichael J. Spencer     }
113b842725cSMichael J. Spencer     return Res.first->second;
114b842725cSMichael J. Spencer   };
115b842725cSMichael J. Spencer 
116b842725cSMichael J. Spencer   // Create the graph.
117b842725cSMichael J. Spencer   for (const auto &C : Profile) {
118b842725cSMichael J. Spencer     const auto *FromSB = cast<InputSectionBase>(C.first.first->Repl);
119b842725cSMichael J. Spencer     const auto *ToSB = cast<InputSectionBase>(C.first.second->Repl);
120b842725cSMichael J. Spencer     uint64_t Weight = C.second;
121b842725cSMichael J. Spencer 
122b842725cSMichael J. Spencer     // Ignore edges between input sections belonging to different output
123b842725cSMichael J. Spencer     // sections.  This is done because otherwise we would end up with clusters
124b842725cSMichael J. Spencer     // containing input sections that can't actually be placed adjacently in the
125b842725cSMichael J. Spencer     // output.  This messes with the cluster size and density calculations.  We
126b842725cSMichael J. Spencer     // would also end up moving input sections in other output sections without
127b842725cSMichael J. Spencer     // moving them closer to what calls them.
128b842725cSMichael J. Spencer     if (FromSB->getOutputSection() != ToSB->getOutputSection())
129b842725cSMichael J. Spencer       continue;
130b842725cSMichael J. Spencer 
131b842725cSMichael J. Spencer     int From = GetOrCreateNode(FromSB);
132b842725cSMichael J. Spencer     int To = GetOrCreateNode(ToSB);
133b842725cSMichael J. Spencer 
134b842725cSMichael J. Spencer     Clusters[To].Weight += Weight;
135b842725cSMichael J. Spencer 
136b842725cSMichael J. Spencer     if (From == To)
137b842725cSMichael J. Spencer       continue;
138b842725cSMichael J. Spencer 
139*f0eedbceSGeorge Rimar     // Remember the best edge.
140*f0eedbceSGeorge Rimar     Cluster &ToC = Clusters[To];
141*f0eedbceSGeorge Rimar     if (ToC.BestPred.From == -1 || ToC.BestPred.Weight < Weight) {
142*f0eedbceSGeorge Rimar       ToC.BestPred.From = From;
143*f0eedbceSGeorge Rimar       ToC.BestPred.Weight = Weight;
144*f0eedbceSGeorge Rimar     }
145b842725cSMichael J. Spencer   }
146b842725cSMichael J. Spencer   for (Cluster &C : Clusters)
147b842725cSMichael J. Spencer     C.InitialWeight = C.Weight;
148b842725cSMichael J. Spencer }
149b842725cSMichael J. Spencer 
150b842725cSMichael J. Spencer // It's bad to merge clusters which would degrade the density too much.
151b842725cSMichael J. Spencer static bool isNewDensityBad(Cluster &A, Cluster &B) {
152b842725cSMichael J. Spencer   double NewDensity = double(A.Weight + B.Weight) / double(A.Size + B.Size);
153b842725cSMichael J. Spencer   if (NewDensity < A.getDensity() / MAX_DENSITY_DEGRADATION)
154b842725cSMichael J. Spencer     return true;
155b842725cSMichael J. Spencer   return false;
156b842725cSMichael J. Spencer }
157b842725cSMichael J. Spencer 
158b842725cSMichael J. Spencer static void mergeClusters(Cluster &Into, Cluster &From) {
159b842725cSMichael J. Spencer   Into.Sections.insert(Into.Sections.end(), From.Sections.begin(),
160b842725cSMichael J. Spencer                        From.Sections.end());
161b842725cSMichael J. Spencer   Into.Size += From.Size;
162b842725cSMichael J. Spencer   Into.Weight += From.Weight;
163b842725cSMichael J. Spencer   From.Sections.clear();
164b842725cSMichael J. Spencer   From.Size = 0;
165b842725cSMichael J. Spencer   From.Weight = 0;
166b842725cSMichael J. Spencer }
167b842725cSMichael J. Spencer 
168b842725cSMichael J. Spencer // Group InputSections into clusters using the Call-Chain Clustering heuristic
169b842725cSMichael J. Spencer // then sort the clusters by density.
170b842725cSMichael J. Spencer void CallGraphSort::groupClusters() {
171b842725cSMichael J. Spencer   std::vector<int> SortedSecs(Clusters.size());
172b842725cSMichael J. Spencer   std::vector<Cluster *> SecToCluster(Clusters.size());
173b842725cSMichael J. Spencer 
174a5cf8da1SGeorge Rimar   for (size_t I = 0; I < Clusters.size(); ++I) {
175a5cf8da1SGeorge Rimar     SortedSecs[I] = I;
176a5cf8da1SGeorge Rimar     SecToCluster[I] = &Clusters[I];
177b842725cSMichael J. Spencer   }
178b842725cSMichael J. Spencer 
179b842725cSMichael J. Spencer   std::stable_sort(SortedSecs.begin(), SortedSecs.end(), [&](int A, int B) {
180b842725cSMichael J. Spencer     return Clusters[B].getDensity() < Clusters[A].getDensity();
181b842725cSMichael J. Spencer   });
182b842725cSMichael J. Spencer 
183b842725cSMichael J. Spencer   for (int SI : SortedSecs) {
184b842725cSMichael J. Spencer     // Clusters[SI] is the same as SecToClusters[SI] here because it has not
185b842725cSMichael J. Spencer     // been merged into another cluster yet.
186b842725cSMichael J. Spencer     Cluster &C = Clusters[SI];
187b842725cSMichael J. Spencer 
188*f0eedbceSGeorge Rimar     // Don't consider merging if the edge is unlikely.
189*f0eedbceSGeorge Rimar     if (C.BestPred.From == -1 || C.BestPred.Weight * 10 <= C.InitialWeight)
190b842725cSMichael J. Spencer       continue;
191b842725cSMichael J. Spencer 
192*f0eedbceSGeorge Rimar     Cluster *PredC = SecToCluster[C.BestPred.From];
193b842725cSMichael J. Spencer     if (PredC == &C)
194b842725cSMichael J. Spencer       continue;
195b842725cSMichael J. Spencer 
196b842725cSMichael J. Spencer     if (C.Size + PredC->Size > MAX_CLUSTER_SIZE)
197b842725cSMichael J. Spencer       continue;
198b842725cSMichael J. Spencer 
199b842725cSMichael J. Spencer     if (isNewDensityBad(*PredC, C))
200b842725cSMichael J. Spencer       continue;
201b842725cSMichael J. Spencer 
202b842725cSMichael J. Spencer     // NOTE: Consider using a disjoint-set to track section -> cluster mapping
203b842725cSMichael J. Spencer     // if this is ever slow.
204b842725cSMichael J. Spencer     for (int SI : C.Sections)
205b842725cSMichael J. Spencer       SecToCluster[SI] = PredC;
206b842725cSMichael J. Spencer 
207b842725cSMichael J. Spencer     mergeClusters(*PredC, C);
208b842725cSMichael J. Spencer   }
209b842725cSMichael J. Spencer 
210b842725cSMichael J. Spencer   // Remove empty or dead nodes. Invalidates all cluster indices.
21197df22f1SGeorge Rimar   llvm::erase_if(Clusters, [](const Cluster &C) {
212b842725cSMichael J. Spencer     return C.Size == 0 || C.Sections.empty();
21397df22f1SGeorge Rimar   });
214b842725cSMichael J. Spencer 
215b842725cSMichael J. Spencer   // Sort by density.
216de83cbf3SGeorge Rimar   std::stable_sort(Clusters.begin(), Clusters.end(),
217b842725cSMichael J. Spencer                    [](const Cluster &A, const Cluster &B) {
218b842725cSMichael J. Spencer                      return A.getDensity() > B.getDensity();
219b842725cSMichael J. Spencer                    });
220b842725cSMichael J. Spencer }
221b842725cSMichael J. Spencer 
222b842725cSMichael J. Spencer DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
223b842725cSMichael J. Spencer   groupClusters();
224b842725cSMichael J. Spencer 
225b842725cSMichael J. Spencer   // Generate order.
226b842725cSMichael J. Spencer   llvm::DenseMap<const InputSectionBase *, int> OrderMap;
227b842725cSMichael J. Spencer   ssize_t CurOrder = 1;
228b842725cSMichael J. Spencer 
229b842725cSMichael J. Spencer   for (const Cluster &C : Clusters)
230b842725cSMichael J. Spencer     for (int SecIndex : C.Sections)
231b842725cSMichael J. Spencer       OrderMap[Sections[SecIndex]] = CurOrder++;
232b842725cSMichael J. Spencer 
233b842725cSMichael J. Spencer   return OrderMap;
234b842725cSMichael J. Spencer }
235b842725cSMichael J. Spencer 
236b842725cSMichael J. Spencer // Sort sections by the profile data provided by -callgraph-profile-file
237b842725cSMichael J. Spencer //
238b842725cSMichael J. Spencer // This first builds a call graph based on the profile data then merges sections
239b842725cSMichael J. Spencer // according to the C³ huristic. All clusters are then sorted by a density
240b842725cSMichael J. Spencer // metric to further improve locality.
241b842725cSMichael J. Spencer DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
242b842725cSMichael J. Spencer   return CallGraphSort().run();
243b842725cSMichael J. Spencer }
244