1 //===- CallGraphSort.cpp --------------------------------------------------===//
2 //
3 //                             The LLVM Linker
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// Implementation of Call-Chain Clustering from: Optimizing Function Placement
11 /// for Large-Scale Data-Center Applications
12 /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
13 ///
14 /// The goal of this algorithm is to improve runtime performance of the final
15 /// executable by arranging code sections such that page table and i-cache
16 /// misses are minimized.
17 ///
18 /// Definitions:
19 /// * Cluster
20 ///   * An ordered list of input sections which are layed out as a unit. At the
21 ///     beginning of the algorithm each input section has its own cluster and
22 ///     the weight of the cluster is the sum of the weight of all incomming
23 ///     edges.
24 /// * Call-Chain Clustering (C³) Heuristic
25 ///   * Defines when and how clusters are combined. Pick the highest weighted
26 ///     input section then add it to its most likely predecessor if it wouldn't
27 ///     penalize it too much.
28 /// * Density
29 ///   * The weight of the cluster divided by the size of the cluster. This is a
30 ///     proxy for the ammount of execution time spent per byte of the cluster.
31 ///
32 /// It does so given a call graph profile by the following:
33 /// * Build a weighted call graph from the call graph profile
34 /// * Sort input sections by weight
35 /// * For each input section starting with the highest weight
36 ///   * Find its most likely predecessor cluster
37 ///   * Check if the combined cluster would be too large, or would have too low
38 ///     a density.
39 ///   * If not, then combine the clusters.
40 /// * Sort non-empty clusters by density
41 ///
42 //===----------------------------------------------------------------------===//
43 
44 #include "CallGraphSort.h"
45 #include "OutputSections.h"
46 #include "SymbolTable.h"
47 #include "Symbols.h"
48 
49 using namespace llvm;
50 using namespace lld;
51 using namespace lld::elf;
52 
53 namespace {
54 struct Edge {
55   int From;
56   uint64_t Weight;
57 };
58 
59 struct Cluster {
60   Cluster(int Sec, size_t S) {
61     Sections.push_back(Sec);
62     Size = S;
63   }
64 
65   double getDensity() const {
66     if (Size == 0)
67       return 0;
68     return double(Weight) / double(Size);
69   }
70 
71   std::vector<int> Sections;
72   size_t Size = 0;
73   uint64_t Weight = 0;
74   uint64_t InitialWeight = 0;
75   Edge BestPred = {-1, 0};
76 };
77 
78 class CallGraphSort {
79 public:
80   CallGraphSort();
81 
82   DenseMap<const InputSectionBase *, int> run();
83 
84 private:
85   std::vector<Cluster> Clusters;
86   std::vector<const InputSectionBase *> Sections;
87 
88   void groupClusters();
89 };
90 
91 // Maximum ammount the combined cluster density can be worse than the original
92 // cluster to consider merging.
93 constexpr int MAX_DENSITY_DEGRADATION = 8;
94 
95 // Maximum cluster size in bytes.
96 constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
97 } // end anonymous namespace
98 
99 // Take the edge list in Config->CallGraphProfile, resolve symbol names to
100 // Symbols, and generate a graph between InputSections with the provided
101 // weights.
102 CallGraphSort::CallGraphSort() {
103   llvm::MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>,
104                   uint64_t> &Profile = Config->CallGraphProfile;
105   DenseMap<const InputSectionBase *, int> SecToCluster;
106 
107   auto GetOrCreateNode = [&](const InputSectionBase *IS) -> int {
108     auto Res = SecToCluster.insert(std::make_pair(IS, Clusters.size()));
109     if (Res.second) {
110       Sections.push_back(IS);
111       Clusters.emplace_back(Clusters.size(), IS->getSize());
112     }
113     return Res.first->second;
114   };
115 
116   // Create the graph.
117   for (const auto &C : Profile) {
118     const auto *FromSB = cast<InputSectionBase>(C.first.first->Repl);
119     const auto *ToSB = cast<InputSectionBase>(C.first.second->Repl);
120     uint64_t Weight = C.second;
121 
122     // Ignore edges between input sections belonging to different output
123     // sections.  This is done because otherwise we would end up with clusters
124     // containing input sections that can't actually be placed adjacently in the
125     // output.  This messes with the cluster size and density calculations.  We
126     // would also end up moving input sections in other output sections without
127     // moving them closer to what calls them.
128     if (FromSB->getOutputSection() != ToSB->getOutputSection())
129       continue;
130 
131     int From = GetOrCreateNode(FromSB);
132     int To = GetOrCreateNode(ToSB);
133 
134     Clusters[To].Weight += Weight;
135 
136     if (From == To)
137       continue;
138 
139     // Remember the best edge.
140     Cluster &ToC = Clusters[To];
141     if (ToC.BestPred.From == -1 || ToC.BestPred.Weight < Weight) {
142       ToC.BestPred.From = From;
143       ToC.BestPred.Weight = Weight;
144     }
145   }
146   for (Cluster &C : Clusters)
147     C.InitialWeight = C.Weight;
148 }
149 
150 // It's bad to merge clusters which would degrade the density too much.
151 static bool isNewDensityBad(Cluster &A, Cluster &B) {
152   double NewDensity = double(A.Weight + B.Weight) / double(A.Size + B.Size);
153   if (NewDensity < A.getDensity() / MAX_DENSITY_DEGRADATION)
154     return true;
155   return false;
156 }
157 
158 static void mergeClusters(Cluster &Into, Cluster &From) {
159   Into.Sections.insert(Into.Sections.end(), From.Sections.begin(),
160                        From.Sections.end());
161   Into.Size += From.Size;
162   Into.Weight += From.Weight;
163   From.Sections.clear();
164   From.Size = 0;
165   From.Weight = 0;
166 }
167 
168 // Group InputSections into clusters using the Call-Chain Clustering heuristic
169 // then sort the clusters by density.
170 void CallGraphSort::groupClusters() {
171   std::vector<int> SortedSecs(Clusters.size());
172   std::vector<Cluster *> SecToCluster(Clusters.size());
173 
174   for (size_t I = 0; I < Clusters.size(); ++I) {
175     SortedSecs[I] = I;
176     SecToCluster[I] = &Clusters[I];
177   }
178 
179   std::stable_sort(SortedSecs.begin(), SortedSecs.end(), [&](int A, int B) {
180     return Clusters[B].getDensity() < Clusters[A].getDensity();
181   });
182 
183   for (int SI : SortedSecs) {
184     // Clusters[SI] is the same as SecToClusters[SI] here because it has not
185     // been merged into another cluster yet.
186     Cluster &C = Clusters[SI];
187 
188     // Don't consider merging if the edge is unlikely.
189     if (C.BestPred.From == -1 || C.BestPred.Weight * 10 <= C.InitialWeight)
190       continue;
191 
192     Cluster *PredC = SecToCluster[C.BestPred.From];
193     if (PredC == &C)
194       continue;
195 
196     if (C.Size + PredC->Size > MAX_CLUSTER_SIZE)
197       continue;
198 
199     if (isNewDensityBad(*PredC, C))
200       continue;
201 
202     // NOTE: Consider using a disjoint-set to track section -> cluster mapping
203     // if this is ever slow.
204     for (int SI : C.Sections)
205       SecToCluster[SI] = PredC;
206 
207     mergeClusters(*PredC, C);
208   }
209 
210   // Remove empty or dead nodes. Invalidates all cluster indices.
211   llvm::erase_if(Clusters, [](const Cluster &C) {
212     return C.Size == 0 || C.Sections.empty();
213   });
214 
215   // Sort by density.
216   std::stable_sort(Clusters.begin(), Clusters.end(),
217                    [](const Cluster &A, const Cluster &B) {
218                      return A.getDensity() > B.getDensity();
219                    });
220 }
221 
222 DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
223   groupClusters();
224 
225   // Generate order.
226   llvm::DenseMap<const InputSectionBase *, int> OrderMap;
227   ssize_t CurOrder = 1;
228 
229   for (const Cluster &C : Clusters)
230     for (int SecIndex : C.Sections)
231       OrderMap[Sections[SecIndex]] = CurOrder++;
232 
233   return OrderMap;
234 }
235 
236 // Sort sections by the profile data provided by -callgraph-profile-file
237 //
238 // This first builds a call graph based on the profile data then merges sections
239 // according to the C³ huristic. All clusters are then sorted by a density
240 // metric to further improve locality.
241 DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
242   return CallGraphSort().run();
243 }
244