1b842725cSMichael J. Spencer //===- CallGraphSort.cpp --------------------------------------------------===//
2b842725cSMichael J. Spencer //
32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6b842725cSMichael J. Spencer //
7b842725cSMichael J. Spencer //===----------------------------------------------------------------------===//
8b842725cSMichael J. Spencer ///
9b842725cSMichael J. Spencer /// Implementation of Call-Chain Clustering from: Optimizing Function Placement
10b842725cSMichael J. Spencer /// for Large-Scale Data-Center Applications
11b842725cSMichael J. Spencer /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
12b842725cSMichael J. Spencer ///
13b842725cSMichael J. Spencer /// The goal of this algorithm is to improve runtime performance of the final
14b842725cSMichael J. Spencer /// executable by arranging code sections such that page table and i-cache
15b842725cSMichael J. Spencer /// misses are minimized.
16b842725cSMichael J. Spencer ///
17b842725cSMichael J. Spencer /// Definitions:
18b842725cSMichael J. Spencer /// * Cluster
195976a3f5SNico Weber ///   * An ordered list of input sections which are laid out as a unit. At the
20b842725cSMichael J. Spencer ///     beginning of the algorithm each input section has its own cluster and
215976a3f5SNico Weber ///     the weight of the cluster is the sum of the weight of all incoming
22b842725cSMichael J. Spencer ///     edges.
23b842725cSMichael J. Spencer /// * Call-Chain Clustering (C³) Heuristic
24b842725cSMichael J. Spencer ///   * Defines when and how clusters are combined. Pick the highest weighted
25b842725cSMichael J. Spencer ///     input section then add it to its most likely predecessor if it wouldn't
26b842725cSMichael J. Spencer ///     penalize it too much.
27b842725cSMichael J. Spencer /// * Density
28b842725cSMichael J. Spencer ///   * The weight of the cluster divided by the size of the cluster. This is a
295976a3f5SNico Weber ///     proxy for the amount of execution time spent per byte of the cluster.
30b842725cSMichael J. Spencer ///
31b842725cSMichael J. Spencer /// It does so given a call graph profile by the following:
32b842725cSMichael J. Spencer /// * Build a weighted call graph from the call graph profile
33b842725cSMichael J. Spencer /// * Sort input sections by weight
34b842725cSMichael J. Spencer /// * For each input section starting with the highest weight
35b842725cSMichael J. Spencer ///   * Find its most likely predecessor cluster
36b842725cSMichael J. Spencer ///   * Check if the combined cluster would be too large, or would have too low
37b842725cSMichael J. Spencer ///     a density.
38b842725cSMichael J. Spencer ///   * If not, then combine the clusters.
39b842725cSMichael J. Spencer /// * Sort non-empty clusters by density
40b842725cSMichael J. Spencer ///
41b842725cSMichael J. Spencer //===----------------------------------------------------------------------===//
42b842725cSMichael J. Spencer 
43b842725cSMichael J. Spencer #include "CallGraphSort.h"
44b842725cSMichael J. Spencer #include "OutputSections.h"
45b842725cSMichael J. Spencer #include "SymbolTable.h"
46b842725cSMichael J. Spencer #include "Symbols.h"
47b842725cSMichael J. Spencer 
487588cf09SFangrui Song #include <numeric>
497588cf09SFangrui Song 
50b842725cSMichael J. Spencer using namespace llvm;
5107837b8fSFangrui Song using namespace lld;
5207837b8fSFangrui Song using namespace lld::elf;
53b842725cSMichael J. Spencer 
54b842725cSMichael J. Spencer namespace {
55b842725cSMichael J. Spencer struct Edge {
563837f427SRui Ueyama   int from;
573837f427SRui Ueyama   uint64_t weight;
58b842725cSMichael J. Spencer };
59b842725cSMichael J. Spencer 
60b842725cSMichael J. Spencer struct Cluster {
617588cf09SFangrui Song   Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {}
62b842725cSMichael J. Spencer 
63b842725cSMichael J. Spencer   double getDensity() const {
643837f427SRui Ueyama     if (size == 0)
65b842725cSMichael J. Spencer       return 0;
663837f427SRui Ueyama     return double(weight) / double(size);
67b842725cSMichael J. Spencer   }
68b842725cSMichael J. Spencer 
697588cf09SFangrui Song   int next;
707588cf09SFangrui Song   int prev;
71763671f3SZequan Wu   uint64_t size;
723837f427SRui Ueyama   uint64_t weight = 0;
733837f427SRui Ueyama   uint64_t initialWeight = 0;
743837f427SRui Ueyama   Edge bestPred = {-1, 0};
75b842725cSMichael J. Spencer };
76b842725cSMichael J. Spencer 
77b842725cSMichael J. Spencer class CallGraphSort {
78b842725cSMichael J. Spencer public:
79b842725cSMichael J. Spencer   CallGraphSort();
80b842725cSMichael J. Spencer 
81b842725cSMichael J. Spencer   DenseMap<const InputSectionBase *, int> run();
82b842725cSMichael J. Spencer 
83b842725cSMichael J. Spencer private:
843837f427SRui Ueyama   std::vector<Cluster> clusters;
853837f427SRui Ueyama   std::vector<const InputSectionBase *> sections;
86b842725cSMichael J. Spencer };
87b842725cSMichael J. Spencer 
885976a3f5SNico Weber // Maximum amount the combined cluster density can be worse than the original
89b842725cSMichael J. Spencer // cluster to consider merging.
90b842725cSMichael J. Spencer constexpr int MAX_DENSITY_DEGRADATION = 8;
91b842725cSMichael J. Spencer 
92b842725cSMichael J. Spencer // Maximum cluster size in bytes.
93b842725cSMichael J. Spencer constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
94b842725cSMichael J. Spencer } // end anonymous namespace
95b842725cSMichael J. Spencer 
9668b9f45fSRui Ueyama using SectionPair =
9768b9f45fSRui Ueyama     std::pair<const InputSectionBase *, const InputSectionBase *>;
983d354081SRui Ueyama 
99b842725cSMichael J. Spencer // Take the edge list in Config->CallGraphProfile, resolve symbol names to
100b842725cSMichael J. Spencer // Symbols, and generate a graph between InputSections with the provided
101b842725cSMichael J. Spencer // weights.
102b842725cSMichael J. Spencer CallGraphSort::CallGraphSort() {
1033837f427SRui Ueyama   MapVector<SectionPair, uint64_t> &profile = config->callGraphProfile;
1043837f427SRui Ueyama   DenseMap<const InputSectionBase *, int> secToCluster;
105b842725cSMichael J. Spencer 
1063837f427SRui Ueyama   auto getOrCreateNode = [&](const InputSectionBase *isec) -> int {
1077588cf09SFangrui Song     auto res = secToCluster.try_emplace(isec, clusters.size());
1083837f427SRui Ueyama     if (res.second) {
1093837f427SRui Ueyama       sections.push_back(isec);
1103837f427SRui Ueyama       clusters.emplace_back(clusters.size(), isec->getSize());
111b842725cSMichael J. Spencer     }
1123837f427SRui Ueyama     return res.first->second;
113b842725cSMichael J. Spencer   };
114b842725cSMichael J. Spencer 
115b842725cSMichael J. Spencer   // Create the graph.
1163837f427SRui Ueyama   for (std::pair<SectionPair, uint64_t> &c : profile) {
1173837f427SRui Ueyama     const auto *fromSB = cast<InputSectionBase>(c.first.first->repl);
1183837f427SRui Ueyama     const auto *toSB = cast<InputSectionBase>(c.first.second->repl);
1193837f427SRui Ueyama     uint64_t weight = c.second;
120b842725cSMichael J. Spencer 
121b842725cSMichael J. Spencer     // Ignore edges between input sections belonging to different output
122b842725cSMichael J. Spencer     // sections.  This is done because otherwise we would end up with clusters
123b842725cSMichael J. Spencer     // containing input sections that can't actually be placed adjacently in the
124b842725cSMichael J. Spencer     // output.  This messes with the cluster size and density calculations.  We
125b842725cSMichael J. Spencer     // would also end up moving input sections in other output sections without
126b842725cSMichael J. Spencer     // moving them closer to what calls them.
1273837f427SRui Ueyama     if (fromSB->getOutputSection() != toSB->getOutputSection())
128b842725cSMichael J. Spencer       continue;
129b842725cSMichael J. Spencer 
1303837f427SRui Ueyama     int from = getOrCreateNode(fromSB);
1313837f427SRui Ueyama     int to = getOrCreateNode(toSB);
132b842725cSMichael J. Spencer 
1333837f427SRui Ueyama     clusters[to].weight += weight;
134b842725cSMichael J. Spencer 
1353837f427SRui Ueyama     if (from == to)
136b842725cSMichael J. Spencer       continue;
137b842725cSMichael J. Spencer 
138f0eedbceSGeorge Rimar     // Remember the best edge.
1393837f427SRui Ueyama     Cluster &toC = clusters[to];
1403837f427SRui Ueyama     if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) {
1413837f427SRui Ueyama       toC.bestPred.from = from;
1423837f427SRui Ueyama       toC.bestPred.weight = weight;
143f0eedbceSGeorge Rimar     }
144b842725cSMichael J. Spencer   }
1453837f427SRui Ueyama   for (Cluster &c : clusters)
1463837f427SRui Ueyama     c.initialWeight = c.weight;
147b842725cSMichael J. Spencer }
148b842725cSMichael J. Spencer 
149b842725cSMichael J. Spencer // It's bad to merge clusters which would degrade the density too much.
1503837f427SRui Ueyama static bool isNewDensityBad(Cluster &a, Cluster &b) {
1513837f427SRui Ueyama   double newDensity = double(a.weight + b.weight) / double(a.size + b.size);
1523837f427SRui Ueyama   return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION;
153b842725cSMichael J. Spencer }
154b842725cSMichael J. Spencer 
1557588cf09SFangrui Song // Find the leader of V's belonged cluster (represented as an equivalence
1567588cf09SFangrui Song // class). We apply union-find path-halving technique (simple to implement) in
1577588cf09SFangrui Song // the meantime as it decreases depths and the time complexity.
1587588cf09SFangrui Song static int getLeader(std::vector<int> &leaders, int v) {
1597588cf09SFangrui Song   while (leaders[v] != v) {
1607588cf09SFangrui Song     leaders[v] = leaders[leaders[v]];
1617588cf09SFangrui Song     v = leaders[v];
1627588cf09SFangrui Song   }
1637588cf09SFangrui Song   return v;
1647588cf09SFangrui Song }
1657588cf09SFangrui Song 
1667588cf09SFangrui Song static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx,
1677588cf09SFangrui Song                           Cluster &from, int fromIdx) {
1687588cf09SFangrui Song   int tail1 = into.prev, tail2 = from.prev;
1697588cf09SFangrui Song   into.prev = tail2;
1707588cf09SFangrui Song   cs[tail2].next = intoIdx;
1717588cf09SFangrui Song   from.prev = tail1;
1727588cf09SFangrui Song   cs[tail1].next = fromIdx;
1733837f427SRui Ueyama   into.size += from.size;
1743837f427SRui Ueyama   into.weight += from.weight;
1753837f427SRui Ueyama   from.size = 0;
1763837f427SRui Ueyama   from.weight = 0;
177b842725cSMichael J. Spencer }
178b842725cSMichael J. Spencer 
179b842725cSMichael J. Spencer // Group InputSections into clusters using the Call-Chain Clustering heuristic
180b842725cSMichael J. Spencer // then sort the clusters by density.
1817588cf09SFangrui Song DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
1827588cf09SFangrui Song   std::vector<int> sorted(clusters.size());
1837588cf09SFangrui Song   std::vector<int> leaders(clusters.size());
184b842725cSMichael J. Spencer 
1857588cf09SFangrui Song   std::iota(leaders.begin(), leaders.end(), 0);
1867588cf09SFangrui Song   std::iota(sorted.begin(), sorted.end(), 0);
1877588cf09SFangrui Song   llvm::stable_sort(sorted, [&](int a, int b) {
1883837f427SRui Ueyama     return clusters[a].getDensity() > clusters[b].getDensity();
189b842725cSMichael J. Spencer   });
190b842725cSMichael J. Spencer 
1917588cf09SFangrui Song   for (int l : sorted) {
1927588cf09SFangrui Song     // The cluster index is the same as the index of its leader here because
1937588cf09SFangrui Song     // clusters[L] has not been merged into another cluster yet.
1947588cf09SFangrui Song     Cluster &c = clusters[l];
195b842725cSMichael J. Spencer 
196f0eedbceSGeorge Rimar     // Don't consider merging if the edge is unlikely.
1973837f427SRui Ueyama     if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight)
198b842725cSMichael J. Spencer       continue;
199b842725cSMichael J. Spencer 
2007588cf09SFangrui Song     int predL = getLeader(leaders, c.bestPred.from);
2017588cf09SFangrui Song     if (l == predL)
202b842725cSMichael J. Spencer       continue;
203b842725cSMichael J. Spencer 
2047588cf09SFangrui Song     Cluster *predC = &clusters[predL];
2053837f427SRui Ueyama     if (c.size + predC->size > MAX_CLUSTER_SIZE)
206b842725cSMichael J. Spencer       continue;
207b842725cSMichael J. Spencer 
2083837f427SRui Ueyama     if (isNewDensityBad(*predC, c))
209b842725cSMichael J. Spencer       continue;
210b842725cSMichael J. Spencer 
2117588cf09SFangrui Song     leaders[l] = predL;
2127588cf09SFangrui Song     mergeClusters(clusters, *predC, predL, c, l);
213b842725cSMichael J. Spencer   }
214b842725cSMichael J. Spencer 
2157588cf09SFangrui Song   // Sort remaining non-empty clusters by density.
2167588cf09SFangrui Song   sorted.clear();
2177588cf09SFangrui Song   for (int i = 0, e = (int)clusters.size(); i != e; ++i)
2187588cf09SFangrui Song     if (clusters[i].size > 0)
2197588cf09SFangrui Song       sorted.push_back(i);
2207588cf09SFangrui Song   llvm::stable_sort(sorted, [&](int a, int b) {
2217588cf09SFangrui Song     return clusters[a].getDensity() > clusters[b].getDensity();
22297df22f1SGeorge Rimar   });
223b842725cSMichael J. Spencer 
2243837f427SRui Ueyama   DenseMap<const InputSectionBase *, int> orderMap;
2257588cf09SFangrui Song   int curOrder = 1;
226763671f3SZequan Wu   for (int leader : sorted) {
2277588cf09SFangrui Song     for (int i = leader;;) {
2287588cf09SFangrui Song       orderMap[sections[i]] = curOrder++;
2297588cf09SFangrui Song       i = clusters[i].next;
2307588cf09SFangrui Song       if (i == leader)
2317588cf09SFangrui Song         break;
2327588cf09SFangrui Song     }
233763671f3SZequan Wu   }
2343837f427SRui Ueyama   if (!config->printSymbolOrder.empty()) {
2353837f427SRui Ueyama     std::error_code ec;
236d9b948b6SFangrui Song     raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None);
2373837f427SRui Ueyama     if (ec) {
2383837f427SRui Ueyama       error("cannot open " + config->printSymbolOrder + ": " + ec.message());
2393837f427SRui Ueyama       return orderMap;
240432030e8SRui Ueyama     }
241432030e8SRui Ueyama 
24247cfe8f3SFangrui Song     // Print the symbols ordered by C3, in the order of increasing curOrder
24347cfe8f3SFangrui Song     // Instead of sorting all the orderMap, just repeat the loops above.
2447588cf09SFangrui Song     for (int leader : sorted)
2457588cf09SFangrui Song       for (int i = leader;;) {
246432030e8SRui Ueyama         // Search all the symbols in the file of the section
247432030e8SRui Ueyama         // and find out a Defined symbol with name that is within the section.
2487588cf09SFangrui Song         for (Symbol *sym : sections[i]->file->getSymbols())
2493837f427SRui Ueyama           if (!sym->isSection()) // Filter out section-type symbols here.
2503837f427SRui Ueyama             if (auto *d = dyn_cast<Defined>(sym))
2517588cf09SFangrui Song               if (sections[i] == d->section)
2523837f427SRui Ueyama                 os << sym->getName() << "\n";
2537588cf09SFangrui Song         i = clusters[i].next;
2547588cf09SFangrui Song         if (i == leader)
2557588cf09SFangrui Song           break;
2567588cf09SFangrui Song       }
257432030e8SRui Ueyama   }
258432030e8SRui Ueyama 
2593837f427SRui Ueyama   return orderMap;
260b842725cSMichael J. Spencer }
261b842725cSMichael J. Spencer 
262*bf6e259bSFangrui Song // Sort sections by the profile data provided by --callgraph-profile-file.
263b842725cSMichael J. Spencer //
264b842725cSMichael J. Spencer // This first builds a call graph based on the profile data then merges sections
2657c5fcb35SKazuaki Ishizaki // according to the C³ heuristic. All clusters are then sorted by a density
266b842725cSMichael J. Spencer // metric to further improve locality.
26707837b8fSFangrui Song DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
268b842725cSMichael J. Spencer   return CallGraphSort().run();
269b842725cSMichael J. Spencer }
270