1b842725cSMichael J. Spencer //===- CallGraphSort.cpp --------------------------------------------------===//
2b842725cSMichael J. Spencer //
32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6b842725cSMichael J. Spencer //
7b842725cSMichael J. Spencer //===----------------------------------------------------------------------===//
8b842725cSMichael J. Spencer ///
9b842725cSMichael J. Spencer /// Implementation of Call-Chain Clustering from: Optimizing Function Placement
10b842725cSMichael J. Spencer /// for Large-Scale Data-Center Applications
11b842725cSMichael J. Spencer /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
12b842725cSMichael J. Spencer ///
13b842725cSMichael J. Spencer /// The goal of this algorithm is to improve runtime performance of the final
14b842725cSMichael J. Spencer /// executable by arranging code sections such that page table and i-cache
15b842725cSMichael J. Spencer /// misses are minimized.
16b842725cSMichael J. Spencer ///
17b842725cSMichael J. Spencer /// Definitions:
18b842725cSMichael J. Spencer /// * Cluster
195976a3f5SNico Weber /// * An ordered list of input sections which are laid out as a unit. At the
20b842725cSMichael J. Spencer /// beginning of the algorithm each input section has its own cluster and
215976a3f5SNico Weber /// the weight of the cluster is the sum of the weight of all incoming
22b842725cSMichael J. Spencer /// edges.
23b842725cSMichael J. Spencer /// * Call-Chain Clustering (C³) Heuristic
24b842725cSMichael J. Spencer /// * Defines when and how clusters are combined. Pick the highest weighted
25b842725cSMichael J. Spencer /// input section then add it to its most likely predecessor if it wouldn't
26b842725cSMichael J. Spencer /// penalize it too much.
27b842725cSMichael J. Spencer /// * Density
28b842725cSMichael J. Spencer /// * The weight of the cluster divided by the size of the cluster. This is a
295976a3f5SNico Weber /// proxy for the amount of execution time spent per byte of the cluster.
30b842725cSMichael J. Spencer ///
31b842725cSMichael J. Spencer /// It does so given a call graph profile by the following:
32b842725cSMichael J. Spencer /// * Build a weighted call graph from the call graph profile
33b842725cSMichael J. Spencer /// * Sort input sections by weight
34b842725cSMichael J. Spencer /// * For each input section starting with the highest weight
35b842725cSMichael J. Spencer /// * Find its most likely predecessor cluster
36b842725cSMichael J. Spencer /// * Check if the combined cluster would be too large, or would have too low
37b842725cSMichael J. Spencer /// a density.
38b842725cSMichael J. Spencer /// * If not, then combine the clusters.
39b842725cSMichael J. Spencer /// * Sort non-empty clusters by density
40b842725cSMichael J. Spencer ///
41b842725cSMichael J. Spencer //===----------------------------------------------------------------------===//
42b842725cSMichael J. Spencer
43b842725cSMichael J. Spencer #include "CallGraphSort.h"
44*38fbedabSFangrui Song #include "InputFiles.h"
4527bb7990SFangrui Song #include "InputSection.h"
46b842725cSMichael J. Spencer #include "Symbols.h"
4727bb7990SFangrui Song #include "llvm/Support/FileSystem.h"
48b842725cSMichael J. Spencer
497588cf09SFangrui Song #include <numeric>
507588cf09SFangrui Song
51b842725cSMichael J. Spencer using namespace llvm;
5207837b8fSFangrui Song using namespace lld;
5307837b8fSFangrui Song using namespace lld::elf;
54b842725cSMichael J. Spencer
55b842725cSMichael J. Spencer namespace {
56b842725cSMichael J. Spencer struct Edge {
573837f427SRui Ueyama int from;
583837f427SRui Ueyama uint64_t weight;
59b842725cSMichael J. Spencer };
60b842725cSMichael J. Spencer
61b842725cSMichael J. Spencer struct Cluster {
Cluster__anon5f5d43770111::Cluster627588cf09SFangrui Song Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {}
63b842725cSMichael J. Spencer
getDensity__anon5f5d43770111::Cluster64b842725cSMichael J. Spencer double getDensity() const {
653837f427SRui Ueyama if (size == 0)
66b842725cSMichael J. Spencer return 0;
673837f427SRui Ueyama return double(weight) / double(size);
68b842725cSMichael J. Spencer }
69b842725cSMichael J. Spencer
707588cf09SFangrui Song int next;
717588cf09SFangrui Song int prev;
72763671f3SZequan Wu uint64_t size;
733837f427SRui Ueyama uint64_t weight = 0;
743837f427SRui Ueyama uint64_t initialWeight = 0;
753837f427SRui Ueyama Edge bestPred = {-1, 0};
76b842725cSMichael J. Spencer };
77b842725cSMichael J. Spencer
78b842725cSMichael J. Spencer class CallGraphSort {
79b842725cSMichael J. Spencer public:
80b842725cSMichael J. Spencer CallGraphSort();
81b842725cSMichael J. Spencer
82b842725cSMichael J. Spencer DenseMap<const InputSectionBase *, int> run();
83b842725cSMichael J. Spencer
84b842725cSMichael J. Spencer private:
853837f427SRui Ueyama std::vector<Cluster> clusters;
863837f427SRui Ueyama std::vector<const InputSectionBase *> sections;
87b842725cSMichael J. Spencer };
88b842725cSMichael J. Spencer
895976a3f5SNico Weber // Maximum amount the combined cluster density can be worse than the original
90b842725cSMichael J. Spencer // cluster to consider merging.
91b842725cSMichael J. Spencer constexpr int MAX_DENSITY_DEGRADATION = 8;
92b842725cSMichael J. Spencer
93b842725cSMichael J. Spencer // Maximum cluster size in bytes.
94b842725cSMichael J. Spencer constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
95b842725cSMichael J. Spencer } // end anonymous namespace
96b842725cSMichael J. Spencer
9768b9f45fSRui Ueyama using SectionPair =
9868b9f45fSRui Ueyama std::pair<const InputSectionBase *, const InputSectionBase *>;
993d354081SRui Ueyama
100b842725cSMichael J. Spencer // Take the edge list in Config->CallGraphProfile, resolve symbol names to
101b842725cSMichael J. Spencer // Symbols, and generate a graph between InputSections with the provided
102b842725cSMichael J. Spencer // weights.
CallGraphSort()103b842725cSMichael J. Spencer CallGraphSort::CallGraphSort() {
1043837f427SRui Ueyama MapVector<SectionPair, uint64_t> &profile = config->callGraphProfile;
1053837f427SRui Ueyama DenseMap<const InputSectionBase *, int> secToCluster;
106b842725cSMichael J. Spencer
1073837f427SRui Ueyama auto getOrCreateNode = [&](const InputSectionBase *isec) -> int {
1087588cf09SFangrui Song auto res = secToCluster.try_emplace(isec, clusters.size());
1093837f427SRui Ueyama if (res.second) {
1103837f427SRui Ueyama sections.push_back(isec);
1113837f427SRui Ueyama clusters.emplace_back(clusters.size(), isec->getSize());
112b842725cSMichael J. Spencer }
1133837f427SRui Ueyama return res.first->second;
114b842725cSMichael J. Spencer };
115b842725cSMichael J. Spencer
116b842725cSMichael J. Spencer // Create the graph.
1173837f427SRui Ueyama for (std::pair<SectionPair, uint64_t> &c : profile) {
118e1b6b5beSFangrui Song const auto *fromSB = cast<InputSectionBase>(c.first.first);
119e1b6b5beSFangrui Song const auto *toSB = cast<InputSectionBase>(c.first.second);
1203837f427SRui Ueyama uint64_t weight = c.second;
121b842725cSMichael J. Spencer
122b842725cSMichael J. Spencer // Ignore edges between input sections belonging to different output
123b842725cSMichael J. Spencer // sections. This is done because otherwise we would end up with clusters
124b842725cSMichael J. Spencer // containing input sections that can't actually be placed adjacently in the
125b842725cSMichael J. Spencer // output. This messes with the cluster size and density calculations. We
126b842725cSMichael J. Spencer // would also end up moving input sections in other output sections without
127b842725cSMichael J. Spencer // moving them closer to what calls them.
1283837f427SRui Ueyama if (fromSB->getOutputSection() != toSB->getOutputSection())
129b842725cSMichael J. Spencer continue;
130b842725cSMichael J. Spencer
1313837f427SRui Ueyama int from = getOrCreateNode(fromSB);
1323837f427SRui Ueyama int to = getOrCreateNode(toSB);
133b842725cSMichael J. Spencer
1343837f427SRui Ueyama clusters[to].weight += weight;
135b842725cSMichael J. Spencer
1363837f427SRui Ueyama if (from == to)
137b842725cSMichael J. Spencer continue;
138b842725cSMichael J. Spencer
139f0eedbceSGeorge Rimar // Remember the best edge.
1403837f427SRui Ueyama Cluster &toC = clusters[to];
1413837f427SRui Ueyama if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) {
1423837f427SRui Ueyama toC.bestPred.from = from;
1433837f427SRui Ueyama toC.bestPred.weight = weight;
144f0eedbceSGeorge Rimar }
145b842725cSMichael J. Spencer }
1463837f427SRui Ueyama for (Cluster &c : clusters)
1473837f427SRui Ueyama c.initialWeight = c.weight;
148b842725cSMichael J. Spencer }
149b842725cSMichael J. Spencer
150b842725cSMichael J. Spencer // It's bad to merge clusters which would degrade the density too much.
isNewDensityBad(Cluster & a,Cluster & b)1513837f427SRui Ueyama static bool isNewDensityBad(Cluster &a, Cluster &b) {
1523837f427SRui Ueyama double newDensity = double(a.weight + b.weight) / double(a.size + b.size);
1533837f427SRui Ueyama return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION;
154b842725cSMichael J. Spencer }
155b842725cSMichael J. Spencer
1567588cf09SFangrui Song // Find the leader of V's belonged cluster (represented as an equivalence
1577588cf09SFangrui Song // class). We apply union-find path-halving technique (simple to implement) in
1587588cf09SFangrui Song // the meantime as it decreases depths and the time complexity.
getLeader(std::vector<int> & leaders,int v)1597588cf09SFangrui Song static int getLeader(std::vector<int> &leaders, int v) {
1607588cf09SFangrui Song while (leaders[v] != v) {
1617588cf09SFangrui Song leaders[v] = leaders[leaders[v]];
1627588cf09SFangrui Song v = leaders[v];
1637588cf09SFangrui Song }
1647588cf09SFangrui Song return v;
1657588cf09SFangrui Song }
1667588cf09SFangrui Song
mergeClusters(std::vector<Cluster> & cs,Cluster & into,int intoIdx,Cluster & from,int fromIdx)1677588cf09SFangrui Song static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx,
1687588cf09SFangrui Song Cluster &from, int fromIdx) {
1697588cf09SFangrui Song int tail1 = into.prev, tail2 = from.prev;
1707588cf09SFangrui Song into.prev = tail2;
1717588cf09SFangrui Song cs[tail2].next = intoIdx;
1727588cf09SFangrui Song from.prev = tail1;
1737588cf09SFangrui Song cs[tail1].next = fromIdx;
1743837f427SRui Ueyama into.size += from.size;
1753837f427SRui Ueyama into.weight += from.weight;
1763837f427SRui Ueyama from.size = 0;
1773837f427SRui Ueyama from.weight = 0;
178b842725cSMichael J. Spencer }
179b842725cSMichael J. Spencer
180b842725cSMichael J. Spencer // Group InputSections into clusters using the Call-Chain Clustering heuristic
181b842725cSMichael J. Spencer // then sort the clusters by density.
run()1827588cf09SFangrui Song DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
1837588cf09SFangrui Song std::vector<int> sorted(clusters.size());
1847588cf09SFangrui Song std::vector<int> leaders(clusters.size());
185b842725cSMichael J. Spencer
1867588cf09SFangrui Song std::iota(leaders.begin(), leaders.end(), 0);
1877588cf09SFangrui Song std::iota(sorted.begin(), sorted.end(), 0);
1887588cf09SFangrui Song llvm::stable_sort(sorted, [&](int a, int b) {
1893837f427SRui Ueyama return clusters[a].getDensity() > clusters[b].getDensity();
190b842725cSMichael J. Spencer });
191b842725cSMichael J. Spencer
1927588cf09SFangrui Song for (int l : sorted) {
1937588cf09SFangrui Song // The cluster index is the same as the index of its leader here because
1947588cf09SFangrui Song // clusters[L] has not been merged into another cluster yet.
1957588cf09SFangrui Song Cluster &c = clusters[l];
196b842725cSMichael J. Spencer
197f0eedbceSGeorge Rimar // Don't consider merging if the edge is unlikely.
1983837f427SRui Ueyama if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight)
199b842725cSMichael J. Spencer continue;
200b842725cSMichael J. Spencer
2017588cf09SFangrui Song int predL = getLeader(leaders, c.bestPred.from);
2027588cf09SFangrui Song if (l == predL)
203b842725cSMichael J. Spencer continue;
204b842725cSMichael J. Spencer
2057588cf09SFangrui Song Cluster *predC = &clusters[predL];
2063837f427SRui Ueyama if (c.size + predC->size > MAX_CLUSTER_SIZE)
207b842725cSMichael J. Spencer continue;
208b842725cSMichael J. Spencer
2093837f427SRui Ueyama if (isNewDensityBad(*predC, c))
210b842725cSMichael J. Spencer continue;
211b842725cSMichael J. Spencer
2127588cf09SFangrui Song leaders[l] = predL;
2137588cf09SFangrui Song mergeClusters(clusters, *predC, predL, c, l);
214b842725cSMichael J. Spencer }
215b842725cSMichael J. Spencer
2167588cf09SFangrui Song // Sort remaining non-empty clusters by density.
2177588cf09SFangrui Song sorted.clear();
2187588cf09SFangrui Song for (int i = 0, e = (int)clusters.size(); i != e; ++i)
2197588cf09SFangrui Song if (clusters[i].size > 0)
2207588cf09SFangrui Song sorted.push_back(i);
2217588cf09SFangrui Song llvm::stable_sort(sorted, [&](int a, int b) {
2227588cf09SFangrui Song return clusters[a].getDensity() > clusters[b].getDensity();
22397df22f1SGeorge Rimar });
224b842725cSMichael J. Spencer
2253837f427SRui Ueyama DenseMap<const InputSectionBase *, int> orderMap;
2267588cf09SFangrui Song int curOrder = 1;
227763671f3SZequan Wu for (int leader : sorted) {
2287588cf09SFangrui Song for (int i = leader;;) {
2297588cf09SFangrui Song orderMap[sections[i]] = curOrder++;
2307588cf09SFangrui Song i = clusters[i].next;
2317588cf09SFangrui Song if (i == leader)
2327588cf09SFangrui Song break;
2337588cf09SFangrui Song }
234763671f3SZequan Wu }
2353837f427SRui Ueyama if (!config->printSymbolOrder.empty()) {
2363837f427SRui Ueyama std::error_code ec;
237d9b948b6SFangrui Song raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None);
2383837f427SRui Ueyama if (ec) {
2393837f427SRui Ueyama error("cannot open " + config->printSymbolOrder + ": " + ec.message());
2403837f427SRui Ueyama return orderMap;
241432030e8SRui Ueyama }
242432030e8SRui Ueyama
24347cfe8f3SFangrui Song // Print the symbols ordered by C3, in the order of increasing curOrder
24447cfe8f3SFangrui Song // Instead of sorting all the orderMap, just repeat the loops above.
2457588cf09SFangrui Song for (int leader : sorted)
2467588cf09SFangrui Song for (int i = leader;;) {
247432030e8SRui Ueyama // Search all the symbols in the file of the section
248432030e8SRui Ueyama // and find out a Defined symbol with name that is within the section.
2497588cf09SFangrui Song for (Symbol *sym : sections[i]->file->getSymbols())
2503837f427SRui Ueyama if (!sym->isSection()) // Filter out section-type symbols here.
2513837f427SRui Ueyama if (auto *d = dyn_cast<Defined>(sym))
2527588cf09SFangrui Song if (sections[i] == d->section)
2533837f427SRui Ueyama os << sym->getName() << "\n";
2547588cf09SFangrui Song i = clusters[i].next;
2557588cf09SFangrui Song if (i == leader)
2567588cf09SFangrui Song break;
2577588cf09SFangrui Song }
258432030e8SRui Ueyama }
259432030e8SRui Ueyama
2603837f427SRui Ueyama return orderMap;
261b842725cSMichael J. Spencer }
262b842725cSMichael J. Spencer
263bf6e259bSFangrui Song // Sort sections by the profile data provided by --callgraph-profile-file.
264b842725cSMichael J. Spencer //
265b842725cSMichael J. Spencer // This first builds a call graph based on the profile data then merges sections
2667c5fcb35SKazuaki Ishizaki // according to the C³ heuristic. All clusters are then sorted by a density
267b842725cSMichael J. Spencer // metric to further improve locality.
computeCallGraphProfileOrder()26807837b8fSFangrui Song DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
269b842725cSMichael J. Spencer return CallGraphSort().run();
270b842725cSMichael J. Spencer }
271