10b57cec5SDimitry Andric //===- CallGraphSort.cpp --------------------------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric ///
9*c9157d92SDimitry Andric /// The file is responsible for sorting sections using LLVM call graph profile
10*c9157d92SDimitry Andric /// data by placing frequently executed code sections together. The goal of the
11*c9157d92SDimitry Andric /// placement is to improve the runtime performance of the final executable by
12*c9157d92SDimitry Andric /// arranging code sections so that i-TLB misses and i-cache misses are reduced.
13*c9157d92SDimitry Andric ///
14*c9157d92SDimitry Andric /// The algorithm first builds a call graph based on the profile data and then
15*c9157d92SDimitry Andric /// iteratively merges "chains" (ordered lists) of input sections which will be
16*c9157d92SDimitry Andric /// laid out as a unit. There are two implementations for deciding how to
17*c9157d92SDimitry Andric /// merge a pair of chains:
18*c9157d92SDimitry Andric /// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows
19*c9157d92SDimitry Andric /// "Optimizing Function Placement for Large-Scale Data-Center Applications"
200b57cec5SDimitry Andric /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
21*c9157d92SDimitry Andric /// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which
22*c9157d92SDimitry Andric /// typically produces layouts with higher locality, and hence, yields fewer
23*c9157d92SDimitry Andric /// instruction cache misses on large binaries.
240b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
250b57cec5SDimitry Andric
260b57cec5SDimitry Andric #include "CallGraphSort.h"
2781ad6265SDimitry Andric #include "InputFiles.h"
2881ad6265SDimitry Andric #include "InputSection.h"
290b57cec5SDimitry Andric #include "Symbols.h"
3081ad6265SDimitry Andric #include "llvm/Support/FileSystem.h"
31*c9157d92SDimitry Andric #include "llvm/Transforms/Utils/CodeLayout.h"
320b57cec5SDimitry Andric
3385868e8aSDimitry Andric #include <numeric>
3485868e8aSDimitry Andric
350b57cec5SDimitry Andric using namespace llvm;
365ffd83dbSDimitry Andric using namespace lld;
375ffd83dbSDimitry Andric using namespace lld::elf;
380b57cec5SDimitry Andric
390b57cec5SDimitry Andric namespace {
400b57cec5SDimitry Andric struct Edge {
410b57cec5SDimitry Andric int from;
420b57cec5SDimitry Andric uint64_t weight;
430b57cec5SDimitry Andric };
440b57cec5SDimitry Andric
450b57cec5SDimitry Andric struct Cluster {
Cluster__anon4a4283fd0111::Cluster4685868e8aSDimitry Andric Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {}
470b57cec5SDimitry Andric
getDensity__anon4a4283fd0111::Cluster480b57cec5SDimitry Andric double getDensity() const {
490b57cec5SDimitry Andric if (size == 0)
500b57cec5SDimitry Andric return 0;
510b57cec5SDimitry Andric return double(weight) / double(size);
520b57cec5SDimitry Andric }
530b57cec5SDimitry Andric
5485868e8aSDimitry Andric int next;
5585868e8aSDimitry Andric int prev;
56e8d8bef9SDimitry Andric uint64_t size;
570b57cec5SDimitry Andric uint64_t weight = 0;
580b57cec5SDimitry Andric uint64_t initialWeight = 0;
590b57cec5SDimitry Andric Edge bestPred = {-1, 0};
600b57cec5SDimitry Andric };
610b57cec5SDimitry Andric
62*c9157d92SDimitry Andric /// Implementation of the Call-Chain Clustering (C^3). The goal of this
63*c9157d92SDimitry Andric /// algorithm is to improve runtime performance of the executable by arranging
64*c9157d92SDimitry Andric /// code sections such that page table and i-cache misses are minimized.
65*c9157d92SDimitry Andric ///
66*c9157d92SDimitry Andric /// Definitions:
67*c9157d92SDimitry Andric /// * Cluster
68*c9157d92SDimitry Andric /// * An ordered list of input sections which are laid out as a unit. At the
69*c9157d92SDimitry Andric /// beginning of the algorithm each input section has its own cluster and
70*c9157d92SDimitry Andric /// the weight of the cluster is the sum of the weight of all incoming
71*c9157d92SDimitry Andric /// edges.
72*c9157d92SDimitry Andric /// * Call-Chain Clustering (C³) Heuristic
73*c9157d92SDimitry Andric /// * Defines when and how clusters are combined. Pick the highest weighted
74*c9157d92SDimitry Andric /// input section then add it to its most likely predecessor if it wouldn't
75*c9157d92SDimitry Andric /// penalize it too much.
76*c9157d92SDimitry Andric /// * Density
77*c9157d92SDimitry Andric /// * The weight of the cluster divided by the size of the cluster. This is a
78*c9157d92SDimitry Andric /// proxy for the amount of execution time spent per byte of the cluster.
79*c9157d92SDimitry Andric ///
80*c9157d92SDimitry Andric /// It does so given a call graph profile by the following:
81*c9157d92SDimitry Andric /// * Build a weighted call graph from the call graph profile
82*c9157d92SDimitry Andric /// * Sort input sections by weight
83*c9157d92SDimitry Andric /// * For each input section starting with the highest weight
84*c9157d92SDimitry Andric /// * Find its most likely predecessor cluster
85*c9157d92SDimitry Andric /// * Check if the combined cluster would be too large, or would have too low
86*c9157d92SDimitry Andric /// a density.
87*c9157d92SDimitry Andric /// * If not, then combine the clusters.
88*c9157d92SDimitry Andric /// * Sort non-empty clusters by density
890b57cec5SDimitry Andric class CallGraphSort {
900b57cec5SDimitry Andric public:
910b57cec5SDimitry Andric CallGraphSort();
920b57cec5SDimitry Andric
930b57cec5SDimitry Andric DenseMap<const InputSectionBase *, int> run();
940b57cec5SDimitry Andric
950b57cec5SDimitry Andric private:
960b57cec5SDimitry Andric std::vector<Cluster> clusters;
970b57cec5SDimitry Andric std::vector<const InputSectionBase *> sections;
980b57cec5SDimitry Andric };
990b57cec5SDimitry Andric
100480093f4SDimitry Andric // Maximum amount the combined cluster density can be worse than the original
1010b57cec5SDimitry Andric // cluster to consider merging.
1020b57cec5SDimitry Andric constexpr int MAX_DENSITY_DEGRADATION = 8;
1030b57cec5SDimitry Andric
1040b57cec5SDimitry Andric // Maximum cluster size in bytes.
1050b57cec5SDimitry Andric constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
1060b57cec5SDimitry Andric } // end anonymous namespace
1070b57cec5SDimitry Andric
1080b57cec5SDimitry Andric using SectionPair =
1090b57cec5SDimitry Andric std::pair<const InputSectionBase *, const InputSectionBase *>;
1100b57cec5SDimitry Andric
1110b57cec5SDimitry Andric // Take the edge list in Config->CallGraphProfile, resolve symbol names to
1120b57cec5SDimitry Andric // Symbols, and generate a graph between InputSections with the provided
1130b57cec5SDimitry Andric // weights.
CallGraphSort()1140b57cec5SDimitry Andric CallGraphSort::CallGraphSort() {
1150b57cec5SDimitry Andric MapVector<SectionPair, uint64_t> &profile = config->callGraphProfile;
1160b57cec5SDimitry Andric DenseMap<const InputSectionBase *, int> secToCluster;
1170b57cec5SDimitry Andric
1180b57cec5SDimitry Andric auto getOrCreateNode = [&](const InputSectionBase *isec) -> int {
11985868e8aSDimitry Andric auto res = secToCluster.try_emplace(isec, clusters.size());
1200b57cec5SDimitry Andric if (res.second) {
1210b57cec5SDimitry Andric sections.push_back(isec);
1220b57cec5SDimitry Andric clusters.emplace_back(clusters.size(), isec->getSize());
1230b57cec5SDimitry Andric }
1240b57cec5SDimitry Andric return res.first->second;
1250b57cec5SDimitry Andric };
1260b57cec5SDimitry Andric
1270b57cec5SDimitry Andric // Create the graph.
1280b57cec5SDimitry Andric for (std::pair<SectionPair, uint64_t> &c : profile) {
1290eae32dcSDimitry Andric const auto *fromSB = cast<InputSectionBase>(c.first.first);
1300eae32dcSDimitry Andric const auto *toSB = cast<InputSectionBase>(c.first.second);
1310b57cec5SDimitry Andric uint64_t weight = c.second;
1320b57cec5SDimitry Andric
1330b57cec5SDimitry Andric // Ignore edges between input sections belonging to different output
1340b57cec5SDimitry Andric // sections. This is done because otherwise we would end up with clusters
1350b57cec5SDimitry Andric // containing input sections that can't actually be placed adjacently in the
1360b57cec5SDimitry Andric // output. This messes with the cluster size and density calculations. We
1370b57cec5SDimitry Andric // would also end up moving input sections in other output sections without
1380b57cec5SDimitry Andric // moving them closer to what calls them.
1390b57cec5SDimitry Andric if (fromSB->getOutputSection() != toSB->getOutputSection())
1400b57cec5SDimitry Andric continue;
1410b57cec5SDimitry Andric
1420b57cec5SDimitry Andric int from = getOrCreateNode(fromSB);
1430b57cec5SDimitry Andric int to = getOrCreateNode(toSB);
1440b57cec5SDimitry Andric
1450b57cec5SDimitry Andric clusters[to].weight += weight;
1460b57cec5SDimitry Andric
1470b57cec5SDimitry Andric if (from == to)
1480b57cec5SDimitry Andric continue;
1490b57cec5SDimitry Andric
1500b57cec5SDimitry Andric // Remember the best edge.
1510b57cec5SDimitry Andric Cluster &toC = clusters[to];
1520b57cec5SDimitry Andric if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) {
1530b57cec5SDimitry Andric toC.bestPred.from = from;
1540b57cec5SDimitry Andric toC.bestPred.weight = weight;
1550b57cec5SDimitry Andric }
1560b57cec5SDimitry Andric }
1570b57cec5SDimitry Andric for (Cluster &c : clusters)
1580b57cec5SDimitry Andric c.initialWeight = c.weight;
1590b57cec5SDimitry Andric }
1600b57cec5SDimitry Andric
1610b57cec5SDimitry Andric // It's bad to merge clusters which would degrade the density too much.
isNewDensityBad(Cluster & a,Cluster & b)1620b57cec5SDimitry Andric static bool isNewDensityBad(Cluster &a, Cluster &b) {
1630b57cec5SDimitry Andric double newDensity = double(a.weight + b.weight) / double(a.size + b.size);
1640b57cec5SDimitry Andric return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION;
1650b57cec5SDimitry Andric }
1660b57cec5SDimitry Andric
16785868e8aSDimitry Andric // Find the leader of V's belonged cluster (represented as an equivalence
16885868e8aSDimitry Andric // class). We apply union-find path-halving technique (simple to implement) in
16985868e8aSDimitry Andric // the meantime as it decreases depths and the time complexity.
getLeader(int * leaders,int v)170bdd1243dSDimitry Andric static int getLeader(int *leaders, int v) {
17185868e8aSDimitry Andric while (leaders[v] != v) {
17285868e8aSDimitry Andric leaders[v] = leaders[leaders[v]];
17385868e8aSDimitry Andric v = leaders[v];
17485868e8aSDimitry Andric }
17585868e8aSDimitry Andric return v;
17685868e8aSDimitry Andric }
17785868e8aSDimitry Andric
mergeClusters(std::vector<Cluster> & cs,Cluster & into,int intoIdx,Cluster & from,int fromIdx)17885868e8aSDimitry Andric static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx,
17985868e8aSDimitry Andric Cluster &from, int fromIdx) {
18085868e8aSDimitry Andric int tail1 = into.prev, tail2 = from.prev;
18185868e8aSDimitry Andric into.prev = tail2;
18285868e8aSDimitry Andric cs[tail2].next = intoIdx;
18385868e8aSDimitry Andric from.prev = tail1;
18485868e8aSDimitry Andric cs[tail1].next = fromIdx;
1850b57cec5SDimitry Andric into.size += from.size;
1860b57cec5SDimitry Andric into.weight += from.weight;
1870b57cec5SDimitry Andric from.size = 0;
1880b57cec5SDimitry Andric from.weight = 0;
1890b57cec5SDimitry Andric }
1900b57cec5SDimitry Andric
1910b57cec5SDimitry Andric // Group InputSections into clusters using the Call-Chain Clustering heuristic
1920b57cec5SDimitry Andric // then sort the clusters by density.
run()19385868e8aSDimitry Andric DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
19485868e8aSDimitry Andric std::vector<int> sorted(clusters.size());
195bdd1243dSDimitry Andric std::unique_ptr<int[]> leaders(new int[clusters.size()]);
1960b57cec5SDimitry Andric
197bdd1243dSDimitry Andric std::iota(leaders.get(), leaders.get() + clusters.size(), 0);
19885868e8aSDimitry Andric std::iota(sorted.begin(), sorted.end(), 0);
19985868e8aSDimitry Andric llvm::stable_sort(sorted, [&](int a, int b) {
2000b57cec5SDimitry Andric return clusters[a].getDensity() > clusters[b].getDensity();
2010b57cec5SDimitry Andric });
2020b57cec5SDimitry Andric
20385868e8aSDimitry Andric for (int l : sorted) {
20485868e8aSDimitry Andric // The cluster index is the same as the index of its leader here because
20585868e8aSDimitry Andric // clusters[L] has not been merged into another cluster yet.
20685868e8aSDimitry Andric Cluster &c = clusters[l];
2070b57cec5SDimitry Andric
2080b57cec5SDimitry Andric // Don't consider merging if the edge is unlikely.
2090b57cec5SDimitry Andric if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight)
2100b57cec5SDimitry Andric continue;
2110b57cec5SDimitry Andric
212bdd1243dSDimitry Andric int predL = getLeader(leaders.get(), c.bestPred.from);
21385868e8aSDimitry Andric if (l == predL)
2140b57cec5SDimitry Andric continue;
2150b57cec5SDimitry Andric
21685868e8aSDimitry Andric Cluster *predC = &clusters[predL];
2170b57cec5SDimitry Andric if (c.size + predC->size > MAX_CLUSTER_SIZE)
2180b57cec5SDimitry Andric continue;
2190b57cec5SDimitry Andric
2200b57cec5SDimitry Andric if (isNewDensityBad(*predC, c))
2210b57cec5SDimitry Andric continue;
2220b57cec5SDimitry Andric
22385868e8aSDimitry Andric leaders[l] = predL;
22485868e8aSDimitry Andric mergeClusters(clusters, *predC, predL, c, l);
2250b57cec5SDimitry Andric }
2260b57cec5SDimitry Andric
22785868e8aSDimitry Andric // Sort remaining non-empty clusters by density.
22885868e8aSDimitry Andric sorted.clear();
22985868e8aSDimitry Andric for (int i = 0, e = (int)clusters.size(); i != e; ++i)
23085868e8aSDimitry Andric if (clusters[i].size > 0)
23185868e8aSDimitry Andric sorted.push_back(i);
23285868e8aSDimitry Andric llvm::stable_sort(sorted, [&](int a, int b) {
23385868e8aSDimitry Andric return clusters[a].getDensity() > clusters[b].getDensity();
2340b57cec5SDimitry Andric });
2350b57cec5SDimitry Andric
2360b57cec5SDimitry Andric DenseMap<const InputSectionBase *, int> orderMap;
23785868e8aSDimitry Andric int curOrder = 1;
238e8d8bef9SDimitry Andric for (int leader : sorted) {
23985868e8aSDimitry Andric for (int i = leader;;) {
24085868e8aSDimitry Andric orderMap[sections[i]] = curOrder++;
24185868e8aSDimitry Andric i = clusters[i].next;
24285868e8aSDimitry Andric if (i == leader)
24385868e8aSDimitry Andric break;
24485868e8aSDimitry Andric }
245e8d8bef9SDimitry Andric }
2460b57cec5SDimitry Andric if (!config->printSymbolOrder.empty()) {
2470b57cec5SDimitry Andric std::error_code ec;
24885868e8aSDimitry Andric raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None);
2490b57cec5SDimitry Andric if (ec) {
2500b57cec5SDimitry Andric error("cannot open " + config->printSymbolOrder + ": " + ec.message());
2510b57cec5SDimitry Andric return orderMap;
2520b57cec5SDimitry Andric }
2530b57cec5SDimitry Andric
2540b57cec5SDimitry Andric // Print the symbols ordered by C3, in the order of increasing curOrder
2550b57cec5SDimitry Andric // Instead of sorting all the orderMap, just repeat the loops above.
25685868e8aSDimitry Andric for (int leader : sorted)
25785868e8aSDimitry Andric for (int i = leader;;) {
2580b57cec5SDimitry Andric // Search all the symbols in the file of the section
2590b57cec5SDimitry Andric // and find out a Defined symbol with name that is within the section.
26085868e8aSDimitry Andric for (Symbol *sym : sections[i]->file->getSymbols())
2610b57cec5SDimitry Andric if (!sym->isSection()) // Filter out section-type symbols here.
2620b57cec5SDimitry Andric if (auto *d = dyn_cast<Defined>(sym))
26385868e8aSDimitry Andric if (sections[i] == d->section)
2640b57cec5SDimitry Andric os << sym->getName() << "\n";
26585868e8aSDimitry Andric i = clusters[i].next;
26685868e8aSDimitry Andric if (i == leader)
26785868e8aSDimitry Andric break;
26885868e8aSDimitry Andric }
2690b57cec5SDimitry Andric }
2700b57cec5SDimitry Andric
2710b57cec5SDimitry Andric return orderMap;
2720b57cec5SDimitry Andric }
2730b57cec5SDimitry Andric
274*c9157d92SDimitry Andric // Sort sections by the profile data using the Cache-Directed Sort algorithm.
275*c9157d92SDimitry Andric // The placement is done by optimizing the locality by co-locating frequently
276*c9157d92SDimitry Andric // executed code sections together.
computeCacheDirectedSortOrder()277*c9157d92SDimitry Andric DenseMap<const InputSectionBase *, int> elf::computeCacheDirectedSortOrder() {
278*c9157d92SDimitry Andric SmallVector<uint64_t, 0> funcSizes;
279*c9157d92SDimitry Andric SmallVector<uint64_t, 0> funcCounts;
280*c9157d92SDimitry Andric SmallVector<codelayout::EdgeCount, 0> callCounts;
281*c9157d92SDimitry Andric SmallVector<uint64_t, 0> callOffsets;
282*c9157d92SDimitry Andric SmallVector<const InputSectionBase *, 0> sections;
283*c9157d92SDimitry Andric DenseMap<const InputSectionBase *, size_t> secToTargetId;
284*c9157d92SDimitry Andric
285*c9157d92SDimitry Andric auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t {
286*c9157d92SDimitry Andric auto res = secToTargetId.try_emplace(inSec, sections.size());
287*c9157d92SDimitry Andric if (res.second) {
288*c9157d92SDimitry Andric // inSec does not appear before in the graph.
289*c9157d92SDimitry Andric sections.push_back(inSec);
290*c9157d92SDimitry Andric funcSizes.push_back(inSec->getSize());
291*c9157d92SDimitry Andric funcCounts.push_back(0);
292*c9157d92SDimitry Andric }
293*c9157d92SDimitry Andric return res.first->second;
294*c9157d92SDimitry Andric };
295*c9157d92SDimitry Andric
296*c9157d92SDimitry Andric // Create the graph.
297*c9157d92SDimitry Andric for (std::pair<SectionPair, uint64_t> &c : config->callGraphProfile) {
298*c9157d92SDimitry Andric const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first);
299*c9157d92SDimitry Andric const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second);
300*c9157d92SDimitry Andric // Ignore edges between input sections belonging to different sections.
301*c9157d92SDimitry Andric if (fromSB->getOutputSection() != toSB->getOutputSection())
302*c9157d92SDimitry Andric continue;
303*c9157d92SDimitry Andric
304*c9157d92SDimitry Andric uint64_t weight = c.second;
305*c9157d92SDimitry Andric // Ignore edges with zero weight.
306*c9157d92SDimitry Andric if (weight == 0)
307*c9157d92SDimitry Andric continue;
308*c9157d92SDimitry Andric
309*c9157d92SDimitry Andric size_t from = getOrCreateNode(fromSB);
310*c9157d92SDimitry Andric size_t to = getOrCreateNode(toSB);
311*c9157d92SDimitry Andric // Ignore self-edges (recursive calls).
312*c9157d92SDimitry Andric if (from == to)
313*c9157d92SDimitry Andric continue;
314*c9157d92SDimitry Andric
315*c9157d92SDimitry Andric callCounts.push_back({from, to, weight});
316*c9157d92SDimitry Andric // Assume that the jump is at the middle of the input section. The profile
317*c9157d92SDimitry Andric // data does not contain jump offsets.
318*c9157d92SDimitry Andric callOffsets.push_back((funcSizes[from] + 1) / 2);
319*c9157d92SDimitry Andric funcCounts[to] += weight;
320*c9157d92SDimitry Andric }
321*c9157d92SDimitry Andric
322*c9157d92SDimitry Andric // Run the layout algorithm.
323*c9157d92SDimitry Andric std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout(
324*c9157d92SDimitry Andric funcSizes, funcCounts, callCounts, callOffsets);
325*c9157d92SDimitry Andric
326*c9157d92SDimitry Andric // Create the final order.
327*c9157d92SDimitry Andric DenseMap<const InputSectionBase *, int> orderMap;
328*c9157d92SDimitry Andric int curOrder = 1;
329*c9157d92SDimitry Andric for (uint64_t secIdx : sortedSections)
330*c9157d92SDimitry Andric orderMap[sections[secIdx]] = curOrder++;
331*c9157d92SDimitry Andric
332*c9157d92SDimitry Andric return orderMap;
333*c9157d92SDimitry Andric }
334*c9157d92SDimitry Andric
335349cc55cSDimitry Andric // Sort sections by the profile data provided by --callgraph-profile-file.
3360b57cec5SDimitry Andric //
3370b57cec5SDimitry Andric // This first builds a call graph based on the profile data then merges sections
338*c9157d92SDimitry Andric // according either to the C³ or Cache-Directed-Sort ordering algorithm.
computeCallGraphProfileOrder()3395ffd83dbSDimitry Andric DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
340*c9157d92SDimitry Andric if (config->callGraphProfileSort == CGProfileSortKind::Cdsort)
341*c9157d92SDimitry Andric return computeCacheDirectedSortOrder();
3420b57cec5SDimitry Andric return CallGraphSort().run();
3430b57cec5SDimitry Andric }
344