10b57cec5SDimitry Andric //===- CallGraphSort.cpp --------------------------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric ///
90b57cec5SDimitry Andric /// Implementation of Call-Chain Clustering from: Optimizing Function Placement
100b57cec5SDimitry Andric /// for Large-Scale Data-Center Applications
110b57cec5SDimitry Andric /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
120b57cec5SDimitry Andric ///
130b57cec5SDimitry Andric /// The goal of this algorithm is to improve runtime performance of the final
140b57cec5SDimitry Andric /// executable by arranging code sections such that page table and i-cache
150b57cec5SDimitry Andric /// misses are minimized.
160b57cec5SDimitry Andric ///
170b57cec5SDimitry Andric /// Definitions:
180b57cec5SDimitry Andric /// * Cluster
19480093f4SDimitry Andric ///   * An ordered list of input sections which are laid out as a unit. At the
200b57cec5SDimitry Andric ///     beginning of the algorithm each input section has its own cluster and
21480093f4SDimitry Andric ///     the weight of the cluster is the sum of the weight of all incoming
220b57cec5SDimitry Andric ///     edges.
230b57cec5SDimitry Andric /// * Call-Chain Clustering (C³) Heuristic
240b57cec5SDimitry Andric ///   * Defines when and how clusters are combined. Pick the highest weighted
250b57cec5SDimitry Andric ///     input section then add it to its most likely predecessor if it wouldn't
260b57cec5SDimitry Andric ///     penalize it too much.
270b57cec5SDimitry Andric /// * Density
280b57cec5SDimitry Andric ///   * The weight of the cluster divided by the size of the cluster. This is a
29480093f4SDimitry Andric ///     proxy for the amount of execution time spent per byte of the cluster.
300b57cec5SDimitry Andric ///
310b57cec5SDimitry Andric /// It does so given a call graph profile by the following:
320b57cec5SDimitry Andric /// * Build a weighted call graph from the call graph profile
330b57cec5SDimitry Andric /// * Sort input sections by weight
340b57cec5SDimitry Andric /// * For each input section starting with the highest weight
350b57cec5SDimitry Andric ///   * Find its most likely predecessor cluster
360b57cec5SDimitry Andric ///   * Check if the combined cluster would be too large, or would have too low
370b57cec5SDimitry Andric ///     a density.
380b57cec5SDimitry Andric ///   * If not, then combine the clusters.
390b57cec5SDimitry Andric /// * Sort non-empty clusters by density
400b57cec5SDimitry Andric ///
410b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
420b57cec5SDimitry Andric 
430b57cec5SDimitry Andric #include "CallGraphSort.h"
440b57cec5SDimitry Andric #include "OutputSections.h"
450b57cec5SDimitry Andric #include "SymbolTable.h"
460b57cec5SDimitry Andric #include "Symbols.h"
470b57cec5SDimitry Andric 
4885868e8aSDimitry Andric #include <numeric>
4985868e8aSDimitry Andric 
500b57cec5SDimitry Andric using namespace llvm;
515ffd83dbSDimitry Andric using namespace lld;
525ffd83dbSDimitry Andric using namespace lld::elf;
530b57cec5SDimitry Andric 
540b57cec5SDimitry Andric namespace {
550b57cec5SDimitry Andric struct Edge {
560b57cec5SDimitry Andric   int from;
570b57cec5SDimitry Andric   uint64_t weight;
580b57cec5SDimitry Andric };
590b57cec5SDimitry Andric 
600b57cec5SDimitry Andric struct Cluster {
Cluster__anon1267b3fb0111::Cluster6185868e8aSDimitry Andric   Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {}
620b57cec5SDimitry Andric 
getDensity__anon1267b3fb0111::Cluster630b57cec5SDimitry Andric   double getDensity() const {
640b57cec5SDimitry Andric     if (size == 0)
650b57cec5SDimitry Andric       return 0;
660b57cec5SDimitry Andric     return double(weight) / double(size);
670b57cec5SDimitry Andric   }
680b57cec5SDimitry Andric 
6985868e8aSDimitry Andric   int next;
7085868e8aSDimitry Andric   int prev;
71*af732203SDimitry Andric   uint64_t size;
720b57cec5SDimitry Andric   uint64_t weight = 0;
730b57cec5SDimitry Andric   uint64_t initialWeight = 0;
740b57cec5SDimitry Andric   Edge bestPred = {-1, 0};
750b57cec5SDimitry Andric };
760b57cec5SDimitry Andric 
770b57cec5SDimitry Andric class CallGraphSort {
780b57cec5SDimitry Andric public:
790b57cec5SDimitry Andric   CallGraphSort();
800b57cec5SDimitry Andric 
810b57cec5SDimitry Andric   DenseMap<const InputSectionBase *, int> run();
820b57cec5SDimitry Andric 
830b57cec5SDimitry Andric private:
840b57cec5SDimitry Andric   std::vector<Cluster> clusters;
850b57cec5SDimitry Andric   std::vector<const InputSectionBase *> sections;
860b57cec5SDimitry Andric };
870b57cec5SDimitry Andric 
88480093f4SDimitry Andric // Maximum amount the combined cluster density can be worse than the original
890b57cec5SDimitry Andric // cluster to consider merging.
900b57cec5SDimitry Andric constexpr int MAX_DENSITY_DEGRADATION = 8;
910b57cec5SDimitry Andric 
920b57cec5SDimitry Andric // Maximum cluster size in bytes.
930b57cec5SDimitry Andric constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
940b57cec5SDimitry Andric } // end anonymous namespace
950b57cec5SDimitry Andric 
960b57cec5SDimitry Andric using SectionPair =
970b57cec5SDimitry Andric     std::pair<const InputSectionBase *, const InputSectionBase *>;
980b57cec5SDimitry Andric 
990b57cec5SDimitry Andric // Take the edge list in Config->CallGraphProfile, resolve symbol names to
1000b57cec5SDimitry Andric // Symbols, and generate a graph between InputSections with the provided
1010b57cec5SDimitry Andric // weights.
CallGraphSort()1020b57cec5SDimitry Andric CallGraphSort::CallGraphSort() {
1030b57cec5SDimitry Andric   MapVector<SectionPair, uint64_t> &profile = config->callGraphProfile;
1040b57cec5SDimitry Andric   DenseMap<const InputSectionBase *, int> secToCluster;
1050b57cec5SDimitry Andric 
1060b57cec5SDimitry Andric   auto getOrCreateNode = [&](const InputSectionBase *isec) -> int {
10785868e8aSDimitry Andric     auto res = secToCluster.try_emplace(isec, clusters.size());
1080b57cec5SDimitry Andric     if (res.second) {
1090b57cec5SDimitry Andric       sections.push_back(isec);
1100b57cec5SDimitry Andric       clusters.emplace_back(clusters.size(), isec->getSize());
1110b57cec5SDimitry Andric     }
1120b57cec5SDimitry Andric     return res.first->second;
1130b57cec5SDimitry Andric   };
1140b57cec5SDimitry Andric 
1150b57cec5SDimitry Andric   // Create the graph.
1160b57cec5SDimitry Andric   for (std::pair<SectionPair, uint64_t> &c : profile) {
1170b57cec5SDimitry Andric     const auto *fromSB = cast<InputSectionBase>(c.first.first->repl);
1180b57cec5SDimitry Andric     const auto *toSB = cast<InputSectionBase>(c.first.second->repl);
1190b57cec5SDimitry Andric     uint64_t weight = c.second;
1200b57cec5SDimitry Andric 
1210b57cec5SDimitry Andric     // Ignore edges between input sections belonging to different output
1220b57cec5SDimitry Andric     // sections.  This is done because otherwise we would end up with clusters
1230b57cec5SDimitry Andric     // containing input sections that can't actually be placed adjacently in the
1240b57cec5SDimitry Andric     // output.  This messes with the cluster size and density calculations.  We
1250b57cec5SDimitry Andric     // would also end up moving input sections in other output sections without
1260b57cec5SDimitry Andric     // moving them closer to what calls them.
1270b57cec5SDimitry Andric     if (fromSB->getOutputSection() != toSB->getOutputSection())
1280b57cec5SDimitry Andric       continue;
1290b57cec5SDimitry Andric 
1300b57cec5SDimitry Andric     int from = getOrCreateNode(fromSB);
1310b57cec5SDimitry Andric     int to = getOrCreateNode(toSB);
1320b57cec5SDimitry Andric 
1330b57cec5SDimitry Andric     clusters[to].weight += weight;
1340b57cec5SDimitry Andric 
1350b57cec5SDimitry Andric     if (from == to)
1360b57cec5SDimitry Andric       continue;
1370b57cec5SDimitry Andric 
1380b57cec5SDimitry Andric     // Remember the best edge.
1390b57cec5SDimitry Andric     Cluster &toC = clusters[to];
1400b57cec5SDimitry Andric     if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) {
1410b57cec5SDimitry Andric       toC.bestPred.from = from;
1420b57cec5SDimitry Andric       toC.bestPred.weight = weight;
1430b57cec5SDimitry Andric     }
1440b57cec5SDimitry Andric   }
1450b57cec5SDimitry Andric   for (Cluster &c : clusters)
1460b57cec5SDimitry Andric     c.initialWeight = c.weight;
1470b57cec5SDimitry Andric }
1480b57cec5SDimitry Andric 
1490b57cec5SDimitry Andric // It's bad to merge clusters which would degrade the density too much.
isNewDensityBad(Cluster & a,Cluster & b)1500b57cec5SDimitry Andric static bool isNewDensityBad(Cluster &a, Cluster &b) {
1510b57cec5SDimitry Andric   double newDensity = double(a.weight + b.weight) / double(a.size + b.size);
1520b57cec5SDimitry Andric   return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION;
1530b57cec5SDimitry Andric }
1540b57cec5SDimitry Andric 
15585868e8aSDimitry Andric // Find the leader of V's belonged cluster (represented as an equivalence
15685868e8aSDimitry Andric // class). We apply union-find path-halving technique (simple to implement) in
15785868e8aSDimitry Andric // the meantime as it decreases depths and the time complexity.
getLeader(std::vector<int> & leaders,int v)15885868e8aSDimitry Andric static int getLeader(std::vector<int> &leaders, int v) {
15985868e8aSDimitry Andric   while (leaders[v] != v) {
16085868e8aSDimitry Andric     leaders[v] = leaders[leaders[v]];
16185868e8aSDimitry Andric     v = leaders[v];
16285868e8aSDimitry Andric   }
16385868e8aSDimitry Andric   return v;
16485868e8aSDimitry Andric }
16585868e8aSDimitry Andric 
mergeClusters(std::vector<Cluster> & cs,Cluster & into,int intoIdx,Cluster & from,int fromIdx)16685868e8aSDimitry Andric static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx,
16785868e8aSDimitry Andric                           Cluster &from, int fromIdx) {
16885868e8aSDimitry Andric   int tail1 = into.prev, tail2 = from.prev;
16985868e8aSDimitry Andric   into.prev = tail2;
17085868e8aSDimitry Andric   cs[tail2].next = intoIdx;
17185868e8aSDimitry Andric   from.prev = tail1;
17285868e8aSDimitry Andric   cs[tail1].next = fromIdx;
1730b57cec5SDimitry Andric   into.size += from.size;
1740b57cec5SDimitry Andric   into.weight += from.weight;
1750b57cec5SDimitry Andric   from.size = 0;
1760b57cec5SDimitry Andric   from.weight = 0;
1770b57cec5SDimitry Andric }
1780b57cec5SDimitry Andric 
1790b57cec5SDimitry Andric // Group InputSections into clusters using the Call-Chain Clustering heuristic
1800b57cec5SDimitry Andric // then sort the clusters by density.
run()18185868e8aSDimitry Andric DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
18285868e8aSDimitry Andric   std::vector<int> sorted(clusters.size());
18385868e8aSDimitry Andric   std::vector<int> leaders(clusters.size());
1840b57cec5SDimitry Andric 
18585868e8aSDimitry Andric   std::iota(leaders.begin(), leaders.end(), 0);
18685868e8aSDimitry Andric   std::iota(sorted.begin(), sorted.end(), 0);
18785868e8aSDimitry Andric   llvm::stable_sort(sorted, [&](int a, int b) {
1880b57cec5SDimitry Andric     return clusters[a].getDensity() > clusters[b].getDensity();
1890b57cec5SDimitry Andric   });
1900b57cec5SDimitry Andric 
19185868e8aSDimitry Andric   for (int l : sorted) {
19285868e8aSDimitry Andric     // The cluster index is the same as the index of its leader here because
19385868e8aSDimitry Andric     // clusters[L] has not been merged into another cluster yet.
19485868e8aSDimitry Andric     Cluster &c = clusters[l];
1950b57cec5SDimitry Andric 
1960b57cec5SDimitry Andric     // Don't consider merging if the edge is unlikely.
1970b57cec5SDimitry Andric     if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight)
1980b57cec5SDimitry Andric       continue;
1990b57cec5SDimitry Andric 
20085868e8aSDimitry Andric     int predL = getLeader(leaders, c.bestPred.from);
20185868e8aSDimitry Andric     if (l == predL)
2020b57cec5SDimitry Andric       continue;
2030b57cec5SDimitry Andric 
20485868e8aSDimitry Andric     Cluster *predC = &clusters[predL];
2050b57cec5SDimitry Andric     if (c.size + predC->size > MAX_CLUSTER_SIZE)
2060b57cec5SDimitry Andric       continue;
2070b57cec5SDimitry Andric 
2080b57cec5SDimitry Andric     if (isNewDensityBad(*predC, c))
2090b57cec5SDimitry Andric       continue;
2100b57cec5SDimitry Andric 
21185868e8aSDimitry Andric     leaders[l] = predL;
21285868e8aSDimitry Andric     mergeClusters(clusters, *predC, predL, c, l);
2130b57cec5SDimitry Andric   }
2140b57cec5SDimitry Andric 
21585868e8aSDimitry Andric   // Sort remaining non-empty clusters by density.
21685868e8aSDimitry Andric   sorted.clear();
21785868e8aSDimitry Andric   for (int i = 0, e = (int)clusters.size(); i != e; ++i)
21885868e8aSDimitry Andric     if (clusters[i].size > 0)
21985868e8aSDimitry Andric       sorted.push_back(i);
22085868e8aSDimitry Andric   llvm::stable_sort(sorted, [&](int a, int b) {
22185868e8aSDimitry Andric     return clusters[a].getDensity() > clusters[b].getDensity();
2220b57cec5SDimitry Andric   });
2230b57cec5SDimitry Andric 
2240b57cec5SDimitry Andric   DenseMap<const InputSectionBase *, int> orderMap;
22585868e8aSDimitry Andric   int curOrder = 1;
226*af732203SDimitry Andric   for (int leader : sorted) {
22785868e8aSDimitry Andric     for (int i = leader;;) {
22885868e8aSDimitry Andric       orderMap[sections[i]] = curOrder++;
22985868e8aSDimitry Andric       i = clusters[i].next;
23085868e8aSDimitry Andric       if (i == leader)
23185868e8aSDimitry Andric         break;
23285868e8aSDimitry Andric     }
233*af732203SDimitry Andric   }
2340b57cec5SDimitry Andric   if (!config->printSymbolOrder.empty()) {
2350b57cec5SDimitry Andric     std::error_code ec;
23685868e8aSDimitry Andric     raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None);
2370b57cec5SDimitry Andric     if (ec) {
2380b57cec5SDimitry Andric       error("cannot open " + config->printSymbolOrder + ": " + ec.message());
2390b57cec5SDimitry Andric       return orderMap;
2400b57cec5SDimitry Andric     }
2410b57cec5SDimitry Andric 
2420b57cec5SDimitry Andric     // Print the symbols ordered by C3, in the order of increasing curOrder
2430b57cec5SDimitry Andric     // Instead of sorting all the orderMap, just repeat the loops above.
24485868e8aSDimitry Andric     for (int leader : sorted)
24585868e8aSDimitry Andric       for (int i = leader;;) {
2460b57cec5SDimitry Andric         // Search all the symbols in the file of the section
2470b57cec5SDimitry Andric         // and find out a Defined symbol with name that is within the section.
24885868e8aSDimitry Andric         for (Symbol *sym : sections[i]->file->getSymbols())
2490b57cec5SDimitry Andric           if (!sym->isSection()) // Filter out section-type symbols here.
2500b57cec5SDimitry Andric             if (auto *d = dyn_cast<Defined>(sym))
25185868e8aSDimitry Andric               if (sections[i] == d->section)
2520b57cec5SDimitry Andric                 os << sym->getName() << "\n";
25385868e8aSDimitry Andric         i = clusters[i].next;
25485868e8aSDimitry Andric         if (i == leader)
25585868e8aSDimitry Andric           break;
25685868e8aSDimitry Andric       }
2570b57cec5SDimitry Andric   }
2580b57cec5SDimitry Andric 
2590b57cec5SDimitry Andric   return orderMap;
2600b57cec5SDimitry Andric }
2610b57cec5SDimitry Andric 
2620b57cec5SDimitry Andric // Sort sections by the profile data provided by -callgraph-profile-file
2630b57cec5SDimitry Andric //
2640b57cec5SDimitry Andric // This first builds a call graph based on the profile data then merges sections
2655ffd83dbSDimitry Andric // according to the C³ heuristic. All clusters are then sorted by a density
2660b57cec5SDimitry Andric // metric to further improve locality.
computeCallGraphProfileOrder()2675ffd83dbSDimitry Andric DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
2680b57cec5SDimitry Andric   return CallGraphSort().run();
2690b57cec5SDimitry Andric }
270