14ba319b5SDimitry Andric //===- CallGraphSort.cpp --------------------------------------------------===//
24ba319b5SDimitry Andric //
34ba319b5SDimitry Andric // The LLVM Linker
44ba319b5SDimitry Andric //
54ba319b5SDimitry Andric // This file is distributed under the University of Illinois Open Source
64ba319b5SDimitry Andric // License. See LICENSE.TXT for details.
74ba319b5SDimitry Andric //
84ba319b5SDimitry Andric //===----------------------------------------------------------------------===//
94ba319b5SDimitry Andric ///
104ba319b5SDimitry Andric /// Implementation of Call-Chain Clustering from: Optimizing Function Placement
114ba319b5SDimitry Andric /// for Large-Scale Data-Center Applications
124ba319b5SDimitry Andric /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
134ba319b5SDimitry Andric ///
144ba319b5SDimitry Andric /// The goal of this algorithm is to improve runtime performance of the final
154ba319b5SDimitry Andric /// executable by arranging code sections such that page table and i-cache
164ba319b5SDimitry Andric /// misses are minimized.
174ba319b5SDimitry Andric ///
184ba319b5SDimitry Andric /// Definitions:
194ba319b5SDimitry Andric /// * Cluster
204ba319b5SDimitry Andric /// * An ordered list of input sections which are layed out as a unit. At the
214ba319b5SDimitry Andric /// beginning of the algorithm each input section has its own cluster and
224ba319b5SDimitry Andric /// the weight of the cluster is the sum of the weight of all incomming
234ba319b5SDimitry Andric /// edges.
244ba319b5SDimitry Andric /// * Call-Chain Clustering (C³) Heuristic
254ba319b5SDimitry Andric /// * Defines when and how clusters are combined. Pick the highest weighted
264ba319b5SDimitry Andric /// input section then add it to its most likely predecessor if it wouldn't
274ba319b5SDimitry Andric /// penalize it too much.
284ba319b5SDimitry Andric /// * Density
294ba319b5SDimitry Andric /// * The weight of the cluster divided by the size of the cluster. This is a
304ba319b5SDimitry Andric /// proxy for the ammount of execution time spent per byte of the cluster.
314ba319b5SDimitry Andric ///
324ba319b5SDimitry Andric /// It does so given a call graph profile by the following:
334ba319b5SDimitry Andric /// * Build a weighted call graph from the call graph profile
344ba319b5SDimitry Andric /// * Sort input sections by weight
354ba319b5SDimitry Andric /// * For each input section starting with the highest weight
364ba319b5SDimitry Andric /// * Find its most likely predecessor cluster
374ba319b5SDimitry Andric /// * Check if the combined cluster would be too large, or would have too low
384ba319b5SDimitry Andric /// a density.
394ba319b5SDimitry Andric /// * If not, then combine the clusters.
404ba319b5SDimitry Andric /// * Sort non-empty clusters by density
414ba319b5SDimitry Andric ///
424ba319b5SDimitry Andric //===----------------------------------------------------------------------===//
434ba319b5SDimitry Andric
444ba319b5SDimitry Andric #include "CallGraphSort.h"
454ba319b5SDimitry Andric #include "OutputSections.h"
464ba319b5SDimitry Andric #include "SymbolTable.h"
474ba319b5SDimitry Andric #include "Symbols.h"
484ba319b5SDimitry Andric
494ba319b5SDimitry Andric using namespace llvm;
504ba319b5SDimitry Andric using namespace lld;
514ba319b5SDimitry Andric using namespace lld::elf;
524ba319b5SDimitry Andric
534ba319b5SDimitry Andric namespace {
544ba319b5SDimitry Andric struct Edge {
554ba319b5SDimitry Andric int From;
564ba319b5SDimitry Andric uint64_t Weight;
574ba319b5SDimitry Andric };
584ba319b5SDimitry Andric
594ba319b5SDimitry Andric struct Cluster {
Cluster__anon3cb5ab160111::Cluster60*b5893f02SDimitry Andric Cluster(int Sec, size_t S) : Sections{Sec}, Size(S) {}
614ba319b5SDimitry Andric
getDensity__anon3cb5ab160111::Cluster624ba319b5SDimitry Andric double getDensity() const {
634ba319b5SDimitry Andric if (Size == 0)
644ba319b5SDimitry Andric return 0;
654ba319b5SDimitry Andric return double(Weight) / double(Size);
664ba319b5SDimitry Andric }
674ba319b5SDimitry Andric
684ba319b5SDimitry Andric std::vector<int> Sections;
694ba319b5SDimitry Andric size_t Size = 0;
704ba319b5SDimitry Andric uint64_t Weight = 0;
714ba319b5SDimitry Andric uint64_t InitialWeight = 0;
72*b5893f02SDimitry Andric Edge BestPred = {-1, 0};
734ba319b5SDimitry Andric };
744ba319b5SDimitry Andric
754ba319b5SDimitry Andric class CallGraphSort {
764ba319b5SDimitry Andric public:
774ba319b5SDimitry Andric CallGraphSort();
784ba319b5SDimitry Andric
794ba319b5SDimitry Andric DenseMap<const InputSectionBase *, int> run();
804ba319b5SDimitry Andric
814ba319b5SDimitry Andric private:
824ba319b5SDimitry Andric std::vector<Cluster> Clusters;
834ba319b5SDimitry Andric std::vector<const InputSectionBase *> Sections;
844ba319b5SDimitry Andric
854ba319b5SDimitry Andric void groupClusters();
864ba319b5SDimitry Andric };
874ba319b5SDimitry Andric
884ba319b5SDimitry Andric // Maximum ammount the combined cluster density can be worse than the original
894ba319b5SDimitry Andric // cluster to consider merging.
904ba319b5SDimitry Andric constexpr int MAX_DENSITY_DEGRADATION = 8;
914ba319b5SDimitry Andric
924ba319b5SDimitry Andric // Maximum cluster size in bytes.
934ba319b5SDimitry Andric constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
944ba319b5SDimitry Andric } // end anonymous namespace
954ba319b5SDimitry Andric
96*b5893f02SDimitry Andric typedef std::pair<const InputSectionBase *, const InputSectionBase *>
97*b5893f02SDimitry Andric SectionPair;
98*b5893f02SDimitry Andric
994ba319b5SDimitry Andric // Take the edge list in Config->CallGraphProfile, resolve symbol names to
1004ba319b5SDimitry Andric // Symbols, and generate a graph between InputSections with the provided
1014ba319b5SDimitry Andric // weights.
CallGraphSort()1024ba319b5SDimitry Andric CallGraphSort::CallGraphSort() {
103*b5893f02SDimitry Andric MapVector<SectionPair, uint64_t> &Profile = Config->CallGraphProfile;
1044ba319b5SDimitry Andric DenseMap<const InputSectionBase *, int> SecToCluster;
1054ba319b5SDimitry Andric
1064ba319b5SDimitry Andric auto GetOrCreateNode = [&](const InputSectionBase *IS) -> int {
1074ba319b5SDimitry Andric auto Res = SecToCluster.insert(std::make_pair(IS, Clusters.size()));
1084ba319b5SDimitry Andric if (Res.second) {
1094ba319b5SDimitry Andric Sections.push_back(IS);
1104ba319b5SDimitry Andric Clusters.emplace_back(Clusters.size(), IS->getSize());
1114ba319b5SDimitry Andric }
1124ba319b5SDimitry Andric return Res.first->second;
1134ba319b5SDimitry Andric };
1144ba319b5SDimitry Andric
1154ba319b5SDimitry Andric // Create the graph.
116*b5893f02SDimitry Andric for (std::pair<SectionPair, uint64_t> &C : Profile) {
1174ba319b5SDimitry Andric const auto *FromSB = cast<InputSectionBase>(C.first.first->Repl);
1184ba319b5SDimitry Andric const auto *ToSB = cast<InputSectionBase>(C.first.second->Repl);
1194ba319b5SDimitry Andric uint64_t Weight = C.second;
1204ba319b5SDimitry Andric
1214ba319b5SDimitry Andric // Ignore edges between input sections belonging to different output
1224ba319b5SDimitry Andric // sections. This is done because otherwise we would end up with clusters
1234ba319b5SDimitry Andric // containing input sections that can't actually be placed adjacently in the
1244ba319b5SDimitry Andric // output. This messes with the cluster size and density calculations. We
1254ba319b5SDimitry Andric // would also end up moving input sections in other output sections without
1264ba319b5SDimitry Andric // moving them closer to what calls them.
1274ba319b5SDimitry Andric if (FromSB->getOutputSection() != ToSB->getOutputSection())
1284ba319b5SDimitry Andric continue;
1294ba319b5SDimitry Andric
1304ba319b5SDimitry Andric int From = GetOrCreateNode(FromSB);
1314ba319b5SDimitry Andric int To = GetOrCreateNode(ToSB);
1324ba319b5SDimitry Andric
1334ba319b5SDimitry Andric Clusters[To].Weight += Weight;
1344ba319b5SDimitry Andric
1354ba319b5SDimitry Andric if (From == To)
1364ba319b5SDimitry Andric continue;
1374ba319b5SDimitry Andric
138*b5893f02SDimitry Andric // Remember the best edge.
139*b5893f02SDimitry Andric Cluster &ToC = Clusters[To];
140*b5893f02SDimitry Andric if (ToC.BestPred.From == -1 || ToC.BestPred.Weight < Weight) {
141*b5893f02SDimitry Andric ToC.BestPred.From = From;
142*b5893f02SDimitry Andric ToC.BestPred.Weight = Weight;
143*b5893f02SDimitry Andric }
1444ba319b5SDimitry Andric }
1454ba319b5SDimitry Andric for (Cluster &C : Clusters)
1464ba319b5SDimitry Andric C.InitialWeight = C.Weight;
1474ba319b5SDimitry Andric }
1484ba319b5SDimitry Andric
1494ba319b5SDimitry Andric // It's bad to merge clusters which would degrade the density too much.
isNewDensityBad(Cluster & A,Cluster & B)1504ba319b5SDimitry Andric static bool isNewDensityBad(Cluster &A, Cluster &B) {
1514ba319b5SDimitry Andric double NewDensity = double(A.Weight + B.Weight) / double(A.Size + B.Size);
152*b5893f02SDimitry Andric return NewDensity < A.getDensity() / MAX_DENSITY_DEGRADATION;
1534ba319b5SDimitry Andric }
1544ba319b5SDimitry Andric
mergeClusters(Cluster & Into,Cluster & From)1554ba319b5SDimitry Andric static void mergeClusters(Cluster &Into, Cluster &From) {
1564ba319b5SDimitry Andric Into.Sections.insert(Into.Sections.end(), From.Sections.begin(),
1574ba319b5SDimitry Andric From.Sections.end());
1584ba319b5SDimitry Andric Into.Size += From.Size;
1594ba319b5SDimitry Andric Into.Weight += From.Weight;
1604ba319b5SDimitry Andric From.Sections.clear();
1614ba319b5SDimitry Andric From.Size = 0;
1624ba319b5SDimitry Andric From.Weight = 0;
1634ba319b5SDimitry Andric }
1644ba319b5SDimitry Andric
1654ba319b5SDimitry Andric // Group InputSections into clusters using the Call-Chain Clustering heuristic
1664ba319b5SDimitry Andric // then sort the clusters by density.
groupClusters()1674ba319b5SDimitry Andric void CallGraphSort::groupClusters() {
1684ba319b5SDimitry Andric std::vector<int> SortedSecs(Clusters.size());
1694ba319b5SDimitry Andric std::vector<Cluster *> SecToCluster(Clusters.size());
1704ba319b5SDimitry Andric
171*b5893f02SDimitry Andric for (size_t I = 0; I < Clusters.size(); ++I) {
172*b5893f02SDimitry Andric SortedSecs[I] = I;
173*b5893f02SDimitry Andric SecToCluster[I] = &Clusters[I];
1744ba319b5SDimitry Andric }
1754ba319b5SDimitry Andric
1764ba319b5SDimitry Andric std::stable_sort(SortedSecs.begin(), SortedSecs.end(), [&](int A, int B) {
1774ba319b5SDimitry Andric return Clusters[B].getDensity() < Clusters[A].getDensity();
1784ba319b5SDimitry Andric });
1794ba319b5SDimitry Andric
1804ba319b5SDimitry Andric for (int SI : SortedSecs) {
1814ba319b5SDimitry Andric // Clusters[SI] is the same as SecToClusters[SI] here because it has not
1824ba319b5SDimitry Andric // been merged into another cluster yet.
1834ba319b5SDimitry Andric Cluster &C = Clusters[SI];
1844ba319b5SDimitry Andric
185*b5893f02SDimitry Andric // Don't consider merging if the edge is unlikely.
186*b5893f02SDimitry Andric if (C.BestPred.From == -1 || C.BestPred.Weight * 10 <= C.InitialWeight)
1874ba319b5SDimitry Andric continue;
1884ba319b5SDimitry Andric
189*b5893f02SDimitry Andric Cluster *PredC = SecToCluster[C.BestPred.From];
1904ba319b5SDimitry Andric if (PredC == &C)
1914ba319b5SDimitry Andric continue;
1924ba319b5SDimitry Andric
1934ba319b5SDimitry Andric if (C.Size + PredC->Size > MAX_CLUSTER_SIZE)
1944ba319b5SDimitry Andric continue;
1954ba319b5SDimitry Andric
1964ba319b5SDimitry Andric if (isNewDensityBad(*PredC, C))
1974ba319b5SDimitry Andric continue;
1984ba319b5SDimitry Andric
1994ba319b5SDimitry Andric // NOTE: Consider using a disjoint-set to track section -> cluster mapping
2004ba319b5SDimitry Andric // if this is ever slow.
2014ba319b5SDimitry Andric for (int SI : C.Sections)
2024ba319b5SDimitry Andric SecToCluster[SI] = PredC;
2034ba319b5SDimitry Andric
2044ba319b5SDimitry Andric mergeClusters(*PredC, C);
2054ba319b5SDimitry Andric }
2064ba319b5SDimitry Andric
2074ba319b5SDimitry Andric // Remove empty or dead nodes. Invalidates all cluster indices.
2084ba319b5SDimitry Andric llvm::erase_if(Clusters, [](const Cluster &C) {
2094ba319b5SDimitry Andric return C.Size == 0 || C.Sections.empty();
2104ba319b5SDimitry Andric });
2114ba319b5SDimitry Andric
2124ba319b5SDimitry Andric // Sort by density.
2134ba319b5SDimitry Andric std::stable_sort(Clusters.begin(), Clusters.end(),
2144ba319b5SDimitry Andric [](const Cluster &A, const Cluster &B) {
2154ba319b5SDimitry Andric return A.getDensity() > B.getDensity();
2164ba319b5SDimitry Andric });
2174ba319b5SDimitry Andric }
2184ba319b5SDimitry Andric
run()2194ba319b5SDimitry Andric DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
2204ba319b5SDimitry Andric groupClusters();
2214ba319b5SDimitry Andric
2224ba319b5SDimitry Andric // Generate order.
223*b5893f02SDimitry Andric DenseMap<const InputSectionBase *, int> OrderMap;
2244ba319b5SDimitry Andric ssize_t CurOrder = 1;
2254ba319b5SDimitry Andric
2264ba319b5SDimitry Andric for (const Cluster &C : Clusters)
2274ba319b5SDimitry Andric for (int SecIndex : C.Sections)
2284ba319b5SDimitry Andric OrderMap[Sections[SecIndex]] = CurOrder++;
2294ba319b5SDimitry Andric
2304ba319b5SDimitry Andric return OrderMap;
2314ba319b5SDimitry Andric }
2324ba319b5SDimitry Andric
2334ba319b5SDimitry Andric // Sort sections by the profile data provided by -callgraph-profile-file
2344ba319b5SDimitry Andric //
2354ba319b5SDimitry Andric // This first builds a call graph based on the profile data then merges sections
2364ba319b5SDimitry Andric // according to the C³ huristic. All clusters are then sorted by a density
2374ba319b5SDimitry Andric // metric to further improve locality.
computeCallGraphProfileOrder()2384ba319b5SDimitry Andric DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
2394ba319b5SDimitry Andric return CallGraphSort().run();
2404ba319b5SDimitry Andric }
241