14ba319b5SDimitry Andric //===- CallGraphSort.cpp --------------------------------------------------===//
24ba319b5SDimitry Andric //
34ba319b5SDimitry Andric //                             The LLVM Linker
44ba319b5SDimitry Andric //
54ba319b5SDimitry Andric // This file is distributed under the University of Illinois Open Source
64ba319b5SDimitry Andric // License. See LICENSE.TXT for details.
74ba319b5SDimitry Andric //
84ba319b5SDimitry Andric //===----------------------------------------------------------------------===//
94ba319b5SDimitry Andric ///
104ba319b5SDimitry Andric /// Implementation of Call-Chain Clustering from: Optimizing Function Placement
114ba319b5SDimitry Andric /// for Large-Scale Data-Center Applications
124ba319b5SDimitry Andric /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
134ba319b5SDimitry Andric ///
144ba319b5SDimitry Andric /// The goal of this algorithm is to improve runtime performance of the final
154ba319b5SDimitry Andric /// executable by arranging code sections such that page table and i-cache
164ba319b5SDimitry Andric /// misses are minimized.
174ba319b5SDimitry Andric ///
184ba319b5SDimitry Andric /// Definitions:
194ba319b5SDimitry Andric /// * Cluster
204ba319b5SDimitry Andric ///   * An ordered list of input sections which are layed out as a unit. At the
214ba319b5SDimitry Andric ///     beginning of the algorithm each input section has its own cluster and
224ba319b5SDimitry Andric ///     the weight of the cluster is the sum of the weight of all incomming
234ba319b5SDimitry Andric ///     edges.
244ba319b5SDimitry Andric /// * Call-Chain Clustering (C³) Heuristic
254ba319b5SDimitry Andric ///   * Defines when and how clusters are combined. Pick the highest weighted
264ba319b5SDimitry Andric ///     input section then add it to its most likely predecessor if it wouldn't
274ba319b5SDimitry Andric ///     penalize it too much.
284ba319b5SDimitry Andric /// * Density
294ba319b5SDimitry Andric ///   * The weight of the cluster divided by the size of the cluster. This is a
304ba319b5SDimitry Andric ///     proxy for the ammount of execution time spent per byte of the cluster.
314ba319b5SDimitry Andric ///
324ba319b5SDimitry Andric /// It does so given a call graph profile by the following:
334ba319b5SDimitry Andric /// * Build a weighted call graph from the call graph profile
344ba319b5SDimitry Andric /// * Sort input sections by weight
354ba319b5SDimitry Andric /// * For each input section starting with the highest weight
364ba319b5SDimitry Andric ///   * Find its most likely predecessor cluster
374ba319b5SDimitry Andric ///   * Check if the combined cluster would be too large, or would have too low
384ba319b5SDimitry Andric ///     a density.
394ba319b5SDimitry Andric ///   * If not, then combine the clusters.
404ba319b5SDimitry Andric /// * Sort non-empty clusters by density
414ba319b5SDimitry Andric ///
424ba319b5SDimitry Andric //===----------------------------------------------------------------------===//
434ba319b5SDimitry Andric 
444ba319b5SDimitry Andric #include "CallGraphSort.h"
454ba319b5SDimitry Andric #include "OutputSections.h"
464ba319b5SDimitry Andric #include "SymbolTable.h"
474ba319b5SDimitry Andric #include "Symbols.h"
484ba319b5SDimitry Andric 
494ba319b5SDimitry Andric using namespace llvm;
504ba319b5SDimitry Andric using namespace lld;
514ba319b5SDimitry Andric using namespace lld::elf;
524ba319b5SDimitry Andric 
534ba319b5SDimitry Andric namespace {
544ba319b5SDimitry Andric struct Edge {
554ba319b5SDimitry Andric   int From;
564ba319b5SDimitry Andric   uint64_t Weight;
574ba319b5SDimitry Andric };
584ba319b5SDimitry Andric 
594ba319b5SDimitry Andric struct Cluster {
Cluster__anon3cb5ab160111::Cluster60*b5893f02SDimitry Andric   Cluster(int Sec, size_t S) : Sections{Sec}, Size(S) {}
614ba319b5SDimitry Andric 
getDensity__anon3cb5ab160111::Cluster624ba319b5SDimitry Andric   double getDensity() const {
634ba319b5SDimitry Andric     if (Size == 0)
644ba319b5SDimitry Andric       return 0;
654ba319b5SDimitry Andric     return double(Weight) / double(Size);
664ba319b5SDimitry Andric   }
674ba319b5SDimitry Andric 
684ba319b5SDimitry Andric   std::vector<int> Sections;
694ba319b5SDimitry Andric   size_t Size = 0;
704ba319b5SDimitry Andric   uint64_t Weight = 0;
714ba319b5SDimitry Andric   uint64_t InitialWeight = 0;
72*b5893f02SDimitry Andric   Edge BestPred = {-1, 0};
734ba319b5SDimitry Andric };
744ba319b5SDimitry Andric 
754ba319b5SDimitry Andric class CallGraphSort {
764ba319b5SDimitry Andric public:
774ba319b5SDimitry Andric   CallGraphSort();
784ba319b5SDimitry Andric 
794ba319b5SDimitry Andric   DenseMap<const InputSectionBase *, int> run();
804ba319b5SDimitry Andric 
814ba319b5SDimitry Andric private:
824ba319b5SDimitry Andric   std::vector<Cluster> Clusters;
834ba319b5SDimitry Andric   std::vector<const InputSectionBase *> Sections;
844ba319b5SDimitry Andric 
854ba319b5SDimitry Andric   void groupClusters();
864ba319b5SDimitry Andric };
874ba319b5SDimitry Andric 
884ba319b5SDimitry Andric // Maximum ammount the combined cluster density can be worse than the original
894ba319b5SDimitry Andric // cluster to consider merging.
904ba319b5SDimitry Andric constexpr int MAX_DENSITY_DEGRADATION = 8;
914ba319b5SDimitry Andric 
924ba319b5SDimitry Andric // Maximum cluster size in bytes.
934ba319b5SDimitry Andric constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
944ba319b5SDimitry Andric } // end anonymous namespace
954ba319b5SDimitry Andric 
96*b5893f02SDimitry Andric typedef std::pair<const InputSectionBase *, const InputSectionBase *>
97*b5893f02SDimitry Andric     SectionPair;
98*b5893f02SDimitry Andric 
994ba319b5SDimitry Andric // Take the edge list in Config->CallGraphProfile, resolve symbol names to
1004ba319b5SDimitry Andric // Symbols, and generate a graph between InputSections with the provided
1014ba319b5SDimitry Andric // weights.
CallGraphSort()1024ba319b5SDimitry Andric CallGraphSort::CallGraphSort() {
103*b5893f02SDimitry Andric   MapVector<SectionPair, uint64_t> &Profile = Config->CallGraphProfile;
1044ba319b5SDimitry Andric   DenseMap<const InputSectionBase *, int> SecToCluster;
1054ba319b5SDimitry Andric 
1064ba319b5SDimitry Andric   auto GetOrCreateNode = [&](const InputSectionBase *IS) -> int {
1074ba319b5SDimitry Andric     auto Res = SecToCluster.insert(std::make_pair(IS, Clusters.size()));
1084ba319b5SDimitry Andric     if (Res.second) {
1094ba319b5SDimitry Andric       Sections.push_back(IS);
1104ba319b5SDimitry Andric       Clusters.emplace_back(Clusters.size(), IS->getSize());
1114ba319b5SDimitry Andric     }
1124ba319b5SDimitry Andric     return Res.first->second;
1134ba319b5SDimitry Andric   };
1144ba319b5SDimitry Andric 
1154ba319b5SDimitry Andric   // Create the graph.
116*b5893f02SDimitry Andric   for (std::pair<SectionPair, uint64_t> &C : Profile) {
1174ba319b5SDimitry Andric     const auto *FromSB = cast<InputSectionBase>(C.first.first->Repl);
1184ba319b5SDimitry Andric     const auto *ToSB = cast<InputSectionBase>(C.first.second->Repl);
1194ba319b5SDimitry Andric     uint64_t Weight = C.second;
1204ba319b5SDimitry Andric 
1214ba319b5SDimitry Andric     // Ignore edges between input sections belonging to different output
1224ba319b5SDimitry Andric     // sections.  This is done because otherwise we would end up with clusters
1234ba319b5SDimitry Andric     // containing input sections that can't actually be placed adjacently in the
1244ba319b5SDimitry Andric     // output.  This messes with the cluster size and density calculations.  We
1254ba319b5SDimitry Andric     // would also end up moving input sections in other output sections without
1264ba319b5SDimitry Andric     // moving them closer to what calls them.
1274ba319b5SDimitry Andric     if (FromSB->getOutputSection() != ToSB->getOutputSection())
1284ba319b5SDimitry Andric       continue;
1294ba319b5SDimitry Andric 
1304ba319b5SDimitry Andric     int From = GetOrCreateNode(FromSB);
1314ba319b5SDimitry Andric     int To = GetOrCreateNode(ToSB);
1324ba319b5SDimitry Andric 
1334ba319b5SDimitry Andric     Clusters[To].Weight += Weight;
1344ba319b5SDimitry Andric 
1354ba319b5SDimitry Andric     if (From == To)
1364ba319b5SDimitry Andric       continue;
1374ba319b5SDimitry Andric 
138*b5893f02SDimitry Andric     // Remember the best edge.
139*b5893f02SDimitry Andric     Cluster &ToC = Clusters[To];
140*b5893f02SDimitry Andric     if (ToC.BestPred.From == -1 || ToC.BestPred.Weight < Weight) {
141*b5893f02SDimitry Andric       ToC.BestPred.From = From;
142*b5893f02SDimitry Andric       ToC.BestPred.Weight = Weight;
143*b5893f02SDimitry Andric     }
1444ba319b5SDimitry Andric   }
1454ba319b5SDimitry Andric   for (Cluster &C : Clusters)
1464ba319b5SDimitry Andric     C.InitialWeight = C.Weight;
1474ba319b5SDimitry Andric }
1484ba319b5SDimitry Andric 
1494ba319b5SDimitry Andric // It's bad to merge clusters which would degrade the density too much.
isNewDensityBad(Cluster & A,Cluster & B)1504ba319b5SDimitry Andric static bool isNewDensityBad(Cluster &A, Cluster &B) {
1514ba319b5SDimitry Andric   double NewDensity = double(A.Weight + B.Weight) / double(A.Size + B.Size);
152*b5893f02SDimitry Andric   return NewDensity < A.getDensity() / MAX_DENSITY_DEGRADATION;
1534ba319b5SDimitry Andric }
1544ba319b5SDimitry Andric 
mergeClusters(Cluster & Into,Cluster & From)1554ba319b5SDimitry Andric static void mergeClusters(Cluster &Into, Cluster &From) {
1564ba319b5SDimitry Andric   Into.Sections.insert(Into.Sections.end(), From.Sections.begin(),
1574ba319b5SDimitry Andric                        From.Sections.end());
1584ba319b5SDimitry Andric   Into.Size += From.Size;
1594ba319b5SDimitry Andric   Into.Weight += From.Weight;
1604ba319b5SDimitry Andric   From.Sections.clear();
1614ba319b5SDimitry Andric   From.Size = 0;
1624ba319b5SDimitry Andric   From.Weight = 0;
1634ba319b5SDimitry Andric }
1644ba319b5SDimitry Andric 
1654ba319b5SDimitry Andric // Group InputSections into clusters using the Call-Chain Clustering heuristic
1664ba319b5SDimitry Andric // then sort the clusters by density.
groupClusters()1674ba319b5SDimitry Andric void CallGraphSort::groupClusters() {
1684ba319b5SDimitry Andric   std::vector<int> SortedSecs(Clusters.size());
1694ba319b5SDimitry Andric   std::vector<Cluster *> SecToCluster(Clusters.size());
1704ba319b5SDimitry Andric 
171*b5893f02SDimitry Andric   for (size_t I = 0; I < Clusters.size(); ++I) {
172*b5893f02SDimitry Andric     SortedSecs[I] = I;
173*b5893f02SDimitry Andric     SecToCluster[I] = &Clusters[I];
1744ba319b5SDimitry Andric   }
1754ba319b5SDimitry Andric 
1764ba319b5SDimitry Andric   std::stable_sort(SortedSecs.begin(), SortedSecs.end(), [&](int A, int B) {
1774ba319b5SDimitry Andric     return Clusters[B].getDensity() < Clusters[A].getDensity();
1784ba319b5SDimitry Andric   });
1794ba319b5SDimitry Andric 
1804ba319b5SDimitry Andric   for (int SI : SortedSecs) {
1814ba319b5SDimitry Andric     // Clusters[SI] is the same as SecToClusters[SI] here because it has not
1824ba319b5SDimitry Andric     // been merged into another cluster yet.
1834ba319b5SDimitry Andric     Cluster &C = Clusters[SI];
1844ba319b5SDimitry Andric 
185*b5893f02SDimitry Andric     // Don't consider merging if the edge is unlikely.
186*b5893f02SDimitry Andric     if (C.BestPred.From == -1 || C.BestPred.Weight * 10 <= C.InitialWeight)
1874ba319b5SDimitry Andric       continue;
1884ba319b5SDimitry Andric 
189*b5893f02SDimitry Andric     Cluster *PredC = SecToCluster[C.BestPred.From];
1904ba319b5SDimitry Andric     if (PredC == &C)
1914ba319b5SDimitry Andric       continue;
1924ba319b5SDimitry Andric 
1934ba319b5SDimitry Andric     if (C.Size + PredC->Size > MAX_CLUSTER_SIZE)
1944ba319b5SDimitry Andric       continue;
1954ba319b5SDimitry Andric 
1964ba319b5SDimitry Andric     if (isNewDensityBad(*PredC, C))
1974ba319b5SDimitry Andric       continue;
1984ba319b5SDimitry Andric 
1994ba319b5SDimitry Andric     // NOTE: Consider using a disjoint-set to track section -> cluster mapping
2004ba319b5SDimitry Andric     // if this is ever slow.
2014ba319b5SDimitry Andric     for (int SI : C.Sections)
2024ba319b5SDimitry Andric       SecToCluster[SI] = PredC;
2034ba319b5SDimitry Andric 
2044ba319b5SDimitry Andric     mergeClusters(*PredC, C);
2054ba319b5SDimitry Andric   }
2064ba319b5SDimitry Andric 
2074ba319b5SDimitry Andric   // Remove empty or dead nodes. Invalidates all cluster indices.
2084ba319b5SDimitry Andric   llvm::erase_if(Clusters, [](const Cluster &C) {
2094ba319b5SDimitry Andric     return C.Size == 0 || C.Sections.empty();
2104ba319b5SDimitry Andric   });
2114ba319b5SDimitry Andric 
2124ba319b5SDimitry Andric   // Sort by density.
2134ba319b5SDimitry Andric   std::stable_sort(Clusters.begin(), Clusters.end(),
2144ba319b5SDimitry Andric                    [](const Cluster &A, const Cluster &B) {
2154ba319b5SDimitry Andric                      return A.getDensity() > B.getDensity();
2164ba319b5SDimitry Andric                    });
2174ba319b5SDimitry Andric }
2184ba319b5SDimitry Andric 
run()2194ba319b5SDimitry Andric DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
2204ba319b5SDimitry Andric   groupClusters();
2214ba319b5SDimitry Andric 
2224ba319b5SDimitry Andric   // Generate order.
223*b5893f02SDimitry Andric   DenseMap<const InputSectionBase *, int> OrderMap;
2244ba319b5SDimitry Andric   ssize_t CurOrder = 1;
2254ba319b5SDimitry Andric 
2264ba319b5SDimitry Andric   for (const Cluster &C : Clusters)
2274ba319b5SDimitry Andric     for (int SecIndex : C.Sections)
2284ba319b5SDimitry Andric       OrderMap[Sections[SecIndex]] = CurOrder++;
2294ba319b5SDimitry Andric 
2304ba319b5SDimitry Andric   return OrderMap;
2314ba319b5SDimitry Andric }
2324ba319b5SDimitry Andric 
2334ba319b5SDimitry Andric // Sort sections by the profile data provided by -callgraph-profile-file
2344ba319b5SDimitry Andric //
2354ba319b5SDimitry Andric // This first builds a call graph based on the profile data then merges sections
2364ba319b5SDimitry Andric // according to the C³ huristic. All clusters are then sorted by a density
2374ba319b5SDimitry Andric // metric to further improve locality.
computeCallGraphProfileOrder()2384ba319b5SDimitry Andric DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
2394ba319b5SDimitry Andric   return CallGraphSort().run();
2404ba319b5SDimitry Andric }
241