10eae32dcSDimitry Andric //===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
20eae32dcSDimitry Andric //
30eae32dcSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40eae32dcSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50eae32dcSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60eae32dcSDimitry Andric //
70eae32dcSDimitry Andric //===----------------------------------------------------------------------===//
80eae32dcSDimitry Andric //
9fe013be4SDimitry Andric // The file implements "cache-aware" layout algorithms of basic blocks and
10fe013be4SDimitry Andric // functions in a binary.
110eae32dcSDimitry Andric //
120eae32dcSDimitry Andric // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
130eae32dcSDimitry Andric // optimizing jump locality and thus processor I-cache utilization. This is
140eae32dcSDimitry Andric // achieved via increasing the number of fall-through jumps and co-locating
150eae32dcSDimitry Andric // frequently executed nodes together. The name follows the underlying
160eae32dcSDimitry Andric // optimization problem, Extended-TSP, which is a generalization of classical
170eae32dcSDimitry Andric // (maximum) Traveling Salesmen Problem.
180eae32dcSDimitry Andric //
190eae32dcSDimitry Andric // The algorithm is a greedy heuristic that works with chains (ordered lists)
200eae32dcSDimitry Andric // of basic blocks. Initially all chains are isolated basic blocks. On every
210eae32dcSDimitry Andric // iteration, we pick a pair of chains whose merging yields the biggest increase
220eae32dcSDimitry Andric // in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
230eae32dcSDimitry Andric // A pair of chains giving the maximum gain is merged into a new chain. The
240eae32dcSDimitry Andric // procedure stops when there is only one chain left, or when merging does not
250eae32dcSDimitry Andric // increase ExtTSP. In the latter case, the remaining chains are sorted by
260eae32dcSDimitry Andric // density in the decreasing order.
270eae32dcSDimitry Andric //
280eae32dcSDimitry Andric // An important aspect is the way two chains are merged. Unlike earlier
290eae32dcSDimitry Andric // algorithms (e.g., based on the approach of Pettis-Hansen), two
300eae32dcSDimitry Andric // chains, X and Y, are first split into three, X1, X2, and Y. Then we
310eae32dcSDimitry Andric // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
320eae32dcSDimitry Andric // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
330eae32dcSDimitry Andric // This improves the quality of the final result (the search space is larger)
340eae32dcSDimitry Andric // while keeping the implementation sufficiently fast.
350eae32dcSDimitry Andric //
360eae32dcSDimitry Andric // Reference:
370eae32dcSDimitry Andric //   * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
380eae32dcSDimitry Andric //     IEEE Transactions on Computers, 2020
39bdd1243dSDimitry Andric //     https://arxiv.org/abs/1809.04676
400eae32dcSDimitry Andric //
410eae32dcSDimitry Andric //===----------------------------------------------------------------------===//
420eae32dcSDimitry Andric 
430eae32dcSDimitry Andric #include "llvm/Transforms/Utils/CodeLayout.h"
440eae32dcSDimitry Andric #include "llvm/Support/CommandLine.h"
45fe013be4SDimitry Andric #include "llvm/Support/Debug.h"
460eae32dcSDimitry Andric 
47bdd1243dSDimitry Andric #include <cmath>
48*c9157d92SDimitry Andric #include <set>
49bdd1243dSDimitry Andric 
500eae32dcSDimitry Andric using namespace llvm;
51*c9157d92SDimitry Andric using namespace llvm::codelayout;
52*c9157d92SDimitry Andric 
530eae32dcSDimitry Andric #define DEBUG_TYPE "code-layout"
540eae32dcSDimitry Andric 
55fe013be4SDimitry Andric namespace llvm {
5681ad6265SDimitry Andric cl::opt<bool> EnableExtTspBlockPlacement(
5781ad6265SDimitry Andric     "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
5881ad6265SDimitry Andric     cl::desc("Enable machine block placement based on the ext-tsp model, "
5981ad6265SDimitry Andric              "optimizing I-cache utilization."));
6081ad6265SDimitry Andric 
6181ad6265SDimitry Andric cl::opt<bool> ApplyExtTspWithoutProfile(
6281ad6265SDimitry Andric     "ext-tsp-apply-without-profile",
6381ad6265SDimitry Andric     cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
6481ad6265SDimitry Andric     cl::init(true), cl::Hidden);
65fe013be4SDimitry Andric } // namespace llvm
6681ad6265SDimitry Andric 
67*c9157d92SDimitry Andric // Algorithm-specific params for Ext-TSP. The values are tuned for the best
68*c9157d92SDimitry Andric // performance of large-scale front-end bound binaries.
69bdd1243dSDimitry Andric static cl::opt<double> ForwardWeightCond(
70bdd1243dSDimitry Andric     "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),
71bdd1243dSDimitry Andric     cl::desc("The weight of conditional forward jumps for ExtTSP value"));
720eae32dcSDimitry Andric 
73bdd1243dSDimitry Andric static cl::opt<double> ForwardWeightUncond(
74bdd1243dSDimitry Andric     "ext-tsp-forward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
75bdd1243dSDimitry Andric     cl::desc("The weight of unconditional forward jumps for ExtTSP value"));
76bdd1243dSDimitry Andric 
77bdd1243dSDimitry Andric static cl::opt<double> BackwardWeightCond(
78bdd1243dSDimitry Andric     "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1),
79fe013be4SDimitry Andric     cl::desc("The weight of conditional backward jumps for ExtTSP value"));
80bdd1243dSDimitry Andric 
81bdd1243dSDimitry Andric static cl::opt<double> BackwardWeightUncond(
82bdd1243dSDimitry Andric     "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
83fe013be4SDimitry Andric     cl::desc("The weight of unconditional backward jumps for ExtTSP value"));
84bdd1243dSDimitry Andric 
85bdd1243dSDimitry Andric static cl::opt<double> FallthroughWeightCond(
86bdd1243dSDimitry Andric     "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0),
87bdd1243dSDimitry Andric     cl::desc("The weight of conditional fallthrough jumps for ExtTSP value"));
88bdd1243dSDimitry Andric 
89bdd1243dSDimitry Andric static cl::opt<double> FallthroughWeightUncond(
90bdd1243dSDimitry Andric     "ext-tsp-fallthrough-weight-uncond", cl::ReallyHidden, cl::init(1.05),
91bdd1243dSDimitry Andric     cl::desc("The weight of unconditional fallthrough jumps for ExtTSP value"));
920eae32dcSDimitry Andric 
930eae32dcSDimitry Andric static cl::opt<unsigned> ForwardDistance(
94bdd1243dSDimitry Andric     "ext-tsp-forward-distance", cl::ReallyHidden, cl::init(1024),
950eae32dcSDimitry Andric     cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
960eae32dcSDimitry Andric 
970eae32dcSDimitry Andric static cl::opt<unsigned> BackwardDistance(
98bdd1243dSDimitry Andric     "ext-tsp-backward-distance", cl::ReallyHidden, cl::init(640),
990eae32dcSDimitry Andric     cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
1000eae32dcSDimitry Andric 
10181ad6265SDimitry Andric // The maximum size of a chain created by the algorithm. The size is bounded
102*c9157d92SDimitry Andric // so that the algorithm can efficiently process extremely large instances.
10381ad6265SDimitry Andric static cl::opt<unsigned>
104*c9157d92SDimitry Andric     MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(512),
105*c9157d92SDimitry Andric                  cl::desc("The maximum size of a chain to create"));
10681ad6265SDimitry Andric 
1070eae32dcSDimitry Andric // The maximum size of a chain for splitting. Larger values of the threshold
1080eae32dcSDimitry Andric // may yield better quality at the cost of worsen run-time.
1090eae32dcSDimitry Andric static cl::opt<unsigned> ChainSplitThreshold(
110bdd1243dSDimitry Andric     "ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128),
1110eae32dcSDimitry Andric     cl::desc("The maximum size of a chain to apply splitting"));
1120eae32dcSDimitry Andric 
113*c9157d92SDimitry Andric // The maximum ratio between densities of two chains for merging.
114*c9157d92SDimitry Andric static cl::opt<double> MaxMergeDensityRatio(
115*c9157d92SDimitry Andric     "ext-tsp-max-merge-density-ratio", cl::ReallyHidden, cl::init(100),
116*c9157d92SDimitry Andric     cl::desc("The maximum ratio between densities of two chains for merging"));
117*c9157d92SDimitry Andric 
118*c9157d92SDimitry Andric // Algorithm-specific options for CDSort.
119*c9157d92SDimitry Andric static cl::opt<unsigned> CacheEntries("cdsort-cache-entries", cl::ReallyHidden,
120*c9157d92SDimitry Andric                                       cl::desc("The size of the cache"));
121*c9157d92SDimitry Andric 
122*c9157d92SDimitry Andric static cl::opt<unsigned> CacheSize("cdsort-cache-size", cl::ReallyHidden,
123*c9157d92SDimitry Andric                                    cl::desc("The size of a line in the cache"));
124*c9157d92SDimitry Andric 
125*c9157d92SDimitry Andric static cl::opt<unsigned>
126*c9157d92SDimitry Andric     CDMaxChainSize("cdsort-max-chain-size", cl::ReallyHidden,
127*c9157d92SDimitry Andric                    cl::desc("The maximum size of a chain to create"));
128*c9157d92SDimitry Andric 
129*c9157d92SDimitry Andric static cl::opt<double> DistancePower(
130*c9157d92SDimitry Andric     "cdsort-distance-power", cl::ReallyHidden,
131*c9157d92SDimitry Andric     cl::desc("The power exponent for the distance-based locality"));
132*c9157d92SDimitry Andric 
133*c9157d92SDimitry Andric static cl::opt<double> FrequencyScale(
134*c9157d92SDimitry Andric     "cdsort-frequency-scale", cl::ReallyHidden,
135*c9157d92SDimitry Andric     cl::desc("The scale factor for the frequency-based locality"));
1360eae32dcSDimitry Andric 
1370eae32dcSDimitry Andric namespace {
1380eae32dcSDimitry Andric 
1390eae32dcSDimitry Andric // Epsilon for comparison of doubles.
1400eae32dcSDimitry Andric constexpr double EPS = 1e-8;
1410eae32dcSDimitry Andric 
142bdd1243dSDimitry Andric // Compute the Ext-TSP score for a given jump.
jumpExtTSPScore(uint64_t JumpDist,uint64_t JumpMaxDist,uint64_t Count,double Weight)143bdd1243dSDimitry Andric double jumpExtTSPScore(uint64_t JumpDist, uint64_t JumpMaxDist, uint64_t Count,
144bdd1243dSDimitry Andric                        double Weight) {
145bdd1243dSDimitry Andric   if (JumpDist > JumpMaxDist)
146bdd1243dSDimitry Andric     return 0;
147bdd1243dSDimitry Andric   double Prob = 1.0 - static_cast<double>(JumpDist) / JumpMaxDist;
148bdd1243dSDimitry Andric   return Weight * Prob * Count;
149bdd1243dSDimitry Andric }
150bdd1243dSDimitry Andric 
1510eae32dcSDimitry Andric // Compute the Ext-TSP score for a jump between a given pair of blocks,
1520eae32dcSDimitry Andric // using their sizes, (estimated) addresses and the jump execution count.
extTSPScore(uint64_t SrcAddr,uint64_t SrcSize,uint64_t DstAddr,uint64_t Count,bool IsConditional)1530eae32dcSDimitry Andric double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
154bdd1243dSDimitry Andric                    uint64_t Count, bool IsConditional) {
1550eae32dcSDimitry Andric   // Fallthrough
1560eae32dcSDimitry Andric   if (SrcAddr + SrcSize == DstAddr) {
157bdd1243dSDimitry Andric     return jumpExtTSPScore(0, 1, Count,
158bdd1243dSDimitry Andric                            IsConditional ? FallthroughWeightCond
159bdd1243dSDimitry Andric                                          : FallthroughWeightUncond);
1600eae32dcSDimitry Andric   }
1610eae32dcSDimitry Andric   // Forward
1620eae32dcSDimitry Andric   if (SrcAddr + SrcSize < DstAddr) {
163bdd1243dSDimitry Andric     const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
164bdd1243dSDimitry Andric     return jumpExtTSPScore(Dist, ForwardDistance, Count,
165bdd1243dSDimitry Andric                            IsConditional ? ForwardWeightCond
166bdd1243dSDimitry Andric                                          : ForwardWeightUncond);
1670eae32dcSDimitry Andric   }
1680eae32dcSDimitry Andric   // Backward
169bdd1243dSDimitry Andric   const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
170bdd1243dSDimitry Andric   return jumpExtTSPScore(Dist, BackwardDistance, Count,
171bdd1243dSDimitry Andric                          IsConditional ? BackwardWeightCond
172bdd1243dSDimitry Andric                                        : BackwardWeightUncond);
1730eae32dcSDimitry Andric }
1740eae32dcSDimitry Andric 
1750eae32dcSDimitry Andric /// A type of merging two chains, X and Y. The former chain is split into
1760eae32dcSDimitry Andric /// X1 and X2 and then concatenated with Y in the order specified by the type.
177fe013be4SDimitry Andric enum class MergeTypeT : int { X_Y, Y_X, X1_Y_X2, Y_X2_X1, X2_X1_Y };
1780eae32dcSDimitry Andric 
1790eae32dcSDimitry Andric /// The gain of merging two chains, that is, the Ext-TSP score of the merge
180fe013be4SDimitry Andric /// together with the corresponding merge 'type' and 'offset'.
181fe013be4SDimitry Andric struct MergeGainT {
182fe013be4SDimitry Andric   explicit MergeGainT() = default;
MergeGainT__anond1605d3d0111::MergeGainT183fe013be4SDimitry Andric   explicit MergeGainT(double Score, size_t MergeOffset, MergeTypeT MergeType)
1840eae32dcSDimitry Andric       : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
1850eae32dcSDimitry Andric 
score__anond1605d3d0111::MergeGainT1860eae32dcSDimitry Andric   double score() const { return Score; }
1870eae32dcSDimitry Andric 
mergeOffset__anond1605d3d0111::MergeGainT1880eae32dcSDimitry Andric   size_t mergeOffset() const { return MergeOffset; }
1890eae32dcSDimitry Andric 
mergeType__anond1605d3d0111::MergeGainT190fe013be4SDimitry Andric   MergeTypeT mergeType() const { return MergeType; }
191fe013be4SDimitry Andric 
setMergeType__anond1605d3d0111::MergeGainT192fe013be4SDimitry Andric   void setMergeType(MergeTypeT Ty) { MergeType = Ty; }
1930eae32dcSDimitry Andric 
1940eae32dcSDimitry Andric   // Returns 'true' iff Other is preferred over this.
operator <__anond1605d3d0111::MergeGainT195fe013be4SDimitry Andric   bool operator<(const MergeGainT &Other) const {
1960eae32dcSDimitry Andric     return (Other.Score > EPS && Other.Score > Score + EPS);
1970eae32dcSDimitry Andric   }
1980eae32dcSDimitry Andric 
1990eae32dcSDimitry Andric   // Update the current gain if Other is preferred over this.
updateIfLessThan__anond1605d3d0111::MergeGainT200fe013be4SDimitry Andric   void updateIfLessThan(const MergeGainT &Other) {
2010eae32dcSDimitry Andric     if (*this < Other)
2020eae32dcSDimitry Andric       *this = Other;
2030eae32dcSDimitry Andric   }
2040eae32dcSDimitry Andric 
2050eae32dcSDimitry Andric private:
2060eae32dcSDimitry Andric   double Score{-1.0};
2070eae32dcSDimitry Andric   size_t MergeOffset{0};
208fe013be4SDimitry Andric   MergeTypeT MergeType{MergeTypeT::X_Y};
2090eae32dcSDimitry Andric };
2100eae32dcSDimitry Andric 
211fe013be4SDimitry Andric struct JumpT;
212fe013be4SDimitry Andric struct ChainT;
213fe013be4SDimitry Andric struct ChainEdge;
2140eae32dcSDimitry Andric 
215fe013be4SDimitry Andric /// A node in the graph, typically corresponding to a basic block in the CFG or
216fe013be4SDimitry Andric /// a function in the call graph.
217fe013be4SDimitry Andric struct NodeT {
218fe013be4SDimitry Andric   NodeT(const NodeT &) = delete;
219fe013be4SDimitry Andric   NodeT(NodeT &&) = default;
220fe013be4SDimitry Andric   NodeT &operator=(const NodeT &) = delete;
221fe013be4SDimitry Andric   NodeT &operator=(NodeT &&) = default;
2220eae32dcSDimitry Andric 
NodeT__anond1605d3d0111::NodeT223*c9157d92SDimitry Andric   explicit NodeT(size_t Index, uint64_t Size, uint64_t Count)
224*c9157d92SDimitry Andric       : Index(Index), Size(Size), ExecutionCount(Count) {}
225fe013be4SDimitry Andric 
isEntry__anond1605d3d0111::NodeT2260eae32dcSDimitry Andric   bool isEntry() const { return Index == 0; }
227fe013be4SDimitry Andric 
228*c9157d92SDimitry Andric   // Check if Other is a successor of the node.
229*c9157d92SDimitry Andric   bool isSuccessor(const NodeT *Other) const;
230*c9157d92SDimitry Andric 
231fe013be4SDimitry Andric   // The total execution count of outgoing jumps.
232fe013be4SDimitry Andric   uint64_t outCount() const;
233fe013be4SDimitry Andric 
234fe013be4SDimitry Andric   // The total execution count of incoming jumps.
235fe013be4SDimitry Andric   uint64_t inCount() const;
236fe013be4SDimitry Andric 
237fe013be4SDimitry Andric   // The original index of the node in graph.
238fe013be4SDimitry Andric   size_t Index{0};
239fe013be4SDimitry Andric   // The index of the node in the current chain.
240fe013be4SDimitry Andric   size_t CurIndex{0};
241fe013be4SDimitry Andric   // The size of the node in the binary.
242fe013be4SDimitry Andric   uint64_t Size{0};
243fe013be4SDimitry Andric   // The execution count of the node in the profile data.
244fe013be4SDimitry Andric   uint64_t ExecutionCount{0};
245fe013be4SDimitry Andric   // The current chain of the node.
246fe013be4SDimitry Andric   ChainT *CurChain{nullptr};
247fe013be4SDimitry Andric   // The offset of the node in the current chain.
248fe013be4SDimitry Andric   mutable uint64_t EstimatedAddr{0};
249fe013be4SDimitry Andric   // Forced successor of the node in the graph.
250fe013be4SDimitry Andric   NodeT *ForcedSucc{nullptr};
251fe013be4SDimitry Andric   // Forced predecessor of the node in the graph.
252fe013be4SDimitry Andric   NodeT *ForcedPred{nullptr};
253fe013be4SDimitry Andric   // Outgoing jumps from the node.
254fe013be4SDimitry Andric   std::vector<JumpT *> OutJumps;
255fe013be4SDimitry Andric   // Incoming jumps to the node.
256fe013be4SDimitry Andric   std::vector<JumpT *> InJumps;
2570eae32dcSDimitry Andric };
2580eae32dcSDimitry Andric 
259fe013be4SDimitry Andric /// An arc in the graph, typically corresponding to a jump between two nodes.
260fe013be4SDimitry Andric struct JumpT {
261fe013be4SDimitry Andric   JumpT(const JumpT &) = delete;
262fe013be4SDimitry Andric   JumpT(JumpT &&) = default;
263fe013be4SDimitry Andric   JumpT &operator=(const JumpT &) = delete;
264fe013be4SDimitry Andric   JumpT &operator=(JumpT &&) = default;
2650eae32dcSDimitry Andric 
JumpT__anond1605d3d0111::JumpT266fe013be4SDimitry Andric   explicit JumpT(NodeT *Source, NodeT *Target, uint64_t ExecutionCount)
267fe013be4SDimitry Andric       : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
268fe013be4SDimitry Andric 
269fe013be4SDimitry Andric   // Source node of the jump.
270fe013be4SDimitry Andric   NodeT *Source;
271fe013be4SDimitry Andric   // Target node of the jump.
272fe013be4SDimitry Andric   NodeT *Target;
2730eae32dcSDimitry Andric   // Execution count of the arc in the profile data.
2740eae32dcSDimitry Andric   uint64_t ExecutionCount{0};
275bdd1243dSDimitry Andric   // Whether the jump corresponds to a conditional branch.
276bdd1243dSDimitry Andric   bool IsConditional{false};
277fe013be4SDimitry Andric   // The offset of the jump from the source node.
278fe013be4SDimitry Andric   uint64_t Offset{0};
2790eae32dcSDimitry Andric };
2800eae32dcSDimitry Andric 
281fe013be4SDimitry Andric /// A chain (ordered sequence) of nodes in the graph.
282fe013be4SDimitry Andric struct ChainT {
283fe013be4SDimitry Andric   ChainT(const ChainT &) = delete;
284fe013be4SDimitry Andric   ChainT(ChainT &&) = default;
285fe013be4SDimitry Andric   ChainT &operator=(const ChainT &) = delete;
286fe013be4SDimitry Andric   ChainT &operator=(ChainT &&) = default;
2870eae32dcSDimitry Andric 
ChainT__anond1605d3d0111::ChainT288fe013be4SDimitry Andric   explicit ChainT(uint64_t Id, NodeT *Node)
289fe013be4SDimitry Andric       : Id(Id), ExecutionCount(Node->ExecutionCount), Size(Node->Size),
290fe013be4SDimitry Andric         Nodes(1, Node) {}
2910eae32dcSDimitry Andric 
numBlocks__anond1605d3d0111::ChainT292fe013be4SDimitry Andric   size_t numBlocks() const { return Nodes.size(); }
2930eae32dcSDimitry Andric 
density__anond1605d3d0111::ChainT294*c9157d92SDimitry Andric   double density() const { return ExecutionCount / Size; }
295fe013be4SDimitry Andric 
isEntry__anond1605d3d0111::ChainT296fe013be4SDimitry Andric   bool isEntry() const { return Nodes[0]->Index == 0; }
2970eae32dcSDimitry Andric 
isCold__anond1605d3d0111::ChainT298bdd1243dSDimitry Andric   bool isCold() const {
299fe013be4SDimitry Andric     for (NodeT *Node : Nodes) {
300fe013be4SDimitry Andric       if (Node->ExecutionCount > 0)
301bdd1243dSDimitry Andric         return false;
302bdd1243dSDimitry Andric     }
303bdd1243dSDimitry Andric     return true;
304bdd1243dSDimitry Andric   }
305bdd1243dSDimitry Andric 
getEdge__anond1605d3d0111::ChainT306fe013be4SDimitry Andric   ChainEdge *getEdge(ChainT *Other) const {
307*c9157d92SDimitry Andric     for (const auto &[Chain, ChainEdge] : Edges) {
308*c9157d92SDimitry Andric       if (Chain == Other)
309*c9157d92SDimitry Andric         return ChainEdge;
3100eae32dcSDimitry Andric     }
3110eae32dcSDimitry Andric     return nullptr;
3120eae32dcSDimitry Andric   }
3130eae32dcSDimitry Andric 
removeEdge__anond1605d3d0111::ChainT314fe013be4SDimitry Andric   void removeEdge(ChainT *Other) {
3150eae32dcSDimitry Andric     auto It = Edges.begin();
3160eae32dcSDimitry Andric     while (It != Edges.end()) {
3170eae32dcSDimitry Andric       if (It->first == Other) {
3180eae32dcSDimitry Andric         Edges.erase(It);
3190eae32dcSDimitry Andric         return;
3200eae32dcSDimitry Andric       }
3210eae32dcSDimitry Andric       It++;
3220eae32dcSDimitry Andric     }
3230eae32dcSDimitry Andric   }
3240eae32dcSDimitry Andric 
addEdge__anond1605d3d0111::ChainT325fe013be4SDimitry Andric   void addEdge(ChainT *Other, ChainEdge *Edge) {
3260eae32dcSDimitry Andric     Edges.push_back(std::make_pair(Other, Edge));
3270eae32dcSDimitry Andric   }
3280eae32dcSDimitry Andric 
merge__anond1605d3d0111::ChainT329*c9157d92SDimitry Andric   void merge(ChainT *Other, std::vector<NodeT *> MergedBlocks) {
330*c9157d92SDimitry Andric     Nodes = std::move(MergedBlocks);
331*c9157d92SDimitry Andric     // Update the chain's data.
332fe013be4SDimitry Andric     ExecutionCount += Other->ExecutionCount;
333fe013be4SDimitry Andric     Size += Other->Size;
334fe013be4SDimitry Andric     Id = Nodes[0]->Index;
335*c9157d92SDimitry Andric     // Update the node's data.
336fe013be4SDimitry Andric     for (size_t Idx = 0; Idx < Nodes.size(); Idx++) {
337fe013be4SDimitry Andric       Nodes[Idx]->CurChain = this;
338fe013be4SDimitry Andric       Nodes[Idx]->CurIndex = Idx;
3390eae32dcSDimitry Andric     }
3400eae32dcSDimitry Andric   }
3410eae32dcSDimitry Andric 
342fe013be4SDimitry Andric   void mergeEdges(ChainT *Other);
3430eae32dcSDimitry Andric 
clear__anond1605d3d0111::ChainT3440eae32dcSDimitry Andric   void clear() {
345fe013be4SDimitry Andric     Nodes.clear();
346fe013be4SDimitry Andric     Nodes.shrink_to_fit();
3470eae32dcSDimitry Andric     Edges.clear();
3480eae32dcSDimitry Andric     Edges.shrink_to_fit();
3490eae32dcSDimitry Andric   }
3500eae32dcSDimitry Andric 
3510eae32dcSDimitry Andric   // Unique chain identifier.
3520eae32dcSDimitry Andric   uint64_t Id;
3530eae32dcSDimitry Andric   // Cached ext-tsp score for the chain.
354fe013be4SDimitry Andric   double Score{0};
355*c9157d92SDimitry Andric   // The total execution count of the chain. Since the execution count of
356*c9157d92SDimitry Andric   // a basic block is uint64_t, using doubles here to avoid overflow.
357*c9157d92SDimitry Andric   double ExecutionCount{0};
358fe013be4SDimitry Andric   // The total size of the chain.
359fe013be4SDimitry Andric   uint64_t Size{0};
360fe013be4SDimitry Andric   // Nodes of the chain.
361fe013be4SDimitry Andric   std::vector<NodeT *> Nodes;
3620eae32dcSDimitry Andric   // Adjacent chains and corresponding edges (lists of jumps).
363fe013be4SDimitry Andric   std::vector<std::pair<ChainT *, ChainEdge *>> Edges;
3640eae32dcSDimitry Andric };
3650eae32dcSDimitry Andric 
366fe013be4SDimitry Andric /// An edge in the graph representing jumps between two chains.
367fe013be4SDimitry Andric /// When nodes are merged into chains, the edges are combined too so that
368*c9157d92SDimitry Andric /// there is always at most one edge between a pair of chains.
369fe013be4SDimitry Andric struct ChainEdge {
3700eae32dcSDimitry Andric   ChainEdge(const ChainEdge &) = delete;
3710eae32dcSDimitry Andric   ChainEdge(ChainEdge &&) = default;
3720eae32dcSDimitry Andric   ChainEdge &operator=(const ChainEdge &) = delete;
373fe013be4SDimitry Andric   ChainEdge &operator=(ChainEdge &&) = delete;
3740eae32dcSDimitry Andric 
ChainEdge__anond1605d3d0111::ChainEdge375fe013be4SDimitry Andric   explicit ChainEdge(JumpT *Jump)
3760eae32dcSDimitry Andric       : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
3770eae32dcSDimitry Andric         Jumps(1, Jump) {}
3780eae32dcSDimitry Andric 
srcChain__anond1605d3d0111::ChainEdge379fe013be4SDimitry Andric   ChainT *srcChain() const { return SrcChain; }
3800eae32dcSDimitry Andric 
dstChain__anond1605d3d0111::ChainEdge381fe013be4SDimitry Andric   ChainT *dstChain() const { return DstChain; }
3820eae32dcSDimitry Andric 
isSelfEdge__anond1605d3d0111::ChainEdge383fe013be4SDimitry Andric   bool isSelfEdge() const { return SrcChain == DstChain; }
384fe013be4SDimitry Andric 
jumps__anond1605d3d0111::ChainEdge385fe013be4SDimitry Andric   const std::vector<JumpT *> &jumps() const { return Jumps; }
386fe013be4SDimitry Andric 
appendJump__anond1605d3d0111::ChainEdge387fe013be4SDimitry Andric   void appendJump(JumpT *Jump) { Jumps.push_back(Jump); }
3880eae32dcSDimitry Andric 
moveJumps__anond1605d3d0111::ChainEdge3890eae32dcSDimitry Andric   void moveJumps(ChainEdge *Other) {
3900eae32dcSDimitry Andric     Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
3910eae32dcSDimitry Andric     Other->Jumps.clear();
3920eae32dcSDimitry Andric     Other->Jumps.shrink_to_fit();
3930eae32dcSDimitry Andric   }
3940eae32dcSDimitry Andric 
changeEndpoint__anond1605d3d0111::ChainEdge395fe013be4SDimitry Andric   void changeEndpoint(ChainT *From, ChainT *To) {
396fe013be4SDimitry Andric     if (From == SrcChain)
397fe013be4SDimitry Andric       SrcChain = To;
398fe013be4SDimitry Andric     if (From == DstChain)
399fe013be4SDimitry Andric       DstChain = To;
400fe013be4SDimitry Andric   }
401fe013be4SDimitry Andric 
hasCachedMergeGain__anond1605d3d0111::ChainEdge402fe013be4SDimitry Andric   bool hasCachedMergeGain(ChainT *Src, ChainT *Dst) const {
4030eae32dcSDimitry Andric     return Src == SrcChain ? CacheValidForward : CacheValidBackward;
4040eae32dcSDimitry Andric   }
4050eae32dcSDimitry Andric 
getCachedMergeGain__anond1605d3d0111::ChainEdge406fe013be4SDimitry Andric   MergeGainT getCachedMergeGain(ChainT *Src, ChainT *Dst) const {
4070eae32dcSDimitry Andric     return Src == SrcChain ? CachedGainForward : CachedGainBackward;
4080eae32dcSDimitry Andric   }
4090eae32dcSDimitry Andric 
setCachedMergeGain__anond1605d3d0111::ChainEdge410fe013be4SDimitry Andric   void setCachedMergeGain(ChainT *Src, ChainT *Dst, MergeGainT MergeGain) {
4110eae32dcSDimitry Andric     if (Src == SrcChain) {
4120eae32dcSDimitry Andric       CachedGainForward = MergeGain;
4130eae32dcSDimitry Andric       CacheValidForward = true;
4140eae32dcSDimitry Andric     } else {
4150eae32dcSDimitry Andric       CachedGainBackward = MergeGain;
4160eae32dcSDimitry Andric       CacheValidBackward = true;
4170eae32dcSDimitry Andric     }
4180eae32dcSDimitry Andric   }
4190eae32dcSDimitry Andric 
invalidateCache__anond1605d3d0111::ChainEdge4200eae32dcSDimitry Andric   void invalidateCache() {
4210eae32dcSDimitry Andric     CacheValidForward = false;
4220eae32dcSDimitry Andric     CacheValidBackward = false;
4230eae32dcSDimitry Andric   }
4240eae32dcSDimitry Andric 
setMergeGain__anond1605d3d0111::ChainEdge425fe013be4SDimitry Andric   void setMergeGain(MergeGainT Gain) { CachedGain = Gain; }
426fe013be4SDimitry Andric 
getMergeGain__anond1605d3d0111::ChainEdge427fe013be4SDimitry Andric   MergeGainT getMergeGain() const { return CachedGain; }
428fe013be4SDimitry Andric 
gain__anond1605d3d0111::ChainEdge429fe013be4SDimitry Andric   double gain() const { return CachedGain.score(); }
430fe013be4SDimitry Andric 
4310eae32dcSDimitry Andric private:
4320eae32dcSDimitry Andric   // Source chain.
433fe013be4SDimitry Andric   ChainT *SrcChain{nullptr};
4340eae32dcSDimitry Andric   // Destination chain.
435fe013be4SDimitry Andric   ChainT *DstChain{nullptr};
436fe013be4SDimitry Andric   // Original jumps in the binary with corresponding execution counts.
437fe013be4SDimitry Andric   std::vector<JumpT *> Jumps;
438fe013be4SDimitry Andric   // Cached gain value for merging the pair of chains.
439fe013be4SDimitry Andric   MergeGainT CachedGain;
440fe013be4SDimitry Andric 
441fe013be4SDimitry Andric   // Cached gain values for merging the pair of chains. Since the gain of
442fe013be4SDimitry Andric   // merging (Src, Dst) and (Dst, Src) might be different, we store both values
443fe013be4SDimitry Andric   // here and a flag indicating which of the options results in a higher gain.
444fe013be4SDimitry Andric   // Cached gain values.
445fe013be4SDimitry Andric   MergeGainT CachedGainForward;
446fe013be4SDimitry Andric   MergeGainT CachedGainBackward;
4470eae32dcSDimitry Andric   // Whether the cached value must be recomputed.
4480eae32dcSDimitry Andric   bool CacheValidForward{false};
4490eae32dcSDimitry Andric   bool CacheValidBackward{false};
4500eae32dcSDimitry Andric };
4510eae32dcSDimitry Andric 
isSuccessor(const NodeT * Other) const452*c9157d92SDimitry Andric bool NodeT::isSuccessor(const NodeT *Other) const {
453*c9157d92SDimitry Andric   for (JumpT *Jump : OutJumps)
454*c9157d92SDimitry Andric     if (Jump->Target == Other)
455*c9157d92SDimitry Andric       return true;
456*c9157d92SDimitry Andric   return false;
457*c9157d92SDimitry Andric }
458*c9157d92SDimitry Andric 
outCount() const459fe013be4SDimitry Andric uint64_t NodeT::outCount() const {
460fe013be4SDimitry Andric   uint64_t Count = 0;
461*c9157d92SDimitry Andric   for (JumpT *Jump : OutJumps)
462fe013be4SDimitry Andric     Count += Jump->ExecutionCount;
463fe013be4SDimitry Andric   return Count;
464fe013be4SDimitry Andric }
4650eae32dcSDimitry Andric 
inCount() const466fe013be4SDimitry Andric uint64_t NodeT::inCount() const {
467fe013be4SDimitry Andric   uint64_t Count = 0;
468*c9157d92SDimitry Andric   for (JumpT *Jump : InJumps)
469fe013be4SDimitry Andric     Count += Jump->ExecutionCount;
470fe013be4SDimitry Andric   return Count;
471fe013be4SDimitry Andric }
472fe013be4SDimitry Andric 
mergeEdges(ChainT * Other)473fe013be4SDimitry Andric void ChainT::mergeEdges(ChainT *Other) {
474*c9157d92SDimitry Andric   // Update edges adjacent to chain Other.
475*c9157d92SDimitry Andric   for (const auto &[DstChain, DstEdge] : Other->Edges) {
476fe013be4SDimitry Andric     ChainT *TargetChain = DstChain == Other ? this : DstChain;
477bdd1243dSDimitry Andric     ChainEdge *CurEdge = getEdge(TargetChain);
4780eae32dcSDimitry Andric     if (CurEdge == nullptr) {
4790eae32dcSDimitry Andric       DstEdge->changeEndpoint(Other, this);
4800eae32dcSDimitry Andric       this->addEdge(TargetChain, DstEdge);
481*c9157d92SDimitry Andric       if (DstChain != this && DstChain != Other)
4820eae32dcSDimitry Andric         DstChain->addEdge(this, DstEdge);
4830eae32dcSDimitry Andric     } else {
4840eae32dcSDimitry Andric       CurEdge->moveJumps(DstEdge);
4850eae32dcSDimitry Andric     }
486*c9157d92SDimitry Andric     // Cleanup leftover edge.
487*c9157d92SDimitry Andric     if (DstChain != Other)
4880eae32dcSDimitry Andric       DstChain->removeEdge(Other);
4890eae32dcSDimitry Andric   }
4900eae32dcSDimitry Andric }
4910eae32dcSDimitry Andric 
492fe013be4SDimitry Andric using NodeIter = std::vector<NodeT *>::const_iterator;
493*c9157d92SDimitry Andric static std::vector<NodeT *> EmptyList;
4940eae32dcSDimitry Andric 
495*c9157d92SDimitry Andric /// A wrapper around three concatenated vectors (chains) of nodes; it is used
496*c9157d92SDimitry Andric /// to avoid extra instantiation of the vectors.
497*c9157d92SDimitry Andric struct MergedNodesT {
MergedNodesT__anond1605d3d0111::MergedNodesT498*c9157d92SDimitry Andric   MergedNodesT(NodeIter Begin1, NodeIter End1,
499*c9157d92SDimitry Andric                NodeIter Begin2 = EmptyList.begin(),
500*c9157d92SDimitry Andric                NodeIter End2 = EmptyList.end(),
501*c9157d92SDimitry Andric                NodeIter Begin3 = EmptyList.begin(),
502*c9157d92SDimitry Andric                NodeIter End3 = EmptyList.end())
5030eae32dcSDimitry Andric       : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
5040eae32dcSDimitry Andric         End3(End3) {}
5050eae32dcSDimitry Andric 
forEach__anond1605d3d0111::MergedNodesT5060eae32dcSDimitry Andric   template <typename F> void forEach(const F &Func) const {
5070eae32dcSDimitry Andric     for (auto It = Begin1; It != End1; It++)
5080eae32dcSDimitry Andric       Func(*It);
5090eae32dcSDimitry Andric     for (auto It = Begin2; It != End2; It++)
5100eae32dcSDimitry Andric       Func(*It);
5110eae32dcSDimitry Andric     for (auto It = Begin3; It != End3; It++)
5120eae32dcSDimitry Andric       Func(*It);
5130eae32dcSDimitry Andric   }
5140eae32dcSDimitry Andric 
getNodes__anond1605d3d0111::MergedNodesT515fe013be4SDimitry Andric   std::vector<NodeT *> getNodes() const {
516fe013be4SDimitry Andric     std::vector<NodeT *> Result;
5170eae32dcSDimitry Andric     Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
5180eae32dcSDimitry Andric                    std::distance(Begin3, End3));
5190eae32dcSDimitry Andric     Result.insert(Result.end(), Begin1, End1);
5200eae32dcSDimitry Andric     Result.insert(Result.end(), Begin2, End2);
5210eae32dcSDimitry Andric     Result.insert(Result.end(), Begin3, End3);
5220eae32dcSDimitry Andric     return Result;
5230eae32dcSDimitry Andric   }
5240eae32dcSDimitry Andric 
getFirstNode__anond1605d3d0111::MergedNodesT525fe013be4SDimitry Andric   const NodeT *getFirstNode() const { return *Begin1; }
5260eae32dcSDimitry Andric 
5270eae32dcSDimitry Andric private:
528fe013be4SDimitry Andric   NodeIter Begin1;
529fe013be4SDimitry Andric   NodeIter End1;
530fe013be4SDimitry Andric   NodeIter Begin2;
531fe013be4SDimitry Andric   NodeIter End2;
532fe013be4SDimitry Andric   NodeIter Begin3;
533fe013be4SDimitry Andric   NodeIter End3;
5340eae32dcSDimitry Andric };
5350eae32dcSDimitry Andric 
536*c9157d92SDimitry Andric /// A wrapper around two concatenated vectors (chains) of jumps.
537*c9157d92SDimitry Andric struct MergedJumpsT {
MergedJumpsT__anond1605d3d0111::MergedJumpsT538*c9157d92SDimitry Andric   MergedJumpsT(const std::vector<JumpT *> *Jumps1,
539*c9157d92SDimitry Andric                const std::vector<JumpT *> *Jumps2 = nullptr) {
540*c9157d92SDimitry Andric     assert(!Jumps1->empty() && "cannot merge empty jump list");
541*c9157d92SDimitry Andric     JumpArray[0] = Jumps1;
542*c9157d92SDimitry Andric     JumpArray[1] = Jumps2;
543*c9157d92SDimitry Andric   }
544*c9157d92SDimitry Andric 
forEach__anond1605d3d0111::MergedJumpsT545*c9157d92SDimitry Andric   template <typename F> void forEach(const F &Func) const {
546*c9157d92SDimitry Andric     for (auto Jumps : JumpArray)
547*c9157d92SDimitry Andric       if (Jumps != nullptr)
548*c9157d92SDimitry Andric         for (JumpT *Jump : *Jumps)
549*c9157d92SDimitry Andric           Func(Jump);
550*c9157d92SDimitry Andric   }
551*c9157d92SDimitry Andric 
552*c9157d92SDimitry Andric private:
553*c9157d92SDimitry Andric   std::array<const std::vector<JumpT *> *, 2> JumpArray{nullptr, nullptr};
554*c9157d92SDimitry Andric };
555*c9157d92SDimitry Andric 
556fe013be4SDimitry Andric /// Merge two chains of nodes respecting a given 'type' and 'offset'.
557fe013be4SDimitry Andric ///
558fe013be4SDimitry Andric /// If MergeType == 0, then the result is a concatenation of two chains.
559fe013be4SDimitry Andric /// Otherwise, the first chain is cut into two sub-chains at the offset,
560fe013be4SDimitry Andric /// and merged using all possible ways of concatenating three chains.
mergeNodes(const std::vector<NodeT * > & X,const std::vector<NodeT * > & Y,size_t MergeOffset,MergeTypeT MergeType)561*c9157d92SDimitry Andric MergedNodesT mergeNodes(const std::vector<NodeT *> &X,
562fe013be4SDimitry Andric                         const std::vector<NodeT *> &Y, size_t MergeOffset,
563fe013be4SDimitry Andric                         MergeTypeT MergeType) {
564*c9157d92SDimitry Andric   // Split the first chain, X, into X1 and X2.
565fe013be4SDimitry Andric   NodeIter BeginX1 = X.begin();
566fe013be4SDimitry Andric   NodeIter EndX1 = X.begin() + MergeOffset;
567fe013be4SDimitry Andric   NodeIter BeginX2 = X.begin() + MergeOffset;
568fe013be4SDimitry Andric   NodeIter EndX2 = X.end();
569fe013be4SDimitry Andric   NodeIter BeginY = Y.begin();
570fe013be4SDimitry Andric   NodeIter EndY = Y.end();
571fe013be4SDimitry Andric 
572*c9157d92SDimitry Andric   // Construct a new chain from the three existing ones.
573fe013be4SDimitry Andric   switch (MergeType) {
574fe013be4SDimitry Andric   case MergeTypeT::X_Y:
575*c9157d92SDimitry Andric     return MergedNodesT(BeginX1, EndX2, BeginY, EndY);
576fe013be4SDimitry Andric   case MergeTypeT::Y_X:
577*c9157d92SDimitry Andric     return MergedNodesT(BeginY, EndY, BeginX1, EndX2);
578fe013be4SDimitry Andric   case MergeTypeT::X1_Y_X2:
579*c9157d92SDimitry Andric     return MergedNodesT(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
580fe013be4SDimitry Andric   case MergeTypeT::Y_X2_X1:
581*c9157d92SDimitry Andric     return MergedNodesT(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
582fe013be4SDimitry Andric   case MergeTypeT::X2_X1_Y:
583*c9157d92SDimitry Andric     return MergedNodesT(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
584fe013be4SDimitry Andric   }
585fe013be4SDimitry Andric   llvm_unreachable("unexpected chain merge type");
586fe013be4SDimitry Andric }
587fe013be4SDimitry Andric 
5880eae32dcSDimitry Andric /// The implementation of the ExtTSP algorithm.
5890eae32dcSDimitry Andric class ExtTSPImpl {
5900eae32dcSDimitry Andric public:
ExtTSPImpl(ArrayRef<uint64_t> NodeSizes,ArrayRef<uint64_t> NodeCounts,ArrayRef<EdgeCount> EdgeCounts)591*c9157d92SDimitry Andric   ExtTSPImpl(ArrayRef<uint64_t> NodeSizes, ArrayRef<uint64_t> NodeCounts,
592*c9157d92SDimitry Andric              ArrayRef<EdgeCount> EdgeCounts)
593fe013be4SDimitry Andric       : NumNodes(NodeSizes.size()) {
5940eae32dcSDimitry Andric     initialize(NodeSizes, NodeCounts, EdgeCounts);
5950eae32dcSDimitry Andric   }
5960eae32dcSDimitry Andric 
597fe013be4SDimitry Andric   /// Run the algorithm and return an optimized ordering of nodes.
run()598*c9157d92SDimitry Andric   std::vector<uint64_t> run() {
599fe013be4SDimitry Andric     // Pass 1: Merge nodes with their mutually forced successors
6000eae32dcSDimitry Andric     mergeForcedPairs();
6010eae32dcSDimitry Andric 
6020eae32dcSDimitry Andric     // Pass 2: Merge pairs of chains while improving the ExtTSP objective
6030eae32dcSDimitry Andric     mergeChainPairs();
6040eae32dcSDimitry Andric 
605fe013be4SDimitry Andric     // Pass 3: Merge cold nodes to reduce code size
6060eae32dcSDimitry Andric     mergeColdChains();
6070eae32dcSDimitry Andric 
608fe013be4SDimitry Andric     // Collect nodes from all chains
609*c9157d92SDimitry Andric     return concatChains();
6100eae32dcSDimitry Andric   }
6110eae32dcSDimitry Andric 
6120eae32dcSDimitry Andric private:
6130eae32dcSDimitry Andric   /// Initialize the algorithm's data structures.
initialize(const ArrayRef<uint64_t> & NodeSizes,const ArrayRef<uint64_t> & NodeCounts,const ArrayRef<EdgeCount> & EdgeCounts)614*c9157d92SDimitry Andric   void initialize(const ArrayRef<uint64_t> &NodeSizes,
615*c9157d92SDimitry Andric                   const ArrayRef<uint64_t> &NodeCounts,
616*c9157d92SDimitry Andric                   const ArrayRef<EdgeCount> &EdgeCounts) {
617*c9157d92SDimitry Andric     // Initialize nodes.
618fe013be4SDimitry Andric     AllNodes.reserve(NumNodes);
619fe013be4SDimitry Andric     for (uint64_t Idx = 0; Idx < NumNodes; Idx++) {
620fe013be4SDimitry Andric       uint64_t Size = std::max<uint64_t>(NodeSizes[Idx], 1ULL);
621fe013be4SDimitry Andric       uint64_t ExecutionCount = NodeCounts[Idx];
622*c9157d92SDimitry Andric       // The execution count of the entry node is set to at least one.
623fe013be4SDimitry Andric       if (Idx == 0 && ExecutionCount == 0)
6240eae32dcSDimitry Andric         ExecutionCount = 1;
625fe013be4SDimitry Andric       AllNodes.emplace_back(Idx, Size, ExecutionCount);
6260eae32dcSDimitry Andric     }
6270eae32dcSDimitry Andric 
628*c9157d92SDimitry Andric     // Initialize jumps between the nodes.
629bdd1243dSDimitry Andric     SuccNodes.resize(NumNodes);
630bdd1243dSDimitry Andric     PredNodes.resize(NumNodes);
631bdd1243dSDimitry Andric     std::vector<uint64_t> OutDegree(NumNodes, 0);
6320eae32dcSDimitry Andric     AllJumps.reserve(EdgeCounts.size());
633*c9157d92SDimitry Andric     for (auto Edge : EdgeCounts) {
634*c9157d92SDimitry Andric       ++OutDegree[Edge.src];
635*c9157d92SDimitry Andric       // Ignore self-edges.
636*c9157d92SDimitry Andric       if (Edge.src == Edge.dst)
6370eae32dcSDimitry Andric         continue;
6380eae32dcSDimitry Andric 
639*c9157d92SDimitry Andric       SuccNodes[Edge.src].push_back(Edge.dst);
640*c9157d92SDimitry Andric       PredNodes[Edge.dst].push_back(Edge.src);
641*c9157d92SDimitry Andric       if (Edge.count > 0) {
642*c9157d92SDimitry Andric         NodeT &PredNode = AllNodes[Edge.src];
643*c9157d92SDimitry Andric         NodeT &SuccNode = AllNodes[Edge.dst];
644*c9157d92SDimitry Andric         AllJumps.emplace_back(&PredNode, &SuccNode, Edge.count);
645fe013be4SDimitry Andric         SuccNode.InJumps.push_back(&AllJumps.back());
646fe013be4SDimitry Andric         PredNode.OutJumps.push_back(&AllJumps.back());
647*c9157d92SDimitry Andric         // Adjust execution counts.
648*c9157d92SDimitry Andric         PredNode.ExecutionCount = std::max(PredNode.ExecutionCount, Edge.count);
649*c9157d92SDimitry Andric         SuccNode.ExecutionCount = std::max(SuccNode.ExecutionCount, Edge.count);
6500eae32dcSDimitry Andric       }
6510eae32dcSDimitry Andric     }
652fe013be4SDimitry Andric     for (JumpT &Jump : AllJumps) {
653*c9157d92SDimitry Andric       assert(OutDegree[Jump.Source->Index] > 0 &&
654*c9157d92SDimitry Andric              "incorrectly computed out-degree of the block");
655bdd1243dSDimitry Andric       Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
656bdd1243dSDimitry Andric     }
6570eae32dcSDimitry Andric 
658*c9157d92SDimitry Andric     // Initialize chains.
6590eae32dcSDimitry Andric     AllChains.reserve(NumNodes);
6600eae32dcSDimitry Andric     HotChains.reserve(NumNodes);
661fe013be4SDimitry Andric     for (NodeT &Node : AllNodes) {
662*c9157d92SDimitry Andric       // Create a chain.
663fe013be4SDimitry Andric       AllChains.emplace_back(Node.Index, &Node);
664fe013be4SDimitry Andric       Node.CurChain = &AllChains.back();
665*c9157d92SDimitry Andric       if (Node.ExecutionCount > 0)
6660eae32dcSDimitry Andric         HotChains.push_back(&AllChains.back());
6670eae32dcSDimitry Andric     }
6680eae32dcSDimitry Andric 
669*c9157d92SDimitry Andric     // Initialize chain edges.
6700eae32dcSDimitry Andric     AllEdges.reserve(AllJumps.size());
671fe013be4SDimitry Andric     for (NodeT &PredNode : AllNodes) {
672fe013be4SDimitry Andric       for (JumpT *Jump : PredNode.OutJumps) {
673*c9157d92SDimitry Andric         assert(Jump->ExecutionCount > 0 && "incorrectly initialized jump");
674fe013be4SDimitry Andric         NodeT *SuccNode = Jump->Target;
675fe013be4SDimitry Andric         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
676*c9157d92SDimitry Andric         // This edge is already present in the graph.
6770eae32dcSDimitry Andric         if (CurEdge != nullptr) {
678fe013be4SDimitry Andric           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
6790eae32dcSDimitry Andric           CurEdge->appendJump(Jump);
6800eae32dcSDimitry Andric           continue;
6810eae32dcSDimitry Andric         }
682*c9157d92SDimitry Andric         // This is a new edge.
6830eae32dcSDimitry Andric         AllEdges.emplace_back(Jump);
684fe013be4SDimitry Andric         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
685fe013be4SDimitry Andric         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
6860eae32dcSDimitry Andric       }
6870eae32dcSDimitry Andric     }
6880eae32dcSDimitry Andric   }
6890eae32dcSDimitry Andric 
690fe013be4SDimitry Andric   /// For a pair of nodes, A and B, node B is the forced successor of A,
6910eae32dcSDimitry Andric   /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
692fe013be4SDimitry Andric   /// to B are from A. Such nodes should be adjacent in the optimal ordering;
693fe013be4SDimitry Andric   /// the method finds and merges such pairs of nodes.
mergeForcedPairs()6940eae32dcSDimitry Andric   void mergeForcedPairs() {
695*c9157d92SDimitry Andric     // Find forced pairs of blocks.
696fe013be4SDimitry Andric     for (NodeT &Node : AllNodes) {
697fe013be4SDimitry Andric       if (SuccNodes[Node.Index].size() == 1 &&
698fe013be4SDimitry Andric           PredNodes[SuccNodes[Node.Index][0]].size() == 1 &&
699fe013be4SDimitry Andric           SuccNodes[Node.Index][0] != 0) {
700fe013be4SDimitry Andric         size_t SuccIndex = SuccNodes[Node.Index][0];
701fe013be4SDimitry Andric         Node.ForcedSucc = &AllNodes[SuccIndex];
702fe013be4SDimitry Andric         AllNodes[SuccIndex].ForcedPred = &Node;
7030eae32dcSDimitry Andric       }
7040eae32dcSDimitry Andric     }
7050eae32dcSDimitry Andric 
7060eae32dcSDimitry Andric     // There might be 'cycles' in the forced dependencies, since profile
7070eae32dcSDimitry Andric     // data isn't 100% accurate. Typically this is observed in loops, when the
7080eae32dcSDimitry Andric     // loop edges are the hottest successors for the basic blocks of the loop.
709fe013be4SDimitry Andric     // Break the cycles by choosing the node with the smallest index as the
7100eae32dcSDimitry Andric     // head. This helps to keep the original order of the loops, which likely
7110eae32dcSDimitry Andric     // have already been rotated in the optimized manner.
712fe013be4SDimitry Andric     for (NodeT &Node : AllNodes) {
713fe013be4SDimitry Andric       if (Node.ForcedSucc == nullptr || Node.ForcedPred == nullptr)
7140eae32dcSDimitry Andric         continue;
7150eae32dcSDimitry Andric 
716fe013be4SDimitry Andric       NodeT *SuccNode = Node.ForcedSucc;
717fe013be4SDimitry Andric       while (SuccNode != nullptr && SuccNode != &Node) {
718fe013be4SDimitry Andric         SuccNode = SuccNode->ForcedSucc;
7190eae32dcSDimitry Andric       }
720fe013be4SDimitry Andric       if (SuccNode == nullptr)
7210eae32dcSDimitry Andric         continue;
722*c9157d92SDimitry Andric       // Break the cycle.
723fe013be4SDimitry Andric       AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr;
724fe013be4SDimitry Andric       Node.ForcedPred = nullptr;
7250eae32dcSDimitry Andric     }
7260eae32dcSDimitry Andric 
727*c9157d92SDimitry Andric     // Merge nodes with their fallthrough successors.
728fe013be4SDimitry Andric     for (NodeT &Node : AllNodes) {
729fe013be4SDimitry Andric       if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) {
730fe013be4SDimitry Andric         const NodeT *CurBlock = &Node;
7310eae32dcSDimitry Andric         while (CurBlock->ForcedSucc != nullptr) {
732fe013be4SDimitry Andric           const NodeT *NextBlock = CurBlock->ForcedSucc;
733fe013be4SDimitry Andric           mergeChains(Node.CurChain, NextBlock->CurChain, 0, MergeTypeT::X_Y);
7340eae32dcSDimitry Andric           CurBlock = NextBlock;
7350eae32dcSDimitry Andric         }
7360eae32dcSDimitry Andric       }
7370eae32dcSDimitry Andric     }
7380eae32dcSDimitry Andric   }
7390eae32dcSDimitry Andric 
7400eae32dcSDimitry Andric   /// Merge pairs of chains while improving the ExtTSP objective.
mergeChainPairs()7410eae32dcSDimitry Andric   void mergeChainPairs() {
742*c9157d92SDimitry Andric     /// Deterministically compare pairs of chains.
743fe013be4SDimitry Andric     auto compareChainPairs = [](const ChainT *A1, const ChainT *B1,
744fe013be4SDimitry Andric                                 const ChainT *A2, const ChainT *B2) {
745*c9157d92SDimitry Andric       return std::make_tuple(A1->Id, B1->Id) < std::make_tuple(A2->Id, B2->Id);
7460eae32dcSDimitry Andric     };
7470eae32dcSDimitry Andric 
7480eae32dcSDimitry Andric     while (HotChains.size() > 1) {
749fe013be4SDimitry Andric       ChainT *BestChainPred = nullptr;
750fe013be4SDimitry Andric       ChainT *BestChainSucc = nullptr;
751fe013be4SDimitry Andric       MergeGainT BestGain;
752*c9157d92SDimitry Andric       // Iterate over all pairs of chains.
753fe013be4SDimitry Andric       for (ChainT *ChainPred : HotChains) {
754*c9157d92SDimitry Andric         // Get candidates for merging with the current chain.
755*c9157d92SDimitry Andric         for (const auto &[ChainSucc, Edge] : ChainPred->Edges) {
756*c9157d92SDimitry Andric           // Ignore loop edges.
757*c9157d92SDimitry Andric           if (Edge->isSelfEdge())
7580eae32dcSDimitry Andric             continue;
759*c9157d92SDimitry Andric           // Skip the merge if the combined chain violates the maximum specified
760*c9157d92SDimitry Andric           // size.
76181ad6265SDimitry Andric           if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
76281ad6265SDimitry Andric             continue;
763*c9157d92SDimitry Andric           // Don't merge the chains if they have vastly different densities.
764*c9157d92SDimitry Andric           // Skip the merge if the ratio between the densities exceeds
765*c9157d92SDimitry Andric           // MaxMergeDensityRatio. Smaller values of the option result in fewer
766*c9157d92SDimitry Andric           // merges, and hence, more chains.
767*c9157d92SDimitry Andric           const double ChainPredDensity = ChainPred->density();
768*c9157d92SDimitry Andric           const double ChainSuccDensity = ChainSucc->density();
769*c9157d92SDimitry Andric           assert(ChainPredDensity > 0.0 && ChainSuccDensity > 0.0 &&
770*c9157d92SDimitry Andric                  "incorrectly computed chain densities");
771*c9157d92SDimitry Andric           auto [MinDensity, MaxDensity] =
772*c9157d92SDimitry Andric               std::minmax(ChainPredDensity, ChainSuccDensity);
773*c9157d92SDimitry Andric           const double Ratio = MaxDensity / MinDensity;
774*c9157d92SDimitry Andric           if (Ratio > MaxMergeDensityRatio)
775*c9157d92SDimitry Andric             continue;
77681ad6265SDimitry Andric 
777*c9157d92SDimitry Andric           // Compute the gain of merging the two chains.
778fe013be4SDimitry Andric           MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
7790eae32dcSDimitry Andric           if (CurGain.score() <= EPS)
7800eae32dcSDimitry Andric             continue;
7810eae32dcSDimitry Andric 
7820eae32dcSDimitry Andric           if (BestGain < CurGain ||
7830eae32dcSDimitry Andric               (std::abs(CurGain.score() - BestGain.score()) < EPS &&
7840eae32dcSDimitry Andric                compareChainPairs(ChainPred, ChainSucc, BestChainPred,
7850eae32dcSDimitry Andric                                  BestChainSucc))) {
7860eae32dcSDimitry Andric             BestGain = CurGain;
7870eae32dcSDimitry Andric             BestChainPred = ChainPred;
7880eae32dcSDimitry Andric             BestChainSucc = ChainSucc;
7890eae32dcSDimitry Andric           }
7900eae32dcSDimitry Andric         }
7910eae32dcSDimitry Andric       }
7920eae32dcSDimitry Andric 
793*c9157d92SDimitry Andric       // Stop merging when there is no improvement.
7940eae32dcSDimitry Andric       if (BestGain.score() <= EPS)
7950eae32dcSDimitry Andric         break;
7960eae32dcSDimitry Andric 
797*c9157d92SDimitry Andric       // Merge the best pair of chains.
7980eae32dcSDimitry Andric       mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
7990eae32dcSDimitry Andric                   BestGain.mergeType());
8000eae32dcSDimitry Andric     }
8010eae32dcSDimitry Andric   }
8020eae32dcSDimitry Andric 
803fe013be4SDimitry Andric   /// Merge remaining nodes into chains w/o taking jump counts into
804fe013be4SDimitry Andric   /// consideration. This allows to maintain the original node order in the
805*c9157d92SDimitry Andric   /// absence of profile data.
mergeColdChains()8060eae32dcSDimitry Andric   void mergeColdChains() {
8070eae32dcSDimitry Andric     for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
808bdd1243dSDimitry Andric       // Iterating in reverse order to make sure original fallthrough jumps are
809bdd1243dSDimitry Andric       // merged first; this might be beneficial for code size.
8100eae32dcSDimitry Andric       size_t NumSuccs = SuccNodes[SrcBB].size();
8110eae32dcSDimitry Andric       for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
812fe013be4SDimitry Andric         size_t DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
813fe013be4SDimitry Andric         ChainT *SrcChain = AllNodes[SrcBB].CurChain;
814fe013be4SDimitry Andric         ChainT *DstChain = AllNodes[DstBB].CurChain;
8150eae32dcSDimitry Andric         if (SrcChain != DstChain && !DstChain->isEntry() &&
816fe013be4SDimitry Andric             SrcChain->Nodes.back()->Index == SrcBB &&
817fe013be4SDimitry Andric             DstChain->Nodes.front()->Index == DstBB &&
818bdd1243dSDimitry Andric             SrcChain->isCold() == DstChain->isCold()) {
819fe013be4SDimitry Andric           mergeChains(SrcChain, DstChain, 0, MergeTypeT::X_Y);
8200eae32dcSDimitry Andric         }
8210eae32dcSDimitry Andric       }
8220eae32dcSDimitry Andric     }
8230eae32dcSDimitry Andric   }
8240eae32dcSDimitry Andric 
825fe013be4SDimitry Andric   /// Compute the Ext-TSP score for a given node order and a list of jumps.
extTSPScore(const MergedNodesT & Nodes,const MergedJumpsT & Jumps) const826*c9157d92SDimitry Andric   double extTSPScore(const MergedNodesT &Nodes,
827*c9157d92SDimitry Andric                      const MergedJumpsT &Jumps) const {
8280eae32dcSDimitry Andric     uint64_t CurAddr = 0;
829*c9157d92SDimitry Andric     Nodes.forEach([&](const NodeT *Node) {
830fe013be4SDimitry Andric       Node->EstimatedAddr = CurAddr;
831fe013be4SDimitry Andric       CurAddr += Node->Size;
8320eae32dcSDimitry Andric     });
8330eae32dcSDimitry Andric 
8340eae32dcSDimitry Andric     double Score = 0;
835*c9157d92SDimitry Andric     Jumps.forEach([&](const JumpT *Jump) {
836fe013be4SDimitry Andric       const NodeT *SrcBlock = Jump->Source;
837fe013be4SDimitry Andric       const NodeT *DstBlock = Jump->Target;
8380eae32dcSDimitry Andric       Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
839bdd1243dSDimitry Andric                              DstBlock->EstimatedAddr, Jump->ExecutionCount,
840bdd1243dSDimitry Andric                              Jump->IsConditional);
841*c9157d92SDimitry Andric     });
8420eae32dcSDimitry Andric     return Score;
8430eae32dcSDimitry Andric   }
8440eae32dcSDimitry Andric 
8450eae32dcSDimitry Andric   /// Compute the gain of merging two chains.
8460eae32dcSDimitry Andric   ///
8470eae32dcSDimitry Andric   /// The function considers all possible ways of merging two chains and
8480eae32dcSDimitry Andric   /// computes the one having the largest increase in ExtTSP objective. The
8490eae32dcSDimitry Andric   /// result is a pair with the first element being the gain and the second
8500eae32dcSDimitry Andric   /// element being the corresponding merging type.
getBestMergeGain(ChainT * ChainPred,ChainT * ChainSucc,ChainEdge * Edge) const851fe013be4SDimitry Andric   MergeGainT getBestMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
8520eae32dcSDimitry Andric                               ChainEdge *Edge) const {
853*c9157d92SDimitry Andric     if (Edge->hasCachedMergeGain(ChainPred, ChainSucc))
8540eae32dcSDimitry Andric       return Edge->getCachedMergeGain(ChainPred, ChainSucc);
8550eae32dcSDimitry Andric 
856*c9157d92SDimitry Andric     assert(!Edge->jumps().empty() && "trying to merge chains w/o jumps");
857*c9157d92SDimitry Andric     // Precompute jumps between ChainPred and ChainSucc.
858bdd1243dSDimitry Andric     ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);
859*c9157d92SDimitry Andric     MergedJumpsT Jumps(&Edge->jumps(), EdgePP ? &EdgePP->jumps() : nullptr);
8600eae32dcSDimitry Andric 
861*c9157d92SDimitry Andric     // This object holds the best chosen gain of merging two chains.
862fe013be4SDimitry Andric     MergeGainT Gain = MergeGainT();
8630eae32dcSDimitry Andric 
8640eae32dcSDimitry Andric     /// Given a merge offset and a list of merge types, try to merge two chains
865*c9157d92SDimitry Andric     /// and update Gain with a better alternative.
8660eae32dcSDimitry Andric     auto tryChainMerging = [&](size_t Offset,
867fe013be4SDimitry Andric                                const std::vector<MergeTypeT> &MergeTypes) {
868*c9157d92SDimitry Andric       // Skip merging corresponding to concatenation w/o splitting.
869fe013be4SDimitry Andric       if (Offset == 0 || Offset == ChainPred->Nodes.size())
8700eae32dcSDimitry Andric         return;
871*c9157d92SDimitry Andric       // Skip merging if it breaks Forced successors.
872fe013be4SDimitry Andric       NodeT *Node = ChainPred->Nodes[Offset - 1];
873fe013be4SDimitry Andric       if (Node->ForcedSucc != nullptr)
8740eae32dcSDimitry Andric         return;
8750eae32dcSDimitry Andric       // Apply the merge, compute the corresponding gain, and update the best
876*c9157d92SDimitry Andric       // value, if the merge is beneficial.
877fe013be4SDimitry Andric       for (const MergeTypeT &MergeType : MergeTypes) {
8780eae32dcSDimitry Andric         Gain.updateIfLessThan(
8790eae32dcSDimitry Andric             computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
8800eae32dcSDimitry Andric       }
8810eae32dcSDimitry Andric     };
8820eae32dcSDimitry Andric 
883*c9157d92SDimitry Andric     // Try to concatenate two chains w/o splitting.
8840eae32dcSDimitry Andric     Gain.updateIfLessThan(
885fe013be4SDimitry Andric         computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));
8860eae32dcSDimitry Andric 
887*c9157d92SDimitry Andric     // Attach (a part of) ChainPred before the first node of ChainSucc.
888fe013be4SDimitry Andric     for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
889fe013be4SDimitry Andric       const NodeT *SrcBlock = Jump->Source;
8900eae32dcSDimitry Andric       if (SrcBlock->CurChain != ChainPred)
8910eae32dcSDimitry Andric         continue;
8920eae32dcSDimitry Andric       size_t Offset = SrcBlock->CurIndex + 1;
893fe013be4SDimitry Andric       tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
8940eae32dcSDimitry Andric     }
8950eae32dcSDimitry Andric 
896*c9157d92SDimitry Andric     // Attach (a part of) ChainPred after the last node of ChainSucc.
897fe013be4SDimitry Andric     for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
898*c9157d92SDimitry Andric       const NodeT *DstBlock = Jump->Target;
8990eae32dcSDimitry Andric       if (DstBlock->CurChain != ChainPred)
9000eae32dcSDimitry Andric         continue;
9010eae32dcSDimitry Andric       size_t Offset = DstBlock->CurIndex;
902fe013be4SDimitry Andric       tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
9030eae32dcSDimitry Andric     }
9040eae32dcSDimitry Andric 
905*c9157d92SDimitry Andric     // Try to break ChainPred in various ways and concatenate with ChainSucc.
906fe013be4SDimitry Andric     if (ChainPred->Nodes.size() <= ChainSplitThreshold) {
907fe013be4SDimitry Andric       for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
908*c9157d92SDimitry Andric         // Do not split the chain along a fall-through jump. One of the two
909*c9157d92SDimitry Andric         // loops above may still "break" such a jump whenever it results in a
910*c9157d92SDimitry Andric         // new fall-through.
911*c9157d92SDimitry Andric         const NodeT *BB = ChainPred->Nodes[Offset - 1];
912*c9157d92SDimitry Andric         const NodeT *BB2 = ChainPred->Nodes[Offset];
913*c9157d92SDimitry Andric         if (BB->isSuccessor(BB2))
914*c9157d92SDimitry Andric           continue;
915*c9157d92SDimitry Andric 
916*c9157d92SDimitry Andric         // In practice, applying X2_Y_X1 merging almost never provides benefits;
917*c9157d92SDimitry Andric         // thus, we exclude it from consideration to reduce the search space.
918fe013be4SDimitry Andric         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,
919fe013be4SDimitry Andric                                  MergeTypeT::X2_X1_Y});
9200eae32dcSDimitry Andric       }
9210eae32dcSDimitry Andric     }
922*c9157d92SDimitry Andric 
9230eae32dcSDimitry Andric     Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
9240eae32dcSDimitry Andric     return Gain;
9250eae32dcSDimitry Andric   }
9260eae32dcSDimitry Andric 
9270eae32dcSDimitry Andric   /// Compute the score gain of merging two chains, respecting a given
9280eae32dcSDimitry Andric   /// merge 'type' and 'offset'.
9290eae32dcSDimitry Andric   ///
9300eae32dcSDimitry Andric   /// The two chains are not modified in the method.
computeMergeGain(const ChainT * ChainPred,const ChainT * ChainSucc,const MergedJumpsT & Jumps,size_t MergeOffset,MergeTypeT MergeType) const931fe013be4SDimitry Andric   MergeGainT computeMergeGain(const ChainT *ChainPred, const ChainT *ChainSucc,
932*c9157d92SDimitry Andric                               const MergedJumpsT &Jumps, size_t MergeOffset,
933*c9157d92SDimitry Andric                               MergeTypeT MergeType) const {
934*c9157d92SDimitry Andric     MergedNodesT MergedNodes =
935fe013be4SDimitry Andric         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
9360eae32dcSDimitry Andric 
937*c9157d92SDimitry Andric     // Do not allow a merge that does not preserve the original entry point.
9380eae32dcSDimitry Andric     if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
939*c9157d92SDimitry Andric         !MergedNodes.getFirstNode()->isEntry())
940fe013be4SDimitry Andric       return MergeGainT();
9410eae32dcSDimitry Andric 
942*c9157d92SDimitry Andric     // The gain for the new chain.
943*c9157d92SDimitry Andric     double NewScore = extTSPScore(MergedNodes, Jumps);
944*c9157d92SDimitry Andric     double CurScore = ChainPred->Score;
945*c9157d92SDimitry Andric     return MergeGainT(NewScore - CurScore, MergeOffset, MergeType);
9460eae32dcSDimitry Andric   }
9470eae32dcSDimitry Andric 
9480eae32dcSDimitry Andric   /// Merge chain From into chain Into, update the list of active chains,
9490eae32dcSDimitry Andric   /// adjacency information, and the corresponding cached values.
mergeChains(ChainT * Into,ChainT * From,size_t MergeOffset,MergeTypeT MergeType)950fe013be4SDimitry Andric   void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
951fe013be4SDimitry Andric                    MergeTypeT MergeType) {
9520eae32dcSDimitry Andric     assert(Into != From && "a chain cannot be merged with itself");
9530eae32dcSDimitry Andric 
954*c9157d92SDimitry Andric     // Merge the nodes.
955*c9157d92SDimitry Andric     MergedNodesT MergedNodes =
956fe013be4SDimitry Andric         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
957fe013be4SDimitry Andric     Into->merge(From, MergedNodes.getNodes());
958fe013be4SDimitry Andric 
959*c9157d92SDimitry Andric     // Merge the edges.
9600eae32dcSDimitry Andric     Into->mergeEdges(From);
9610eae32dcSDimitry Andric     From->clear();
9620eae32dcSDimitry Andric 
963*c9157d92SDimitry Andric     // Update cached ext-tsp score for the new chain.
964bdd1243dSDimitry Andric     ChainEdge *SelfEdge = Into->getEdge(Into);
9650eae32dcSDimitry Andric     if (SelfEdge != nullptr) {
966*c9157d92SDimitry Andric       MergedNodes = MergedNodesT(Into->Nodes.begin(), Into->Nodes.end());
967*c9157d92SDimitry Andric       MergedJumpsT MergedJumps(&SelfEdge->jumps());
968*c9157d92SDimitry Andric       Into->Score = extTSPScore(MergedNodes, MergedJumps);
9690eae32dcSDimitry Andric     }
9700eae32dcSDimitry Andric 
971*c9157d92SDimitry Andric     // Remove the chain from the list of active chains.
972*c9157d92SDimitry Andric     llvm::erase(HotChains, From);
9730eae32dcSDimitry Andric 
974*c9157d92SDimitry Andric     // Invalidate caches.
975fe013be4SDimitry Andric     for (auto EdgeIt : Into->Edges)
976fe013be4SDimitry Andric       EdgeIt.second->invalidateCache();
9770eae32dcSDimitry Andric   }
9780eae32dcSDimitry Andric 
979fe013be4SDimitry Andric   /// Concatenate all chains into the final order.
concatChains()980*c9157d92SDimitry Andric   std::vector<uint64_t> concatChains() {
981*c9157d92SDimitry Andric     // Collect non-empty chains.
982fe013be4SDimitry Andric     std::vector<const ChainT *> SortedChains;
983fe013be4SDimitry Andric     for (ChainT &Chain : AllChains) {
984*c9157d92SDimitry Andric       if (!Chain.Nodes.empty())
9850eae32dcSDimitry Andric         SortedChains.push_back(&Chain);
9860eae32dcSDimitry Andric     }
9870eae32dcSDimitry Andric 
988*c9157d92SDimitry Andric     // Sorting chains by density in the decreasing order.
989*c9157d92SDimitry Andric     std::sort(SortedChains.begin(), SortedChains.end(),
990fe013be4SDimitry Andric               [&](const ChainT *L, const ChainT *R) {
991*c9157d92SDimitry Andric                 // Place the entry point at the beginning of the order.
992fe013be4SDimitry Andric                 if (L->isEntry() != R->isEntry())
993fe013be4SDimitry Andric                   return L->isEntry();
9940eae32dcSDimitry Andric 
995*c9157d92SDimitry Andric                 // Compare by density and break ties by chain identifiers.
996*c9157d92SDimitry Andric                 return std::make_tuple(-L->density(), L->Id) <
997*c9157d92SDimitry Andric                        std::make_tuple(-R->density(), R->Id);
9980eae32dcSDimitry Andric               });
9990eae32dcSDimitry Andric 
1000*c9157d92SDimitry Andric     // Collect the nodes in the order specified by their chains.
1001*c9157d92SDimitry Andric     std::vector<uint64_t> Order;
10020eae32dcSDimitry Andric     Order.reserve(NumNodes);
1003*c9157d92SDimitry Andric     for (const ChainT *Chain : SortedChains)
1004*c9157d92SDimitry Andric       for (NodeT *Node : Chain->Nodes)
1005fe013be4SDimitry Andric         Order.push_back(Node->Index);
1006*c9157d92SDimitry Andric     return Order;
10070eae32dcSDimitry Andric   }
10080eae32dcSDimitry Andric 
10090eae32dcSDimitry Andric private:
10100eae32dcSDimitry Andric   /// The number of nodes in the graph.
10110eae32dcSDimitry Andric   const size_t NumNodes;
10120eae32dcSDimitry Andric 
10130eae32dcSDimitry Andric   /// Successors of each node.
10140eae32dcSDimitry Andric   std::vector<std::vector<uint64_t>> SuccNodes;
10150eae32dcSDimitry Andric 
10160eae32dcSDimitry Andric   /// Predecessors of each node.
10170eae32dcSDimitry Andric   std::vector<std::vector<uint64_t>> PredNodes;
10180eae32dcSDimitry Andric 
1019fe013be4SDimitry Andric   /// All nodes (basic blocks) in the graph.
1020fe013be4SDimitry Andric   std::vector<NodeT> AllNodes;
10210eae32dcSDimitry Andric 
1022fe013be4SDimitry Andric   /// All jumps between the nodes.
1023fe013be4SDimitry Andric   std::vector<JumpT> AllJumps;
10240eae32dcSDimitry Andric 
1025fe013be4SDimitry Andric   /// All chains of nodes.
1026fe013be4SDimitry Andric   std::vector<ChainT> AllChains;
10270eae32dcSDimitry Andric 
1028fe013be4SDimitry Andric   /// All edges between the chains.
10290eae32dcSDimitry Andric   std::vector<ChainEdge> AllEdges;
10300eae32dcSDimitry Andric 
10310eae32dcSDimitry Andric   /// Active chains. The vector gets updated at runtime when chains are merged.
1032fe013be4SDimitry Andric   std::vector<ChainT *> HotChains;
10330eae32dcSDimitry Andric };
10340eae32dcSDimitry Andric 
1035*c9157d92SDimitry Andric /// The implementation of the Cache-Directed Sort (CDSort) algorithm for
1036*c9157d92SDimitry Andric /// ordering functions represented by a call graph.
1037*c9157d92SDimitry Andric class CDSortImpl {
1038*c9157d92SDimitry Andric public:
CDSortImpl(const CDSortConfig & Config,ArrayRef<uint64_t> NodeSizes,ArrayRef<uint64_t> NodeCounts,ArrayRef<EdgeCount> EdgeCounts,ArrayRef<uint64_t> EdgeOffsets)1039*c9157d92SDimitry Andric   CDSortImpl(const CDSortConfig &Config, ArrayRef<uint64_t> NodeSizes,
1040*c9157d92SDimitry Andric              ArrayRef<uint64_t> NodeCounts, ArrayRef<EdgeCount> EdgeCounts,
1041*c9157d92SDimitry Andric              ArrayRef<uint64_t> EdgeOffsets)
1042*c9157d92SDimitry Andric       : Config(Config), NumNodes(NodeSizes.size()) {
1043*c9157d92SDimitry Andric     initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets);
1044*c9157d92SDimitry Andric   }
1045*c9157d92SDimitry Andric 
1046*c9157d92SDimitry Andric   /// Run the algorithm and return an ordered set of function clusters.
run()1047*c9157d92SDimitry Andric   std::vector<uint64_t> run() {
1048*c9157d92SDimitry Andric     // Merge pairs of chains while improving the objective.
1049*c9157d92SDimitry Andric     mergeChainPairs();
1050*c9157d92SDimitry Andric 
1051*c9157d92SDimitry Andric     // Collect nodes from all the chains.
1052*c9157d92SDimitry Andric     return concatChains();
1053*c9157d92SDimitry Andric   }
1054*c9157d92SDimitry Andric 
1055*c9157d92SDimitry Andric private:
1056*c9157d92SDimitry Andric   /// Initialize the algorithm's data structures.
initialize(const ArrayRef<uint64_t> & NodeSizes,const ArrayRef<uint64_t> & NodeCounts,const ArrayRef<EdgeCount> & EdgeCounts,const ArrayRef<uint64_t> & EdgeOffsets)1057*c9157d92SDimitry Andric   void initialize(const ArrayRef<uint64_t> &NodeSizes,
1058*c9157d92SDimitry Andric                   const ArrayRef<uint64_t> &NodeCounts,
1059*c9157d92SDimitry Andric                   const ArrayRef<EdgeCount> &EdgeCounts,
1060*c9157d92SDimitry Andric                   const ArrayRef<uint64_t> &EdgeOffsets) {
1061*c9157d92SDimitry Andric     // Initialize nodes.
1062*c9157d92SDimitry Andric     AllNodes.reserve(NumNodes);
1063*c9157d92SDimitry Andric     for (uint64_t Node = 0; Node < NumNodes; Node++) {
1064*c9157d92SDimitry Andric       uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
1065*c9157d92SDimitry Andric       uint64_t ExecutionCount = NodeCounts[Node];
1066*c9157d92SDimitry Andric       AllNodes.emplace_back(Node, Size, ExecutionCount);
1067*c9157d92SDimitry Andric       TotalSamples += ExecutionCount;
1068*c9157d92SDimitry Andric       if (ExecutionCount > 0)
1069*c9157d92SDimitry Andric         TotalSize += Size;
1070*c9157d92SDimitry Andric     }
1071*c9157d92SDimitry Andric 
1072*c9157d92SDimitry Andric     // Initialize jumps between the nodes.
1073*c9157d92SDimitry Andric     SuccNodes.resize(NumNodes);
1074*c9157d92SDimitry Andric     PredNodes.resize(NumNodes);
1075*c9157d92SDimitry Andric     AllJumps.reserve(EdgeCounts.size());
1076*c9157d92SDimitry Andric     for (size_t I = 0; I < EdgeCounts.size(); I++) {
1077*c9157d92SDimitry Andric       auto [Pred, Succ, Count] = EdgeCounts[I];
1078*c9157d92SDimitry Andric       // Ignore recursive calls.
1079*c9157d92SDimitry Andric       if (Pred == Succ)
1080*c9157d92SDimitry Andric         continue;
1081*c9157d92SDimitry Andric 
1082*c9157d92SDimitry Andric       SuccNodes[Pred].push_back(Succ);
1083*c9157d92SDimitry Andric       PredNodes[Succ].push_back(Pred);
1084*c9157d92SDimitry Andric       if (Count > 0) {
1085*c9157d92SDimitry Andric         NodeT &PredNode = AllNodes[Pred];
1086*c9157d92SDimitry Andric         NodeT &SuccNode = AllNodes[Succ];
1087*c9157d92SDimitry Andric         AllJumps.emplace_back(&PredNode, &SuccNode, Count);
1088*c9157d92SDimitry Andric         AllJumps.back().Offset = EdgeOffsets[I];
1089*c9157d92SDimitry Andric         SuccNode.InJumps.push_back(&AllJumps.back());
1090*c9157d92SDimitry Andric         PredNode.OutJumps.push_back(&AllJumps.back());
1091*c9157d92SDimitry Andric         // Adjust execution counts.
1092*c9157d92SDimitry Andric         PredNode.ExecutionCount = std::max(PredNode.ExecutionCount, Count);
1093*c9157d92SDimitry Andric         SuccNode.ExecutionCount = std::max(SuccNode.ExecutionCount, Count);
1094*c9157d92SDimitry Andric       }
1095*c9157d92SDimitry Andric     }
1096*c9157d92SDimitry Andric 
1097*c9157d92SDimitry Andric     // Initialize chains.
1098*c9157d92SDimitry Andric     AllChains.reserve(NumNodes);
1099*c9157d92SDimitry Andric     for (NodeT &Node : AllNodes) {
1100*c9157d92SDimitry Andric       // Adjust execution counts.
1101*c9157d92SDimitry Andric       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount());
1102*c9157d92SDimitry Andric       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount());
1103*c9157d92SDimitry Andric       // Create chain.
1104*c9157d92SDimitry Andric       AllChains.emplace_back(Node.Index, &Node);
1105*c9157d92SDimitry Andric       Node.CurChain = &AllChains.back();
1106*c9157d92SDimitry Andric     }
1107*c9157d92SDimitry Andric 
1108*c9157d92SDimitry Andric     // Initialize chain edges.
1109*c9157d92SDimitry Andric     AllEdges.reserve(AllJumps.size());
1110*c9157d92SDimitry Andric     for (NodeT &PredNode : AllNodes) {
1111*c9157d92SDimitry Andric       for (JumpT *Jump : PredNode.OutJumps) {
1112*c9157d92SDimitry Andric         NodeT *SuccNode = Jump->Target;
1113*c9157d92SDimitry Andric         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
1114*c9157d92SDimitry Andric         // This edge is already present in the graph.
1115*c9157d92SDimitry Andric         if (CurEdge != nullptr) {
1116*c9157d92SDimitry Andric           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
1117*c9157d92SDimitry Andric           CurEdge->appendJump(Jump);
1118*c9157d92SDimitry Andric           continue;
1119*c9157d92SDimitry Andric         }
1120*c9157d92SDimitry Andric         // This is a new edge.
1121*c9157d92SDimitry Andric         AllEdges.emplace_back(Jump);
1122*c9157d92SDimitry Andric         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
1123*c9157d92SDimitry Andric         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
1124*c9157d92SDimitry Andric       }
1125*c9157d92SDimitry Andric     }
1126*c9157d92SDimitry Andric   }
1127*c9157d92SDimitry Andric 
1128*c9157d92SDimitry Andric   /// Merge pairs of chains while there is an improvement in the objective.
mergeChainPairs()1129*c9157d92SDimitry Andric   void mergeChainPairs() {
1130*c9157d92SDimitry Andric     // Create a priority queue containing all edges ordered by the merge gain.
1131*c9157d92SDimitry Andric     auto GainComparator = [](ChainEdge *L, ChainEdge *R) {
1132*c9157d92SDimitry Andric       return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) <
1133*c9157d92SDimitry Andric              std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id);
1134*c9157d92SDimitry Andric     };
1135*c9157d92SDimitry Andric     std::set<ChainEdge *, decltype(GainComparator)> Queue(GainComparator);
1136*c9157d92SDimitry Andric 
1137*c9157d92SDimitry Andric     // Insert the edges into the queue.
1138*c9157d92SDimitry Andric     [[maybe_unused]] size_t NumActiveChains = 0;
1139*c9157d92SDimitry Andric     for (NodeT &Node : AllNodes) {
1140*c9157d92SDimitry Andric       if (Node.ExecutionCount == 0)
1141*c9157d92SDimitry Andric         continue;
1142*c9157d92SDimitry Andric       ++NumActiveChains;
1143*c9157d92SDimitry Andric       for (const auto &[_, Edge] : Node.CurChain->Edges) {
1144*c9157d92SDimitry Andric         // Ignore self-edges.
1145*c9157d92SDimitry Andric         if (Edge->isSelfEdge())
1146*c9157d92SDimitry Andric           continue;
1147*c9157d92SDimitry Andric         // Ignore already processed edges.
1148*c9157d92SDimitry Andric         if (Edge->gain() != -1.0)
1149*c9157d92SDimitry Andric           continue;
1150*c9157d92SDimitry Andric 
1151*c9157d92SDimitry Andric         // Compute the gain of merging the two chains.
1152*c9157d92SDimitry Andric         MergeGainT Gain = getBestMergeGain(Edge);
1153*c9157d92SDimitry Andric         Edge->setMergeGain(Gain);
1154*c9157d92SDimitry Andric 
1155*c9157d92SDimitry Andric         if (Edge->gain() > EPS)
1156*c9157d92SDimitry Andric           Queue.insert(Edge);
1157*c9157d92SDimitry Andric       }
1158*c9157d92SDimitry Andric     }
1159*c9157d92SDimitry Andric 
1160*c9157d92SDimitry Andric     // Merge the chains while the gain of merging is positive.
1161*c9157d92SDimitry Andric     while (!Queue.empty()) {
1162*c9157d92SDimitry Andric       // Extract the best (top) edge for merging.
1163*c9157d92SDimitry Andric       ChainEdge *BestEdge = *Queue.begin();
1164*c9157d92SDimitry Andric       Queue.erase(Queue.begin());
1165*c9157d92SDimitry Andric       ChainT *BestSrcChain = BestEdge->srcChain();
1166*c9157d92SDimitry Andric       ChainT *BestDstChain = BestEdge->dstChain();
1167*c9157d92SDimitry Andric 
1168*c9157d92SDimitry Andric       // Remove outdated edges from the queue.
1169*c9157d92SDimitry Andric       for (const auto &[_, ChainEdge] : BestSrcChain->Edges)
1170*c9157d92SDimitry Andric         Queue.erase(ChainEdge);
1171*c9157d92SDimitry Andric       for (const auto &[_, ChainEdge] : BestDstChain->Edges)
1172*c9157d92SDimitry Andric         Queue.erase(ChainEdge);
1173*c9157d92SDimitry Andric 
1174*c9157d92SDimitry Andric       // Merge the best pair of chains.
1175*c9157d92SDimitry Andric       MergeGainT BestGain = BestEdge->getMergeGain();
1176*c9157d92SDimitry Andric       mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(),
1177*c9157d92SDimitry Andric                   BestGain.mergeType());
1178*c9157d92SDimitry Andric       --NumActiveChains;
1179*c9157d92SDimitry Andric 
1180*c9157d92SDimitry Andric       // Insert newly created edges into the queue.
1181*c9157d92SDimitry Andric       for (const auto &[_, Edge] : BestSrcChain->Edges) {
1182*c9157d92SDimitry Andric         // Ignore loop edges.
1183*c9157d92SDimitry Andric         if (Edge->isSelfEdge())
1184*c9157d92SDimitry Andric           continue;
1185*c9157d92SDimitry Andric         if (Edge->srcChain()->numBlocks() + Edge->dstChain()->numBlocks() >
1186*c9157d92SDimitry Andric             Config.MaxChainSize)
1187*c9157d92SDimitry Andric           continue;
1188*c9157d92SDimitry Andric 
1189*c9157d92SDimitry Andric         // Compute the gain of merging the two chains.
1190*c9157d92SDimitry Andric         MergeGainT Gain = getBestMergeGain(Edge);
1191*c9157d92SDimitry Andric         Edge->setMergeGain(Gain);
1192*c9157d92SDimitry Andric 
1193*c9157d92SDimitry Andric         if (Edge->gain() > EPS)
1194*c9157d92SDimitry Andric           Queue.insert(Edge);
1195*c9157d92SDimitry Andric       }
1196*c9157d92SDimitry Andric     }
1197*c9157d92SDimitry Andric 
1198*c9157d92SDimitry Andric     LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number"
1199*c9157d92SDimitry Andric                       << " of chains from " << NumNodes << " to "
1200*c9157d92SDimitry Andric                       << NumActiveChains << "\n");
1201*c9157d92SDimitry Andric   }
1202*c9157d92SDimitry Andric 
1203*c9157d92SDimitry Andric   /// Compute the gain of merging two chains.
1204*c9157d92SDimitry Andric   ///
1205*c9157d92SDimitry Andric   /// The function considers all possible ways of merging two chains and
1206*c9157d92SDimitry Andric   /// computes the one having the largest increase in ExtTSP objective. The
1207*c9157d92SDimitry Andric   /// result is a pair with the first element being the gain and the second
1208*c9157d92SDimitry Andric   /// element being the corresponding merging type.
getBestMergeGain(ChainEdge * Edge) const1209*c9157d92SDimitry Andric   MergeGainT getBestMergeGain(ChainEdge *Edge) const {
1210*c9157d92SDimitry Andric     assert(!Edge->jumps().empty() && "trying to merge chains w/o jumps");
1211*c9157d92SDimitry Andric     // Precompute jumps between ChainPred and ChainSucc.
1212*c9157d92SDimitry Andric     MergedJumpsT Jumps(&Edge->jumps());
1213*c9157d92SDimitry Andric     ChainT *SrcChain = Edge->srcChain();
1214*c9157d92SDimitry Andric     ChainT *DstChain = Edge->dstChain();
1215*c9157d92SDimitry Andric 
1216*c9157d92SDimitry Andric     // This object holds the best currently chosen gain of merging two chains.
1217*c9157d92SDimitry Andric     MergeGainT Gain = MergeGainT();
1218*c9157d92SDimitry Andric 
1219*c9157d92SDimitry Andric     /// Given a list of merge types, try to merge two chains and update Gain
1220*c9157d92SDimitry Andric     /// with a better alternative.
1221*c9157d92SDimitry Andric     auto tryChainMerging = [&](const std::vector<MergeTypeT> &MergeTypes) {
1222*c9157d92SDimitry Andric       // Apply the merge, compute the corresponding gain, and update the best
1223*c9157d92SDimitry Andric       // value, if the merge is beneficial.
1224*c9157d92SDimitry Andric       for (const MergeTypeT &MergeType : MergeTypes) {
1225*c9157d92SDimitry Andric         MergeGainT NewGain =
1226*c9157d92SDimitry Andric             computeMergeGain(SrcChain, DstChain, Jumps, MergeType);
1227*c9157d92SDimitry Andric 
1228*c9157d92SDimitry Andric         // When forward and backward gains are the same, prioritize merging that
1229*c9157d92SDimitry Andric         // preserves the original order of the functions in the binary.
1230*c9157d92SDimitry Andric         if (std::abs(Gain.score() - NewGain.score()) < EPS) {
1231*c9157d92SDimitry Andric           if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) ||
1232*c9157d92SDimitry Andric               (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) {
1233*c9157d92SDimitry Andric             Gain = NewGain;
1234*c9157d92SDimitry Andric           }
1235*c9157d92SDimitry Andric         } else if (NewGain.score() > Gain.score() + EPS) {
1236*c9157d92SDimitry Andric           Gain = NewGain;
1237*c9157d92SDimitry Andric         }
1238*c9157d92SDimitry Andric       }
1239*c9157d92SDimitry Andric     };
1240*c9157d92SDimitry Andric 
1241*c9157d92SDimitry Andric     // Try to concatenate two chains w/o splitting.
1242*c9157d92SDimitry Andric     tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X});
1243*c9157d92SDimitry Andric 
1244*c9157d92SDimitry Andric     return Gain;
1245*c9157d92SDimitry Andric   }
1246*c9157d92SDimitry Andric 
1247*c9157d92SDimitry Andric   /// Compute the score gain of merging two chains, respecting a given type.
1248*c9157d92SDimitry Andric   ///
1249*c9157d92SDimitry Andric   /// The two chains are not modified in the method.
computeMergeGain(ChainT * ChainPred,ChainT * ChainSucc,const MergedJumpsT & Jumps,MergeTypeT MergeType) const1250*c9157d92SDimitry Andric   MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
1251*c9157d92SDimitry Andric                               const MergedJumpsT &Jumps,
1252*c9157d92SDimitry Andric                               MergeTypeT MergeType) const {
1253*c9157d92SDimitry Andric     // This doesn't depend on the ordering of the nodes
1254*c9157d92SDimitry Andric     double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc);
1255*c9157d92SDimitry Andric 
1256*c9157d92SDimitry Andric     // Merge offset is always 0, as the chains are not split.
1257*c9157d92SDimitry Andric     size_t MergeOffset = 0;
1258*c9157d92SDimitry Andric     auto MergedBlocks =
1259*c9157d92SDimitry Andric         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
1260*c9157d92SDimitry Andric     double DistGain = distBasedLocalityGain(MergedBlocks, Jumps);
1261*c9157d92SDimitry Andric 
1262*c9157d92SDimitry Andric     double GainScore = DistGain + Config.FrequencyScale * FreqGain;
1263*c9157d92SDimitry Andric     // Scale the result to increase the importance of merging short chains.
1264*c9157d92SDimitry Andric     if (GainScore >= 0.0)
1265*c9157d92SDimitry Andric       GainScore /= std::min(ChainPred->Size, ChainSucc->Size);
1266*c9157d92SDimitry Andric 
1267*c9157d92SDimitry Andric     return MergeGainT(GainScore, MergeOffset, MergeType);
1268*c9157d92SDimitry Andric   }
1269*c9157d92SDimitry Andric 
1270*c9157d92SDimitry Andric   /// Compute the change of the frequency locality after merging the chains.
freqBasedLocalityGain(ChainT * ChainPred,ChainT * ChainSucc) const1271*c9157d92SDimitry Andric   double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const {
1272*c9157d92SDimitry Andric     auto missProbability = [&](double ChainDensity) {
1273*c9157d92SDimitry Andric       double PageSamples = ChainDensity * Config.CacheSize;
1274*c9157d92SDimitry Andric       if (PageSamples >= TotalSamples)
1275*c9157d92SDimitry Andric         return 0.0;
1276*c9157d92SDimitry Andric       double P = PageSamples / TotalSamples;
1277*c9157d92SDimitry Andric       return pow(1.0 - P, static_cast<double>(Config.CacheEntries));
1278*c9157d92SDimitry Andric     };
1279*c9157d92SDimitry Andric 
1280*c9157d92SDimitry Andric     // Cache misses on the chains before merging.
1281*c9157d92SDimitry Andric     double CurScore =
1282*c9157d92SDimitry Andric         ChainPred->ExecutionCount * missProbability(ChainPred->density()) +
1283*c9157d92SDimitry Andric         ChainSucc->ExecutionCount * missProbability(ChainSucc->density());
1284*c9157d92SDimitry Andric 
1285*c9157d92SDimitry Andric     // Cache misses on the merged chain
1286*c9157d92SDimitry Andric     double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;
1287*c9157d92SDimitry Andric     double MergedSize = ChainPred->Size + ChainSucc->Size;
1288*c9157d92SDimitry Andric     double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;
1289*c9157d92SDimitry Andric     double NewScore = MergedCounts * missProbability(MergedDensity);
1290*c9157d92SDimitry Andric 
1291*c9157d92SDimitry Andric     return CurScore - NewScore;
1292*c9157d92SDimitry Andric   }
1293*c9157d92SDimitry Andric 
1294*c9157d92SDimitry Andric   /// Compute the distance locality for a jump / call.
distScore(uint64_t SrcAddr,uint64_t DstAddr,uint64_t Count) const1295*c9157d92SDimitry Andric   double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const {
1296*c9157d92SDimitry Andric     uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr;
1297*c9157d92SDimitry Andric     double D = Dist == 0 ? 0.1 : static_cast<double>(Dist);
1298*c9157d92SDimitry Andric     return static_cast<double>(Count) * std::pow(D, -Config.DistancePower);
1299*c9157d92SDimitry Andric   }
1300*c9157d92SDimitry Andric 
1301*c9157d92SDimitry Andric   /// Compute the change of the distance locality after merging the chains.
distBasedLocalityGain(const MergedNodesT & Nodes,const MergedJumpsT & Jumps) const1302*c9157d92SDimitry Andric   double distBasedLocalityGain(const MergedNodesT &Nodes,
1303*c9157d92SDimitry Andric                                const MergedJumpsT &Jumps) const {
1304*c9157d92SDimitry Andric     uint64_t CurAddr = 0;
1305*c9157d92SDimitry Andric     Nodes.forEach([&](const NodeT *Node) {
1306*c9157d92SDimitry Andric       Node->EstimatedAddr = CurAddr;
1307*c9157d92SDimitry Andric       CurAddr += Node->Size;
1308*c9157d92SDimitry Andric     });
1309*c9157d92SDimitry Andric 
1310*c9157d92SDimitry Andric     double CurScore = 0;
1311*c9157d92SDimitry Andric     double NewScore = 0;
1312*c9157d92SDimitry Andric     Jumps.forEach([&](const JumpT *Jump) {
1313*c9157d92SDimitry Andric       uint64_t SrcAddr = Jump->Source->EstimatedAddr + Jump->Offset;
1314*c9157d92SDimitry Andric       uint64_t DstAddr = Jump->Target->EstimatedAddr;
1315*c9157d92SDimitry Andric       NewScore += distScore(SrcAddr, DstAddr, Jump->ExecutionCount);
1316*c9157d92SDimitry Andric       CurScore += distScore(0, TotalSize, Jump->ExecutionCount);
1317*c9157d92SDimitry Andric     });
1318*c9157d92SDimitry Andric     return NewScore - CurScore;
1319*c9157d92SDimitry Andric   }
1320*c9157d92SDimitry Andric 
1321*c9157d92SDimitry Andric   /// Merge chain From into chain Into, update the list of active chains,
1322*c9157d92SDimitry Andric   /// adjacency information, and the corresponding cached values.
mergeChains(ChainT * Into,ChainT * From,size_t MergeOffset,MergeTypeT MergeType)1323*c9157d92SDimitry Andric   void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
1324*c9157d92SDimitry Andric                    MergeTypeT MergeType) {
1325*c9157d92SDimitry Andric     assert(Into != From && "a chain cannot be merged with itself");
1326*c9157d92SDimitry Andric 
1327*c9157d92SDimitry Andric     // Merge the nodes.
1328*c9157d92SDimitry Andric     MergedNodesT MergedNodes =
1329*c9157d92SDimitry Andric         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
1330*c9157d92SDimitry Andric     Into->merge(From, MergedNodes.getNodes());
1331*c9157d92SDimitry Andric 
1332*c9157d92SDimitry Andric     // Merge the edges.
1333*c9157d92SDimitry Andric     Into->mergeEdges(From);
1334*c9157d92SDimitry Andric     From->clear();
1335*c9157d92SDimitry Andric   }
1336*c9157d92SDimitry Andric 
1337*c9157d92SDimitry Andric   /// Concatenate all chains into the final order.
concatChains()1338*c9157d92SDimitry Andric   std::vector<uint64_t> concatChains() {
1339*c9157d92SDimitry Andric     // Collect chains and calculate density stats for their sorting.
1340*c9157d92SDimitry Andric     std::vector<const ChainT *> SortedChains;
1341*c9157d92SDimitry Andric     DenseMap<const ChainT *, double> ChainDensity;
1342*c9157d92SDimitry Andric     for (ChainT &Chain : AllChains) {
1343*c9157d92SDimitry Andric       if (!Chain.Nodes.empty()) {
1344*c9157d92SDimitry Andric         SortedChains.push_back(&Chain);
1345*c9157d92SDimitry Andric         // Using doubles to avoid overflow of ExecutionCounts.
1346*c9157d92SDimitry Andric         double Size = 0;
1347*c9157d92SDimitry Andric         double ExecutionCount = 0;
1348*c9157d92SDimitry Andric         for (NodeT *Node : Chain.Nodes) {
1349*c9157d92SDimitry Andric           Size += static_cast<double>(Node->Size);
1350*c9157d92SDimitry Andric           ExecutionCount += static_cast<double>(Node->ExecutionCount);
1351*c9157d92SDimitry Andric         }
1352*c9157d92SDimitry Andric         assert(Size > 0 && "a chain of zero size");
1353*c9157d92SDimitry Andric         ChainDensity[&Chain] = ExecutionCount / Size;
1354*c9157d92SDimitry Andric       }
1355*c9157d92SDimitry Andric     }
1356*c9157d92SDimitry Andric 
1357*c9157d92SDimitry Andric     // Sort chains by density in the decreasing order.
1358*c9157d92SDimitry Andric     std::sort(SortedChains.begin(), SortedChains.end(),
1359*c9157d92SDimitry Andric               [&](const ChainT *L, const ChainT *R) {
1360*c9157d92SDimitry Andric                 const double DL = ChainDensity[L];
1361*c9157d92SDimitry Andric                 const double DR = ChainDensity[R];
1362*c9157d92SDimitry Andric                 // Compare by density and break ties by chain identifiers.
1363*c9157d92SDimitry Andric                 return std::make_tuple(-DL, L->Id) <
1364*c9157d92SDimitry Andric                        std::make_tuple(-DR, R->Id);
1365*c9157d92SDimitry Andric               });
1366*c9157d92SDimitry Andric 
1367*c9157d92SDimitry Andric     // Collect the nodes in the order specified by their chains.
1368*c9157d92SDimitry Andric     std::vector<uint64_t> Order;
1369*c9157d92SDimitry Andric     Order.reserve(NumNodes);
1370*c9157d92SDimitry Andric     for (const ChainT *Chain : SortedChains)
1371*c9157d92SDimitry Andric       for (NodeT *Node : Chain->Nodes)
1372*c9157d92SDimitry Andric         Order.push_back(Node->Index);
1373*c9157d92SDimitry Andric     return Order;
1374*c9157d92SDimitry Andric   }
1375*c9157d92SDimitry Andric 
1376*c9157d92SDimitry Andric private:
1377*c9157d92SDimitry Andric   /// Config for the algorithm.
1378*c9157d92SDimitry Andric   const CDSortConfig Config;
1379*c9157d92SDimitry Andric 
1380*c9157d92SDimitry Andric   /// The number of nodes in the graph.
1381*c9157d92SDimitry Andric   const size_t NumNodes;
1382*c9157d92SDimitry Andric 
1383*c9157d92SDimitry Andric   /// Successors of each node.
1384*c9157d92SDimitry Andric   std::vector<std::vector<uint64_t>> SuccNodes;
1385*c9157d92SDimitry Andric 
1386*c9157d92SDimitry Andric   /// Predecessors of each node.
1387*c9157d92SDimitry Andric   std::vector<std::vector<uint64_t>> PredNodes;
1388*c9157d92SDimitry Andric 
1389*c9157d92SDimitry Andric   /// All nodes (functions) in the graph.
1390*c9157d92SDimitry Andric   std::vector<NodeT> AllNodes;
1391*c9157d92SDimitry Andric 
1392*c9157d92SDimitry Andric   /// All jumps (function calls) between the nodes.
1393*c9157d92SDimitry Andric   std::vector<JumpT> AllJumps;
1394*c9157d92SDimitry Andric 
1395*c9157d92SDimitry Andric   /// All chains of nodes.
1396*c9157d92SDimitry Andric   std::vector<ChainT> AllChains;
1397*c9157d92SDimitry Andric 
1398*c9157d92SDimitry Andric   /// All edges between the chains.
1399*c9157d92SDimitry Andric   std::vector<ChainEdge> AllEdges;
1400*c9157d92SDimitry Andric 
1401*c9157d92SDimitry Andric   /// The total number of samples in the graph.
1402*c9157d92SDimitry Andric   uint64_t TotalSamples{0};
1403*c9157d92SDimitry Andric 
1404*c9157d92SDimitry Andric   /// The total size of the nodes in the graph.
1405*c9157d92SDimitry Andric   uint64_t TotalSize{0};
1406*c9157d92SDimitry Andric };
1407*c9157d92SDimitry Andric 
14080eae32dcSDimitry Andric } // end of anonymous namespace
14090eae32dcSDimitry Andric 
1410fe013be4SDimitry Andric std::vector<uint64_t>
computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,ArrayRef<uint64_t> NodeCounts,ArrayRef<EdgeCount> EdgeCounts)1411*c9157d92SDimitry Andric codelayout::computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
1412*c9157d92SDimitry Andric                                 ArrayRef<uint64_t> NodeCounts,
1413*c9157d92SDimitry Andric                                 ArrayRef<EdgeCount> EdgeCounts) {
1414*c9157d92SDimitry Andric   // Verify correctness of the input data.
14150eae32dcSDimitry Andric   assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
1416fe013be4SDimitry Andric   assert(NodeSizes.size() > 2 && "Incorrect input");
14170eae32dcSDimitry Andric 
1418*c9157d92SDimitry Andric   // Apply the reordering algorithm.
1419fe013be4SDimitry Andric   ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts);
1420*c9157d92SDimitry Andric   std::vector<uint64_t> Result = Alg.run();
14210eae32dcSDimitry Andric 
1422*c9157d92SDimitry Andric   // Verify correctness of the output.
14230eae32dcSDimitry Andric   assert(Result.front() == 0 && "Original entry point is not preserved");
1424fe013be4SDimitry Andric   assert(Result.size() == NodeSizes.size() && "Incorrect size of layout");
14250eae32dcSDimitry Andric   return Result;
14260eae32dcSDimitry Andric }
14270eae32dcSDimitry Andric 
calcExtTspScore(ArrayRef<uint64_t> Order,ArrayRef<uint64_t> NodeSizes,ArrayRef<uint64_t> NodeCounts,ArrayRef<EdgeCount> EdgeCounts)1428*c9157d92SDimitry Andric double codelayout::calcExtTspScore(ArrayRef<uint64_t> Order,
1429*c9157d92SDimitry Andric                                    ArrayRef<uint64_t> NodeSizes,
1430*c9157d92SDimitry Andric                                    ArrayRef<uint64_t> NodeCounts,
1431*c9157d92SDimitry Andric                                    ArrayRef<EdgeCount> EdgeCounts) {
1432*c9157d92SDimitry Andric   // Estimate addresses of the blocks in memory.
1433bdd1243dSDimitry Andric   std::vector<uint64_t> Addr(NodeSizes.size(), 0);
14340eae32dcSDimitry Andric   for (size_t Idx = 1; Idx < Order.size(); Idx++) {
14350eae32dcSDimitry Andric     Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
14360eae32dcSDimitry Andric   }
1437bdd1243dSDimitry Andric   std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
1438*c9157d92SDimitry Andric   for (auto Edge : EdgeCounts)
1439*c9157d92SDimitry Andric     ++OutDegree[Edge.src];
14400eae32dcSDimitry Andric 
1441*c9157d92SDimitry Andric   // Increase the score for each jump.
14420eae32dcSDimitry Andric   double Score = 0;
1443*c9157d92SDimitry Andric   for (auto Edge : EdgeCounts) {
1444*c9157d92SDimitry Andric     bool IsConditional = OutDegree[Edge.src] > 1;
1445*c9157d92SDimitry Andric     Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst],
1446*c9157d92SDimitry Andric                            Edge.count, IsConditional);
14470eae32dcSDimitry Andric   }
14480eae32dcSDimitry Andric   return Score;
14490eae32dcSDimitry Andric }
14500eae32dcSDimitry Andric 
calcExtTspScore(ArrayRef<uint64_t> NodeSizes,ArrayRef<uint64_t> NodeCounts,ArrayRef<EdgeCount> EdgeCounts)1451*c9157d92SDimitry Andric double codelayout::calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
1452*c9157d92SDimitry Andric                                    ArrayRef<uint64_t> NodeCounts,
1453*c9157d92SDimitry Andric                                    ArrayRef<EdgeCount> EdgeCounts) {
1454bdd1243dSDimitry Andric   std::vector<uint64_t> Order(NodeSizes.size());
14550eae32dcSDimitry Andric   for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
14560eae32dcSDimitry Andric     Order[Idx] = Idx;
14570eae32dcSDimitry Andric   }
14580eae32dcSDimitry Andric   return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
14590eae32dcSDimitry Andric }
1460*c9157d92SDimitry Andric 
computeCacheDirectedLayout(const CDSortConfig & Config,ArrayRef<uint64_t> FuncSizes,ArrayRef<uint64_t> FuncCounts,ArrayRef<EdgeCount> CallCounts,ArrayRef<uint64_t> CallOffsets)1461*c9157d92SDimitry Andric std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
1462*c9157d92SDimitry Andric     const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes,
1463*c9157d92SDimitry Andric     ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts,
1464*c9157d92SDimitry Andric     ArrayRef<uint64_t> CallOffsets) {
1465*c9157d92SDimitry Andric   // Verify correctness of the input data.
1466*c9157d92SDimitry Andric   assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input");
1467*c9157d92SDimitry Andric 
1468*c9157d92SDimitry Andric   // Apply the reordering algorithm.
1469*c9157d92SDimitry Andric   CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
1470*c9157d92SDimitry Andric   std::vector<uint64_t> Result = Alg.run();
1471*c9157d92SDimitry Andric   assert(Result.size() == FuncSizes.size() && "Incorrect size of layout");
1472*c9157d92SDimitry Andric   return Result;
1473*c9157d92SDimitry Andric }
1474*c9157d92SDimitry Andric 
computeCacheDirectedLayout(ArrayRef<uint64_t> FuncSizes,ArrayRef<uint64_t> FuncCounts,ArrayRef<EdgeCount> CallCounts,ArrayRef<uint64_t> CallOffsets)1475*c9157d92SDimitry Andric std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
1476*c9157d92SDimitry Andric     ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts,
1477*c9157d92SDimitry Andric     ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets) {
1478*c9157d92SDimitry Andric   CDSortConfig Config;
1479*c9157d92SDimitry Andric   // Populate the config from the command-line options.
1480*c9157d92SDimitry Andric   if (CacheEntries.getNumOccurrences() > 0)
1481*c9157d92SDimitry Andric     Config.CacheEntries = CacheEntries;
1482*c9157d92SDimitry Andric   if (CacheSize.getNumOccurrences() > 0)
1483*c9157d92SDimitry Andric     Config.CacheSize = CacheSize;
1484*c9157d92SDimitry Andric   if (CDMaxChainSize.getNumOccurrences() > 0)
1485*c9157d92SDimitry Andric     Config.MaxChainSize = CDMaxChainSize;
1486*c9157d92SDimitry Andric   if (DistancePower.getNumOccurrences() > 0)
1487*c9157d92SDimitry Andric     Config.DistancePower = DistancePower;
1488*c9157d92SDimitry Andric   if (FrequencyScale.getNumOccurrences() > 0)
1489*c9157d92SDimitry Andric     Config.FrequencyScale = FrequencyScale;
1490*c9157d92SDimitry Andric   return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts,
1491*c9157d92SDimitry Andric                                     CallOffsets);
1492*c9157d92SDimitry Andric }
1493