10eae32dcSDimitry Andric //===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
20eae32dcSDimitry Andric //
30eae32dcSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40eae32dcSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50eae32dcSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60eae32dcSDimitry Andric //
70eae32dcSDimitry Andric //===----------------------------------------------------------------------===//
80eae32dcSDimitry Andric //
9fe013be4SDimitry Andric // The file implements "cache-aware" layout algorithms of basic blocks and
10fe013be4SDimitry Andric // functions in a binary.
110eae32dcSDimitry Andric //
120eae32dcSDimitry Andric // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
130eae32dcSDimitry Andric // optimizing jump locality and thus processor I-cache utilization. This is
140eae32dcSDimitry Andric // achieved via increasing the number of fall-through jumps and co-locating
150eae32dcSDimitry Andric // frequently executed nodes together. The name follows the underlying
160eae32dcSDimitry Andric // optimization problem, Extended-TSP, which is a generalization of classical
170eae32dcSDimitry Andric // (maximum) Traveling Salesmen Problem.
180eae32dcSDimitry Andric //
190eae32dcSDimitry Andric // The algorithm is a greedy heuristic that works with chains (ordered lists)
200eae32dcSDimitry Andric // of basic blocks. Initially all chains are isolated basic blocks. On every
210eae32dcSDimitry Andric // iteration, we pick a pair of chains whose merging yields the biggest increase
220eae32dcSDimitry Andric // in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
230eae32dcSDimitry Andric // A pair of chains giving the maximum gain is merged into a new chain. The
240eae32dcSDimitry Andric // procedure stops when there is only one chain left, or when merging does not
250eae32dcSDimitry Andric // increase ExtTSP. In the latter case, the remaining chains are sorted by
260eae32dcSDimitry Andric // density in the decreasing order.
270eae32dcSDimitry Andric //
280eae32dcSDimitry Andric // An important aspect is the way two chains are merged. Unlike earlier
290eae32dcSDimitry Andric // algorithms (e.g., based on the approach of Pettis-Hansen), two
300eae32dcSDimitry Andric // chains, X and Y, are first split into three, X1, X2, and Y. Then we
310eae32dcSDimitry Andric // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
320eae32dcSDimitry Andric // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
330eae32dcSDimitry Andric // This improves the quality of the final result (the search space is larger)
340eae32dcSDimitry Andric // while keeping the implementation sufficiently fast.
350eae32dcSDimitry Andric //
360eae32dcSDimitry Andric // Reference:
370eae32dcSDimitry Andric // * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
380eae32dcSDimitry Andric // IEEE Transactions on Computers, 2020
39bdd1243dSDimitry Andric // https://arxiv.org/abs/1809.04676
400eae32dcSDimitry Andric //
410eae32dcSDimitry Andric //===----------------------------------------------------------------------===//
420eae32dcSDimitry Andric
430eae32dcSDimitry Andric #include "llvm/Transforms/Utils/CodeLayout.h"
440eae32dcSDimitry Andric #include "llvm/Support/CommandLine.h"
45fe013be4SDimitry Andric #include "llvm/Support/Debug.h"
460eae32dcSDimitry Andric
47bdd1243dSDimitry Andric #include <cmath>
48*c9157d92SDimitry Andric #include <set>
49bdd1243dSDimitry Andric
500eae32dcSDimitry Andric using namespace llvm;
51*c9157d92SDimitry Andric using namespace llvm::codelayout;
52*c9157d92SDimitry Andric
530eae32dcSDimitry Andric #define DEBUG_TYPE "code-layout"
540eae32dcSDimitry Andric
55fe013be4SDimitry Andric namespace llvm {
5681ad6265SDimitry Andric cl::opt<bool> EnableExtTspBlockPlacement(
5781ad6265SDimitry Andric "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
5881ad6265SDimitry Andric cl::desc("Enable machine block placement based on the ext-tsp model, "
5981ad6265SDimitry Andric "optimizing I-cache utilization."));
6081ad6265SDimitry Andric
6181ad6265SDimitry Andric cl::opt<bool> ApplyExtTspWithoutProfile(
6281ad6265SDimitry Andric "ext-tsp-apply-without-profile",
6381ad6265SDimitry Andric cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
6481ad6265SDimitry Andric cl::init(true), cl::Hidden);
65fe013be4SDimitry Andric } // namespace llvm
6681ad6265SDimitry Andric
67*c9157d92SDimitry Andric // Algorithm-specific params for Ext-TSP. The values are tuned for the best
68*c9157d92SDimitry Andric // performance of large-scale front-end bound binaries.
69bdd1243dSDimitry Andric static cl::opt<double> ForwardWeightCond(
70bdd1243dSDimitry Andric "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),
71bdd1243dSDimitry Andric cl::desc("The weight of conditional forward jumps for ExtTSP value"));
720eae32dcSDimitry Andric
73bdd1243dSDimitry Andric static cl::opt<double> ForwardWeightUncond(
74bdd1243dSDimitry Andric "ext-tsp-forward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
75bdd1243dSDimitry Andric cl::desc("The weight of unconditional forward jumps for ExtTSP value"));
76bdd1243dSDimitry Andric
77bdd1243dSDimitry Andric static cl::opt<double> BackwardWeightCond(
78bdd1243dSDimitry Andric "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1),
79fe013be4SDimitry Andric cl::desc("The weight of conditional backward jumps for ExtTSP value"));
80bdd1243dSDimitry Andric
81bdd1243dSDimitry Andric static cl::opt<double> BackwardWeightUncond(
82bdd1243dSDimitry Andric "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
83fe013be4SDimitry Andric cl::desc("The weight of unconditional backward jumps for ExtTSP value"));
84bdd1243dSDimitry Andric
85bdd1243dSDimitry Andric static cl::opt<double> FallthroughWeightCond(
86bdd1243dSDimitry Andric "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0),
87bdd1243dSDimitry Andric cl::desc("The weight of conditional fallthrough jumps for ExtTSP value"));
88bdd1243dSDimitry Andric
89bdd1243dSDimitry Andric static cl::opt<double> FallthroughWeightUncond(
90bdd1243dSDimitry Andric "ext-tsp-fallthrough-weight-uncond", cl::ReallyHidden, cl::init(1.05),
91bdd1243dSDimitry Andric cl::desc("The weight of unconditional fallthrough jumps for ExtTSP value"));
920eae32dcSDimitry Andric
930eae32dcSDimitry Andric static cl::opt<unsigned> ForwardDistance(
94bdd1243dSDimitry Andric "ext-tsp-forward-distance", cl::ReallyHidden, cl::init(1024),
950eae32dcSDimitry Andric cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
960eae32dcSDimitry Andric
970eae32dcSDimitry Andric static cl::opt<unsigned> BackwardDistance(
98bdd1243dSDimitry Andric "ext-tsp-backward-distance", cl::ReallyHidden, cl::init(640),
990eae32dcSDimitry Andric cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
1000eae32dcSDimitry Andric
10181ad6265SDimitry Andric // The maximum size of a chain created by the algorithm. The size is bounded
102*c9157d92SDimitry Andric // so that the algorithm can efficiently process extremely large instances.
10381ad6265SDimitry Andric static cl::opt<unsigned>
104*c9157d92SDimitry Andric MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(512),
105*c9157d92SDimitry Andric cl::desc("The maximum size of a chain to create"));
10681ad6265SDimitry Andric
1070eae32dcSDimitry Andric // The maximum size of a chain for splitting. Larger values of the threshold
1080eae32dcSDimitry Andric // may yield better quality at the cost of worsen run-time.
1090eae32dcSDimitry Andric static cl::opt<unsigned> ChainSplitThreshold(
110bdd1243dSDimitry Andric "ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128),
1110eae32dcSDimitry Andric cl::desc("The maximum size of a chain to apply splitting"));
1120eae32dcSDimitry Andric
113*c9157d92SDimitry Andric // The maximum ratio between densities of two chains for merging.
114*c9157d92SDimitry Andric static cl::opt<double> MaxMergeDensityRatio(
115*c9157d92SDimitry Andric "ext-tsp-max-merge-density-ratio", cl::ReallyHidden, cl::init(100),
116*c9157d92SDimitry Andric cl::desc("The maximum ratio between densities of two chains for merging"));
117*c9157d92SDimitry Andric
118*c9157d92SDimitry Andric // Algorithm-specific options for CDSort.
119*c9157d92SDimitry Andric static cl::opt<unsigned> CacheEntries("cdsort-cache-entries", cl::ReallyHidden,
120*c9157d92SDimitry Andric cl::desc("The size of the cache"));
121*c9157d92SDimitry Andric
122*c9157d92SDimitry Andric static cl::opt<unsigned> CacheSize("cdsort-cache-size", cl::ReallyHidden,
123*c9157d92SDimitry Andric cl::desc("The size of a line in the cache"));
124*c9157d92SDimitry Andric
125*c9157d92SDimitry Andric static cl::opt<unsigned>
126*c9157d92SDimitry Andric CDMaxChainSize("cdsort-max-chain-size", cl::ReallyHidden,
127*c9157d92SDimitry Andric cl::desc("The maximum size of a chain to create"));
128*c9157d92SDimitry Andric
129*c9157d92SDimitry Andric static cl::opt<double> DistancePower(
130*c9157d92SDimitry Andric "cdsort-distance-power", cl::ReallyHidden,
131*c9157d92SDimitry Andric cl::desc("The power exponent for the distance-based locality"));
132*c9157d92SDimitry Andric
133*c9157d92SDimitry Andric static cl::opt<double> FrequencyScale(
134*c9157d92SDimitry Andric "cdsort-frequency-scale", cl::ReallyHidden,
135*c9157d92SDimitry Andric cl::desc("The scale factor for the frequency-based locality"));
1360eae32dcSDimitry Andric
1370eae32dcSDimitry Andric namespace {
1380eae32dcSDimitry Andric
1390eae32dcSDimitry Andric // Epsilon for comparison of doubles.
1400eae32dcSDimitry Andric constexpr double EPS = 1e-8;
1410eae32dcSDimitry Andric
142bdd1243dSDimitry Andric // Compute the Ext-TSP score for a given jump.
jumpExtTSPScore(uint64_t JumpDist,uint64_t JumpMaxDist,uint64_t Count,double Weight)143bdd1243dSDimitry Andric double jumpExtTSPScore(uint64_t JumpDist, uint64_t JumpMaxDist, uint64_t Count,
144bdd1243dSDimitry Andric double Weight) {
145bdd1243dSDimitry Andric if (JumpDist > JumpMaxDist)
146bdd1243dSDimitry Andric return 0;
147bdd1243dSDimitry Andric double Prob = 1.0 - static_cast<double>(JumpDist) / JumpMaxDist;
148bdd1243dSDimitry Andric return Weight * Prob * Count;
149bdd1243dSDimitry Andric }
150bdd1243dSDimitry Andric
1510eae32dcSDimitry Andric // Compute the Ext-TSP score for a jump between a given pair of blocks,
1520eae32dcSDimitry Andric // using their sizes, (estimated) addresses and the jump execution count.
extTSPScore(uint64_t SrcAddr,uint64_t SrcSize,uint64_t DstAddr,uint64_t Count,bool IsConditional)1530eae32dcSDimitry Andric double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
154bdd1243dSDimitry Andric uint64_t Count, bool IsConditional) {
1550eae32dcSDimitry Andric // Fallthrough
1560eae32dcSDimitry Andric if (SrcAddr + SrcSize == DstAddr) {
157bdd1243dSDimitry Andric return jumpExtTSPScore(0, 1, Count,
158bdd1243dSDimitry Andric IsConditional ? FallthroughWeightCond
159bdd1243dSDimitry Andric : FallthroughWeightUncond);
1600eae32dcSDimitry Andric }
1610eae32dcSDimitry Andric // Forward
1620eae32dcSDimitry Andric if (SrcAddr + SrcSize < DstAddr) {
163bdd1243dSDimitry Andric const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
164bdd1243dSDimitry Andric return jumpExtTSPScore(Dist, ForwardDistance, Count,
165bdd1243dSDimitry Andric IsConditional ? ForwardWeightCond
166bdd1243dSDimitry Andric : ForwardWeightUncond);
1670eae32dcSDimitry Andric }
1680eae32dcSDimitry Andric // Backward
169bdd1243dSDimitry Andric const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
170bdd1243dSDimitry Andric return jumpExtTSPScore(Dist, BackwardDistance, Count,
171bdd1243dSDimitry Andric IsConditional ? BackwardWeightCond
172bdd1243dSDimitry Andric : BackwardWeightUncond);
1730eae32dcSDimitry Andric }
1740eae32dcSDimitry Andric
1750eae32dcSDimitry Andric /// A type of merging two chains, X and Y. The former chain is split into
1760eae32dcSDimitry Andric /// X1 and X2 and then concatenated with Y in the order specified by the type.
177fe013be4SDimitry Andric enum class MergeTypeT : int { X_Y, Y_X, X1_Y_X2, Y_X2_X1, X2_X1_Y };
1780eae32dcSDimitry Andric
1790eae32dcSDimitry Andric /// The gain of merging two chains, that is, the Ext-TSP score of the merge
180fe013be4SDimitry Andric /// together with the corresponding merge 'type' and 'offset'.
181fe013be4SDimitry Andric struct MergeGainT {
182fe013be4SDimitry Andric explicit MergeGainT() = default;
MergeGainT__anond1605d3d0111::MergeGainT183fe013be4SDimitry Andric explicit MergeGainT(double Score, size_t MergeOffset, MergeTypeT MergeType)
1840eae32dcSDimitry Andric : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
1850eae32dcSDimitry Andric
score__anond1605d3d0111::MergeGainT1860eae32dcSDimitry Andric double score() const { return Score; }
1870eae32dcSDimitry Andric
mergeOffset__anond1605d3d0111::MergeGainT1880eae32dcSDimitry Andric size_t mergeOffset() const { return MergeOffset; }
1890eae32dcSDimitry Andric
mergeType__anond1605d3d0111::MergeGainT190fe013be4SDimitry Andric MergeTypeT mergeType() const { return MergeType; }
191fe013be4SDimitry Andric
setMergeType__anond1605d3d0111::MergeGainT192fe013be4SDimitry Andric void setMergeType(MergeTypeT Ty) { MergeType = Ty; }
1930eae32dcSDimitry Andric
1940eae32dcSDimitry Andric // Returns 'true' iff Other is preferred over this.
operator <__anond1605d3d0111::MergeGainT195fe013be4SDimitry Andric bool operator<(const MergeGainT &Other) const {
1960eae32dcSDimitry Andric return (Other.Score > EPS && Other.Score > Score + EPS);
1970eae32dcSDimitry Andric }
1980eae32dcSDimitry Andric
1990eae32dcSDimitry Andric // Update the current gain if Other is preferred over this.
updateIfLessThan__anond1605d3d0111::MergeGainT200fe013be4SDimitry Andric void updateIfLessThan(const MergeGainT &Other) {
2010eae32dcSDimitry Andric if (*this < Other)
2020eae32dcSDimitry Andric *this = Other;
2030eae32dcSDimitry Andric }
2040eae32dcSDimitry Andric
2050eae32dcSDimitry Andric private:
2060eae32dcSDimitry Andric double Score{-1.0};
2070eae32dcSDimitry Andric size_t MergeOffset{0};
208fe013be4SDimitry Andric MergeTypeT MergeType{MergeTypeT::X_Y};
2090eae32dcSDimitry Andric };
2100eae32dcSDimitry Andric
211fe013be4SDimitry Andric struct JumpT;
212fe013be4SDimitry Andric struct ChainT;
213fe013be4SDimitry Andric struct ChainEdge;
2140eae32dcSDimitry Andric
215fe013be4SDimitry Andric /// A node in the graph, typically corresponding to a basic block in the CFG or
216fe013be4SDimitry Andric /// a function in the call graph.
217fe013be4SDimitry Andric struct NodeT {
218fe013be4SDimitry Andric NodeT(const NodeT &) = delete;
219fe013be4SDimitry Andric NodeT(NodeT &&) = default;
220fe013be4SDimitry Andric NodeT &operator=(const NodeT &) = delete;
221fe013be4SDimitry Andric NodeT &operator=(NodeT &&) = default;
2220eae32dcSDimitry Andric
NodeT__anond1605d3d0111::NodeT223*c9157d92SDimitry Andric explicit NodeT(size_t Index, uint64_t Size, uint64_t Count)
224*c9157d92SDimitry Andric : Index(Index), Size(Size), ExecutionCount(Count) {}
225fe013be4SDimitry Andric
isEntry__anond1605d3d0111::NodeT2260eae32dcSDimitry Andric bool isEntry() const { return Index == 0; }
227fe013be4SDimitry Andric
228*c9157d92SDimitry Andric // Check if Other is a successor of the node.
229*c9157d92SDimitry Andric bool isSuccessor(const NodeT *Other) const;
230*c9157d92SDimitry Andric
231fe013be4SDimitry Andric // The total execution count of outgoing jumps.
232fe013be4SDimitry Andric uint64_t outCount() const;
233fe013be4SDimitry Andric
234fe013be4SDimitry Andric // The total execution count of incoming jumps.
235fe013be4SDimitry Andric uint64_t inCount() const;
236fe013be4SDimitry Andric
237fe013be4SDimitry Andric // The original index of the node in graph.
238fe013be4SDimitry Andric size_t Index{0};
239fe013be4SDimitry Andric // The index of the node in the current chain.
240fe013be4SDimitry Andric size_t CurIndex{0};
241fe013be4SDimitry Andric // The size of the node in the binary.
242fe013be4SDimitry Andric uint64_t Size{0};
243fe013be4SDimitry Andric // The execution count of the node in the profile data.
244fe013be4SDimitry Andric uint64_t ExecutionCount{0};
245fe013be4SDimitry Andric // The current chain of the node.
246fe013be4SDimitry Andric ChainT *CurChain{nullptr};
247fe013be4SDimitry Andric // The offset of the node in the current chain.
248fe013be4SDimitry Andric mutable uint64_t EstimatedAddr{0};
249fe013be4SDimitry Andric // Forced successor of the node in the graph.
250fe013be4SDimitry Andric NodeT *ForcedSucc{nullptr};
251fe013be4SDimitry Andric // Forced predecessor of the node in the graph.
252fe013be4SDimitry Andric NodeT *ForcedPred{nullptr};
253fe013be4SDimitry Andric // Outgoing jumps from the node.
254fe013be4SDimitry Andric std::vector<JumpT *> OutJumps;
255fe013be4SDimitry Andric // Incoming jumps to the node.
256fe013be4SDimitry Andric std::vector<JumpT *> InJumps;
2570eae32dcSDimitry Andric };
2580eae32dcSDimitry Andric
259fe013be4SDimitry Andric /// An arc in the graph, typically corresponding to a jump between two nodes.
260fe013be4SDimitry Andric struct JumpT {
261fe013be4SDimitry Andric JumpT(const JumpT &) = delete;
262fe013be4SDimitry Andric JumpT(JumpT &&) = default;
263fe013be4SDimitry Andric JumpT &operator=(const JumpT &) = delete;
264fe013be4SDimitry Andric JumpT &operator=(JumpT &&) = default;
2650eae32dcSDimitry Andric
JumpT__anond1605d3d0111::JumpT266fe013be4SDimitry Andric explicit JumpT(NodeT *Source, NodeT *Target, uint64_t ExecutionCount)
267fe013be4SDimitry Andric : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
268fe013be4SDimitry Andric
269fe013be4SDimitry Andric // Source node of the jump.
270fe013be4SDimitry Andric NodeT *Source;
271fe013be4SDimitry Andric // Target node of the jump.
272fe013be4SDimitry Andric NodeT *Target;
2730eae32dcSDimitry Andric // Execution count of the arc in the profile data.
2740eae32dcSDimitry Andric uint64_t ExecutionCount{0};
275bdd1243dSDimitry Andric // Whether the jump corresponds to a conditional branch.
276bdd1243dSDimitry Andric bool IsConditional{false};
277fe013be4SDimitry Andric // The offset of the jump from the source node.
278fe013be4SDimitry Andric uint64_t Offset{0};
2790eae32dcSDimitry Andric };
2800eae32dcSDimitry Andric
281fe013be4SDimitry Andric /// A chain (ordered sequence) of nodes in the graph.
282fe013be4SDimitry Andric struct ChainT {
283fe013be4SDimitry Andric ChainT(const ChainT &) = delete;
284fe013be4SDimitry Andric ChainT(ChainT &&) = default;
285fe013be4SDimitry Andric ChainT &operator=(const ChainT &) = delete;
286fe013be4SDimitry Andric ChainT &operator=(ChainT &&) = default;
2870eae32dcSDimitry Andric
ChainT__anond1605d3d0111::ChainT288fe013be4SDimitry Andric explicit ChainT(uint64_t Id, NodeT *Node)
289fe013be4SDimitry Andric : Id(Id), ExecutionCount(Node->ExecutionCount), Size(Node->Size),
290fe013be4SDimitry Andric Nodes(1, Node) {}
2910eae32dcSDimitry Andric
numBlocks__anond1605d3d0111::ChainT292fe013be4SDimitry Andric size_t numBlocks() const { return Nodes.size(); }
2930eae32dcSDimitry Andric
density__anond1605d3d0111::ChainT294*c9157d92SDimitry Andric double density() const { return ExecutionCount / Size; }
295fe013be4SDimitry Andric
isEntry__anond1605d3d0111::ChainT296fe013be4SDimitry Andric bool isEntry() const { return Nodes[0]->Index == 0; }
2970eae32dcSDimitry Andric
isCold__anond1605d3d0111::ChainT298bdd1243dSDimitry Andric bool isCold() const {
299fe013be4SDimitry Andric for (NodeT *Node : Nodes) {
300fe013be4SDimitry Andric if (Node->ExecutionCount > 0)
301bdd1243dSDimitry Andric return false;
302bdd1243dSDimitry Andric }
303bdd1243dSDimitry Andric return true;
304bdd1243dSDimitry Andric }
305bdd1243dSDimitry Andric
getEdge__anond1605d3d0111::ChainT306fe013be4SDimitry Andric ChainEdge *getEdge(ChainT *Other) const {
307*c9157d92SDimitry Andric for (const auto &[Chain, ChainEdge] : Edges) {
308*c9157d92SDimitry Andric if (Chain == Other)
309*c9157d92SDimitry Andric return ChainEdge;
3100eae32dcSDimitry Andric }
3110eae32dcSDimitry Andric return nullptr;
3120eae32dcSDimitry Andric }
3130eae32dcSDimitry Andric
removeEdge__anond1605d3d0111::ChainT314fe013be4SDimitry Andric void removeEdge(ChainT *Other) {
3150eae32dcSDimitry Andric auto It = Edges.begin();
3160eae32dcSDimitry Andric while (It != Edges.end()) {
3170eae32dcSDimitry Andric if (It->first == Other) {
3180eae32dcSDimitry Andric Edges.erase(It);
3190eae32dcSDimitry Andric return;
3200eae32dcSDimitry Andric }
3210eae32dcSDimitry Andric It++;
3220eae32dcSDimitry Andric }
3230eae32dcSDimitry Andric }
3240eae32dcSDimitry Andric
addEdge__anond1605d3d0111::ChainT325fe013be4SDimitry Andric void addEdge(ChainT *Other, ChainEdge *Edge) {
3260eae32dcSDimitry Andric Edges.push_back(std::make_pair(Other, Edge));
3270eae32dcSDimitry Andric }
3280eae32dcSDimitry Andric
merge__anond1605d3d0111::ChainT329*c9157d92SDimitry Andric void merge(ChainT *Other, std::vector<NodeT *> MergedBlocks) {
330*c9157d92SDimitry Andric Nodes = std::move(MergedBlocks);
331*c9157d92SDimitry Andric // Update the chain's data.
332fe013be4SDimitry Andric ExecutionCount += Other->ExecutionCount;
333fe013be4SDimitry Andric Size += Other->Size;
334fe013be4SDimitry Andric Id = Nodes[0]->Index;
335*c9157d92SDimitry Andric // Update the node's data.
336fe013be4SDimitry Andric for (size_t Idx = 0; Idx < Nodes.size(); Idx++) {
337fe013be4SDimitry Andric Nodes[Idx]->CurChain = this;
338fe013be4SDimitry Andric Nodes[Idx]->CurIndex = Idx;
3390eae32dcSDimitry Andric }
3400eae32dcSDimitry Andric }
3410eae32dcSDimitry Andric
342fe013be4SDimitry Andric void mergeEdges(ChainT *Other);
3430eae32dcSDimitry Andric
clear__anond1605d3d0111::ChainT3440eae32dcSDimitry Andric void clear() {
345fe013be4SDimitry Andric Nodes.clear();
346fe013be4SDimitry Andric Nodes.shrink_to_fit();
3470eae32dcSDimitry Andric Edges.clear();
3480eae32dcSDimitry Andric Edges.shrink_to_fit();
3490eae32dcSDimitry Andric }
3500eae32dcSDimitry Andric
3510eae32dcSDimitry Andric // Unique chain identifier.
3520eae32dcSDimitry Andric uint64_t Id;
3530eae32dcSDimitry Andric // Cached ext-tsp score for the chain.
354fe013be4SDimitry Andric double Score{0};
355*c9157d92SDimitry Andric // The total execution count of the chain. Since the execution count of
356*c9157d92SDimitry Andric // a basic block is uint64_t, using doubles here to avoid overflow.
357*c9157d92SDimitry Andric double ExecutionCount{0};
358fe013be4SDimitry Andric // The total size of the chain.
359fe013be4SDimitry Andric uint64_t Size{0};
360fe013be4SDimitry Andric // Nodes of the chain.
361fe013be4SDimitry Andric std::vector<NodeT *> Nodes;
3620eae32dcSDimitry Andric // Adjacent chains and corresponding edges (lists of jumps).
363fe013be4SDimitry Andric std::vector<std::pair<ChainT *, ChainEdge *>> Edges;
3640eae32dcSDimitry Andric };
3650eae32dcSDimitry Andric
366fe013be4SDimitry Andric /// An edge in the graph representing jumps between two chains.
367fe013be4SDimitry Andric /// When nodes are merged into chains, the edges are combined too so that
368*c9157d92SDimitry Andric /// there is always at most one edge between a pair of chains.
369fe013be4SDimitry Andric struct ChainEdge {
3700eae32dcSDimitry Andric ChainEdge(const ChainEdge &) = delete;
3710eae32dcSDimitry Andric ChainEdge(ChainEdge &&) = default;
3720eae32dcSDimitry Andric ChainEdge &operator=(const ChainEdge &) = delete;
373fe013be4SDimitry Andric ChainEdge &operator=(ChainEdge &&) = delete;
3740eae32dcSDimitry Andric
ChainEdge__anond1605d3d0111::ChainEdge375fe013be4SDimitry Andric explicit ChainEdge(JumpT *Jump)
3760eae32dcSDimitry Andric : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
3770eae32dcSDimitry Andric Jumps(1, Jump) {}
3780eae32dcSDimitry Andric
srcChain__anond1605d3d0111::ChainEdge379fe013be4SDimitry Andric ChainT *srcChain() const { return SrcChain; }
3800eae32dcSDimitry Andric
dstChain__anond1605d3d0111::ChainEdge381fe013be4SDimitry Andric ChainT *dstChain() const { return DstChain; }
3820eae32dcSDimitry Andric
isSelfEdge__anond1605d3d0111::ChainEdge383fe013be4SDimitry Andric bool isSelfEdge() const { return SrcChain == DstChain; }
384fe013be4SDimitry Andric
jumps__anond1605d3d0111::ChainEdge385fe013be4SDimitry Andric const std::vector<JumpT *> &jumps() const { return Jumps; }
386fe013be4SDimitry Andric
appendJump__anond1605d3d0111::ChainEdge387fe013be4SDimitry Andric void appendJump(JumpT *Jump) { Jumps.push_back(Jump); }
3880eae32dcSDimitry Andric
moveJumps__anond1605d3d0111::ChainEdge3890eae32dcSDimitry Andric void moveJumps(ChainEdge *Other) {
3900eae32dcSDimitry Andric Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
3910eae32dcSDimitry Andric Other->Jumps.clear();
3920eae32dcSDimitry Andric Other->Jumps.shrink_to_fit();
3930eae32dcSDimitry Andric }
3940eae32dcSDimitry Andric
changeEndpoint__anond1605d3d0111::ChainEdge395fe013be4SDimitry Andric void changeEndpoint(ChainT *From, ChainT *To) {
396fe013be4SDimitry Andric if (From == SrcChain)
397fe013be4SDimitry Andric SrcChain = To;
398fe013be4SDimitry Andric if (From == DstChain)
399fe013be4SDimitry Andric DstChain = To;
400fe013be4SDimitry Andric }
401fe013be4SDimitry Andric
hasCachedMergeGain__anond1605d3d0111::ChainEdge402fe013be4SDimitry Andric bool hasCachedMergeGain(ChainT *Src, ChainT *Dst) const {
4030eae32dcSDimitry Andric return Src == SrcChain ? CacheValidForward : CacheValidBackward;
4040eae32dcSDimitry Andric }
4050eae32dcSDimitry Andric
getCachedMergeGain__anond1605d3d0111::ChainEdge406fe013be4SDimitry Andric MergeGainT getCachedMergeGain(ChainT *Src, ChainT *Dst) const {
4070eae32dcSDimitry Andric return Src == SrcChain ? CachedGainForward : CachedGainBackward;
4080eae32dcSDimitry Andric }
4090eae32dcSDimitry Andric
setCachedMergeGain__anond1605d3d0111::ChainEdge410fe013be4SDimitry Andric void setCachedMergeGain(ChainT *Src, ChainT *Dst, MergeGainT MergeGain) {
4110eae32dcSDimitry Andric if (Src == SrcChain) {
4120eae32dcSDimitry Andric CachedGainForward = MergeGain;
4130eae32dcSDimitry Andric CacheValidForward = true;
4140eae32dcSDimitry Andric } else {
4150eae32dcSDimitry Andric CachedGainBackward = MergeGain;
4160eae32dcSDimitry Andric CacheValidBackward = true;
4170eae32dcSDimitry Andric }
4180eae32dcSDimitry Andric }
4190eae32dcSDimitry Andric
invalidateCache__anond1605d3d0111::ChainEdge4200eae32dcSDimitry Andric void invalidateCache() {
4210eae32dcSDimitry Andric CacheValidForward = false;
4220eae32dcSDimitry Andric CacheValidBackward = false;
4230eae32dcSDimitry Andric }
4240eae32dcSDimitry Andric
setMergeGain__anond1605d3d0111::ChainEdge425fe013be4SDimitry Andric void setMergeGain(MergeGainT Gain) { CachedGain = Gain; }
426fe013be4SDimitry Andric
getMergeGain__anond1605d3d0111::ChainEdge427fe013be4SDimitry Andric MergeGainT getMergeGain() const { return CachedGain; }
428fe013be4SDimitry Andric
gain__anond1605d3d0111::ChainEdge429fe013be4SDimitry Andric double gain() const { return CachedGain.score(); }
430fe013be4SDimitry Andric
4310eae32dcSDimitry Andric private:
4320eae32dcSDimitry Andric // Source chain.
433fe013be4SDimitry Andric ChainT *SrcChain{nullptr};
4340eae32dcSDimitry Andric // Destination chain.
435fe013be4SDimitry Andric ChainT *DstChain{nullptr};
436fe013be4SDimitry Andric // Original jumps in the binary with corresponding execution counts.
437fe013be4SDimitry Andric std::vector<JumpT *> Jumps;
438fe013be4SDimitry Andric // Cached gain value for merging the pair of chains.
439fe013be4SDimitry Andric MergeGainT CachedGain;
440fe013be4SDimitry Andric
441fe013be4SDimitry Andric // Cached gain values for merging the pair of chains. Since the gain of
442fe013be4SDimitry Andric // merging (Src, Dst) and (Dst, Src) might be different, we store both values
443fe013be4SDimitry Andric // here and a flag indicating which of the options results in a higher gain.
444fe013be4SDimitry Andric // Cached gain values.
445fe013be4SDimitry Andric MergeGainT CachedGainForward;
446fe013be4SDimitry Andric MergeGainT CachedGainBackward;
4470eae32dcSDimitry Andric // Whether the cached value must be recomputed.
4480eae32dcSDimitry Andric bool CacheValidForward{false};
4490eae32dcSDimitry Andric bool CacheValidBackward{false};
4500eae32dcSDimitry Andric };
4510eae32dcSDimitry Andric
isSuccessor(const NodeT * Other) const452*c9157d92SDimitry Andric bool NodeT::isSuccessor(const NodeT *Other) const {
453*c9157d92SDimitry Andric for (JumpT *Jump : OutJumps)
454*c9157d92SDimitry Andric if (Jump->Target == Other)
455*c9157d92SDimitry Andric return true;
456*c9157d92SDimitry Andric return false;
457*c9157d92SDimitry Andric }
458*c9157d92SDimitry Andric
outCount() const459fe013be4SDimitry Andric uint64_t NodeT::outCount() const {
460fe013be4SDimitry Andric uint64_t Count = 0;
461*c9157d92SDimitry Andric for (JumpT *Jump : OutJumps)
462fe013be4SDimitry Andric Count += Jump->ExecutionCount;
463fe013be4SDimitry Andric return Count;
464fe013be4SDimitry Andric }
4650eae32dcSDimitry Andric
inCount() const466fe013be4SDimitry Andric uint64_t NodeT::inCount() const {
467fe013be4SDimitry Andric uint64_t Count = 0;
468*c9157d92SDimitry Andric for (JumpT *Jump : InJumps)
469fe013be4SDimitry Andric Count += Jump->ExecutionCount;
470fe013be4SDimitry Andric return Count;
471fe013be4SDimitry Andric }
472fe013be4SDimitry Andric
mergeEdges(ChainT * Other)473fe013be4SDimitry Andric void ChainT::mergeEdges(ChainT *Other) {
474*c9157d92SDimitry Andric // Update edges adjacent to chain Other.
475*c9157d92SDimitry Andric for (const auto &[DstChain, DstEdge] : Other->Edges) {
476fe013be4SDimitry Andric ChainT *TargetChain = DstChain == Other ? this : DstChain;
477bdd1243dSDimitry Andric ChainEdge *CurEdge = getEdge(TargetChain);
4780eae32dcSDimitry Andric if (CurEdge == nullptr) {
4790eae32dcSDimitry Andric DstEdge->changeEndpoint(Other, this);
4800eae32dcSDimitry Andric this->addEdge(TargetChain, DstEdge);
481*c9157d92SDimitry Andric if (DstChain != this && DstChain != Other)
4820eae32dcSDimitry Andric DstChain->addEdge(this, DstEdge);
4830eae32dcSDimitry Andric } else {
4840eae32dcSDimitry Andric CurEdge->moveJumps(DstEdge);
4850eae32dcSDimitry Andric }
486*c9157d92SDimitry Andric // Cleanup leftover edge.
487*c9157d92SDimitry Andric if (DstChain != Other)
4880eae32dcSDimitry Andric DstChain->removeEdge(Other);
4890eae32dcSDimitry Andric }
4900eae32dcSDimitry Andric }
4910eae32dcSDimitry Andric
492fe013be4SDimitry Andric using NodeIter = std::vector<NodeT *>::const_iterator;
493*c9157d92SDimitry Andric static std::vector<NodeT *> EmptyList;
4940eae32dcSDimitry Andric
495*c9157d92SDimitry Andric /// A wrapper around three concatenated vectors (chains) of nodes; it is used
496*c9157d92SDimitry Andric /// to avoid extra instantiation of the vectors.
497*c9157d92SDimitry Andric struct MergedNodesT {
MergedNodesT__anond1605d3d0111::MergedNodesT498*c9157d92SDimitry Andric MergedNodesT(NodeIter Begin1, NodeIter End1,
499*c9157d92SDimitry Andric NodeIter Begin2 = EmptyList.begin(),
500*c9157d92SDimitry Andric NodeIter End2 = EmptyList.end(),
501*c9157d92SDimitry Andric NodeIter Begin3 = EmptyList.begin(),
502*c9157d92SDimitry Andric NodeIter End3 = EmptyList.end())
5030eae32dcSDimitry Andric : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
5040eae32dcSDimitry Andric End3(End3) {}
5050eae32dcSDimitry Andric
forEach__anond1605d3d0111::MergedNodesT5060eae32dcSDimitry Andric template <typename F> void forEach(const F &Func) const {
5070eae32dcSDimitry Andric for (auto It = Begin1; It != End1; It++)
5080eae32dcSDimitry Andric Func(*It);
5090eae32dcSDimitry Andric for (auto It = Begin2; It != End2; It++)
5100eae32dcSDimitry Andric Func(*It);
5110eae32dcSDimitry Andric for (auto It = Begin3; It != End3; It++)
5120eae32dcSDimitry Andric Func(*It);
5130eae32dcSDimitry Andric }
5140eae32dcSDimitry Andric
getNodes__anond1605d3d0111::MergedNodesT515fe013be4SDimitry Andric std::vector<NodeT *> getNodes() const {
516fe013be4SDimitry Andric std::vector<NodeT *> Result;
5170eae32dcSDimitry Andric Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
5180eae32dcSDimitry Andric std::distance(Begin3, End3));
5190eae32dcSDimitry Andric Result.insert(Result.end(), Begin1, End1);
5200eae32dcSDimitry Andric Result.insert(Result.end(), Begin2, End2);
5210eae32dcSDimitry Andric Result.insert(Result.end(), Begin3, End3);
5220eae32dcSDimitry Andric return Result;
5230eae32dcSDimitry Andric }
5240eae32dcSDimitry Andric
getFirstNode__anond1605d3d0111::MergedNodesT525fe013be4SDimitry Andric const NodeT *getFirstNode() const { return *Begin1; }
5260eae32dcSDimitry Andric
5270eae32dcSDimitry Andric private:
528fe013be4SDimitry Andric NodeIter Begin1;
529fe013be4SDimitry Andric NodeIter End1;
530fe013be4SDimitry Andric NodeIter Begin2;
531fe013be4SDimitry Andric NodeIter End2;
532fe013be4SDimitry Andric NodeIter Begin3;
533fe013be4SDimitry Andric NodeIter End3;
5340eae32dcSDimitry Andric };
5350eae32dcSDimitry Andric
536*c9157d92SDimitry Andric /// A wrapper around two concatenated vectors (chains) of jumps.
537*c9157d92SDimitry Andric struct MergedJumpsT {
MergedJumpsT__anond1605d3d0111::MergedJumpsT538*c9157d92SDimitry Andric MergedJumpsT(const std::vector<JumpT *> *Jumps1,
539*c9157d92SDimitry Andric const std::vector<JumpT *> *Jumps2 = nullptr) {
540*c9157d92SDimitry Andric assert(!Jumps1->empty() && "cannot merge empty jump list");
541*c9157d92SDimitry Andric JumpArray[0] = Jumps1;
542*c9157d92SDimitry Andric JumpArray[1] = Jumps2;
543*c9157d92SDimitry Andric }
544*c9157d92SDimitry Andric
forEach__anond1605d3d0111::MergedJumpsT545*c9157d92SDimitry Andric template <typename F> void forEach(const F &Func) const {
546*c9157d92SDimitry Andric for (auto Jumps : JumpArray)
547*c9157d92SDimitry Andric if (Jumps != nullptr)
548*c9157d92SDimitry Andric for (JumpT *Jump : *Jumps)
549*c9157d92SDimitry Andric Func(Jump);
550*c9157d92SDimitry Andric }
551*c9157d92SDimitry Andric
552*c9157d92SDimitry Andric private:
553*c9157d92SDimitry Andric std::array<const std::vector<JumpT *> *, 2> JumpArray{nullptr, nullptr};
554*c9157d92SDimitry Andric };
555*c9157d92SDimitry Andric
556fe013be4SDimitry Andric /// Merge two chains of nodes respecting a given 'type' and 'offset'.
557fe013be4SDimitry Andric ///
558fe013be4SDimitry Andric /// If MergeType == 0, then the result is a concatenation of two chains.
559fe013be4SDimitry Andric /// Otherwise, the first chain is cut into two sub-chains at the offset,
560fe013be4SDimitry Andric /// and merged using all possible ways of concatenating three chains.
mergeNodes(const std::vector<NodeT * > & X,const std::vector<NodeT * > & Y,size_t MergeOffset,MergeTypeT MergeType)561*c9157d92SDimitry Andric MergedNodesT mergeNodes(const std::vector<NodeT *> &X,
562fe013be4SDimitry Andric const std::vector<NodeT *> &Y, size_t MergeOffset,
563fe013be4SDimitry Andric MergeTypeT MergeType) {
564*c9157d92SDimitry Andric // Split the first chain, X, into X1 and X2.
565fe013be4SDimitry Andric NodeIter BeginX1 = X.begin();
566fe013be4SDimitry Andric NodeIter EndX1 = X.begin() + MergeOffset;
567fe013be4SDimitry Andric NodeIter BeginX2 = X.begin() + MergeOffset;
568fe013be4SDimitry Andric NodeIter EndX2 = X.end();
569fe013be4SDimitry Andric NodeIter BeginY = Y.begin();
570fe013be4SDimitry Andric NodeIter EndY = Y.end();
571fe013be4SDimitry Andric
572*c9157d92SDimitry Andric // Construct a new chain from the three existing ones.
573fe013be4SDimitry Andric switch (MergeType) {
574fe013be4SDimitry Andric case MergeTypeT::X_Y:
575*c9157d92SDimitry Andric return MergedNodesT(BeginX1, EndX2, BeginY, EndY);
576fe013be4SDimitry Andric case MergeTypeT::Y_X:
577*c9157d92SDimitry Andric return MergedNodesT(BeginY, EndY, BeginX1, EndX2);
578fe013be4SDimitry Andric case MergeTypeT::X1_Y_X2:
579*c9157d92SDimitry Andric return MergedNodesT(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
580fe013be4SDimitry Andric case MergeTypeT::Y_X2_X1:
581*c9157d92SDimitry Andric return MergedNodesT(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
582fe013be4SDimitry Andric case MergeTypeT::X2_X1_Y:
583*c9157d92SDimitry Andric return MergedNodesT(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
584fe013be4SDimitry Andric }
585fe013be4SDimitry Andric llvm_unreachable("unexpected chain merge type");
586fe013be4SDimitry Andric }
587fe013be4SDimitry Andric
5880eae32dcSDimitry Andric /// The implementation of the ExtTSP algorithm.
5890eae32dcSDimitry Andric class ExtTSPImpl {
5900eae32dcSDimitry Andric public:
ExtTSPImpl(ArrayRef<uint64_t> NodeSizes,ArrayRef<uint64_t> NodeCounts,ArrayRef<EdgeCount> EdgeCounts)591*c9157d92SDimitry Andric ExtTSPImpl(ArrayRef<uint64_t> NodeSizes, ArrayRef<uint64_t> NodeCounts,
592*c9157d92SDimitry Andric ArrayRef<EdgeCount> EdgeCounts)
593fe013be4SDimitry Andric : NumNodes(NodeSizes.size()) {
5940eae32dcSDimitry Andric initialize(NodeSizes, NodeCounts, EdgeCounts);
5950eae32dcSDimitry Andric }
5960eae32dcSDimitry Andric
597fe013be4SDimitry Andric /// Run the algorithm and return an optimized ordering of nodes.
run()598*c9157d92SDimitry Andric std::vector<uint64_t> run() {
599fe013be4SDimitry Andric // Pass 1: Merge nodes with their mutually forced successors
6000eae32dcSDimitry Andric mergeForcedPairs();
6010eae32dcSDimitry Andric
6020eae32dcSDimitry Andric // Pass 2: Merge pairs of chains while improving the ExtTSP objective
6030eae32dcSDimitry Andric mergeChainPairs();
6040eae32dcSDimitry Andric
605fe013be4SDimitry Andric // Pass 3: Merge cold nodes to reduce code size
6060eae32dcSDimitry Andric mergeColdChains();
6070eae32dcSDimitry Andric
608fe013be4SDimitry Andric // Collect nodes from all chains
609*c9157d92SDimitry Andric return concatChains();
6100eae32dcSDimitry Andric }
6110eae32dcSDimitry Andric
6120eae32dcSDimitry Andric private:
6130eae32dcSDimitry Andric /// Initialize the algorithm's data structures.
initialize(const ArrayRef<uint64_t> & NodeSizes,const ArrayRef<uint64_t> & NodeCounts,const ArrayRef<EdgeCount> & EdgeCounts)614*c9157d92SDimitry Andric void initialize(const ArrayRef<uint64_t> &NodeSizes,
615*c9157d92SDimitry Andric const ArrayRef<uint64_t> &NodeCounts,
616*c9157d92SDimitry Andric const ArrayRef<EdgeCount> &EdgeCounts) {
617*c9157d92SDimitry Andric // Initialize nodes.
618fe013be4SDimitry Andric AllNodes.reserve(NumNodes);
619fe013be4SDimitry Andric for (uint64_t Idx = 0; Idx < NumNodes; Idx++) {
620fe013be4SDimitry Andric uint64_t Size = std::max<uint64_t>(NodeSizes[Idx], 1ULL);
621fe013be4SDimitry Andric uint64_t ExecutionCount = NodeCounts[Idx];
622*c9157d92SDimitry Andric // The execution count of the entry node is set to at least one.
623fe013be4SDimitry Andric if (Idx == 0 && ExecutionCount == 0)
6240eae32dcSDimitry Andric ExecutionCount = 1;
625fe013be4SDimitry Andric AllNodes.emplace_back(Idx, Size, ExecutionCount);
6260eae32dcSDimitry Andric }
6270eae32dcSDimitry Andric
628*c9157d92SDimitry Andric // Initialize jumps between the nodes.
629bdd1243dSDimitry Andric SuccNodes.resize(NumNodes);
630bdd1243dSDimitry Andric PredNodes.resize(NumNodes);
631bdd1243dSDimitry Andric std::vector<uint64_t> OutDegree(NumNodes, 0);
6320eae32dcSDimitry Andric AllJumps.reserve(EdgeCounts.size());
633*c9157d92SDimitry Andric for (auto Edge : EdgeCounts) {
634*c9157d92SDimitry Andric ++OutDegree[Edge.src];
635*c9157d92SDimitry Andric // Ignore self-edges.
636*c9157d92SDimitry Andric if (Edge.src == Edge.dst)
6370eae32dcSDimitry Andric continue;
6380eae32dcSDimitry Andric
639*c9157d92SDimitry Andric SuccNodes[Edge.src].push_back(Edge.dst);
640*c9157d92SDimitry Andric PredNodes[Edge.dst].push_back(Edge.src);
641*c9157d92SDimitry Andric if (Edge.count > 0) {
642*c9157d92SDimitry Andric NodeT &PredNode = AllNodes[Edge.src];
643*c9157d92SDimitry Andric NodeT &SuccNode = AllNodes[Edge.dst];
644*c9157d92SDimitry Andric AllJumps.emplace_back(&PredNode, &SuccNode, Edge.count);
645fe013be4SDimitry Andric SuccNode.InJumps.push_back(&AllJumps.back());
646fe013be4SDimitry Andric PredNode.OutJumps.push_back(&AllJumps.back());
647*c9157d92SDimitry Andric // Adjust execution counts.
648*c9157d92SDimitry Andric PredNode.ExecutionCount = std::max(PredNode.ExecutionCount, Edge.count);
649*c9157d92SDimitry Andric SuccNode.ExecutionCount = std::max(SuccNode.ExecutionCount, Edge.count);
6500eae32dcSDimitry Andric }
6510eae32dcSDimitry Andric }
652fe013be4SDimitry Andric for (JumpT &Jump : AllJumps) {
653*c9157d92SDimitry Andric assert(OutDegree[Jump.Source->Index] > 0 &&
654*c9157d92SDimitry Andric "incorrectly computed out-degree of the block");
655bdd1243dSDimitry Andric Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
656bdd1243dSDimitry Andric }
6570eae32dcSDimitry Andric
658*c9157d92SDimitry Andric // Initialize chains.
6590eae32dcSDimitry Andric AllChains.reserve(NumNodes);
6600eae32dcSDimitry Andric HotChains.reserve(NumNodes);
661fe013be4SDimitry Andric for (NodeT &Node : AllNodes) {
662*c9157d92SDimitry Andric // Create a chain.
663fe013be4SDimitry Andric AllChains.emplace_back(Node.Index, &Node);
664fe013be4SDimitry Andric Node.CurChain = &AllChains.back();
665*c9157d92SDimitry Andric if (Node.ExecutionCount > 0)
6660eae32dcSDimitry Andric HotChains.push_back(&AllChains.back());
6670eae32dcSDimitry Andric }
6680eae32dcSDimitry Andric
669*c9157d92SDimitry Andric // Initialize chain edges.
6700eae32dcSDimitry Andric AllEdges.reserve(AllJumps.size());
671fe013be4SDimitry Andric for (NodeT &PredNode : AllNodes) {
672fe013be4SDimitry Andric for (JumpT *Jump : PredNode.OutJumps) {
673*c9157d92SDimitry Andric assert(Jump->ExecutionCount > 0 && "incorrectly initialized jump");
674fe013be4SDimitry Andric NodeT *SuccNode = Jump->Target;
675fe013be4SDimitry Andric ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
676*c9157d92SDimitry Andric // This edge is already present in the graph.
6770eae32dcSDimitry Andric if (CurEdge != nullptr) {
678fe013be4SDimitry Andric assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
6790eae32dcSDimitry Andric CurEdge->appendJump(Jump);
6800eae32dcSDimitry Andric continue;
6810eae32dcSDimitry Andric }
682*c9157d92SDimitry Andric // This is a new edge.
6830eae32dcSDimitry Andric AllEdges.emplace_back(Jump);
684fe013be4SDimitry Andric PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
685fe013be4SDimitry Andric SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
6860eae32dcSDimitry Andric }
6870eae32dcSDimitry Andric }
6880eae32dcSDimitry Andric }
6890eae32dcSDimitry Andric
690fe013be4SDimitry Andric /// For a pair of nodes, A and B, node B is the forced successor of A,
6910eae32dcSDimitry Andric /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
692fe013be4SDimitry Andric /// to B are from A. Such nodes should be adjacent in the optimal ordering;
693fe013be4SDimitry Andric /// the method finds and merges such pairs of nodes.
mergeForcedPairs()6940eae32dcSDimitry Andric void mergeForcedPairs() {
695*c9157d92SDimitry Andric // Find forced pairs of blocks.
696fe013be4SDimitry Andric for (NodeT &Node : AllNodes) {
697fe013be4SDimitry Andric if (SuccNodes[Node.Index].size() == 1 &&
698fe013be4SDimitry Andric PredNodes[SuccNodes[Node.Index][0]].size() == 1 &&
699fe013be4SDimitry Andric SuccNodes[Node.Index][0] != 0) {
700fe013be4SDimitry Andric size_t SuccIndex = SuccNodes[Node.Index][0];
701fe013be4SDimitry Andric Node.ForcedSucc = &AllNodes[SuccIndex];
702fe013be4SDimitry Andric AllNodes[SuccIndex].ForcedPred = &Node;
7030eae32dcSDimitry Andric }
7040eae32dcSDimitry Andric }
7050eae32dcSDimitry Andric
7060eae32dcSDimitry Andric // There might be 'cycles' in the forced dependencies, since profile
7070eae32dcSDimitry Andric // data isn't 100% accurate. Typically this is observed in loops, when the
7080eae32dcSDimitry Andric // loop edges are the hottest successors for the basic blocks of the loop.
709fe013be4SDimitry Andric // Break the cycles by choosing the node with the smallest index as the
7100eae32dcSDimitry Andric // head. This helps to keep the original order of the loops, which likely
7110eae32dcSDimitry Andric // have already been rotated in the optimized manner.
712fe013be4SDimitry Andric for (NodeT &Node : AllNodes) {
713fe013be4SDimitry Andric if (Node.ForcedSucc == nullptr || Node.ForcedPred == nullptr)
7140eae32dcSDimitry Andric continue;
7150eae32dcSDimitry Andric
716fe013be4SDimitry Andric NodeT *SuccNode = Node.ForcedSucc;
717fe013be4SDimitry Andric while (SuccNode != nullptr && SuccNode != &Node) {
718fe013be4SDimitry Andric SuccNode = SuccNode->ForcedSucc;
7190eae32dcSDimitry Andric }
720fe013be4SDimitry Andric if (SuccNode == nullptr)
7210eae32dcSDimitry Andric continue;
722*c9157d92SDimitry Andric // Break the cycle.
723fe013be4SDimitry Andric AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr;
724fe013be4SDimitry Andric Node.ForcedPred = nullptr;
7250eae32dcSDimitry Andric }
7260eae32dcSDimitry Andric
727*c9157d92SDimitry Andric // Merge nodes with their fallthrough successors.
728fe013be4SDimitry Andric for (NodeT &Node : AllNodes) {
729fe013be4SDimitry Andric if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) {
730fe013be4SDimitry Andric const NodeT *CurBlock = &Node;
7310eae32dcSDimitry Andric while (CurBlock->ForcedSucc != nullptr) {
732fe013be4SDimitry Andric const NodeT *NextBlock = CurBlock->ForcedSucc;
733fe013be4SDimitry Andric mergeChains(Node.CurChain, NextBlock->CurChain, 0, MergeTypeT::X_Y);
7340eae32dcSDimitry Andric CurBlock = NextBlock;
7350eae32dcSDimitry Andric }
7360eae32dcSDimitry Andric }
7370eae32dcSDimitry Andric }
7380eae32dcSDimitry Andric }
7390eae32dcSDimitry Andric
7400eae32dcSDimitry Andric /// Merge pairs of chains while improving the ExtTSP objective.
mergeChainPairs()7410eae32dcSDimitry Andric void mergeChainPairs() {
742*c9157d92SDimitry Andric /// Deterministically compare pairs of chains.
743fe013be4SDimitry Andric auto compareChainPairs = [](const ChainT *A1, const ChainT *B1,
744fe013be4SDimitry Andric const ChainT *A2, const ChainT *B2) {
745*c9157d92SDimitry Andric return std::make_tuple(A1->Id, B1->Id) < std::make_tuple(A2->Id, B2->Id);
7460eae32dcSDimitry Andric };
7470eae32dcSDimitry Andric
7480eae32dcSDimitry Andric while (HotChains.size() > 1) {
749fe013be4SDimitry Andric ChainT *BestChainPred = nullptr;
750fe013be4SDimitry Andric ChainT *BestChainSucc = nullptr;
751fe013be4SDimitry Andric MergeGainT BestGain;
752*c9157d92SDimitry Andric // Iterate over all pairs of chains.
753fe013be4SDimitry Andric for (ChainT *ChainPred : HotChains) {
754*c9157d92SDimitry Andric // Get candidates for merging with the current chain.
755*c9157d92SDimitry Andric for (const auto &[ChainSucc, Edge] : ChainPred->Edges) {
756*c9157d92SDimitry Andric // Ignore loop edges.
757*c9157d92SDimitry Andric if (Edge->isSelfEdge())
7580eae32dcSDimitry Andric continue;
759*c9157d92SDimitry Andric // Skip the merge if the combined chain violates the maximum specified
760*c9157d92SDimitry Andric // size.
76181ad6265SDimitry Andric if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
76281ad6265SDimitry Andric continue;
763*c9157d92SDimitry Andric // Don't merge the chains if they have vastly different densities.
764*c9157d92SDimitry Andric // Skip the merge if the ratio between the densities exceeds
765*c9157d92SDimitry Andric // MaxMergeDensityRatio. Smaller values of the option result in fewer
766*c9157d92SDimitry Andric // merges, and hence, more chains.
767*c9157d92SDimitry Andric const double ChainPredDensity = ChainPred->density();
768*c9157d92SDimitry Andric const double ChainSuccDensity = ChainSucc->density();
769*c9157d92SDimitry Andric assert(ChainPredDensity > 0.0 && ChainSuccDensity > 0.0 &&
770*c9157d92SDimitry Andric "incorrectly computed chain densities");
771*c9157d92SDimitry Andric auto [MinDensity, MaxDensity] =
772*c9157d92SDimitry Andric std::minmax(ChainPredDensity, ChainSuccDensity);
773*c9157d92SDimitry Andric const double Ratio = MaxDensity / MinDensity;
774*c9157d92SDimitry Andric if (Ratio > MaxMergeDensityRatio)
775*c9157d92SDimitry Andric continue;
77681ad6265SDimitry Andric
777*c9157d92SDimitry Andric // Compute the gain of merging the two chains.
778fe013be4SDimitry Andric MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
7790eae32dcSDimitry Andric if (CurGain.score() <= EPS)
7800eae32dcSDimitry Andric continue;
7810eae32dcSDimitry Andric
7820eae32dcSDimitry Andric if (BestGain < CurGain ||
7830eae32dcSDimitry Andric (std::abs(CurGain.score() - BestGain.score()) < EPS &&
7840eae32dcSDimitry Andric compareChainPairs(ChainPred, ChainSucc, BestChainPred,
7850eae32dcSDimitry Andric BestChainSucc))) {
7860eae32dcSDimitry Andric BestGain = CurGain;
7870eae32dcSDimitry Andric BestChainPred = ChainPred;
7880eae32dcSDimitry Andric BestChainSucc = ChainSucc;
7890eae32dcSDimitry Andric }
7900eae32dcSDimitry Andric }
7910eae32dcSDimitry Andric }
7920eae32dcSDimitry Andric
793*c9157d92SDimitry Andric // Stop merging when there is no improvement.
7940eae32dcSDimitry Andric if (BestGain.score() <= EPS)
7950eae32dcSDimitry Andric break;
7960eae32dcSDimitry Andric
797*c9157d92SDimitry Andric // Merge the best pair of chains.
7980eae32dcSDimitry Andric mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
7990eae32dcSDimitry Andric BestGain.mergeType());
8000eae32dcSDimitry Andric }
8010eae32dcSDimitry Andric }
8020eae32dcSDimitry Andric
803fe013be4SDimitry Andric /// Merge remaining nodes into chains w/o taking jump counts into
804fe013be4SDimitry Andric /// consideration. This allows to maintain the original node order in the
805*c9157d92SDimitry Andric /// absence of profile data.
mergeColdChains()8060eae32dcSDimitry Andric void mergeColdChains() {
8070eae32dcSDimitry Andric for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
808bdd1243dSDimitry Andric // Iterating in reverse order to make sure original fallthrough jumps are
809bdd1243dSDimitry Andric // merged first; this might be beneficial for code size.
8100eae32dcSDimitry Andric size_t NumSuccs = SuccNodes[SrcBB].size();
8110eae32dcSDimitry Andric for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
812fe013be4SDimitry Andric size_t DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
813fe013be4SDimitry Andric ChainT *SrcChain = AllNodes[SrcBB].CurChain;
814fe013be4SDimitry Andric ChainT *DstChain = AllNodes[DstBB].CurChain;
8150eae32dcSDimitry Andric if (SrcChain != DstChain && !DstChain->isEntry() &&
816fe013be4SDimitry Andric SrcChain->Nodes.back()->Index == SrcBB &&
817fe013be4SDimitry Andric DstChain->Nodes.front()->Index == DstBB &&
818bdd1243dSDimitry Andric SrcChain->isCold() == DstChain->isCold()) {
819fe013be4SDimitry Andric mergeChains(SrcChain, DstChain, 0, MergeTypeT::X_Y);
8200eae32dcSDimitry Andric }
8210eae32dcSDimitry Andric }
8220eae32dcSDimitry Andric }
8230eae32dcSDimitry Andric }
8240eae32dcSDimitry Andric
825fe013be4SDimitry Andric /// Compute the Ext-TSP score for a given node order and a list of jumps.
extTSPScore(const MergedNodesT & Nodes,const MergedJumpsT & Jumps) const826*c9157d92SDimitry Andric double extTSPScore(const MergedNodesT &Nodes,
827*c9157d92SDimitry Andric const MergedJumpsT &Jumps) const {
8280eae32dcSDimitry Andric uint64_t CurAddr = 0;
829*c9157d92SDimitry Andric Nodes.forEach([&](const NodeT *Node) {
830fe013be4SDimitry Andric Node->EstimatedAddr = CurAddr;
831fe013be4SDimitry Andric CurAddr += Node->Size;
8320eae32dcSDimitry Andric });
8330eae32dcSDimitry Andric
8340eae32dcSDimitry Andric double Score = 0;
835*c9157d92SDimitry Andric Jumps.forEach([&](const JumpT *Jump) {
836fe013be4SDimitry Andric const NodeT *SrcBlock = Jump->Source;
837fe013be4SDimitry Andric const NodeT *DstBlock = Jump->Target;
8380eae32dcSDimitry Andric Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
839bdd1243dSDimitry Andric DstBlock->EstimatedAddr, Jump->ExecutionCount,
840bdd1243dSDimitry Andric Jump->IsConditional);
841*c9157d92SDimitry Andric });
8420eae32dcSDimitry Andric return Score;
8430eae32dcSDimitry Andric }
8440eae32dcSDimitry Andric
8450eae32dcSDimitry Andric /// Compute the gain of merging two chains.
8460eae32dcSDimitry Andric ///
8470eae32dcSDimitry Andric /// The function considers all possible ways of merging two chains and
8480eae32dcSDimitry Andric /// computes the one having the largest increase in ExtTSP objective. The
8490eae32dcSDimitry Andric /// result is a pair with the first element being the gain and the second
8500eae32dcSDimitry Andric /// element being the corresponding merging type.
getBestMergeGain(ChainT * ChainPred,ChainT * ChainSucc,ChainEdge * Edge) const851fe013be4SDimitry Andric MergeGainT getBestMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
8520eae32dcSDimitry Andric ChainEdge *Edge) const {
853*c9157d92SDimitry Andric if (Edge->hasCachedMergeGain(ChainPred, ChainSucc))
8540eae32dcSDimitry Andric return Edge->getCachedMergeGain(ChainPred, ChainSucc);
8550eae32dcSDimitry Andric
856*c9157d92SDimitry Andric assert(!Edge->jumps().empty() && "trying to merge chains w/o jumps");
857*c9157d92SDimitry Andric // Precompute jumps between ChainPred and ChainSucc.
858bdd1243dSDimitry Andric ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);
859*c9157d92SDimitry Andric MergedJumpsT Jumps(&Edge->jumps(), EdgePP ? &EdgePP->jumps() : nullptr);
8600eae32dcSDimitry Andric
861*c9157d92SDimitry Andric // This object holds the best chosen gain of merging two chains.
862fe013be4SDimitry Andric MergeGainT Gain = MergeGainT();
8630eae32dcSDimitry Andric
8640eae32dcSDimitry Andric /// Given a merge offset and a list of merge types, try to merge two chains
865*c9157d92SDimitry Andric /// and update Gain with a better alternative.
8660eae32dcSDimitry Andric auto tryChainMerging = [&](size_t Offset,
867fe013be4SDimitry Andric const std::vector<MergeTypeT> &MergeTypes) {
868*c9157d92SDimitry Andric // Skip merging corresponding to concatenation w/o splitting.
869fe013be4SDimitry Andric if (Offset == 0 || Offset == ChainPred->Nodes.size())
8700eae32dcSDimitry Andric return;
871*c9157d92SDimitry Andric // Skip merging if it breaks Forced successors.
872fe013be4SDimitry Andric NodeT *Node = ChainPred->Nodes[Offset - 1];
873fe013be4SDimitry Andric if (Node->ForcedSucc != nullptr)
8740eae32dcSDimitry Andric return;
8750eae32dcSDimitry Andric // Apply the merge, compute the corresponding gain, and update the best
876*c9157d92SDimitry Andric // value, if the merge is beneficial.
877fe013be4SDimitry Andric for (const MergeTypeT &MergeType : MergeTypes) {
8780eae32dcSDimitry Andric Gain.updateIfLessThan(
8790eae32dcSDimitry Andric computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
8800eae32dcSDimitry Andric }
8810eae32dcSDimitry Andric };
8820eae32dcSDimitry Andric
883*c9157d92SDimitry Andric // Try to concatenate two chains w/o splitting.
8840eae32dcSDimitry Andric Gain.updateIfLessThan(
885fe013be4SDimitry Andric computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));
8860eae32dcSDimitry Andric
887*c9157d92SDimitry Andric // Attach (a part of) ChainPred before the first node of ChainSucc.
888fe013be4SDimitry Andric for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
889fe013be4SDimitry Andric const NodeT *SrcBlock = Jump->Source;
8900eae32dcSDimitry Andric if (SrcBlock->CurChain != ChainPred)
8910eae32dcSDimitry Andric continue;
8920eae32dcSDimitry Andric size_t Offset = SrcBlock->CurIndex + 1;
893fe013be4SDimitry Andric tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
8940eae32dcSDimitry Andric }
8950eae32dcSDimitry Andric
896*c9157d92SDimitry Andric // Attach (a part of) ChainPred after the last node of ChainSucc.
897fe013be4SDimitry Andric for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
898*c9157d92SDimitry Andric const NodeT *DstBlock = Jump->Target;
8990eae32dcSDimitry Andric if (DstBlock->CurChain != ChainPred)
9000eae32dcSDimitry Andric continue;
9010eae32dcSDimitry Andric size_t Offset = DstBlock->CurIndex;
902fe013be4SDimitry Andric tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
9030eae32dcSDimitry Andric }
9040eae32dcSDimitry Andric
905*c9157d92SDimitry Andric // Try to break ChainPred in various ways and concatenate with ChainSucc.
906fe013be4SDimitry Andric if (ChainPred->Nodes.size() <= ChainSplitThreshold) {
907fe013be4SDimitry Andric for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
908*c9157d92SDimitry Andric // Do not split the chain along a fall-through jump. One of the two
909*c9157d92SDimitry Andric // loops above may still "break" such a jump whenever it results in a
910*c9157d92SDimitry Andric // new fall-through.
911*c9157d92SDimitry Andric const NodeT *BB = ChainPred->Nodes[Offset - 1];
912*c9157d92SDimitry Andric const NodeT *BB2 = ChainPred->Nodes[Offset];
913*c9157d92SDimitry Andric if (BB->isSuccessor(BB2))
914*c9157d92SDimitry Andric continue;
915*c9157d92SDimitry Andric
916*c9157d92SDimitry Andric // In practice, applying X2_Y_X1 merging almost never provides benefits;
917*c9157d92SDimitry Andric // thus, we exclude it from consideration to reduce the search space.
918fe013be4SDimitry Andric tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,
919fe013be4SDimitry Andric MergeTypeT::X2_X1_Y});
9200eae32dcSDimitry Andric }
9210eae32dcSDimitry Andric }
922*c9157d92SDimitry Andric
9230eae32dcSDimitry Andric Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
9240eae32dcSDimitry Andric return Gain;
9250eae32dcSDimitry Andric }
9260eae32dcSDimitry Andric
9270eae32dcSDimitry Andric /// Compute the score gain of merging two chains, respecting a given
9280eae32dcSDimitry Andric /// merge 'type' and 'offset'.
9290eae32dcSDimitry Andric ///
9300eae32dcSDimitry Andric /// The two chains are not modified in the method.
computeMergeGain(const ChainT * ChainPred,const ChainT * ChainSucc,const MergedJumpsT & Jumps,size_t MergeOffset,MergeTypeT MergeType) const931fe013be4SDimitry Andric MergeGainT computeMergeGain(const ChainT *ChainPred, const ChainT *ChainSucc,
932*c9157d92SDimitry Andric const MergedJumpsT &Jumps, size_t MergeOffset,
933*c9157d92SDimitry Andric MergeTypeT MergeType) const {
934*c9157d92SDimitry Andric MergedNodesT MergedNodes =
935fe013be4SDimitry Andric mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
9360eae32dcSDimitry Andric
937*c9157d92SDimitry Andric // Do not allow a merge that does not preserve the original entry point.
9380eae32dcSDimitry Andric if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
939*c9157d92SDimitry Andric !MergedNodes.getFirstNode()->isEntry())
940fe013be4SDimitry Andric return MergeGainT();
9410eae32dcSDimitry Andric
942*c9157d92SDimitry Andric // The gain for the new chain.
943*c9157d92SDimitry Andric double NewScore = extTSPScore(MergedNodes, Jumps);
944*c9157d92SDimitry Andric double CurScore = ChainPred->Score;
945*c9157d92SDimitry Andric return MergeGainT(NewScore - CurScore, MergeOffset, MergeType);
9460eae32dcSDimitry Andric }
9470eae32dcSDimitry Andric
9480eae32dcSDimitry Andric /// Merge chain From into chain Into, update the list of active chains,
9490eae32dcSDimitry Andric /// adjacency information, and the corresponding cached values.
mergeChains(ChainT * Into,ChainT * From,size_t MergeOffset,MergeTypeT MergeType)950fe013be4SDimitry Andric void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
951fe013be4SDimitry Andric MergeTypeT MergeType) {
9520eae32dcSDimitry Andric assert(Into != From && "a chain cannot be merged with itself");
9530eae32dcSDimitry Andric
954*c9157d92SDimitry Andric // Merge the nodes.
955*c9157d92SDimitry Andric MergedNodesT MergedNodes =
956fe013be4SDimitry Andric mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
957fe013be4SDimitry Andric Into->merge(From, MergedNodes.getNodes());
958fe013be4SDimitry Andric
959*c9157d92SDimitry Andric // Merge the edges.
9600eae32dcSDimitry Andric Into->mergeEdges(From);
9610eae32dcSDimitry Andric From->clear();
9620eae32dcSDimitry Andric
963*c9157d92SDimitry Andric // Update cached ext-tsp score for the new chain.
964bdd1243dSDimitry Andric ChainEdge *SelfEdge = Into->getEdge(Into);
9650eae32dcSDimitry Andric if (SelfEdge != nullptr) {
966*c9157d92SDimitry Andric MergedNodes = MergedNodesT(Into->Nodes.begin(), Into->Nodes.end());
967*c9157d92SDimitry Andric MergedJumpsT MergedJumps(&SelfEdge->jumps());
968*c9157d92SDimitry Andric Into->Score = extTSPScore(MergedNodes, MergedJumps);
9690eae32dcSDimitry Andric }
9700eae32dcSDimitry Andric
971*c9157d92SDimitry Andric // Remove the chain from the list of active chains.
972*c9157d92SDimitry Andric llvm::erase(HotChains, From);
9730eae32dcSDimitry Andric
974*c9157d92SDimitry Andric // Invalidate caches.
975fe013be4SDimitry Andric for (auto EdgeIt : Into->Edges)
976fe013be4SDimitry Andric EdgeIt.second->invalidateCache();
9770eae32dcSDimitry Andric }
9780eae32dcSDimitry Andric
979fe013be4SDimitry Andric /// Concatenate all chains into the final order.
concatChains()980*c9157d92SDimitry Andric std::vector<uint64_t> concatChains() {
981*c9157d92SDimitry Andric // Collect non-empty chains.
982fe013be4SDimitry Andric std::vector<const ChainT *> SortedChains;
983fe013be4SDimitry Andric for (ChainT &Chain : AllChains) {
984*c9157d92SDimitry Andric if (!Chain.Nodes.empty())
9850eae32dcSDimitry Andric SortedChains.push_back(&Chain);
9860eae32dcSDimitry Andric }
9870eae32dcSDimitry Andric
988*c9157d92SDimitry Andric // Sorting chains by density in the decreasing order.
989*c9157d92SDimitry Andric std::sort(SortedChains.begin(), SortedChains.end(),
990fe013be4SDimitry Andric [&](const ChainT *L, const ChainT *R) {
991*c9157d92SDimitry Andric // Place the entry point at the beginning of the order.
992fe013be4SDimitry Andric if (L->isEntry() != R->isEntry())
993fe013be4SDimitry Andric return L->isEntry();
9940eae32dcSDimitry Andric
995*c9157d92SDimitry Andric // Compare by density and break ties by chain identifiers.
996*c9157d92SDimitry Andric return std::make_tuple(-L->density(), L->Id) <
997*c9157d92SDimitry Andric std::make_tuple(-R->density(), R->Id);
9980eae32dcSDimitry Andric });
9990eae32dcSDimitry Andric
1000*c9157d92SDimitry Andric // Collect the nodes in the order specified by their chains.
1001*c9157d92SDimitry Andric std::vector<uint64_t> Order;
10020eae32dcSDimitry Andric Order.reserve(NumNodes);
1003*c9157d92SDimitry Andric for (const ChainT *Chain : SortedChains)
1004*c9157d92SDimitry Andric for (NodeT *Node : Chain->Nodes)
1005fe013be4SDimitry Andric Order.push_back(Node->Index);
1006*c9157d92SDimitry Andric return Order;
10070eae32dcSDimitry Andric }
10080eae32dcSDimitry Andric
10090eae32dcSDimitry Andric private:
10100eae32dcSDimitry Andric /// The number of nodes in the graph.
10110eae32dcSDimitry Andric const size_t NumNodes;
10120eae32dcSDimitry Andric
10130eae32dcSDimitry Andric /// Successors of each node.
10140eae32dcSDimitry Andric std::vector<std::vector<uint64_t>> SuccNodes;
10150eae32dcSDimitry Andric
10160eae32dcSDimitry Andric /// Predecessors of each node.
10170eae32dcSDimitry Andric std::vector<std::vector<uint64_t>> PredNodes;
10180eae32dcSDimitry Andric
1019fe013be4SDimitry Andric /// All nodes (basic blocks) in the graph.
1020fe013be4SDimitry Andric std::vector<NodeT> AllNodes;
10210eae32dcSDimitry Andric
1022fe013be4SDimitry Andric /// All jumps between the nodes.
1023fe013be4SDimitry Andric std::vector<JumpT> AllJumps;
10240eae32dcSDimitry Andric
1025fe013be4SDimitry Andric /// All chains of nodes.
1026fe013be4SDimitry Andric std::vector<ChainT> AllChains;
10270eae32dcSDimitry Andric
1028fe013be4SDimitry Andric /// All edges between the chains.
10290eae32dcSDimitry Andric std::vector<ChainEdge> AllEdges;
10300eae32dcSDimitry Andric
10310eae32dcSDimitry Andric /// Active chains. The vector gets updated at runtime when chains are merged.
1032fe013be4SDimitry Andric std::vector<ChainT *> HotChains;
10330eae32dcSDimitry Andric };
10340eae32dcSDimitry Andric
1035*c9157d92SDimitry Andric /// The implementation of the Cache-Directed Sort (CDSort) algorithm for
1036*c9157d92SDimitry Andric /// ordering functions represented by a call graph.
1037*c9157d92SDimitry Andric class CDSortImpl {
1038*c9157d92SDimitry Andric public:
CDSortImpl(const CDSortConfig & Config,ArrayRef<uint64_t> NodeSizes,ArrayRef<uint64_t> NodeCounts,ArrayRef<EdgeCount> EdgeCounts,ArrayRef<uint64_t> EdgeOffsets)1039*c9157d92SDimitry Andric CDSortImpl(const CDSortConfig &Config, ArrayRef<uint64_t> NodeSizes,
1040*c9157d92SDimitry Andric ArrayRef<uint64_t> NodeCounts, ArrayRef<EdgeCount> EdgeCounts,
1041*c9157d92SDimitry Andric ArrayRef<uint64_t> EdgeOffsets)
1042*c9157d92SDimitry Andric : Config(Config), NumNodes(NodeSizes.size()) {
1043*c9157d92SDimitry Andric initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets);
1044*c9157d92SDimitry Andric }
1045*c9157d92SDimitry Andric
1046*c9157d92SDimitry Andric /// Run the algorithm and return an ordered set of function clusters.
run()1047*c9157d92SDimitry Andric std::vector<uint64_t> run() {
1048*c9157d92SDimitry Andric // Merge pairs of chains while improving the objective.
1049*c9157d92SDimitry Andric mergeChainPairs();
1050*c9157d92SDimitry Andric
1051*c9157d92SDimitry Andric // Collect nodes from all the chains.
1052*c9157d92SDimitry Andric return concatChains();
1053*c9157d92SDimitry Andric }
1054*c9157d92SDimitry Andric
1055*c9157d92SDimitry Andric private:
1056*c9157d92SDimitry Andric /// Initialize the algorithm's data structures.
initialize(const ArrayRef<uint64_t> & NodeSizes,const ArrayRef<uint64_t> & NodeCounts,const ArrayRef<EdgeCount> & EdgeCounts,const ArrayRef<uint64_t> & EdgeOffsets)1057*c9157d92SDimitry Andric void initialize(const ArrayRef<uint64_t> &NodeSizes,
1058*c9157d92SDimitry Andric const ArrayRef<uint64_t> &NodeCounts,
1059*c9157d92SDimitry Andric const ArrayRef<EdgeCount> &EdgeCounts,
1060*c9157d92SDimitry Andric const ArrayRef<uint64_t> &EdgeOffsets) {
1061*c9157d92SDimitry Andric // Initialize nodes.
1062*c9157d92SDimitry Andric AllNodes.reserve(NumNodes);
1063*c9157d92SDimitry Andric for (uint64_t Node = 0; Node < NumNodes; Node++) {
1064*c9157d92SDimitry Andric uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
1065*c9157d92SDimitry Andric uint64_t ExecutionCount = NodeCounts[Node];
1066*c9157d92SDimitry Andric AllNodes.emplace_back(Node, Size, ExecutionCount);
1067*c9157d92SDimitry Andric TotalSamples += ExecutionCount;
1068*c9157d92SDimitry Andric if (ExecutionCount > 0)
1069*c9157d92SDimitry Andric TotalSize += Size;
1070*c9157d92SDimitry Andric }
1071*c9157d92SDimitry Andric
1072*c9157d92SDimitry Andric // Initialize jumps between the nodes.
1073*c9157d92SDimitry Andric SuccNodes.resize(NumNodes);
1074*c9157d92SDimitry Andric PredNodes.resize(NumNodes);
1075*c9157d92SDimitry Andric AllJumps.reserve(EdgeCounts.size());
1076*c9157d92SDimitry Andric for (size_t I = 0; I < EdgeCounts.size(); I++) {
1077*c9157d92SDimitry Andric auto [Pred, Succ, Count] = EdgeCounts[I];
1078*c9157d92SDimitry Andric // Ignore recursive calls.
1079*c9157d92SDimitry Andric if (Pred == Succ)
1080*c9157d92SDimitry Andric continue;
1081*c9157d92SDimitry Andric
1082*c9157d92SDimitry Andric SuccNodes[Pred].push_back(Succ);
1083*c9157d92SDimitry Andric PredNodes[Succ].push_back(Pred);
1084*c9157d92SDimitry Andric if (Count > 0) {
1085*c9157d92SDimitry Andric NodeT &PredNode = AllNodes[Pred];
1086*c9157d92SDimitry Andric NodeT &SuccNode = AllNodes[Succ];
1087*c9157d92SDimitry Andric AllJumps.emplace_back(&PredNode, &SuccNode, Count);
1088*c9157d92SDimitry Andric AllJumps.back().Offset = EdgeOffsets[I];
1089*c9157d92SDimitry Andric SuccNode.InJumps.push_back(&AllJumps.back());
1090*c9157d92SDimitry Andric PredNode.OutJumps.push_back(&AllJumps.back());
1091*c9157d92SDimitry Andric // Adjust execution counts.
1092*c9157d92SDimitry Andric PredNode.ExecutionCount = std::max(PredNode.ExecutionCount, Count);
1093*c9157d92SDimitry Andric SuccNode.ExecutionCount = std::max(SuccNode.ExecutionCount, Count);
1094*c9157d92SDimitry Andric }
1095*c9157d92SDimitry Andric }
1096*c9157d92SDimitry Andric
1097*c9157d92SDimitry Andric // Initialize chains.
1098*c9157d92SDimitry Andric AllChains.reserve(NumNodes);
1099*c9157d92SDimitry Andric for (NodeT &Node : AllNodes) {
1100*c9157d92SDimitry Andric // Adjust execution counts.
1101*c9157d92SDimitry Andric Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount());
1102*c9157d92SDimitry Andric Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount());
1103*c9157d92SDimitry Andric // Create chain.
1104*c9157d92SDimitry Andric AllChains.emplace_back(Node.Index, &Node);
1105*c9157d92SDimitry Andric Node.CurChain = &AllChains.back();
1106*c9157d92SDimitry Andric }
1107*c9157d92SDimitry Andric
1108*c9157d92SDimitry Andric // Initialize chain edges.
1109*c9157d92SDimitry Andric AllEdges.reserve(AllJumps.size());
1110*c9157d92SDimitry Andric for (NodeT &PredNode : AllNodes) {
1111*c9157d92SDimitry Andric for (JumpT *Jump : PredNode.OutJumps) {
1112*c9157d92SDimitry Andric NodeT *SuccNode = Jump->Target;
1113*c9157d92SDimitry Andric ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
1114*c9157d92SDimitry Andric // This edge is already present in the graph.
1115*c9157d92SDimitry Andric if (CurEdge != nullptr) {
1116*c9157d92SDimitry Andric assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
1117*c9157d92SDimitry Andric CurEdge->appendJump(Jump);
1118*c9157d92SDimitry Andric continue;
1119*c9157d92SDimitry Andric }
1120*c9157d92SDimitry Andric // This is a new edge.
1121*c9157d92SDimitry Andric AllEdges.emplace_back(Jump);
1122*c9157d92SDimitry Andric PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
1123*c9157d92SDimitry Andric SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
1124*c9157d92SDimitry Andric }
1125*c9157d92SDimitry Andric }
1126*c9157d92SDimitry Andric }
1127*c9157d92SDimitry Andric
1128*c9157d92SDimitry Andric /// Merge pairs of chains while there is an improvement in the objective.
mergeChainPairs()1129*c9157d92SDimitry Andric void mergeChainPairs() {
1130*c9157d92SDimitry Andric // Create a priority queue containing all edges ordered by the merge gain.
1131*c9157d92SDimitry Andric auto GainComparator = [](ChainEdge *L, ChainEdge *R) {
1132*c9157d92SDimitry Andric return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) <
1133*c9157d92SDimitry Andric std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id);
1134*c9157d92SDimitry Andric };
1135*c9157d92SDimitry Andric std::set<ChainEdge *, decltype(GainComparator)> Queue(GainComparator);
1136*c9157d92SDimitry Andric
1137*c9157d92SDimitry Andric // Insert the edges into the queue.
1138*c9157d92SDimitry Andric [[maybe_unused]] size_t NumActiveChains = 0;
1139*c9157d92SDimitry Andric for (NodeT &Node : AllNodes) {
1140*c9157d92SDimitry Andric if (Node.ExecutionCount == 0)
1141*c9157d92SDimitry Andric continue;
1142*c9157d92SDimitry Andric ++NumActiveChains;
1143*c9157d92SDimitry Andric for (const auto &[_, Edge] : Node.CurChain->Edges) {
1144*c9157d92SDimitry Andric // Ignore self-edges.
1145*c9157d92SDimitry Andric if (Edge->isSelfEdge())
1146*c9157d92SDimitry Andric continue;
1147*c9157d92SDimitry Andric // Ignore already processed edges.
1148*c9157d92SDimitry Andric if (Edge->gain() != -1.0)
1149*c9157d92SDimitry Andric continue;
1150*c9157d92SDimitry Andric
1151*c9157d92SDimitry Andric // Compute the gain of merging the two chains.
1152*c9157d92SDimitry Andric MergeGainT Gain = getBestMergeGain(Edge);
1153*c9157d92SDimitry Andric Edge->setMergeGain(Gain);
1154*c9157d92SDimitry Andric
1155*c9157d92SDimitry Andric if (Edge->gain() > EPS)
1156*c9157d92SDimitry Andric Queue.insert(Edge);
1157*c9157d92SDimitry Andric }
1158*c9157d92SDimitry Andric }
1159*c9157d92SDimitry Andric
1160*c9157d92SDimitry Andric // Merge the chains while the gain of merging is positive.
1161*c9157d92SDimitry Andric while (!Queue.empty()) {
1162*c9157d92SDimitry Andric // Extract the best (top) edge for merging.
1163*c9157d92SDimitry Andric ChainEdge *BestEdge = *Queue.begin();
1164*c9157d92SDimitry Andric Queue.erase(Queue.begin());
1165*c9157d92SDimitry Andric ChainT *BestSrcChain = BestEdge->srcChain();
1166*c9157d92SDimitry Andric ChainT *BestDstChain = BestEdge->dstChain();
1167*c9157d92SDimitry Andric
1168*c9157d92SDimitry Andric // Remove outdated edges from the queue.
1169*c9157d92SDimitry Andric for (const auto &[_, ChainEdge] : BestSrcChain->Edges)
1170*c9157d92SDimitry Andric Queue.erase(ChainEdge);
1171*c9157d92SDimitry Andric for (const auto &[_, ChainEdge] : BestDstChain->Edges)
1172*c9157d92SDimitry Andric Queue.erase(ChainEdge);
1173*c9157d92SDimitry Andric
1174*c9157d92SDimitry Andric // Merge the best pair of chains.
1175*c9157d92SDimitry Andric MergeGainT BestGain = BestEdge->getMergeGain();
1176*c9157d92SDimitry Andric mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(),
1177*c9157d92SDimitry Andric BestGain.mergeType());
1178*c9157d92SDimitry Andric --NumActiveChains;
1179*c9157d92SDimitry Andric
1180*c9157d92SDimitry Andric // Insert newly created edges into the queue.
1181*c9157d92SDimitry Andric for (const auto &[_, Edge] : BestSrcChain->Edges) {
1182*c9157d92SDimitry Andric // Ignore loop edges.
1183*c9157d92SDimitry Andric if (Edge->isSelfEdge())
1184*c9157d92SDimitry Andric continue;
1185*c9157d92SDimitry Andric if (Edge->srcChain()->numBlocks() + Edge->dstChain()->numBlocks() >
1186*c9157d92SDimitry Andric Config.MaxChainSize)
1187*c9157d92SDimitry Andric continue;
1188*c9157d92SDimitry Andric
1189*c9157d92SDimitry Andric // Compute the gain of merging the two chains.
1190*c9157d92SDimitry Andric MergeGainT Gain = getBestMergeGain(Edge);
1191*c9157d92SDimitry Andric Edge->setMergeGain(Gain);
1192*c9157d92SDimitry Andric
1193*c9157d92SDimitry Andric if (Edge->gain() > EPS)
1194*c9157d92SDimitry Andric Queue.insert(Edge);
1195*c9157d92SDimitry Andric }
1196*c9157d92SDimitry Andric }
1197*c9157d92SDimitry Andric
1198*c9157d92SDimitry Andric LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number"
1199*c9157d92SDimitry Andric << " of chains from " << NumNodes << " to "
1200*c9157d92SDimitry Andric << NumActiveChains << "\n");
1201*c9157d92SDimitry Andric }
1202*c9157d92SDimitry Andric
1203*c9157d92SDimitry Andric /// Compute the gain of merging two chains.
1204*c9157d92SDimitry Andric ///
1205*c9157d92SDimitry Andric /// The function considers all possible ways of merging two chains and
1206*c9157d92SDimitry Andric /// computes the one having the largest increase in ExtTSP objective. The
1207*c9157d92SDimitry Andric /// result is a pair with the first element being the gain and the second
1208*c9157d92SDimitry Andric /// element being the corresponding merging type.
getBestMergeGain(ChainEdge * Edge) const1209*c9157d92SDimitry Andric MergeGainT getBestMergeGain(ChainEdge *Edge) const {
1210*c9157d92SDimitry Andric assert(!Edge->jumps().empty() && "trying to merge chains w/o jumps");
1211*c9157d92SDimitry Andric // Precompute jumps between ChainPred and ChainSucc.
1212*c9157d92SDimitry Andric MergedJumpsT Jumps(&Edge->jumps());
1213*c9157d92SDimitry Andric ChainT *SrcChain = Edge->srcChain();
1214*c9157d92SDimitry Andric ChainT *DstChain = Edge->dstChain();
1215*c9157d92SDimitry Andric
1216*c9157d92SDimitry Andric // This object holds the best currently chosen gain of merging two chains.
1217*c9157d92SDimitry Andric MergeGainT Gain = MergeGainT();
1218*c9157d92SDimitry Andric
1219*c9157d92SDimitry Andric /// Given a list of merge types, try to merge two chains and update Gain
1220*c9157d92SDimitry Andric /// with a better alternative.
1221*c9157d92SDimitry Andric auto tryChainMerging = [&](const std::vector<MergeTypeT> &MergeTypes) {
1222*c9157d92SDimitry Andric // Apply the merge, compute the corresponding gain, and update the best
1223*c9157d92SDimitry Andric // value, if the merge is beneficial.
1224*c9157d92SDimitry Andric for (const MergeTypeT &MergeType : MergeTypes) {
1225*c9157d92SDimitry Andric MergeGainT NewGain =
1226*c9157d92SDimitry Andric computeMergeGain(SrcChain, DstChain, Jumps, MergeType);
1227*c9157d92SDimitry Andric
1228*c9157d92SDimitry Andric // When forward and backward gains are the same, prioritize merging that
1229*c9157d92SDimitry Andric // preserves the original order of the functions in the binary.
1230*c9157d92SDimitry Andric if (std::abs(Gain.score() - NewGain.score()) < EPS) {
1231*c9157d92SDimitry Andric if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) ||
1232*c9157d92SDimitry Andric (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) {
1233*c9157d92SDimitry Andric Gain = NewGain;
1234*c9157d92SDimitry Andric }
1235*c9157d92SDimitry Andric } else if (NewGain.score() > Gain.score() + EPS) {
1236*c9157d92SDimitry Andric Gain = NewGain;
1237*c9157d92SDimitry Andric }
1238*c9157d92SDimitry Andric }
1239*c9157d92SDimitry Andric };
1240*c9157d92SDimitry Andric
1241*c9157d92SDimitry Andric // Try to concatenate two chains w/o splitting.
1242*c9157d92SDimitry Andric tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X});
1243*c9157d92SDimitry Andric
1244*c9157d92SDimitry Andric return Gain;
1245*c9157d92SDimitry Andric }
1246*c9157d92SDimitry Andric
1247*c9157d92SDimitry Andric /// Compute the score gain of merging two chains, respecting a given type.
1248*c9157d92SDimitry Andric ///
1249*c9157d92SDimitry Andric /// The two chains are not modified in the method.
computeMergeGain(ChainT * ChainPred,ChainT * ChainSucc,const MergedJumpsT & Jumps,MergeTypeT MergeType) const1250*c9157d92SDimitry Andric MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
1251*c9157d92SDimitry Andric const MergedJumpsT &Jumps,
1252*c9157d92SDimitry Andric MergeTypeT MergeType) const {
1253*c9157d92SDimitry Andric // This doesn't depend on the ordering of the nodes
1254*c9157d92SDimitry Andric double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc);
1255*c9157d92SDimitry Andric
1256*c9157d92SDimitry Andric // Merge offset is always 0, as the chains are not split.
1257*c9157d92SDimitry Andric size_t MergeOffset = 0;
1258*c9157d92SDimitry Andric auto MergedBlocks =
1259*c9157d92SDimitry Andric mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
1260*c9157d92SDimitry Andric double DistGain = distBasedLocalityGain(MergedBlocks, Jumps);
1261*c9157d92SDimitry Andric
1262*c9157d92SDimitry Andric double GainScore = DistGain + Config.FrequencyScale * FreqGain;
1263*c9157d92SDimitry Andric // Scale the result to increase the importance of merging short chains.
1264*c9157d92SDimitry Andric if (GainScore >= 0.0)
1265*c9157d92SDimitry Andric GainScore /= std::min(ChainPred->Size, ChainSucc->Size);
1266*c9157d92SDimitry Andric
1267*c9157d92SDimitry Andric return MergeGainT(GainScore, MergeOffset, MergeType);
1268*c9157d92SDimitry Andric }
1269*c9157d92SDimitry Andric
1270*c9157d92SDimitry Andric /// Compute the change of the frequency locality after merging the chains.
freqBasedLocalityGain(ChainT * ChainPred,ChainT * ChainSucc) const1271*c9157d92SDimitry Andric double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const {
1272*c9157d92SDimitry Andric auto missProbability = [&](double ChainDensity) {
1273*c9157d92SDimitry Andric double PageSamples = ChainDensity * Config.CacheSize;
1274*c9157d92SDimitry Andric if (PageSamples >= TotalSamples)
1275*c9157d92SDimitry Andric return 0.0;
1276*c9157d92SDimitry Andric double P = PageSamples / TotalSamples;
1277*c9157d92SDimitry Andric return pow(1.0 - P, static_cast<double>(Config.CacheEntries));
1278*c9157d92SDimitry Andric };
1279*c9157d92SDimitry Andric
1280*c9157d92SDimitry Andric // Cache misses on the chains before merging.
1281*c9157d92SDimitry Andric double CurScore =
1282*c9157d92SDimitry Andric ChainPred->ExecutionCount * missProbability(ChainPred->density()) +
1283*c9157d92SDimitry Andric ChainSucc->ExecutionCount * missProbability(ChainSucc->density());
1284*c9157d92SDimitry Andric
1285*c9157d92SDimitry Andric // Cache misses on the merged chain
1286*c9157d92SDimitry Andric double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;
1287*c9157d92SDimitry Andric double MergedSize = ChainPred->Size + ChainSucc->Size;
1288*c9157d92SDimitry Andric double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;
1289*c9157d92SDimitry Andric double NewScore = MergedCounts * missProbability(MergedDensity);
1290*c9157d92SDimitry Andric
1291*c9157d92SDimitry Andric return CurScore - NewScore;
1292*c9157d92SDimitry Andric }
1293*c9157d92SDimitry Andric
1294*c9157d92SDimitry Andric /// Compute the distance locality for a jump / call.
distScore(uint64_t SrcAddr,uint64_t DstAddr,uint64_t Count) const1295*c9157d92SDimitry Andric double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const {
1296*c9157d92SDimitry Andric uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr;
1297*c9157d92SDimitry Andric double D = Dist == 0 ? 0.1 : static_cast<double>(Dist);
1298*c9157d92SDimitry Andric return static_cast<double>(Count) * std::pow(D, -Config.DistancePower);
1299*c9157d92SDimitry Andric }
1300*c9157d92SDimitry Andric
1301*c9157d92SDimitry Andric /// Compute the change of the distance locality after merging the chains.
distBasedLocalityGain(const MergedNodesT & Nodes,const MergedJumpsT & Jumps) const1302*c9157d92SDimitry Andric double distBasedLocalityGain(const MergedNodesT &Nodes,
1303*c9157d92SDimitry Andric const MergedJumpsT &Jumps) const {
1304*c9157d92SDimitry Andric uint64_t CurAddr = 0;
1305*c9157d92SDimitry Andric Nodes.forEach([&](const NodeT *Node) {
1306*c9157d92SDimitry Andric Node->EstimatedAddr = CurAddr;
1307*c9157d92SDimitry Andric CurAddr += Node->Size;
1308*c9157d92SDimitry Andric });
1309*c9157d92SDimitry Andric
1310*c9157d92SDimitry Andric double CurScore = 0;
1311*c9157d92SDimitry Andric double NewScore = 0;
1312*c9157d92SDimitry Andric Jumps.forEach([&](const JumpT *Jump) {
1313*c9157d92SDimitry Andric uint64_t SrcAddr = Jump->Source->EstimatedAddr + Jump->Offset;
1314*c9157d92SDimitry Andric uint64_t DstAddr = Jump->Target->EstimatedAddr;
1315*c9157d92SDimitry Andric NewScore += distScore(SrcAddr, DstAddr, Jump->ExecutionCount);
1316*c9157d92SDimitry Andric CurScore += distScore(0, TotalSize, Jump->ExecutionCount);
1317*c9157d92SDimitry Andric });
1318*c9157d92SDimitry Andric return NewScore - CurScore;
1319*c9157d92SDimitry Andric }
1320*c9157d92SDimitry Andric
1321*c9157d92SDimitry Andric /// Merge chain From into chain Into, update the list of active chains,
1322*c9157d92SDimitry Andric /// adjacency information, and the corresponding cached values.
mergeChains(ChainT * Into,ChainT * From,size_t MergeOffset,MergeTypeT MergeType)1323*c9157d92SDimitry Andric void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
1324*c9157d92SDimitry Andric MergeTypeT MergeType) {
1325*c9157d92SDimitry Andric assert(Into != From && "a chain cannot be merged with itself");
1326*c9157d92SDimitry Andric
1327*c9157d92SDimitry Andric // Merge the nodes.
1328*c9157d92SDimitry Andric MergedNodesT MergedNodes =
1329*c9157d92SDimitry Andric mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
1330*c9157d92SDimitry Andric Into->merge(From, MergedNodes.getNodes());
1331*c9157d92SDimitry Andric
1332*c9157d92SDimitry Andric // Merge the edges.
1333*c9157d92SDimitry Andric Into->mergeEdges(From);
1334*c9157d92SDimitry Andric From->clear();
1335*c9157d92SDimitry Andric }
1336*c9157d92SDimitry Andric
1337*c9157d92SDimitry Andric /// Concatenate all chains into the final order.
concatChains()1338*c9157d92SDimitry Andric std::vector<uint64_t> concatChains() {
1339*c9157d92SDimitry Andric // Collect chains and calculate density stats for their sorting.
1340*c9157d92SDimitry Andric std::vector<const ChainT *> SortedChains;
1341*c9157d92SDimitry Andric DenseMap<const ChainT *, double> ChainDensity;
1342*c9157d92SDimitry Andric for (ChainT &Chain : AllChains) {
1343*c9157d92SDimitry Andric if (!Chain.Nodes.empty()) {
1344*c9157d92SDimitry Andric SortedChains.push_back(&Chain);
1345*c9157d92SDimitry Andric // Using doubles to avoid overflow of ExecutionCounts.
1346*c9157d92SDimitry Andric double Size = 0;
1347*c9157d92SDimitry Andric double ExecutionCount = 0;
1348*c9157d92SDimitry Andric for (NodeT *Node : Chain.Nodes) {
1349*c9157d92SDimitry Andric Size += static_cast<double>(Node->Size);
1350*c9157d92SDimitry Andric ExecutionCount += static_cast<double>(Node->ExecutionCount);
1351*c9157d92SDimitry Andric }
1352*c9157d92SDimitry Andric assert(Size > 0 && "a chain of zero size");
1353*c9157d92SDimitry Andric ChainDensity[&Chain] = ExecutionCount / Size;
1354*c9157d92SDimitry Andric }
1355*c9157d92SDimitry Andric }
1356*c9157d92SDimitry Andric
1357*c9157d92SDimitry Andric // Sort chains by density in the decreasing order.
1358*c9157d92SDimitry Andric std::sort(SortedChains.begin(), SortedChains.end(),
1359*c9157d92SDimitry Andric [&](const ChainT *L, const ChainT *R) {
1360*c9157d92SDimitry Andric const double DL = ChainDensity[L];
1361*c9157d92SDimitry Andric const double DR = ChainDensity[R];
1362*c9157d92SDimitry Andric // Compare by density and break ties by chain identifiers.
1363*c9157d92SDimitry Andric return std::make_tuple(-DL, L->Id) <
1364*c9157d92SDimitry Andric std::make_tuple(-DR, R->Id);
1365*c9157d92SDimitry Andric });
1366*c9157d92SDimitry Andric
1367*c9157d92SDimitry Andric // Collect the nodes in the order specified by their chains.
1368*c9157d92SDimitry Andric std::vector<uint64_t> Order;
1369*c9157d92SDimitry Andric Order.reserve(NumNodes);
1370*c9157d92SDimitry Andric for (const ChainT *Chain : SortedChains)
1371*c9157d92SDimitry Andric for (NodeT *Node : Chain->Nodes)
1372*c9157d92SDimitry Andric Order.push_back(Node->Index);
1373*c9157d92SDimitry Andric return Order;
1374*c9157d92SDimitry Andric }
1375*c9157d92SDimitry Andric
1376*c9157d92SDimitry Andric private:
1377*c9157d92SDimitry Andric /// Config for the algorithm.
1378*c9157d92SDimitry Andric const CDSortConfig Config;
1379*c9157d92SDimitry Andric
1380*c9157d92SDimitry Andric /// The number of nodes in the graph.
1381*c9157d92SDimitry Andric const size_t NumNodes;
1382*c9157d92SDimitry Andric
1383*c9157d92SDimitry Andric /// Successors of each node.
1384*c9157d92SDimitry Andric std::vector<std::vector<uint64_t>> SuccNodes;
1385*c9157d92SDimitry Andric
1386*c9157d92SDimitry Andric /// Predecessors of each node.
1387*c9157d92SDimitry Andric std::vector<std::vector<uint64_t>> PredNodes;
1388*c9157d92SDimitry Andric
1389*c9157d92SDimitry Andric /// All nodes (functions) in the graph.
1390*c9157d92SDimitry Andric std::vector<NodeT> AllNodes;
1391*c9157d92SDimitry Andric
1392*c9157d92SDimitry Andric /// All jumps (function calls) between the nodes.
1393*c9157d92SDimitry Andric std::vector<JumpT> AllJumps;
1394*c9157d92SDimitry Andric
1395*c9157d92SDimitry Andric /// All chains of nodes.
1396*c9157d92SDimitry Andric std::vector<ChainT> AllChains;
1397*c9157d92SDimitry Andric
1398*c9157d92SDimitry Andric /// All edges between the chains.
1399*c9157d92SDimitry Andric std::vector<ChainEdge> AllEdges;
1400*c9157d92SDimitry Andric
1401*c9157d92SDimitry Andric /// The total number of samples in the graph.
1402*c9157d92SDimitry Andric uint64_t TotalSamples{0};
1403*c9157d92SDimitry Andric
1404*c9157d92SDimitry Andric /// The total size of the nodes in the graph.
1405*c9157d92SDimitry Andric uint64_t TotalSize{0};
1406*c9157d92SDimitry Andric };
1407*c9157d92SDimitry Andric
14080eae32dcSDimitry Andric } // end of anonymous namespace
14090eae32dcSDimitry Andric
1410fe013be4SDimitry Andric std::vector<uint64_t>
computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,ArrayRef<uint64_t> NodeCounts,ArrayRef<EdgeCount> EdgeCounts)1411*c9157d92SDimitry Andric codelayout::computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
1412*c9157d92SDimitry Andric ArrayRef<uint64_t> NodeCounts,
1413*c9157d92SDimitry Andric ArrayRef<EdgeCount> EdgeCounts) {
1414*c9157d92SDimitry Andric // Verify correctness of the input data.
14150eae32dcSDimitry Andric assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
1416fe013be4SDimitry Andric assert(NodeSizes.size() > 2 && "Incorrect input");
14170eae32dcSDimitry Andric
1418*c9157d92SDimitry Andric // Apply the reordering algorithm.
1419fe013be4SDimitry Andric ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts);
1420*c9157d92SDimitry Andric std::vector<uint64_t> Result = Alg.run();
14210eae32dcSDimitry Andric
1422*c9157d92SDimitry Andric // Verify correctness of the output.
14230eae32dcSDimitry Andric assert(Result.front() == 0 && "Original entry point is not preserved");
1424fe013be4SDimitry Andric assert(Result.size() == NodeSizes.size() && "Incorrect size of layout");
14250eae32dcSDimitry Andric return Result;
14260eae32dcSDimitry Andric }
14270eae32dcSDimitry Andric
calcExtTspScore(ArrayRef<uint64_t> Order,ArrayRef<uint64_t> NodeSizes,ArrayRef<uint64_t> NodeCounts,ArrayRef<EdgeCount> EdgeCounts)1428*c9157d92SDimitry Andric double codelayout::calcExtTspScore(ArrayRef<uint64_t> Order,
1429*c9157d92SDimitry Andric ArrayRef<uint64_t> NodeSizes,
1430*c9157d92SDimitry Andric ArrayRef<uint64_t> NodeCounts,
1431*c9157d92SDimitry Andric ArrayRef<EdgeCount> EdgeCounts) {
1432*c9157d92SDimitry Andric // Estimate addresses of the blocks in memory.
1433bdd1243dSDimitry Andric std::vector<uint64_t> Addr(NodeSizes.size(), 0);
14340eae32dcSDimitry Andric for (size_t Idx = 1; Idx < Order.size(); Idx++) {
14350eae32dcSDimitry Andric Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
14360eae32dcSDimitry Andric }
1437bdd1243dSDimitry Andric std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
1438*c9157d92SDimitry Andric for (auto Edge : EdgeCounts)
1439*c9157d92SDimitry Andric ++OutDegree[Edge.src];
14400eae32dcSDimitry Andric
1441*c9157d92SDimitry Andric // Increase the score for each jump.
14420eae32dcSDimitry Andric double Score = 0;
1443*c9157d92SDimitry Andric for (auto Edge : EdgeCounts) {
1444*c9157d92SDimitry Andric bool IsConditional = OutDegree[Edge.src] > 1;
1445*c9157d92SDimitry Andric Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst],
1446*c9157d92SDimitry Andric Edge.count, IsConditional);
14470eae32dcSDimitry Andric }
14480eae32dcSDimitry Andric return Score;
14490eae32dcSDimitry Andric }
14500eae32dcSDimitry Andric
calcExtTspScore(ArrayRef<uint64_t> NodeSizes,ArrayRef<uint64_t> NodeCounts,ArrayRef<EdgeCount> EdgeCounts)1451*c9157d92SDimitry Andric double codelayout::calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
1452*c9157d92SDimitry Andric ArrayRef<uint64_t> NodeCounts,
1453*c9157d92SDimitry Andric ArrayRef<EdgeCount> EdgeCounts) {
1454bdd1243dSDimitry Andric std::vector<uint64_t> Order(NodeSizes.size());
14550eae32dcSDimitry Andric for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
14560eae32dcSDimitry Andric Order[Idx] = Idx;
14570eae32dcSDimitry Andric }
14580eae32dcSDimitry Andric return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
14590eae32dcSDimitry Andric }
1460*c9157d92SDimitry Andric
computeCacheDirectedLayout(const CDSortConfig & Config,ArrayRef<uint64_t> FuncSizes,ArrayRef<uint64_t> FuncCounts,ArrayRef<EdgeCount> CallCounts,ArrayRef<uint64_t> CallOffsets)1461*c9157d92SDimitry Andric std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
1462*c9157d92SDimitry Andric const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes,
1463*c9157d92SDimitry Andric ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts,
1464*c9157d92SDimitry Andric ArrayRef<uint64_t> CallOffsets) {
1465*c9157d92SDimitry Andric // Verify correctness of the input data.
1466*c9157d92SDimitry Andric assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input");
1467*c9157d92SDimitry Andric
1468*c9157d92SDimitry Andric // Apply the reordering algorithm.
1469*c9157d92SDimitry Andric CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
1470*c9157d92SDimitry Andric std::vector<uint64_t> Result = Alg.run();
1471*c9157d92SDimitry Andric assert(Result.size() == FuncSizes.size() && "Incorrect size of layout");
1472*c9157d92SDimitry Andric return Result;
1473*c9157d92SDimitry Andric }
1474*c9157d92SDimitry Andric
computeCacheDirectedLayout(ArrayRef<uint64_t> FuncSizes,ArrayRef<uint64_t> FuncCounts,ArrayRef<EdgeCount> CallCounts,ArrayRef<uint64_t> CallOffsets)1475*c9157d92SDimitry Andric std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
1476*c9157d92SDimitry Andric ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts,
1477*c9157d92SDimitry Andric ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets) {
1478*c9157d92SDimitry Andric CDSortConfig Config;
1479*c9157d92SDimitry Andric // Populate the config from the command-line options.
1480*c9157d92SDimitry Andric if (CacheEntries.getNumOccurrences() > 0)
1481*c9157d92SDimitry Andric Config.CacheEntries = CacheEntries;
1482*c9157d92SDimitry Andric if (CacheSize.getNumOccurrences() > 0)
1483*c9157d92SDimitry Andric Config.CacheSize = CacheSize;
1484*c9157d92SDimitry Andric if (CDMaxChainSize.getNumOccurrences() > 0)
1485*c9157d92SDimitry Andric Config.MaxChainSize = CDMaxChainSize;
1486*c9157d92SDimitry Andric if (DistancePower.getNumOccurrences() > 0)
1487*c9157d92SDimitry Andric Config.DistancePower = DistancePower;
1488*c9157d92SDimitry Andric if (FrequencyScale.getNumOccurrences() > 0)
1489*c9157d92SDimitry Andric Config.FrequencyScale = FrequencyScale;
1490*c9157d92SDimitry Andric return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts,
1491*c9157d92SDimitry Andric CallOffsets);
1492*c9157d92SDimitry Andric }
1493