1*a34c753fSRafael Auler //===--- ExtTSPReorderAlgorithm.cpp - Order basic blocks ---------------===// 2*a34c753fSRafael Auler // 3*a34c753fSRafael Auler // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*a34c753fSRafael Auler // See https://llvm.org/LICENSE.txt for license information. 5*a34c753fSRafael Auler // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*a34c753fSRafael Auler // 7*a34c753fSRafael Auler //===----------------------------------------------------------------------===// 8*a34c753fSRafael Auler // ExtTSP - layout of basic blocks with i-cache optimization. 9*a34c753fSRafael Auler // 10*a34c753fSRafael Auler // The algorithm is a greedy heuristic that works with chains (ordered lists) 11*a34c753fSRafael Auler // of basic blocks. Initially all chains are isolated basic blocks. On every 12*a34c753fSRafael Auler // iteration, we pick a pair of chains whose merging yields the biggest increase 13*a34c753fSRafael Auler // in the ExtTSP value, which models how i-cache "friendly" a specific chain is. 14*a34c753fSRafael Auler // A pair of chains giving the maximum gain is merged into a new chain. The 15*a34c753fSRafael Auler // procedure stops when there is only one chain left, or when merging does not 16*a34c753fSRafael Auler // increase ExtTSP. In the latter case, the remaining chains are sorted by 17*a34c753fSRafael Auler // density in decreasing order. 18*a34c753fSRafael Auler // 19*a34c753fSRafael Auler // An important aspect is the way two chains are merged. Unlike earlier 20*a34c753fSRafael Auler // algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two 21*a34c753fSRafael Auler // chains, X and Y, are first split into three, X1, X2, and Y. Then we 22*a34c753fSRafael Auler // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y, 23*a34c753fSRafael Auler // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score. 24*a34c753fSRafael Auler // This improves the quality of the final result (the search space is larger) 25*a34c753fSRafael Auler // while keeping the implementation sufficiently fast. 26*a34c753fSRafael Auler // 27*a34c753fSRafael Auler // Reference: 28*a34c753fSRafael Auler // * A. Newell and S. Pupyrev, Improved Basic Block Reordering, 29*a34c753fSRafael Auler // IEEE Transactions on Computers, 2020 30*a34c753fSRafael Auler // https://arxiv.org/abs/1809.04676 31*a34c753fSRafael Auler //===----------------------------------------------------------------------===// 32*a34c753fSRafael Auler #include "bolt/Core/BinaryBasicBlock.h" 33*a34c753fSRafael Auler #include "bolt/Core/BinaryFunction.h" 34*a34c753fSRafael Auler #include "bolt/Passes/ReorderAlgorithm.h" 35*a34c753fSRafael Auler #include "llvm/Support/CommandLine.h" 36*a34c753fSRafael Auler 37*a34c753fSRafael Auler using namespace llvm; 38*a34c753fSRafael Auler using namespace bolt; 39*a34c753fSRafael Auler namespace opts { 40*a34c753fSRafael Auler 41*a34c753fSRafael Auler extern cl::OptionCategory BoltOptCategory; 42*a34c753fSRafael Auler extern cl::opt<bool> NoThreads; 43*a34c753fSRafael Auler 44*a34c753fSRafael Auler cl::opt<unsigned> 45*a34c753fSRafael Auler ChainSplitThreshold("chain-split-threshold", 46*a34c753fSRafael Auler cl::desc("The maximum size of a chain to apply splitting"), 47*a34c753fSRafael Auler cl::init(128), 48*a34c753fSRafael Auler cl::ReallyHidden, 49*a34c753fSRafael Auler cl::ZeroOrMore, 50*a34c753fSRafael Auler cl::cat(BoltOptCategory)); 51*a34c753fSRafael Auler 52*a34c753fSRafael Auler cl::opt<double> 53*a34c753fSRafael Auler ForwardWeight("forward-weight", 54*a34c753fSRafael Auler cl::desc("The weight of forward jumps for ExtTSP value"), 55*a34c753fSRafael Auler cl::init(0.1), 56*a34c753fSRafael Auler cl::ReallyHidden, 57*a34c753fSRafael Auler cl::ZeroOrMore, 58*a34c753fSRafael Auler cl::cat(BoltOptCategory)); 59*a34c753fSRafael Auler 60*a34c753fSRafael Auler cl::opt<double> 61*a34c753fSRafael Auler BackwardWeight("backward-weight", 62*a34c753fSRafael Auler cl::desc("The weight of backward jumps for ExtTSP value"), 63*a34c753fSRafael Auler cl::init(0.1), 64*a34c753fSRafael Auler cl::ReallyHidden, 65*a34c753fSRafael Auler cl::ZeroOrMore, 66*a34c753fSRafael Auler cl::cat(BoltOptCategory)); 67*a34c753fSRafael Auler 68*a34c753fSRafael Auler cl::opt<unsigned> 69*a34c753fSRafael Auler ForwardDistance("forward-distance", 70*a34c753fSRafael Auler cl::desc("The maximum distance (in bytes) of forward jumps for ExtTSP value"), 71*a34c753fSRafael Auler cl::init(1024), 72*a34c753fSRafael Auler cl::ReallyHidden, 73*a34c753fSRafael Auler cl::ZeroOrMore, 74*a34c753fSRafael Auler cl::cat(BoltOptCategory)); 75*a34c753fSRafael Auler 76*a34c753fSRafael Auler cl::opt<unsigned> 77*a34c753fSRafael Auler BackwardDistance("backward-distance", 78*a34c753fSRafael Auler cl::desc("The maximum distance (in bytes) of backward jumps for ExtTSP value"), 79*a34c753fSRafael Auler cl::init(640), 80*a34c753fSRafael Auler cl::ReallyHidden, 81*a34c753fSRafael Auler cl::ZeroOrMore, 82*a34c753fSRafael Auler cl::cat(BoltOptCategory)); 83*a34c753fSRafael Auler 84*a34c753fSRafael Auler } 85*a34c753fSRafael Auler 86*a34c753fSRafael Auler namespace llvm { 87*a34c753fSRafael Auler namespace bolt { 88*a34c753fSRafael Auler 89*a34c753fSRafael Auler // Epsilon for comparison of doubles 90*a34c753fSRafael Auler constexpr double EPS = 1e-8; 91*a34c753fSRafael Auler 92*a34c753fSRafael Auler class Block; 93*a34c753fSRafael Auler class Chain; 94*a34c753fSRafael Auler class Edge; 95*a34c753fSRafael Auler 96*a34c753fSRafael Auler // Calculate Ext-TSP value, which quantifies the expected number of i-cache 97*a34c753fSRafael Auler // misses for a given ordering of basic blocks 98*a34c753fSRafael Auler double extTSPScore(uint64_t SrcAddr, 99*a34c753fSRafael Auler uint64_t SrcSize, 100*a34c753fSRafael Auler uint64_t DstAddr, 101*a34c753fSRafael Auler uint64_t Count) { 102*a34c753fSRafael Auler assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE); 103*a34c753fSRafael Auler 104*a34c753fSRafael Auler // Fallthrough 105*a34c753fSRafael Auler if (SrcAddr + SrcSize == DstAddr) { 106*a34c753fSRafael Auler // Assume that FallthroughWeight = 1.0 after normalization 107*a34c753fSRafael Auler return static_cast<double>(Count); 108*a34c753fSRafael Auler } 109*a34c753fSRafael Auler // Forward 110*a34c753fSRafael Auler if (SrcAddr + SrcSize < DstAddr) { 111*a34c753fSRafael Auler const uint64_t Dist = DstAddr - (SrcAddr + SrcSize); 112*a34c753fSRafael Auler if (Dist <= opts::ForwardDistance) { 113*a34c753fSRafael Auler double Prob = 1.0 - static_cast<double>(Dist) / opts::ForwardDistance; 114*a34c753fSRafael Auler return opts::ForwardWeight * Prob * Count; 115*a34c753fSRafael Auler } 116*a34c753fSRafael Auler return 0; 117*a34c753fSRafael Auler } 118*a34c753fSRafael Auler // Backward 119*a34c753fSRafael Auler const uint64_t Dist = SrcAddr + SrcSize - DstAddr; 120*a34c753fSRafael Auler if (Dist <= opts::BackwardDistance) { 121*a34c753fSRafael Auler double Prob = 1.0 - static_cast<double>(Dist) / opts::BackwardDistance; 122*a34c753fSRafael Auler return opts::BackwardWeight * Prob * Count; 123*a34c753fSRafael Auler } 124*a34c753fSRafael Auler return 0; 125*a34c753fSRafael Auler } 126*a34c753fSRafael Auler 127*a34c753fSRafael Auler using BlockPair = std::pair<Block *, Block *>; 128*a34c753fSRafael Auler using JumpList = std::vector<std::pair<BlockPair, uint64_t>>; 129*a34c753fSRafael Auler using BlockIter = std::vector<Block *>::const_iterator; 130*a34c753fSRafael Auler 131*a34c753fSRafael Auler enum MergeTypeTy { 132*a34c753fSRafael Auler X_Y = 0, 133*a34c753fSRafael Auler X1_Y_X2 = 1, 134*a34c753fSRafael Auler Y_X2_X1 = 2, 135*a34c753fSRafael Auler X2_X1_Y = 3, 136*a34c753fSRafael Auler }; 137*a34c753fSRafael Auler 138*a34c753fSRafael Auler class MergeGainTy { 139*a34c753fSRafael Auler public: 140*a34c753fSRafael Auler explicit MergeGainTy() {} 141*a34c753fSRafael Auler explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType) 142*a34c753fSRafael Auler : Score(Score), 143*a34c753fSRafael Auler MergeOffset(MergeOffset), 144*a34c753fSRafael Auler MergeType(MergeType) {} 145*a34c753fSRafael Auler 146*a34c753fSRafael Auler double score() const { 147*a34c753fSRafael Auler return Score; 148*a34c753fSRafael Auler } 149*a34c753fSRafael Auler 150*a34c753fSRafael Auler size_t mergeOffset() const { 151*a34c753fSRafael Auler return MergeOffset; 152*a34c753fSRafael Auler } 153*a34c753fSRafael Auler 154*a34c753fSRafael Auler MergeTypeTy mergeType() const { 155*a34c753fSRafael Auler return MergeType; 156*a34c753fSRafael Auler } 157*a34c753fSRafael Auler 158*a34c753fSRafael Auler // returns 'true' iff Other is preferred over this 159*a34c753fSRafael Auler bool operator < (const MergeGainTy& Other) const { 160*a34c753fSRafael Auler return (Other.Score > EPS && Other.Score > Score + EPS); 161*a34c753fSRafael Auler } 162*a34c753fSRafael Auler 163*a34c753fSRafael Auler private: 164*a34c753fSRafael Auler double Score{-1.0}; 165*a34c753fSRafael Auler size_t MergeOffset{0}; 166*a34c753fSRafael Auler MergeTypeTy MergeType{MergeTypeTy::X_Y}; 167*a34c753fSRafael Auler }; 168*a34c753fSRafael Auler 169*a34c753fSRafael Auler // A node in CFG corresponding to a BinaryBasicBlock. 170*a34c753fSRafael Auler // The class wraps several mutable fields utilized in the ExtTSP algorithm 171*a34c753fSRafael Auler class Block { 172*a34c753fSRafael Auler public: 173*a34c753fSRafael Auler Block(const Block&) = delete; 174*a34c753fSRafael Auler Block(Block&&) = default; 175*a34c753fSRafael Auler Block& operator=(const Block&) = delete; 176*a34c753fSRafael Auler Block& operator=(Block&&) = default; 177*a34c753fSRafael Auler 178*a34c753fSRafael Auler // Corresponding basic block 179*a34c753fSRafael Auler BinaryBasicBlock *BB{nullptr}; 180*a34c753fSRafael Auler // Current chain of the basic block 181*a34c753fSRafael Auler Chain *CurChain{nullptr}; 182*a34c753fSRafael Auler // (Estimated) size of the block in the binary 183*a34c753fSRafael Auler uint64_t Size{0}; 184*a34c753fSRafael Auler // Execution count of the block in the binary 185*a34c753fSRafael Auler uint64_t ExecutionCount{0}; 186*a34c753fSRafael Auler // An original index of the node in CFG 187*a34c753fSRafael Auler size_t Index{0}; 188*a34c753fSRafael Auler // The index of the block in the current chain 189*a34c753fSRafael Auler size_t CurIndex{0}; 190*a34c753fSRafael Auler // An offset of the block in the current chain 191*a34c753fSRafael Auler mutable uint64_t EstimatedAddr{0}; 192*a34c753fSRafael Auler // Fallthrough successor of the node in CFG 193*a34c753fSRafael Auler Block *FallthroughSucc{nullptr}; 194*a34c753fSRafael Auler // Fallthrough predecessor of the node in CFG 195*a34c753fSRafael Auler Block *FallthroughPred{nullptr}; 196*a34c753fSRafael Auler // Outgoing jumps from the block 197*a34c753fSRafael Auler std::vector<std::pair<Block *, uint64_t>> OutJumps; 198*a34c753fSRafael Auler // Incoming jumps to the block 199*a34c753fSRafael Auler std::vector<std::pair<Block *, uint64_t>> InJumps; 200*a34c753fSRafael Auler // Total execution count of incoming jumps 201*a34c753fSRafael Auler uint64_t InWeight{0}; 202*a34c753fSRafael Auler // Total execution count of outgoing jumps 203*a34c753fSRafael Auler uint64_t OutWeight{0}; 204*a34c753fSRafael Auler 205*a34c753fSRafael Auler public: 206*a34c753fSRafael Auler explicit Block(BinaryBasicBlock *BB_, uint64_t Size_) 207*a34c753fSRafael Auler : BB(BB_), 208*a34c753fSRafael Auler Size(Size_), 209*a34c753fSRafael Auler ExecutionCount(BB_->getKnownExecutionCount()), 210*a34c753fSRafael Auler Index(BB->getLayoutIndex()) {} 211*a34c753fSRafael Auler 212*a34c753fSRafael Auler bool adjacent(const Block *Other) const { 213*a34c753fSRafael Auler return hasOutJump(Other) || hasInJump(Other); 214*a34c753fSRafael Auler } 215*a34c753fSRafael Auler 216*a34c753fSRafael Auler bool hasOutJump(const Block *Other) const { 217*a34c753fSRafael Auler for (std::pair<Block *, uint64_t> Jump : OutJumps) { 218*a34c753fSRafael Auler if (Jump.first == Other) 219*a34c753fSRafael Auler return true; 220*a34c753fSRafael Auler } 221*a34c753fSRafael Auler return false; 222*a34c753fSRafael Auler } 223*a34c753fSRafael Auler 224*a34c753fSRafael Auler bool hasInJump(const Block *Other) const { 225*a34c753fSRafael Auler for (std::pair<Block *, uint64_t> Jump : InJumps) { 226*a34c753fSRafael Auler if (Jump.first == Other) 227*a34c753fSRafael Auler return true; 228*a34c753fSRafael Auler } 229*a34c753fSRafael Auler return false; 230*a34c753fSRafael Auler } 231*a34c753fSRafael Auler }; 232*a34c753fSRafael Auler 233*a34c753fSRafael Auler // A chain (ordered sequence) of CFG nodes (basic blocks) 234*a34c753fSRafael Auler class Chain { 235*a34c753fSRafael Auler public: 236*a34c753fSRafael Auler Chain(const Chain&) = delete; 237*a34c753fSRafael Auler Chain(Chain&&) = default; 238*a34c753fSRafael Auler Chain& operator=(const Chain&) = delete; 239*a34c753fSRafael Auler Chain& operator=(Chain&&) = default; 240*a34c753fSRafael Auler 241*a34c753fSRafael Auler explicit Chain(size_t Id, Block *Block) 242*a34c753fSRafael Auler : Id(Id), 243*a34c753fSRafael Auler IsEntry(Block->Index == 0), 244*a34c753fSRafael Auler ExecutionCount(Block->ExecutionCount), 245*a34c753fSRafael Auler Size(Block->Size), 246*a34c753fSRafael Auler Score(0), 247*a34c753fSRafael Auler Blocks(1, Block) {} 248*a34c753fSRafael Auler 249*a34c753fSRafael Auler size_t id() const { 250*a34c753fSRafael Auler return Id; 251*a34c753fSRafael Auler } 252*a34c753fSRafael Auler 253*a34c753fSRafael Auler uint64_t size() const { 254*a34c753fSRafael Auler return Size; 255*a34c753fSRafael Auler } 256*a34c753fSRafael Auler 257*a34c753fSRafael Auler double density() const { 258*a34c753fSRafael Auler return static_cast<double>(ExecutionCount) / Size; 259*a34c753fSRafael Auler } 260*a34c753fSRafael Auler 261*a34c753fSRafael Auler uint64_t executionCount() const { 262*a34c753fSRafael Auler return ExecutionCount; 263*a34c753fSRafael Auler } 264*a34c753fSRafael Auler 265*a34c753fSRafael Auler bool isEntryPoint() const { 266*a34c753fSRafael Auler return IsEntry; 267*a34c753fSRafael Auler } 268*a34c753fSRafael Auler 269*a34c753fSRafael Auler double score() const { 270*a34c753fSRafael Auler return Score; 271*a34c753fSRafael Auler } 272*a34c753fSRafael Auler 273*a34c753fSRafael Auler void setScore(double NewScore) { 274*a34c753fSRafael Auler Score = NewScore; 275*a34c753fSRafael Auler } 276*a34c753fSRafael Auler 277*a34c753fSRafael Auler const std::vector<Block *> &blocks() const { 278*a34c753fSRafael Auler return Blocks; 279*a34c753fSRafael Auler } 280*a34c753fSRafael Auler 281*a34c753fSRafael Auler const std::vector<std::pair<Chain *, Edge *>> &edges() const { 282*a34c753fSRafael Auler return Edges; 283*a34c753fSRafael Auler } 284*a34c753fSRafael Auler 285*a34c753fSRafael Auler Edge *getEdge(Chain *Other) const { 286*a34c753fSRafael Auler for (std::pair<Chain *, Edge *> It : Edges) { 287*a34c753fSRafael Auler if (It.first == Other) 288*a34c753fSRafael Auler return It.second; 289*a34c753fSRafael Auler } 290*a34c753fSRafael Auler return nullptr; 291*a34c753fSRafael Auler } 292*a34c753fSRafael Auler 293*a34c753fSRafael Auler void removeEdge(Chain *Other) { 294*a34c753fSRafael Auler auto It = Edges.begin(); 295*a34c753fSRafael Auler while (It != Edges.end()) { 296*a34c753fSRafael Auler if (It->first == Other) { 297*a34c753fSRafael Auler Edges.erase(It); 298*a34c753fSRafael Auler return; 299*a34c753fSRafael Auler } 300*a34c753fSRafael Auler It++; 301*a34c753fSRafael Auler } 302*a34c753fSRafael Auler } 303*a34c753fSRafael Auler 304*a34c753fSRafael Auler void addEdge(Chain *Other, Edge *Edge) { Edges.emplace_back(Other, Edge); } 305*a34c753fSRafael Auler 306*a34c753fSRafael Auler void merge(Chain *Other, const std::vector<Block *> &MergedBlocks) { 307*a34c753fSRafael Auler Blocks = MergedBlocks; 308*a34c753fSRafael Auler IsEntry |= Other->IsEntry; 309*a34c753fSRafael Auler ExecutionCount += Other->ExecutionCount; 310*a34c753fSRafael Auler Size += Other->Size; 311*a34c753fSRafael Auler // Update block's chains 312*a34c753fSRafael Auler for (size_t Idx = 0; Idx < Blocks.size(); Idx++) { 313*a34c753fSRafael Auler Blocks[Idx]->CurChain = this; 314*a34c753fSRafael Auler Blocks[Idx]->CurIndex = Idx; 315*a34c753fSRafael Auler } 316*a34c753fSRafael Auler } 317*a34c753fSRafael Auler 318*a34c753fSRafael Auler void mergeEdges(Chain *Other); 319*a34c753fSRafael Auler 320*a34c753fSRafael Auler void clear() { 321*a34c753fSRafael Auler Blocks.clear(); 322*a34c753fSRafael Auler Edges.clear(); 323*a34c753fSRafael Auler } 324*a34c753fSRafael Auler 325*a34c753fSRafael Auler private: 326*a34c753fSRafael Auler size_t Id; 327*a34c753fSRafael Auler bool IsEntry; 328*a34c753fSRafael Auler uint64_t ExecutionCount; 329*a34c753fSRafael Auler uint64_t Size; 330*a34c753fSRafael Auler // Cached ext-tsp score for the chain 331*a34c753fSRafael Auler double Score; 332*a34c753fSRafael Auler // Blocks of the chain 333*a34c753fSRafael Auler std::vector<Block *> Blocks; 334*a34c753fSRafael Auler // Adjacent chains and corresponding edges (lists of jumps) 335*a34c753fSRafael Auler std::vector<std::pair<Chain *, Edge *>> Edges; 336*a34c753fSRafael Auler }; 337*a34c753fSRafael Auler 338*a34c753fSRafael Auler // An edge in CFG reprsenting jumps between chains of BinaryBasicBlocks. 339*a34c753fSRafael Auler // When blocks are merged into chains, the edges are combined too so that 340*a34c753fSRafael Auler // there is always at most one edge between a pair of chains 341*a34c753fSRafael Auler class Edge { 342*a34c753fSRafael Auler public: 343*a34c753fSRafael Auler Edge(const Edge&) = delete; 344*a34c753fSRafael Auler Edge(Edge&&) = default; 345*a34c753fSRafael Auler Edge& operator=(const Edge&) = delete; 346*a34c753fSRafael Auler Edge& operator=(Edge&&) = default; 347*a34c753fSRafael Auler 348*a34c753fSRafael Auler explicit Edge(Block *SrcBlock, Block *DstBlock, uint64_t EC) 349*a34c753fSRafael Auler : SrcChain(SrcBlock->CurChain), 350*a34c753fSRafael Auler DstChain(DstBlock->CurChain), 351*a34c753fSRafael Auler Jumps(1, std::make_pair(std::make_pair(SrcBlock, DstBlock), EC)) {} 352*a34c753fSRafael Auler 353*a34c753fSRafael Auler const JumpList &jumps() const { 354*a34c753fSRafael Auler return Jumps; 355*a34c753fSRafael Auler } 356*a34c753fSRafael Auler 357*a34c753fSRafael Auler void changeEndpoint(Chain *From, Chain *To) { 358*a34c753fSRafael Auler if (From == SrcChain) 359*a34c753fSRafael Auler SrcChain = To; 360*a34c753fSRafael Auler if (From == DstChain) 361*a34c753fSRafael Auler DstChain = To; 362*a34c753fSRafael Auler } 363*a34c753fSRafael Auler 364*a34c753fSRafael Auler void appendJump(Block *SrcBlock, Block *DstBlock, uint64_t EC) { 365*a34c753fSRafael Auler Jumps.emplace_back(std::make_pair(SrcBlock, DstBlock), EC); 366*a34c753fSRafael Auler } 367*a34c753fSRafael Auler 368*a34c753fSRafael Auler void moveJumps(Edge *Other) { 369*a34c753fSRafael Auler Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end()); 370*a34c753fSRafael Auler Other->Jumps.clear(); 371*a34c753fSRafael Auler } 372*a34c753fSRafael Auler 373*a34c753fSRafael Auler bool hasCachedMergeGain(Chain *Src, Chain *Dst) const { 374*a34c753fSRafael Auler return Src == SrcChain ? CacheValidForward : CacheValidBackward; 375*a34c753fSRafael Auler } 376*a34c753fSRafael Auler 377*a34c753fSRafael Auler MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const { 378*a34c753fSRafael Auler return Src == SrcChain ? CachedGainForward : CachedGainBackward; 379*a34c753fSRafael Auler } 380*a34c753fSRafael Auler 381*a34c753fSRafael Auler void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) { 382*a34c753fSRafael Auler if (Src == SrcChain) { 383*a34c753fSRafael Auler CachedGainForward = MergeGain; 384*a34c753fSRafael Auler CacheValidForward = true; 385*a34c753fSRafael Auler } else { 386*a34c753fSRafael Auler CachedGainBackward = MergeGain; 387*a34c753fSRafael Auler CacheValidBackward = true; 388*a34c753fSRafael Auler } 389*a34c753fSRafael Auler } 390*a34c753fSRafael Auler 391*a34c753fSRafael Auler void invalidateCache() { 392*a34c753fSRafael Auler CacheValidForward = false; 393*a34c753fSRafael Auler CacheValidBackward = false; 394*a34c753fSRafael Auler } 395*a34c753fSRafael Auler 396*a34c753fSRafael Auler private: 397*a34c753fSRafael Auler Chain *SrcChain{nullptr}; 398*a34c753fSRafael Auler Chain *DstChain{nullptr}; 399*a34c753fSRafael Auler // Original jumps in the binary with correspinding execution counts 400*a34c753fSRafael Auler JumpList Jumps; 401*a34c753fSRafael Auler // Cached ext-tsp value for merging the pair of chains 402*a34c753fSRafael Auler // Since the gain of merging (Src, Dst) and (Dst, Src) might be different, 403*a34c753fSRafael Auler // we store both values here 404*a34c753fSRafael Auler MergeGainTy CachedGainForward; 405*a34c753fSRafael Auler MergeGainTy CachedGainBackward; 406*a34c753fSRafael Auler // Whether the cached value must be recomputed 407*a34c753fSRafael Auler bool CacheValidForward{false}; 408*a34c753fSRafael Auler bool CacheValidBackward{false}; 409*a34c753fSRafael Auler }; 410*a34c753fSRafael Auler 411*a34c753fSRafael Auler void Chain::mergeEdges(Chain *Other) { 412*a34c753fSRafael Auler assert(this != Other && "cannot merge a chain with itself"); 413*a34c753fSRafael Auler 414*a34c753fSRafael Auler // Update edges adjacent to chain Other 415*a34c753fSRafael Auler for (auto EdgeIt : Other->Edges) { 416*a34c753fSRafael Auler Chain *const DstChain = EdgeIt.first; 417*a34c753fSRafael Auler Edge *const DstEdge = EdgeIt.second; 418*a34c753fSRafael Auler Chain *const TargetChain = DstChain == Other ? this : DstChain; 419*a34c753fSRafael Auler 420*a34c753fSRafael Auler // Find the corresponding edge in the current chain 421*a34c753fSRafael Auler Edge *curEdge = getEdge(TargetChain); 422*a34c753fSRafael Auler if (curEdge == nullptr) { 423*a34c753fSRafael Auler DstEdge->changeEndpoint(Other, this); 424*a34c753fSRafael Auler this->addEdge(TargetChain, DstEdge); 425*a34c753fSRafael Auler if (DstChain != this && DstChain != Other) { 426*a34c753fSRafael Auler DstChain->addEdge(this, DstEdge); 427*a34c753fSRafael Auler } 428*a34c753fSRafael Auler } else { 429*a34c753fSRafael Auler curEdge->moveJumps(DstEdge); 430*a34c753fSRafael Auler } 431*a34c753fSRafael Auler // Cleanup leftover edge 432*a34c753fSRafael Auler if (DstChain != Other) { 433*a34c753fSRafael Auler DstChain->removeEdge(Other); 434*a34c753fSRafael Auler } 435*a34c753fSRafael Auler } 436*a34c753fSRafael Auler } 437*a34c753fSRafael Auler 438*a34c753fSRafael Auler // A wrapper around three chains of basic blocks; it is used to avoid extra 439*a34c753fSRafael Auler // instantiation of the vectors. 440*a34c753fSRafael Auler class MergedChain { 441*a34c753fSRafael Auler public: 442*a34c753fSRafael Auler MergedChain(BlockIter Begin1, 443*a34c753fSRafael Auler BlockIter End1, 444*a34c753fSRafael Auler BlockIter Begin2 = BlockIter(), 445*a34c753fSRafael Auler BlockIter End2 = BlockIter(), 446*a34c753fSRafael Auler BlockIter Begin3 = BlockIter(), 447*a34c753fSRafael Auler BlockIter End3 = BlockIter()) 448*a34c753fSRafael Auler : Begin1(Begin1), 449*a34c753fSRafael Auler End1(End1), 450*a34c753fSRafael Auler Begin2(Begin2), 451*a34c753fSRafael Auler End2(End2), 452*a34c753fSRafael Auler Begin3(Begin3), 453*a34c753fSRafael Auler End3(End3) {} 454*a34c753fSRafael Auler 455*a34c753fSRafael Auler template<typename F> 456*a34c753fSRafael Auler void forEach(const F &Func) const { 457*a34c753fSRafael Auler for (auto It = Begin1; It != End1; It++) 458*a34c753fSRafael Auler Func(*It); 459*a34c753fSRafael Auler for (auto It = Begin2; It != End2; It++) 460*a34c753fSRafael Auler Func(*It); 461*a34c753fSRafael Auler for (auto It = Begin3; It != End3; It++) 462*a34c753fSRafael Auler Func(*It); 463*a34c753fSRafael Auler } 464*a34c753fSRafael Auler 465*a34c753fSRafael Auler std::vector<Block *> getBlocks() const { 466*a34c753fSRafael Auler std::vector<Block *> Result; 467*a34c753fSRafael Auler Result.reserve(std::distance(Begin1, End1) + 468*a34c753fSRafael Auler std::distance(Begin2, End2) + 469*a34c753fSRafael Auler std::distance(Begin3, End3)); 470*a34c753fSRafael Auler Result.insert(Result.end(), Begin1, End1); 471*a34c753fSRafael Auler Result.insert(Result.end(), Begin2, End2); 472*a34c753fSRafael Auler Result.insert(Result.end(), Begin3, End3); 473*a34c753fSRafael Auler return Result; 474*a34c753fSRafael Auler } 475*a34c753fSRafael Auler 476*a34c753fSRafael Auler const Block *getFirstBlock() const { 477*a34c753fSRafael Auler return *Begin1; 478*a34c753fSRafael Auler } 479*a34c753fSRafael Auler 480*a34c753fSRafael Auler private: 481*a34c753fSRafael Auler BlockIter Begin1; 482*a34c753fSRafael Auler BlockIter End1; 483*a34c753fSRafael Auler BlockIter Begin2; 484*a34c753fSRafael Auler BlockIter End2; 485*a34c753fSRafael Auler BlockIter Begin3; 486*a34c753fSRafael Auler BlockIter End3; 487*a34c753fSRafael Auler }; 488*a34c753fSRafael Auler 489*a34c753fSRafael Auler /// Deterministically compare pairs of chains 490*a34c753fSRafael Auler bool compareChainPairs(const Chain *A1, const Chain *B1, 491*a34c753fSRafael Auler const Chain *A2, const Chain *B2) { 492*a34c753fSRafael Auler const uint64_t Samples1 = A1->executionCount() + B1->executionCount(); 493*a34c753fSRafael Auler const uint64_t Samples2 = A2->executionCount() + B2->executionCount(); 494*a34c753fSRafael Auler if (Samples1 != Samples2) 495*a34c753fSRafael Auler return Samples1 < Samples2; 496*a34c753fSRafael Auler 497*a34c753fSRafael Auler // Making the order deterministic 498*a34c753fSRafael Auler if (A1 != A2) 499*a34c753fSRafael Auler return A1->id() < A2->id(); 500*a34c753fSRafael Auler return B1->id() < B2->id(); 501*a34c753fSRafael Auler } 502*a34c753fSRafael Auler class ExtTSP { 503*a34c753fSRafael Auler public: 504*a34c753fSRafael Auler ExtTSP(const BinaryFunction &BF) : BF(BF) { 505*a34c753fSRafael Auler initialize(); 506*a34c753fSRafael Auler } 507*a34c753fSRafael Auler 508*a34c753fSRafael Auler /// Run the algorithm and return an ordering of basic block 509*a34c753fSRafael Auler void run(std::vector<BinaryBasicBlock *> &Order) { 510*a34c753fSRafael Auler // Pass 1: Merge blocks with their fallthrough successors 511*a34c753fSRafael Auler mergeFallthroughs(); 512*a34c753fSRafael Auler 513*a34c753fSRafael Auler // Pass 2: Merge pairs of chains while improving the ExtTSP objective 514*a34c753fSRafael Auler mergeChainPairs(); 515*a34c753fSRafael Auler 516*a34c753fSRafael Auler // Pass 3: Merge cold blocks to reduce code size 517*a34c753fSRafael Auler mergeColdChains(); 518*a34c753fSRafael Auler 519*a34c753fSRafael Auler // Collect blocks from all chains 520*a34c753fSRafael Auler concatChains(Order); 521*a34c753fSRafael Auler } 522*a34c753fSRafael Auler 523*a34c753fSRafael Auler private: 524*a34c753fSRafael Auler /// Initialize algorithm's data structures 525*a34c753fSRafael Auler void initialize() { 526*a34c753fSRafael Auler // Create a separate MCCodeEmitter to allow lock-free execution 527*a34c753fSRafael Auler BinaryContext::IndependentCodeEmitter Emitter; 528*a34c753fSRafael Auler if (!opts::NoThreads) { 529*a34c753fSRafael Auler Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter(); 530*a34c753fSRafael Auler } 531*a34c753fSRafael Auler 532*a34c753fSRafael Auler // Initialize CFG nodes 533*a34c753fSRafael Auler AllBlocks.reserve(BF.layout_size()); 534*a34c753fSRafael Auler size_t LayoutIndex = 0; 535*a34c753fSRafael Auler for (BinaryBasicBlock *BB : BF.layout()) { 536*a34c753fSRafael Auler BB->setLayoutIndex(LayoutIndex++); 537*a34c753fSRafael Auler uint64_t Size = 538*a34c753fSRafael Auler std::max<uint64_t>(BB->estimateSize(Emitter.MCE.get()), 1); 539*a34c753fSRafael Auler AllBlocks.emplace_back(BB, Size); 540*a34c753fSRafael Auler } 541*a34c753fSRafael Auler 542*a34c753fSRafael Auler // Initialize edges for the blocks and compute their total in/out weights 543*a34c753fSRafael Auler size_t NumEdges = 0; 544*a34c753fSRafael Auler for (Block &Block : AllBlocks) { 545*a34c753fSRafael Auler auto BI = Block.BB->branch_info_begin(); 546*a34c753fSRafael Auler for (BinaryBasicBlock *SuccBB : Block.BB->successors()) { 547*a34c753fSRafael Auler assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 548*a34c753fSRafael Auler "missing profile for a jump"); 549*a34c753fSRafael Auler if (SuccBB != Block.BB && BI->Count > 0) { 550*a34c753fSRafael Auler class Block &SuccBlock = AllBlocks[SuccBB->getLayoutIndex()]; 551*a34c753fSRafael Auler uint64_t Count = BI->Count; 552*a34c753fSRafael Auler SuccBlock.InWeight += Count; 553*a34c753fSRafael Auler SuccBlock.InJumps.emplace_back(&Block, Count); 554*a34c753fSRafael Auler Block.OutWeight += Count; 555*a34c753fSRafael Auler Block.OutJumps.emplace_back(&SuccBlock, Count); 556*a34c753fSRafael Auler NumEdges++; 557*a34c753fSRafael Auler } 558*a34c753fSRafael Auler ++BI; 559*a34c753fSRafael Auler } 560*a34c753fSRafael Auler } 561*a34c753fSRafael Auler 562*a34c753fSRafael Auler // Initialize execution count for every basic block, which is the 563*a34c753fSRafael Auler // maximum over the sums of all in and out edge weights. 564*a34c753fSRafael Auler // Also execution count of the entry point is set to at least 1 565*a34c753fSRafael Auler for (Block &Block : AllBlocks) { 566*a34c753fSRafael Auler size_t Index = Block.Index; 567*a34c753fSRafael Auler Block.ExecutionCount = std::max(Block.ExecutionCount, Block.InWeight); 568*a34c753fSRafael Auler Block.ExecutionCount = std::max(Block.ExecutionCount, Block.OutWeight); 569*a34c753fSRafael Auler if (Index == 0 && Block.ExecutionCount == 0) 570*a34c753fSRafael Auler Block.ExecutionCount = 1; 571*a34c753fSRafael Auler } 572*a34c753fSRafael Auler 573*a34c753fSRafael Auler // Initialize chains 574*a34c753fSRafael Auler AllChains.reserve(BF.layout_size()); 575*a34c753fSRafael Auler HotChains.reserve(BF.layout_size()); 576*a34c753fSRafael Auler for (Block &Block : AllBlocks) { 577*a34c753fSRafael Auler AllChains.emplace_back(Block.Index, &Block); 578*a34c753fSRafael Auler Block.CurChain = &AllChains.back(); 579*a34c753fSRafael Auler if (Block.ExecutionCount > 0) { 580*a34c753fSRafael Auler HotChains.push_back(&AllChains.back()); 581*a34c753fSRafael Auler } 582*a34c753fSRafael Auler } 583*a34c753fSRafael Auler 584*a34c753fSRafael Auler // Initialize edges 585*a34c753fSRafael Auler AllEdges.reserve(NumEdges); 586*a34c753fSRafael Auler for (Block &Block : AllBlocks) { 587*a34c753fSRafael Auler for (std::pair<class Block *, uint64_t> &Jump : Block.OutJumps) { 588*a34c753fSRafael Auler class Block *const SuccBlock = Jump.first; 589*a34c753fSRafael Auler Edge *CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain); 590*a34c753fSRafael Auler // this edge is already present in the graph 591*a34c753fSRafael Auler if (CurEdge != nullptr) { 592*a34c753fSRafael Auler assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr); 593*a34c753fSRafael Auler CurEdge->appendJump(&Block, SuccBlock, Jump.second); 594*a34c753fSRafael Auler continue; 595*a34c753fSRafael Auler } 596*a34c753fSRafael Auler // this is a new edge 597*a34c753fSRafael Auler AllEdges.emplace_back(&Block, SuccBlock, Jump.second); 598*a34c753fSRafael Auler Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back()); 599*a34c753fSRafael Auler SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back()); 600*a34c753fSRafael Auler } 601*a34c753fSRafael Auler } 602*a34c753fSRafael Auler assert(AllEdges.size() <= NumEdges && "Incorrect number of created edges"); 603*a34c753fSRafael Auler } 604*a34c753fSRafael Auler 605*a34c753fSRafael Auler /// For a pair of blocks, A and B, block B is the fallthrough successor of A, 606*a34c753fSRafael Auler /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps 607*a34c753fSRafael Auler /// to B are from A. Such blocks should be adjacent in an optimal ordering; 608*a34c753fSRafael Auler /// the method finds and merges such pairs of blocks 609*a34c753fSRafael Auler void mergeFallthroughs() { 610*a34c753fSRafael Auler // Find fallthroughs based on edge weights 611*a34c753fSRafael Auler for (Block &Block : AllBlocks) { 612*a34c753fSRafael Auler if (Block.BB->succ_size() == 1 && 613*a34c753fSRafael Auler Block.BB->getSuccessor()->pred_size() == 1 && 614*a34c753fSRafael Auler Block.BB->getSuccessor()->getLayoutIndex() != 0) { 615*a34c753fSRafael Auler size_t SuccIndex = Block.BB->getSuccessor()->getLayoutIndex(); 616*a34c753fSRafael Auler Block.FallthroughSucc = &AllBlocks[SuccIndex]; 617*a34c753fSRafael Auler AllBlocks[SuccIndex].FallthroughPred = &Block; 618*a34c753fSRafael Auler continue; 619*a34c753fSRafael Auler } 620*a34c753fSRafael Auler 621*a34c753fSRafael Auler if (Block.OutWeight == 0) 622*a34c753fSRafael Auler continue; 623*a34c753fSRafael Auler for (std::pair<class Block *, uint64_t> &Edge : Block.OutJumps) { 624*a34c753fSRafael Auler class Block *const SuccBlock = Edge.first; 625*a34c753fSRafael Auler // Successor cannot be the first BB, which is pinned 626*a34c753fSRafael Auler if (Block.OutWeight == Edge.second && 627*a34c753fSRafael Auler SuccBlock->InWeight == Edge.second && 628*a34c753fSRafael Auler SuccBlock->Index != 0) { 629*a34c753fSRafael Auler Block.FallthroughSucc = SuccBlock; 630*a34c753fSRafael Auler SuccBlock->FallthroughPred = &Block; 631*a34c753fSRafael Auler break; 632*a34c753fSRafael Auler } 633*a34c753fSRafael Auler } 634*a34c753fSRafael Auler } 635*a34c753fSRafael Auler 636*a34c753fSRafael Auler // There might be 'cycles' in the fallthrough dependencies (since profile 637*a34c753fSRafael Auler // data isn't 100% accurate). 638*a34c753fSRafael Auler // Break the cycles by choosing the block with smallest index as the tail 639*a34c753fSRafael Auler for (Block &Block : AllBlocks) { 640*a34c753fSRafael Auler if (Block.FallthroughSucc == nullptr || Block.FallthroughPred == nullptr) 641*a34c753fSRafael Auler continue; 642*a34c753fSRafael Auler 643*a34c753fSRafael Auler class Block *SuccBlock = Block.FallthroughSucc; 644*a34c753fSRafael Auler while (SuccBlock != nullptr && SuccBlock != &Block) { 645*a34c753fSRafael Auler SuccBlock = SuccBlock->FallthroughSucc; 646*a34c753fSRafael Auler } 647*a34c753fSRafael Auler if (SuccBlock == nullptr) 648*a34c753fSRafael Auler continue; 649*a34c753fSRafael Auler // break the cycle 650*a34c753fSRafael Auler AllBlocks[Block.FallthroughPred->Index].FallthroughSucc = nullptr; 651*a34c753fSRafael Auler Block.FallthroughPred = nullptr; 652*a34c753fSRafael Auler } 653*a34c753fSRafael Auler 654*a34c753fSRafael Auler // Merge blocks with their fallthrough successors 655*a34c753fSRafael Auler for (Block &Block : AllBlocks) { 656*a34c753fSRafael Auler if (Block.FallthroughPred == nullptr && 657*a34c753fSRafael Auler Block.FallthroughSucc != nullptr) { 658*a34c753fSRafael Auler class Block *CurBlock = &Block; 659*a34c753fSRafael Auler while (CurBlock->FallthroughSucc != nullptr) { 660*a34c753fSRafael Auler class Block *const NextBlock = CurBlock->FallthroughSucc; 661*a34c753fSRafael Auler mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y); 662*a34c753fSRafael Auler CurBlock = NextBlock; 663*a34c753fSRafael Auler } 664*a34c753fSRafael Auler } 665*a34c753fSRafael Auler } 666*a34c753fSRafael Auler } 667*a34c753fSRafael Auler 668*a34c753fSRafael Auler /// Merge pairs of chains while improving the ExtTSP objective 669*a34c753fSRafael Auler void mergeChainPairs() { 670*a34c753fSRafael Auler while (HotChains.size() > 1) { 671*a34c753fSRafael Auler Chain *BestChainPred = nullptr; 672*a34c753fSRafael Auler Chain *BestChainSucc = nullptr; 673*a34c753fSRafael Auler auto BestGain = MergeGainTy(); 674*a34c753fSRafael Auler // Iterate over all pairs of chains 675*a34c753fSRafael Auler for (Chain *ChainPred : HotChains) { 676*a34c753fSRafael Auler // Get candidates for merging with the current chain 677*a34c753fSRafael Auler for (auto EdgeIter : ChainPred->edges()) { 678*a34c753fSRafael Auler Chain *ChainSucc = EdgeIter.first; 679*a34c753fSRafael Auler Edge *ChainEdge = EdgeIter.second; 680*a34c753fSRafael Auler // Ignore loop edges 681*a34c753fSRafael Auler if (ChainPred == ChainSucc) 682*a34c753fSRafael Auler continue; 683*a34c753fSRafael Auler 684*a34c753fSRafael Auler // Compute the gain of merging the two chains 685*a34c753fSRafael Auler MergeGainTy CurGain = mergeGain(ChainPred, ChainSucc, ChainEdge); 686*a34c753fSRafael Auler if (CurGain.score() <= EPS) 687*a34c753fSRafael Auler continue; 688*a34c753fSRafael Auler 689*a34c753fSRafael Auler if (BestGain < CurGain || 690*a34c753fSRafael Auler (std::abs(CurGain.score() - BestGain.score()) < EPS && 691*a34c753fSRafael Auler compareChainPairs(ChainPred, 692*a34c753fSRafael Auler ChainSucc, 693*a34c753fSRafael Auler BestChainPred, 694*a34c753fSRafael Auler BestChainSucc))) { 695*a34c753fSRafael Auler BestGain = CurGain; 696*a34c753fSRafael Auler BestChainPred = ChainPred; 697*a34c753fSRafael Auler BestChainSucc = ChainSucc; 698*a34c753fSRafael Auler } 699*a34c753fSRafael Auler } 700*a34c753fSRafael Auler } 701*a34c753fSRafael Auler 702*a34c753fSRafael Auler // Stop merging when there is no improvement 703*a34c753fSRafael Auler if (BestGain.score() <= EPS) 704*a34c753fSRafael Auler break; 705*a34c753fSRafael Auler 706*a34c753fSRafael Auler // Merge the best pair of chains 707*a34c753fSRafael Auler mergeChains(BestChainPred, 708*a34c753fSRafael Auler BestChainSucc, 709*a34c753fSRafael Auler BestGain.mergeOffset(), 710*a34c753fSRafael Auler BestGain.mergeType()); 711*a34c753fSRafael Auler } 712*a34c753fSRafael Auler } 713*a34c753fSRafael Auler 714*a34c753fSRafael Auler /// Merge cold blocks to reduce code size 715*a34c753fSRafael Auler void mergeColdChains() { 716*a34c753fSRafael Auler for (BinaryBasicBlock *SrcBB : BF.layout()) { 717*a34c753fSRafael Auler // Iterating in reverse order to make sure original fallthrough jumps are 718*a34c753fSRafael Auler // merged first 719*a34c753fSRafael Auler for (auto Itr = SrcBB->succ_rbegin(); Itr != SrcBB->succ_rend(); ++Itr) { 720*a34c753fSRafael Auler BinaryBasicBlock *DstBB = *Itr; 721*a34c753fSRafael Auler size_t SrcIndex = SrcBB->getLayoutIndex(); 722*a34c753fSRafael Auler size_t DstIndex = DstBB->getLayoutIndex(); 723*a34c753fSRafael Auler Chain *SrcChain = AllBlocks[SrcIndex].CurChain; 724*a34c753fSRafael Auler Chain *DstChain = AllBlocks[DstIndex].CurChain; 725*a34c753fSRafael Auler if (SrcChain != DstChain && !DstChain->isEntryPoint() && 726*a34c753fSRafael Auler SrcChain->blocks().back()->Index == SrcIndex && 727*a34c753fSRafael Auler DstChain->blocks().front()->Index == DstIndex) { 728*a34c753fSRafael Auler mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y); 729*a34c753fSRafael Auler } 730*a34c753fSRafael Auler } 731*a34c753fSRafael Auler } 732*a34c753fSRafael Auler } 733*a34c753fSRafael Auler 734*a34c753fSRafael Auler /// Compute ExtTSP score for a given order of basic blocks 735*a34c753fSRafael Auler double score(const MergedChain &MergedBlocks, const JumpList &Jumps) const { 736*a34c753fSRafael Auler if (Jumps.empty()) 737*a34c753fSRafael Auler return 0.0; 738*a34c753fSRafael Auler uint64_t CurAddr = 0; 739*a34c753fSRafael Auler MergedBlocks.forEach( 740*a34c753fSRafael Auler [&](const Block *BB) { 741*a34c753fSRafael Auler BB->EstimatedAddr = CurAddr; 742*a34c753fSRafael Auler CurAddr += BB->Size; 743*a34c753fSRafael Auler } 744*a34c753fSRafael Auler ); 745*a34c753fSRafael Auler 746*a34c753fSRafael Auler double Score = 0; 747*a34c753fSRafael Auler for (const std::pair<std::pair<Block *, Block *>, uint64_t> &Jump : Jumps) { 748*a34c753fSRafael Auler const Block *SrcBlock = Jump.first.first; 749*a34c753fSRafael Auler const Block *DstBlock = Jump.first.second; 750*a34c753fSRafael Auler Score += extTSPScore(SrcBlock->EstimatedAddr, 751*a34c753fSRafael Auler SrcBlock->Size, 752*a34c753fSRafael Auler DstBlock->EstimatedAddr, 753*a34c753fSRafael Auler Jump.second); 754*a34c753fSRafael Auler } 755*a34c753fSRafael Auler return Score; 756*a34c753fSRafael Auler } 757*a34c753fSRafael Auler 758*a34c753fSRafael Auler /// Compute the gain of merging two chains 759*a34c753fSRafael Auler /// 760*a34c753fSRafael Auler /// The function considers all possible ways of merging two chains and 761*a34c753fSRafael Auler /// computes the one having the largest increase in ExtTSP objective. The 762*a34c753fSRafael Auler /// result is a pair with the first element being the gain and the second 763*a34c753fSRafael Auler /// element being the corresponding merging type. 764*a34c753fSRafael Auler MergeGainTy mergeGain(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const { 765*a34c753fSRafael Auler if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) { 766*a34c753fSRafael Auler return Edge->getCachedMergeGain(ChainPred, ChainSucc); 767*a34c753fSRafael Auler } 768*a34c753fSRafael Auler 769*a34c753fSRafael Auler // Precompute jumps between ChainPred and ChainSucc 770*a34c753fSRafael Auler JumpList Jumps = Edge->jumps(); 771*a34c753fSRafael Auler class Edge *EdgePP = ChainPred->getEdge(ChainPred); 772*a34c753fSRafael Auler if (EdgePP != nullptr) 773*a34c753fSRafael Auler Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end()); 774*a34c753fSRafael Auler assert(Jumps.size() > 0 && "trying to merge chains w/o jumps"); 775*a34c753fSRafael Auler 776*a34c753fSRafael Auler MergeGainTy Gain = MergeGainTy(); 777*a34c753fSRafael Auler // Try to concatenate two chains w/o splitting 778*a34c753fSRafael Auler Gain = computeMergeGain( 779*a34c753fSRafael Auler Gain, ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y); 780*a34c753fSRafael Auler 781*a34c753fSRafael Auler // Try to break ChainPred in various ways and concatenate with ChainSucc 782*a34c753fSRafael Auler if (ChainPred->blocks().size() <= opts::ChainSplitThreshold) { 783*a34c753fSRafael Auler for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) { 784*a34c753fSRafael Auler Block *BB1 = ChainPred->blocks()[Offset - 1]; 785*a34c753fSRafael Auler Block *BB2 = ChainPred->blocks()[Offset]; 786*a34c753fSRafael Auler // Does the splitting break FT successors? 787*a34c753fSRafael Auler if (BB1->FallthroughSucc != nullptr) { 788*a34c753fSRafael Auler (void)BB2; 789*a34c753fSRafael Auler assert(BB1->FallthroughSucc == BB2 && "Fallthrough not preserved"); 790*a34c753fSRafael Auler continue; 791*a34c753fSRafael Auler } 792*a34c753fSRafael Auler 793*a34c753fSRafael Auler Gain = computeMergeGain( 794*a34c753fSRafael Auler Gain, ChainPred, ChainSucc, Jumps, Offset, MergeTypeTy::X1_Y_X2); 795*a34c753fSRafael Auler Gain = computeMergeGain( 796*a34c753fSRafael Auler Gain, ChainPred, ChainSucc, Jumps, Offset, MergeTypeTy::Y_X2_X1); 797*a34c753fSRafael Auler Gain = computeMergeGain( 798*a34c753fSRafael Auler Gain, ChainPred, ChainSucc, Jumps, Offset, MergeTypeTy::X2_X1_Y); 799*a34c753fSRafael Auler } 800*a34c753fSRafael Auler } 801*a34c753fSRafael Auler 802*a34c753fSRafael Auler Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain); 803*a34c753fSRafael Auler return Gain; 804*a34c753fSRafael Auler } 805*a34c753fSRafael Auler 806*a34c753fSRafael Auler /// Merge two chains and update the best Gain 807*a34c753fSRafael Auler MergeGainTy computeMergeGain(const MergeGainTy &CurGain, 808*a34c753fSRafael Auler const Chain *ChainPred, 809*a34c753fSRafael Auler const Chain *ChainSucc, 810*a34c753fSRafael Auler const JumpList &Jumps, 811*a34c753fSRafael Auler size_t MergeOffset, 812*a34c753fSRafael Auler MergeTypeTy MergeType) const { 813*a34c753fSRafael Auler MergedChain MergedBlocks = mergeBlocks( 814*a34c753fSRafael Auler ChainPred->blocks(), ChainSucc->blocks(), MergeOffset, MergeType); 815*a34c753fSRafael Auler 816*a34c753fSRafael Auler // Do not allow a merge that does not preserve the original entry block 817*a34c753fSRafael Auler if ((ChainPred->isEntryPoint() || ChainSucc->isEntryPoint()) && 818*a34c753fSRafael Auler MergedBlocks.getFirstBlock()->Index != 0) 819*a34c753fSRafael Auler return CurGain; 820*a34c753fSRafael Auler 821*a34c753fSRafael Auler // The gain for the new chain 822*a34c753fSRafael Auler const double NewScore = score(MergedBlocks, Jumps) - ChainPred->score(); 823*a34c753fSRafael Auler auto NewGain = MergeGainTy(NewScore, MergeOffset, MergeType); 824*a34c753fSRafael Auler return CurGain < NewGain ? NewGain : CurGain; 825*a34c753fSRafael Auler } 826*a34c753fSRafael Auler 827*a34c753fSRafael Auler /// Merge two chains of blocks respecting a given merge 'type' and 'offset' 828*a34c753fSRafael Auler /// 829*a34c753fSRafael Auler /// If MergeType == 0, then the result is a concatentation of two chains. 830*a34c753fSRafael Auler /// Otherwise, the first chain is cut into two sub-chains at the offset, 831*a34c753fSRafael Auler /// and merged using all possible ways of concatenating three chains. 832*a34c753fSRafael Auler MergedChain mergeBlocks(const std::vector<Block *> &X, 833*a34c753fSRafael Auler const std::vector<Block *> &Y, 834*a34c753fSRafael Auler size_t MergeOffset, 835*a34c753fSRafael Auler MergeTypeTy MergeType) const { 836*a34c753fSRafael Auler // Split the first chain, X, into X1 and X2 837*a34c753fSRafael Auler BlockIter BeginX1 = X.begin(); 838*a34c753fSRafael Auler BlockIter EndX1 = X.begin() + MergeOffset; 839*a34c753fSRafael Auler BlockIter BeginX2 = X.begin() + MergeOffset; 840*a34c753fSRafael Auler BlockIter EndX2 = X.end(); 841*a34c753fSRafael Auler BlockIter BeginY = Y.begin(); 842*a34c753fSRafael Auler BlockIter EndY = Y.end(); 843*a34c753fSRafael Auler 844*a34c753fSRafael Auler // Construct a new chain from the three existing ones 845*a34c753fSRafael Auler switch(MergeType) { 846*a34c753fSRafael Auler case MergeTypeTy::X_Y: 847*a34c753fSRafael Auler return MergedChain(BeginX1, EndX2, BeginY, EndY); 848*a34c753fSRafael Auler case MergeTypeTy::X1_Y_X2: 849*a34c753fSRafael Auler return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2); 850*a34c753fSRafael Auler case MergeTypeTy::Y_X2_X1: 851*a34c753fSRafael Auler return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1); 852*a34c753fSRafael Auler case MergeTypeTy::X2_X1_Y: 853*a34c753fSRafael Auler return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY); 854*a34c753fSRafael Auler } 855*a34c753fSRafael Auler 856*a34c753fSRafael Auler llvm_unreachable("unexpected merge type"); 857*a34c753fSRafael Auler } 858*a34c753fSRafael Auler 859*a34c753fSRafael Auler /// Merge chain From into chain Into, update the list of active chains, 860*a34c753fSRafael Auler /// adjacency information, and the corresponding cached values 861*a34c753fSRafael Auler void mergeChains(Chain *Into, 862*a34c753fSRafael Auler Chain *From, 863*a34c753fSRafael Auler size_t MergeOffset, 864*a34c753fSRafael Auler MergeTypeTy MergeType) { 865*a34c753fSRafael Auler assert(Into != From && "a chain cannot be merged with itself"); 866*a34c753fSRafael Auler 867*a34c753fSRafael Auler // Merge the blocks 868*a34c753fSRafael Auler MergedChain MergedBlocks = 869*a34c753fSRafael Auler mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType); 870*a34c753fSRafael Auler Into->merge(From, MergedBlocks.getBlocks()); 871*a34c753fSRafael Auler Into->mergeEdges(From); 872*a34c753fSRafael Auler From->clear(); 873*a34c753fSRafael Auler 874*a34c753fSRafael Auler // Update cached ext-tsp score for the new chain 875*a34c753fSRafael Auler Edge *SelfEdge = Into->getEdge(Into); 876*a34c753fSRafael Auler if (SelfEdge != nullptr) { 877*a34c753fSRafael Auler MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end()); 878*a34c753fSRafael Auler Into->setScore(score(MergedBlocks, SelfEdge->jumps())); 879*a34c753fSRafael Auler } 880*a34c753fSRafael Auler 881*a34c753fSRafael Auler // Remove chain From from the list of active chains 882*a34c753fSRafael Auler auto Iter = std::remove(HotChains.begin(), HotChains.end(), From); 883*a34c753fSRafael Auler HotChains.erase(Iter, HotChains.end()); 884*a34c753fSRafael Auler 885*a34c753fSRafael Auler // Invalidate caches 886*a34c753fSRafael Auler for (std::pair<Chain *, Edge *> EdgeIter : Into->edges()) { 887*a34c753fSRafael Auler EdgeIter.second->invalidateCache(); 888*a34c753fSRafael Auler } 889*a34c753fSRafael Auler } 890*a34c753fSRafael Auler 891*a34c753fSRafael Auler /// Concatenate all chains into a final order 892*a34c753fSRafael Auler void concatChains(std::vector<BinaryBasicBlock *> &Order) { 893*a34c753fSRafael Auler // Collect chains 894*a34c753fSRafael Auler std::vector<Chain *> SortedChains; 895*a34c753fSRafael Auler for (Chain &Chain : AllChains) { 896*a34c753fSRafael Auler if (Chain.blocks().size() > 0) { 897*a34c753fSRafael Auler SortedChains.push_back(&Chain); 898*a34c753fSRafael Auler } 899*a34c753fSRafael Auler } 900*a34c753fSRafael Auler 901*a34c753fSRafael Auler // Sorting chains by density in decreasing order 902*a34c753fSRafael Auler std::stable_sort( 903*a34c753fSRafael Auler SortedChains.begin(), SortedChains.end(), 904*a34c753fSRafael Auler [](const Chain *C1, const Chain *C2) { 905*a34c753fSRafael Auler // Original entry point to the front 906*a34c753fSRafael Auler if (C1->isEntryPoint() != C2->isEntryPoint()) { 907*a34c753fSRafael Auler if (C1->isEntryPoint()) 908*a34c753fSRafael Auler return true; 909*a34c753fSRafael Auler if (C2->isEntryPoint()) 910*a34c753fSRafael Auler return false; 911*a34c753fSRafael Auler } 912*a34c753fSRafael Auler 913*a34c753fSRafael Auler const double D1 = C1->density(); 914*a34c753fSRafael Auler const double D2 = C2->density(); 915*a34c753fSRafael Auler if (D1 != D2) 916*a34c753fSRafael Auler return D1 > D2; 917*a34c753fSRafael Auler 918*a34c753fSRafael Auler // Making the order deterministic 919*a34c753fSRafael Auler return C1->id() < C2->id(); 920*a34c753fSRafael Auler } 921*a34c753fSRafael Auler ); 922*a34c753fSRafael Auler 923*a34c753fSRafael Auler // Collect the basic blocks in the order specified by their chains 924*a34c753fSRafael Auler Order.reserve(BF.layout_size()); 925*a34c753fSRafael Auler for (Chain *Chain : SortedChains) { 926*a34c753fSRafael Auler for (Block *Block : Chain->blocks()) { 927*a34c753fSRafael Auler Order.push_back(Block->BB); 928*a34c753fSRafael Auler } 929*a34c753fSRafael Auler } 930*a34c753fSRafael Auler } 931*a34c753fSRafael Auler 932*a34c753fSRafael Auler private: 933*a34c753fSRafael Auler // The binary function 934*a34c753fSRafael Auler const BinaryFunction &BF; 935*a34c753fSRafael Auler 936*a34c753fSRafael Auler // All CFG nodes (basic blocks) 937*a34c753fSRafael Auler std::vector<Block> AllBlocks; 938*a34c753fSRafael Auler 939*a34c753fSRafael Auler // All chains of blocks 940*a34c753fSRafael Auler std::vector<Chain> AllChains; 941*a34c753fSRafael Auler 942*a34c753fSRafael Auler // Active chains. The vector gets updated at runtime when chains are merged 943*a34c753fSRafael Auler std::vector<Chain *> HotChains; 944*a34c753fSRafael Auler 945*a34c753fSRafael Auler // All edges between chains 946*a34c753fSRafael Auler std::vector<Edge> AllEdges; 947*a34c753fSRafael Auler }; 948*a34c753fSRafael Auler 949*a34c753fSRafael Auler void ExtTSPReorderAlgorithm::reorderBasicBlocks( 950*a34c753fSRafael Auler const BinaryFunction &BF, BasicBlockOrder &Order) const { 951*a34c753fSRafael Auler if (BF.layout_empty()) 952*a34c753fSRafael Auler return; 953*a34c753fSRafael Auler 954*a34c753fSRafael Auler // Do not change layout of functions w/o profile information 955*a34c753fSRafael Auler if (!BF.hasValidProfile() || BF.layout_size() <= 2) { 956*a34c753fSRafael Auler for (BinaryBasicBlock *BB : BF.layout()) { 957*a34c753fSRafael Auler Order.push_back(BB); 958*a34c753fSRafael Auler } 959*a34c753fSRafael Auler return; 960*a34c753fSRafael Auler } 961*a34c753fSRafael Auler 962*a34c753fSRafael Auler // Apply the algorithm 963*a34c753fSRafael Auler ExtTSP(BF).run(Order); 964*a34c753fSRafael Auler 965*a34c753fSRafael Auler // Verify correctness 966*a34c753fSRafael Auler assert(Order[0]->isEntryPoint() && "Original entry point is not preserved"); 967*a34c753fSRafael Auler assert(Order.size() == BF.layout_size() && "Wrong size of reordered layout"); 968*a34c753fSRafael Auler } 969*a34c753fSRafael Auler 970*a34c753fSRafael Auler } // namespace bolt 971*a34c753fSRafael Auler } // namespace llvm 972