1 //===--- Passes/ReorderAlgorithm.cpp - Basic block reorderng algorithms ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implements different basic block reordering algorithms. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/ReorderAlgorithm.h" 14 #include "bolt/Core/BinaryBasicBlock.h" 15 #include "bolt/Core/BinaryFunction.h" 16 #include "llvm/Support/CommandLine.h" 17 #include <queue> 18 #include <random> 19 #include <stack> 20 21 #undef DEBUG_TYPE 22 #define DEBUG_TYPE "bolt" 23 24 using namespace llvm; 25 using namespace bolt; 26 27 namespace opts { 28 29 extern cl::OptionCategory BoltOptCategory; 30 extern cl::opt<bool> NoThreads; 31 32 static cl::opt<unsigned> ColdThreshold( 33 "cold-threshold", 34 cl::desc("tenths of percents of main entry frequency to use as a " 35 "threshold when evaluating whether a basic block is cold " 36 "(0 means it is only considered cold if the block has zero " 37 "samples). Default: 0 "), 38 cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); 39 40 static cl::opt<bool> 41 PrintClusters("print-clusters", 42 cl::desc("print clusters"), 43 cl::ZeroOrMore, 44 cl::Hidden, 45 cl::cat(BoltOptCategory)); 46 47 cl::opt<uint32_t> 48 RandomSeed("bolt-seed", 49 cl::desc("seed for randomization"), 50 cl::init(42), 51 cl::ZeroOrMore, 52 cl::Hidden, 53 cl::cat(BoltOptCategory)); 54 55 } // namespace opts 56 57 namespace { 58 59 template <class T> inline void hashCombine(size_t &Seed, const T &Val) { 60 std::hash<T> Hasher; 61 Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2); 62 } 63 64 template <typename A, typename B> struct HashPair { 65 size_t operator()(const std::pair<A, B> &Val) const { 66 std::hash<A> Hasher; 67 size_t Seed = Hasher(Val.first); 68 hashCombine(Seed, Val.second); 69 return Seed; 70 } 71 }; 72 73 } // namespace 74 75 void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) { 76 // Create a separate MCCodeEmitter to allow lock-free execution 77 BinaryContext::IndependentCodeEmitter Emitter; 78 if (!opts::NoThreads) { 79 Emitter = BC.createIndependentMCCodeEmitter(); 80 } 81 82 AvgFreq.resize(Clusters.size(), 0.0); 83 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { 84 double Freq = 0.0; 85 uint64_t ClusterSize = 0; 86 for (BinaryBasicBlock *BB : Clusters[I]) { 87 if (BB->getNumNonPseudos() > 0) { 88 Freq += BB->getExecutionCount(); 89 // Estimate the size of a block in bytes at run time 90 // NOTE: This might be inaccurate 91 ClusterSize += BB->estimateSize(Emitter.MCE.get()); 92 } 93 } 94 AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize; 95 } 96 } 97 98 void ClusterAlgorithm::printClusters() const { 99 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { 100 errs() << "Cluster number " << I; 101 if (AvgFreq.size() == Clusters.size()) 102 errs() << " (frequency: " << AvgFreq[I] << ")"; 103 errs() << " : "; 104 const char *Sep = ""; 105 for (BinaryBasicBlock *BB : Clusters[I]) { 106 errs() << Sep << BB->getName(); 107 Sep = ", "; 108 } 109 errs() << "\n"; 110 } 111 } 112 113 void ClusterAlgorithm::reset() { 114 Clusters.clear(); 115 ClusterEdges.clear(); 116 AvgFreq.clear(); 117 } 118 119 void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const { 120 OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count; 121 } 122 123 size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const { 124 HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher; 125 return Hasher(std::make_pair(E.Src, E.Dst)); 126 } 127 128 bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A, 129 const EdgeTy &B) const { 130 return A.Src == B.Src && A.Dst == B.Dst; 131 } 132 133 void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF, 134 bool ComputeEdges) { 135 reset(); 136 137 // Greedy heuristic implementation for the TSP, applied to BB layout. Try to 138 // maximize weight during a path traversing all BBs. In this way, we will 139 // convert the hottest branches into fall-throughs. 140 141 // This is the queue of edges from which we will pop edges and use them to 142 // cluster basic blocks in a greedy fashion. 143 std::vector<EdgeTy> Queue; 144 145 // Initialize inter-cluster weights. 146 if (ComputeEdges) 147 ClusterEdges.resize(BF.layout_size()); 148 149 // Initialize clusters and edge queue. 150 for (BinaryBasicBlock *BB : BF.layout()) { 151 // Create a cluster for this BB. 152 uint32_t I = Clusters.size(); 153 Clusters.emplace_back(); 154 std::vector<BinaryBasicBlock *> &Cluster = Clusters.back(); 155 Cluster.push_back(BB); 156 BBToClusterMap[BB] = I; 157 // Populate priority queue with edges. 158 auto BI = BB->branch_info_begin(); 159 for (BinaryBasicBlock *&I : BB->successors()) { 160 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 161 "attempted reordering blocks of function with no profile data"); 162 Queue.emplace_back(EdgeTy(BB, I, BI->Count)); 163 ++BI; 164 } 165 } 166 // Sort and adjust the edge queue. 167 initQueue(Queue, BF); 168 169 // Grow clusters in a greedy fashion. 170 while (!Queue.empty()) { 171 EdgeTy E = Queue.back(); 172 Queue.pop_back(); 173 174 const BinaryBasicBlock *SrcBB = E.Src; 175 const BinaryBasicBlock *DstBB = E.Dst; 176 177 LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n"); 178 179 // Case 1: BBSrc and BBDst are the same. Ignore this edge 180 if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { 181 LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n"); 182 continue; 183 } 184 185 int I = BBToClusterMap[SrcBB]; 186 int J = BBToClusterMap[DstBB]; 187 188 // Case 2: If they are already allocated at the same cluster, just increase 189 // the weight of this cluster 190 if (I == J) { 191 if (ComputeEdges) 192 ClusterEdges[I][I] += E.Count; 193 LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n"); 194 continue; 195 } 196 197 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; 198 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; 199 if (areClustersCompatible(ClusterA, ClusterB, E)) { 200 // Case 3: SrcBB is at the end of a cluster and DstBB is at the start, 201 // allowing us to merge two clusters. 202 for (BinaryBasicBlock *BB : ClusterB) 203 BBToClusterMap[BB] = I; 204 ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); 205 ClusterB.clear(); 206 if (ComputeEdges) { 207 // Increase the intra-cluster edge count of cluster A with the count of 208 // this edge as well as with the total count of previously visited edges 209 // from cluster B cluster A. 210 ClusterEdges[I][I] += E.Count; 211 ClusterEdges[I][I] += ClusterEdges[J][I]; 212 // Iterate through all inter-cluster edges and transfer edges targeting 213 // cluster B to cluster A. 214 for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) 215 ClusterEdges[K][I] += ClusterEdges[K][J]; 216 } 217 // Adjust the weights of the remaining edges and re-sort the queue. 218 adjustQueue(Queue, BF); 219 LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n"); 220 } else { 221 // Case 4: Both SrcBB and DstBB are allocated in positions we cannot 222 // merge them. Add the count of this edge to the inter-cluster edge count 223 // between clusters A and B to help us decide ordering between these 224 // clusters. 225 if (ComputeEdges) 226 ClusterEdges[I][J] += E.Count; 227 LLVM_DEBUG( 228 dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n"); 229 } 230 } 231 } 232 233 void GreedyClusterAlgorithm::reset() { 234 ClusterAlgorithm::reset(); 235 BBToClusterMap.clear(); 236 } 237 238 void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, 239 const BinaryFunction &BF) { 240 // Define a comparison function to establish SWO between edges. 241 auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) { 242 // With equal weights, prioritize branches with lower index 243 // source/destination. This helps to keep original block order for blocks 244 // when optimal order cannot be deducted from a profile. 245 if (A.Count == B.Count) { 246 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); 247 return (SrcOrder != 0) 248 ? SrcOrder > 0 249 : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; 250 } 251 return A.Count < B.Count; 252 }; 253 254 // Sort edges in increasing profile count order. 255 std::sort(Queue.begin(), Queue.end(), Comp); 256 } 257 258 void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, 259 const BinaryFunction &BF) { 260 // Nothing to do. 261 return; 262 } 263 264 bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front, 265 const ClusterTy &Back, 266 const EdgeTy &E) const { 267 return Front.back() == E.Src && Back.front() == E.Dst; 268 } 269 270 int64_t MinBranchGreedyClusterAlgorithm::calculateWeight( 271 const EdgeTy &E, const BinaryFunction &BF) const { 272 const BinaryBasicBlock *SrcBB = E.Src; 273 const BinaryBasicBlock *DstBB = E.Dst; 274 275 // Initial weight value. 276 int64_t W = (int64_t)E.Count; 277 278 // Adjust the weight by taking into account other edges with the same source. 279 auto BI = SrcBB->branch_info_begin(); 280 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { 281 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 282 "attempted reordering blocks of function with no profile data"); 283 assert(BI->Count <= std::numeric_limits<int64_t>::max() && 284 "overflow detected"); 285 // Ignore edges with same source and destination, edges that target the 286 // entry block as well as the edge E itself. 287 if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB) 288 W -= (int64_t)BI->Count; 289 ++BI; 290 } 291 292 // Adjust the weight by taking into account other edges with the same 293 // destination. 294 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { 295 // Ignore edges with same source and destination as well as the edge E 296 // itself. 297 if (PredBB == DstBB || PredBB == SrcBB) 298 continue; 299 auto BI = PredBB->branch_info_begin(); 300 for (const BinaryBasicBlock *SuccBB : PredBB->successors()) { 301 if (SuccBB == DstBB) 302 break; 303 ++BI; 304 } 305 assert(BI != PredBB->branch_info_end() && "invalid control flow graph"); 306 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 307 "attempted reordering blocks of function with no profile data"); 308 assert(BI->Count <= std::numeric_limits<int64_t>::max() && 309 "overflow detected"); 310 W -= (int64_t)BI->Count; 311 } 312 313 return W; 314 } 315 316 void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, 317 const BinaryFunction &BF) { 318 // Initialize edge weights. 319 for (const EdgeTy &E : Queue) 320 Weight.emplace(std::make_pair(E, calculateWeight(E, BF))); 321 322 // Sort edges in increasing weight order. 323 adjustQueue(Queue, BF); 324 } 325 326 void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, 327 const BinaryFunction &BF) { 328 // Define a comparison function to establish SWO between edges. 329 auto Comp = [&](const EdgeTy &A, const EdgeTy &B) { 330 // With equal weights, prioritize branches with lower index 331 // source/destination. This helps to keep original block order for blocks 332 // when optimal order cannot be deduced from a profile. 333 if (Weight[A] == Weight[B]) { 334 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); 335 return (SrcOrder != 0) 336 ? SrcOrder > 0 337 : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; 338 } 339 return Weight[A] < Weight[B]; 340 }; 341 342 // Iterate through all remaining edges to find edges that have their 343 // source and destination in the same cluster. 344 std::vector<EdgeTy> NewQueue; 345 for (const EdgeTy &E : Queue) { 346 const BinaryBasicBlock *SrcBB = E.Src; 347 const BinaryBasicBlock *DstBB = E.Dst; 348 349 // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore 350 // this edge. 351 if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { 352 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs()); 353 dbgs() << " (same src, dst)\n"); 354 continue; 355 } 356 357 int I = BBToClusterMap[SrcBB]; 358 int J = BBToClusterMap[DstBB]; 359 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; 360 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; 361 362 // Case 2: They are already allocated at the same cluster or incompatible 363 // clusters. Adjust the weights of edges with the same source or 364 // destination, so that this edge has no effect on them any more, and ignore 365 // this edge. Also increase the intra- (or inter-) cluster edge count. 366 if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) { 367 if (!ClusterEdges.empty()) 368 ClusterEdges[I][J] += E.Count; 369 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs()); 370 dbgs() << " (src, dst belong to same cluster or incompatible " 371 "clusters)\n"); 372 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { 373 if (SuccBB == DstBB) 374 continue; 375 auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0)); 376 assert(WI != Weight.end() && "CFG edge not found in Weight map"); 377 WI->second += (int64_t)E.Count; 378 } 379 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { 380 if (PredBB == SrcBB) 381 continue; 382 auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0)); 383 assert(WI != Weight.end() && "CFG edge not found in Weight map"); 384 WI->second += (int64_t)E.Count; 385 } 386 continue; 387 } 388 389 // Case 3: None of the previous cases is true, so just keep this edge in 390 // the queue. 391 NewQueue.emplace_back(E); 392 } 393 394 // Sort remaining edges in increasing weight order. 395 Queue.swap(NewQueue); 396 std::sort(Queue.begin(), Queue.end(), Comp); 397 } 398 399 bool MinBranchGreedyClusterAlgorithm::areClustersCompatible( 400 const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { 401 return Front.back() == E.Src && Back.front() == E.Dst; 402 } 403 404 void MinBranchGreedyClusterAlgorithm::reset() { 405 GreedyClusterAlgorithm::reset(); 406 Weight.clear(); 407 } 408 409 void TSPReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF, 410 BasicBlockOrder &Order) const { 411 std::vector<std::vector<uint64_t>> Weight; 412 std::vector<BinaryBasicBlock *> IndexToBB; 413 414 const size_t N = BF.layout_size(); 415 assert(N <= std::numeric_limits<uint64_t>::digits && 416 "cannot use TSP solution for sizes larger than bits in uint64_t"); 417 418 // Populating weight map and index map 419 for (BinaryBasicBlock *BB : BF.layout()) { 420 BB->setLayoutIndex(IndexToBB.size()); 421 IndexToBB.push_back(BB); 422 } 423 Weight.resize(N); 424 for (BinaryBasicBlock *BB : BF.layout()) { 425 auto BI = BB->branch_info_begin(); 426 Weight[BB->getLayoutIndex()].resize(N); 427 for (BinaryBasicBlock *SuccBB : BB->successors()) { 428 if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) 429 Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count; 430 ++BI; 431 } 432 } 433 434 std::vector<std::vector<int64_t>> DP; 435 DP.resize(1 << N); 436 for (std::vector<int64_t> &Elmt : DP) { 437 Elmt.resize(N, -1); 438 } 439 // Start with the entry basic block being allocated with cost zero 440 DP[1][0] = 0; 441 // Walk through TSP solutions using a bitmask to represent state (current set 442 // of BBs in the layout) 443 uint64_t BestSet = 1; 444 uint64_t BestLast = 0; 445 int64_t BestWeight = 0; 446 for (uint64_t Set = 1; Set < (1ULL << N); ++Set) { 447 // Traverse each possibility of Last BB visited in this layout 448 for (uint64_t Last = 0; Last < N; ++Last) { 449 // Case 1: There is no possible layout with this BB as Last 450 if (DP[Set][Last] == -1) 451 continue; 452 453 // Case 2: There is a layout with this Set and this Last, and we try 454 // to expand this set with New 455 for (uint64_t New = 1; New < N; ++New) { 456 // Case 2a: BB "New" is already in this Set 457 if ((Set & (1ULL << New)) != 0) 458 continue; 459 460 // Case 2b: BB "New" is not in this set and we add it to this Set and 461 // record total weight of this layout with "New" as the last BB. 462 uint64_t NewSet = (Set | (1ULL << New)); 463 if (DP[NewSet][New] == -1) 464 DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; 465 DP[NewSet][New] = std::max(DP[NewSet][New], 466 DP[Set][Last] + (int64_t)Weight[Last][New]); 467 468 if (DP[NewSet][New] > BestWeight) { 469 BestWeight = DP[NewSet][New]; 470 BestSet = NewSet; 471 BestLast = New; 472 } 473 } 474 } 475 } 476 477 // Define final function layout based on layout that maximizes weight 478 uint64_t Last = BestLast; 479 uint64_t Set = BestSet; 480 std::vector<bool> Visited; 481 Visited.resize(N); 482 Visited[Last] = true; 483 Order.push_back(IndexToBB[Last]); 484 Set = Set & ~(1ULL << Last); 485 while (Set != 0) { 486 int64_t Best = -1; 487 uint64_t NewLast; 488 for (uint64_t I = 0; I < N; ++I) { 489 if (DP[Set][I] == -1) 490 continue; 491 int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0; 492 if (DP[Set][I] + AdjWeight > Best) { 493 NewLast = I; 494 Best = DP[Set][I] + AdjWeight; 495 } 496 } 497 Last = NewLast; 498 Visited[Last] = true; 499 Order.push_back(IndexToBB[Last]); 500 Set = Set & ~(1ULL << Last); 501 } 502 std::reverse(Order.begin(), Order.end()); 503 504 // Finalize layout with BBs that weren't assigned to the layout using the 505 // input layout. 506 for (BinaryBasicBlock *BB : BF.layout()) { 507 if (Visited[BB->getLayoutIndex()] == false) 508 Order.push_back(BB); 509 } 510 } 511 512 void OptimizeReorderAlgorithm::reorderBasicBlocks( 513 const BinaryFunction &BF, BasicBlockOrder &Order) const { 514 if (BF.layout_empty()) 515 return; 516 517 // Cluster basic blocks. 518 CAlgo->clusterBasicBlocks(BF); 519 520 if (opts::PrintClusters) 521 CAlgo->printClusters(); 522 523 // Arrange basic blocks according to clusters. 524 for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters) 525 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 526 } 527 528 void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( 529 const BinaryFunction &BF, BasicBlockOrder &Order) const { 530 if (BF.layout_empty()) 531 return; 532 533 // Cluster basic blocks. 534 CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true); 535 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 536 std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges = 537 CAlgo->ClusterEdges; 538 539 // Compute clusters' average frequencies. 540 CAlgo->computeClusterAverageFrequency(BF.getBinaryContext()); 541 std::vector<double> &AvgFreq = CAlgo->AvgFreq; 542 543 if (opts::PrintClusters) 544 CAlgo->printClusters(); 545 546 // Cluster layout order 547 std::vector<uint32_t> ClusterOrder; 548 549 // Do a topological sort for clusters, prioritizing frequently-executed BBs 550 // during the traversal. 551 std::stack<uint32_t> Stack; 552 std::vector<uint32_t> Status; 553 std::vector<uint32_t> Parent; 554 Status.resize(Clusters.size(), 0); 555 Parent.resize(Clusters.size(), 0); 556 constexpr uint32_t STACKED = 1; 557 constexpr uint32_t VISITED = 2; 558 Status[0] = STACKED; 559 Stack.push(0); 560 while (!Stack.empty()) { 561 uint32_t I = Stack.top(); 562 if (!(Status[I] & VISITED)) { 563 Status[I] |= VISITED; 564 // Order successors by weight 565 auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { 566 return ClusterEdges[I][A] > ClusterEdges[I][B]; 567 }; 568 std::priority_queue<uint32_t, std::vector<uint32_t>, 569 decltype(ClusterComp)> 570 SuccQueue(ClusterComp); 571 for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) { 572 if (Target.second > 0 && !(Status[Target.first] & STACKED) && 573 !Clusters[Target.first].empty()) { 574 Parent[Target.first] = I; 575 Status[Target.first] = STACKED; 576 SuccQueue.push(Target.first); 577 } 578 } 579 while (!SuccQueue.empty()) { 580 Stack.push(SuccQueue.top()); 581 SuccQueue.pop(); 582 } 583 continue; 584 } 585 // Already visited this node 586 Stack.pop(); 587 ClusterOrder.push_back(I); 588 } 589 std::reverse(ClusterOrder.begin(), ClusterOrder.end()); 590 // Put unreachable clusters at the end 591 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 592 if (!(Status[I] & VISITED) && !Clusters[I].empty()) 593 ClusterOrder.push_back(I); 594 595 // Sort nodes with equal precedence 596 auto Beg = ClusterOrder.begin(); 597 // Don't reorder the first cluster, which contains the function entry point 598 ++Beg; 599 std::stable_sort(Beg, ClusterOrder.end(), 600 [&AvgFreq, &Parent](uint32_t A, uint32_t B) { 601 uint32_t P = Parent[A]; 602 while (Parent[P] != 0) { 603 if (Parent[P] == B) 604 return false; 605 P = Parent[P]; 606 } 607 P = Parent[B]; 608 while (Parent[P] != 0) { 609 if (Parent[P] == A) 610 return true; 611 P = Parent[P]; 612 } 613 return AvgFreq[A] > AvgFreq[B]; 614 }); 615 616 if (opts::PrintClusters) { 617 errs() << "New cluster order: "; 618 const char *Sep = ""; 619 for (uint32_t O : ClusterOrder) { 620 errs() << Sep << O; 621 Sep = ", "; 622 } 623 errs() << '\n'; 624 } 625 626 // Arrange basic blocks according to cluster order. 627 for (uint32_t ClusterIndex : ClusterOrder) { 628 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 629 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 630 } 631 } 632 633 void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( 634 const BinaryFunction &BF, BasicBlockOrder &Order) const { 635 if (BF.layout_empty()) 636 return; 637 638 const uint64_t ColdThreshold = 639 opts::ColdThreshold * (*BF.layout_begin())->getExecutionCount() / 1000; 640 641 // Cluster basic blocks. 642 CAlgo->clusterBasicBlocks(BF); 643 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 644 645 // Compute clusters' average frequencies. 646 CAlgo->computeClusterAverageFrequency(BF.getBinaryContext()); 647 std::vector<double> &AvgFreq = CAlgo->AvgFreq; 648 649 if (opts::PrintClusters) 650 CAlgo->printClusters(); 651 652 // Cluster layout order 653 std::vector<uint32_t> ClusterOrder; 654 655 // Order clusters based on average instruction execution frequency 656 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 657 if (!Clusters[I].empty()) 658 ClusterOrder.push_back(I); 659 // Don't reorder the first cluster, which contains the function entry point 660 std::stable_sort( 661 std::next(ClusterOrder.begin()), ClusterOrder.end(), 662 [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; }); 663 664 if (opts::PrintClusters) { 665 errs() << "New cluster order: "; 666 const char *Sep = ""; 667 for (uint32_t O : ClusterOrder) { 668 errs() << Sep << O; 669 Sep = ", "; 670 } 671 errs() << '\n'; 672 } 673 674 // Arrange basic blocks according to cluster order. 675 for (uint32_t ClusterIndex : ClusterOrder) { 676 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 677 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 678 // Force zero execution count on clusters that do not meet the cut off 679 // specified by --cold-threshold. 680 if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold)) { 681 for (BinaryBasicBlock *BBPtr : Cluster) { 682 BBPtr->setExecutionCount(0); 683 } 684 } 685 } 686 } 687 688 void ReverseReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF, 689 BasicBlockOrder &Order) const { 690 if (BF.layout_empty()) 691 return; 692 693 BinaryBasicBlock *FirstBB = *BF.layout_begin(); 694 Order.push_back(FirstBB); 695 for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI) 696 Order.push_back(*RLI); 697 } 698 699 void RandomClusterReorderAlgorithm::reorderBasicBlocks( 700 const BinaryFunction &BF, BasicBlockOrder &Order) const { 701 if (BF.layout_empty()) 702 return; 703 704 // Cluster basic blocks. 705 CAlgo->clusterBasicBlocks(BF); 706 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 707 708 if (opts::PrintClusters) 709 CAlgo->printClusters(); 710 711 // Cluster layout order 712 std::vector<uint32_t> ClusterOrder; 713 714 // Order clusters based on average instruction execution frequency 715 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 716 if (!Clusters[I].empty()) 717 ClusterOrder.push_back(I); 718 719 std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(), 720 std::default_random_engine(opts::RandomSeed.getValue())); 721 722 if (opts::PrintClusters) { 723 errs() << "New cluster order: "; 724 const char *Sep = ""; 725 for (uint32_t O : ClusterOrder) { 726 errs() << Sep << O; 727 Sep = ", "; 728 } 729 errs() << '\n'; 730 } 731 732 // Arrange basic blocks according to cluster order. 733 for (uint32_t ClusterIndex : ClusterOrder) { 734 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 735 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 736 } 737 } 738