1 //===--- Passes/ReorderAlgorithm.cpp - Basic block reorderng algorithms ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implements different basic block reordering algorithms.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/ReorderAlgorithm.h"
14 #include "bolt/Core/BinaryBasicBlock.h"
15 #include "bolt/Core/BinaryFunction.h"
16 #include "llvm/Support/CommandLine.h"
17 #include <queue>
18 #include <random>
19 #include <stack>
20 
21 #undef  DEBUG_TYPE
22 #define DEBUG_TYPE "bolt"
23 
24 using namespace llvm;
25 using namespace bolt;
26 
27 namespace opts {
28 
29 extern cl::OptionCategory BoltOptCategory;
30 extern cl::opt<bool> NoThreads;
31 
32 static cl::opt<unsigned> ColdThreshold(
33     "cold-threshold",
34     cl::desc("tenths of percents of main entry frequency to use as a "
35              "threshold when evaluating whether a basic block is cold "
36              "(0 means it is only considered cold if the block has zero "
37              "samples). Default: 0 "),
38     cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
39 
40 static cl::opt<bool>
41 PrintClusters("print-clusters",
42   cl::desc("print clusters"),
43   cl::ZeroOrMore,
44   cl::Hidden,
45   cl::cat(BoltOptCategory));
46 
47 cl::opt<uint32_t>
48 RandomSeed("bolt-seed",
49   cl::desc("seed for randomization"),
50   cl::init(42),
51   cl::ZeroOrMore,
52   cl::Hidden,
53   cl::cat(BoltOptCategory));
54 
55 } // namespace opts
56 
57 namespace {
58 
59 template <class T> inline void hashCombine(size_t &Seed, const T &Val) {
60   std::hash<T> Hasher;
61   Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
62 }
63 
64 template <typename A, typename B> struct HashPair {
65   size_t operator()(const std::pair<A, B> &Val) const {
66     std::hash<A> Hasher;
67     size_t Seed = Hasher(Val.first);
68     hashCombine(Seed, Val.second);
69     return Seed;
70   }
71 };
72 
73 } // namespace
74 
75 void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
76   // Create a separate MCCodeEmitter to allow lock-free execution
77   BinaryContext::IndependentCodeEmitter Emitter;
78   if (!opts::NoThreads) {
79     Emitter = BC.createIndependentMCCodeEmitter();
80   }
81 
82   AvgFreq.resize(Clusters.size(), 0.0);
83   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
84     double Freq = 0.0;
85     uint64_t ClusterSize = 0;
86     for (BinaryBasicBlock *BB : Clusters[I]) {
87       if (BB->getNumNonPseudos() > 0) {
88         Freq += BB->getExecutionCount();
89         // Estimate the size of a block in bytes at run time
90         // NOTE: This might be inaccurate
91         ClusterSize += BB->estimateSize(Emitter.MCE.get());
92       }
93     }
94     AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize;
95   }
96 }
97 
98 void ClusterAlgorithm::printClusters() const {
99   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
100     errs() << "Cluster number " << I;
101     if (AvgFreq.size() == Clusters.size())
102       errs() << " (frequency: " << AvgFreq[I] << ")";
103     errs() << " : ";
104     const char *Sep = "";
105     for (BinaryBasicBlock *BB : Clusters[I]) {
106       errs() << Sep << BB->getName();
107       Sep = ", ";
108     }
109     errs() << "\n";
110   }
111 }
112 
113 void ClusterAlgorithm::reset() {
114   Clusters.clear();
115   ClusterEdges.clear();
116   AvgFreq.clear();
117 }
118 
119 void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
120   OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
121 }
122 
123 size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
124   HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
125   return Hasher(std::make_pair(E.Src, E.Dst));
126 }
127 
128 bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A,
129                                                    const EdgeTy &B) const {
130   return A.Src == B.Src && A.Dst == B.Dst;
131 }
132 
133 void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
134                                                 bool ComputeEdges) {
135   reset();
136 
137   // Greedy heuristic implementation for the TSP, applied to BB layout. Try to
138   // maximize weight during a path traversing all BBs. In this way, we will
139   // convert the hottest branches into fall-throughs.
140 
141   // This is the queue of edges from which we will pop edges and use them to
142   // cluster basic blocks in a greedy fashion.
143   std::vector<EdgeTy> Queue;
144 
145   // Initialize inter-cluster weights.
146   if (ComputeEdges)
147     ClusterEdges.resize(BF.layout_size());
148 
149   // Initialize clusters and edge queue.
150   for (BinaryBasicBlock *BB : BF.layout()) {
151     // Create a cluster for this BB.
152     uint32_t I = Clusters.size();
153     Clusters.emplace_back();
154     std::vector<BinaryBasicBlock *> &Cluster = Clusters.back();
155     Cluster.push_back(BB);
156     BBToClusterMap[BB] = I;
157     // Populate priority queue with edges.
158     auto BI = BB->branch_info_begin();
159     for (BinaryBasicBlock *&I : BB->successors()) {
160       assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
161              "attempted reordering blocks of function with no profile data");
162       Queue.emplace_back(EdgeTy(BB, I, BI->Count));
163       ++BI;
164     }
165   }
166   // Sort and adjust the edge queue.
167   initQueue(Queue, BF);
168 
169   // Grow clusters in a greedy fashion.
170   while (!Queue.empty()) {
171     EdgeTy E = Queue.back();
172     Queue.pop_back();
173 
174     const BinaryBasicBlock *SrcBB = E.Src;
175     const BinaryBasicBlock *DstBB = E.Dst;
176 
177     LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n");
178 
179     // Case 1: BBSrc and BBDst are the same. Ignore this edge
180     if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
181       LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
182       continue;
183     }
184 
185     int I = BBToClusterMap[SrcBB];
186     int J = BBToClusterMap[DstBB];
187 
188     // Case 2: If they are already allocated at the same cluster, just increase
189     // the weight of this cluster
190     if (I == J) {
191       if (ComputeEdges)
192         ClusterEdges[I][I] += E.Count;
193       LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
194       continue;
195     }
196 
197     std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
198     std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
199     if (areClustersCompatible(ClusterA, ClusterB, E)) {
200       // Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
201       // allowing us to merge two clusters.
202       for (BinaryBasicBlock *BB : ClusterB)
203         BBToClusterMap[BB] = I;
204       ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
205       ClusterB.clear();
206       if (ComputeEdges) {
207         // Increase the intra-cluster edge count of cluster A with the count of
208         // this edge as well as with the total count of previously visited edges
209         // from cluster B cluster A.
210         ClusterEdges[I][I] += E.Count;
211         ClusterEdges[I][I] += ClusterEdges[J][I];
212         // Iterate through all inter-cluster edges and transfer edges targeting
213         // cluster B to cluster A.
214         for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
215           ClusterEdges[K][I] += ClusterEdges[K][J];
216       }
217       // Adjust the weights of the remaining edges and re-sort the queue.
218       adjustQueue(Queue, BF);
219       LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
220     } else {
221       // Case 4: Both SrcBB and DstBB are allocated in positions we cannot
222       // merge them. Add the count of this edge to the inter-cluster edge count
223       // between clusters A and B to help us decide ordering between these
224       // clusters.
225       if (ComputeEdges)
226         ClusterEdges[I][J] += E.Count;
227       LLVM_DEBUG(
228           dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
229     }
230   }
231 }
232 
233 void GreedyClusterAlgorithm::reset() {
234   ClusterAlgorithm::reset();
235   BBToClusterMap.clear();
236 }
237 
238 void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
239                                          const BinaryFunction &BF) {
240   // Define a comparison function to establish SWO between edges.
241   auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) {
242     // With equal weights, prioritize branches with lower index
243     // source/destination. This helps to keep original block order for blocks
244     // when optimal order cannot be deducted from a profile.
245     if (A.Count == B.Count) {
246       const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
247       return (SrcOrder != 0)
248                  ? SrcOrder > 0
249                  : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
250     }
251     return A.Count < B.Count;
252   };
253 
254   // Sort edges in increasing profile count order.
255   std::sort(Queue.begin(), Queue.end(), Comp);
256 }
257 
258 void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
259                                            const BinaryFunction &BF) {
260   // Nothing to do.
261   return;
262 }
263 
264 bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front,
265                                                      const ClusterTy &Back,
266                                                      const EdgeTy &E) const {
267   return Front.back() == E.Src && Back.front() == E.Dst;
268 }
269 
270 int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
271     const EdgeTy &E, const BinaryFunction &BF) const {
272   const BinaryBasicBlock *SrcBB = E.Src;
273   const BinaryBasicBlock *DstBB = E.Dst;
274 
275   // Initial weight value.
276   int64_t W = (int64_t)E.Count;
277 
278   // Adjust the weight by taking into account other edges with the same source.
279   auto BI = SrcBB->branch_info_begin();
280   for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
281     assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
282            "attempted reordering blocks of function with no profile data");
283     assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
284            "overflow detected");
285     // Ignore edges with same source and destination, edges that target the
286     // entry block as well as the edge E itself.
287     if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB)
288       W -= (int64_t)BI->Count;
289     ++BI;
290   }
291 
292   // Adjust the weight by taking into account other edges with the same
293   // destination.
294   for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
295     // Ignore edges with same source and destination as well as the edge E
296     // itself.
297     if (PredBB == DstBB || PredBB == SrcBB)
298       continue;
299     auto BI = PredBB->branch_info_begin();
300     for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
301       if (SuccBB == DstBB)
302         break;
303       ++BI;
304     }
305     assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
306     assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
307            "attempted reordering blocks of function with no profile data");
308     assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
309            "overflow detected");
310     W -= (int64_t)BI->Count;
311   }
312 
313   return W;
314 }
315 
316 void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
317                                                 const BinaryFunction &BF) {
318   // Initialize edge weights.
319   for (const EdgeTy &E : Queue)
320     Weight.emplace(std::make_pair(E, calculateWeight(E, BF)));
321 
322   // Sort edges in increasing weight order.
323   adjustQueue(Queue, BF);
324 }
325 
326 void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
327                                                   const BinaryFunction &BF) {
328   // Define a comparison function to establish SWO between edges.
329   auto Comp = [&](const EdgeTy &A, const EdgeTy &B) {
330     // With equal weights, prioritize branches with lower index
331     // source/destination. This helps to keep original block order for blocks
332     // when optimal order cannot be deduced from a profile.
333     if (Weight[A] == Weight[B]) {
334       const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
335       return (SrcOrder != 0)
336                  ? SrcOrder > 0
337                  : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
338     }
339     return Weight[A] < Weight[B];
340   };
341 
342   // Iterate through all remaining edges to find edges that have their
343   // source and destination in the same cluster.
344   std::vector<EdgeTy> NewQueue;
345   for (const EdgeTy &E : Queue) {
346     const BinaryBasicBlock *SrcBB = E.Src;
347     const BinaryBasicBlock *DstBB = E.Dst;
348 
349     // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
350     // this edge.
351     if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
352       LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
353                  dbgs() << " (same src, dst)\n");
354       continue;
355     }
356 
357     int I = BBToClusterMap[SrcBB];
358     int J = BBToClusterMap[DstBB];
359     std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
360     std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
361 
362     // Case 2: They are already allocated at the same cluster or incompatible
363     // clusters. Adjust the weights of edges with the same source or
364     // destination, so that this edge has no effect on them any more, and ignore
365     // this edge. Also increase the intra- (or inter-) cluster edge count.
366     if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
367       if (!ClusterEdges.empty())
368         ClusterEdges[I][J] += E.Count;
369       LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
370                  dbgs() << " (src, dst belong to same cluster or incompatible "
371                            "clusters)\n");
372       for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
373         if (SuccBB == DstBB)
374           continue;
375         auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0));
376         assert(WI != Weight.end() && "CFG edge not found in Weight map");
377         WI->second += (int64_t)E.Count;
378       }
379       for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
380         if (PredBB == SrcBB)
381           continue;
382         auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0));
383         assert(WI != Weight.end() && "CFG edge not found in Weight map");
384         WI->second += (int64_t)E.Count;
385       }
386       continue;
387     }
388 
389     // Case 3: None of the previous cases is true, so just keep this edge in
390     // the queue.
391     NewQueue.emplace_back(E);
392   }
393 
394   // Sort remaining edges in increasing weight order.
395   Queue.swap(NewQueue);
396   std::sort(Queue.begin(), Queue.end(), Comp);
397 }
398 
399 bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
400     const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
401   return Front.back() == E.Src && Back.front() == E.Dst;
402 }
403 
404 void MinBranchGreedyClusterAlgorithm::reset() {
405   GreedyClusterAlgorithm::reset();
406   Weight.clear();
407 }
408 
409 void TSPReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF,
410                                              BasicBlockOrder &Order) const {
411   std::vector<std::vector<uint64_t>> Weight;
412   std::vector<BinaryBasicBlock *> IndexToBB;
413 
414   const size_t N = BF.layout_size();
415   assert(N <= std::numeric_limits<uint64_t>::digits &&
416          "cannot use TSP solution for sizes larger than bits in uint64_t");
417 
418   // Populating weight map and index map
419   for (BinaryBasicBlock *BB : BF.layout()) {
420     BB->setLayoutIndex(IndexToBB.size());
421     IndexToBB.push_back(BB);
422   }
423   Weight.resize(N);
424   for (BinaryBasicBlock *BB : BF.layout()) {
425     auto BI = BB->branch_info_begin();
426     Weight[BB->getLayoutIndex()].resize(N);
427     for (BinaryBasicBlock *SuccBB : BB->successors()) {
428       if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
429         Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count;
430       ++BI;
431     }
432   }
433 
434   std::vector<std::vector<int64_t>> DP;
435   DP.resize(1 << N);
436   for (std::vector<int64_t> &Elmt : DP) {
437     Elmt.resize(N, -1);
438   }
439   // Start with the entry basic block being allocated with cost zero
440   DP[1][0] = 0;
441   // Walk through TSP solutions using a bitmask to represent state (current set
442   // of BBs in the layout)
443   uint64_t BestSet = 1;
444   uint64_t BestLast = 0;
445   int64_t BestWeight = 0;
446   for (uint64_t Set = 1; Set < (1ULL << N); ++Set) {
447     // Traverse each possibility of Last BB visited in this layout
448     for (uint64_t Last = 0; Last < N; ++Last) {
449       // Case 1: There is no possible layout with this BB as Last
450       if (DP[Set][Last] == -1)
451         continue;
452 
453       // Case 2: There is a layout with this Set and this Last, and we try
454       // to expand this set with New
455       for (uint64_t New = 1; New < N; ++New) {
456         // Case 2a: BB "New" is already in this Set
457         if ((Set & (1ULL << New)) != 0)
458           continue;
459 
460         // Case 2b: BB "New" is not in this set and we add it to this Set and
461         // record total weight of this layout with "New" as the last BB.
462         uint64_t NewSet = (Set | (1ULL << New));
463         if (DP[NewSet][New] == -1)
464           DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
465         DP[NewSet][New] = std::max(DP[NewSet][New],
466                                    DP[Set][Last] + (int64_t)Weight[Last][New]);
467 
468         if (DP[NewSet][New] > BestWeight) {
469           BestWeight = DP[NewSet][New];
470           BestSet = NewSet;
471           BestLast = New;
472         }
473       }
474     }
475   }
476 
477   // Define final function layout based on layout that maximizes weight
478   uint64_t Last = BestLast;
479   uint64_t Set = BestSet;
480   std::vector<bool> Visited;
481   Visited.resize(N);
482   Visited[Last] = true;
483   Order.push_back(IndexToBB[Last]);
484   Set = Set & ~(1ULL << Last);
485   while (Set != 0) {
486     int64_t Best = -1;
487     uint64_t NewLast;
488     for (uint64_t I = 0; I < N; ++I) {
489       if (DP[Set][I] == -1)
490         continue;
491       int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0;
492       if (DP[Set][I] + AdjWeight > Best) {
493         NewLast = I;
494         Best = DP[Set][I] + AdjWeight;
495       }
496     }
497     Last = NewLast;
498     Visited[Last] = true;
499     Order.push_back(IndexToBB[Last]);
500     Set = Set & ~(1ULL << Last);
501   }
502   std::reverse(Order.begin(), Order.end());
503 
504   // Finalize layout with BBs that weren't assigned to the layout using the
505   // input layout.
506   for (BinaryBasicBlock *BB : BF.layout()) {
507     if (Visited[BB->getLayoutIndex()] == false)
508       Order.push_back(BB);
509   }
510 }
511 
512 void OptimizeReorderAlgorithm::reorderBasicBlocks(
513     const BinaryFunction &BF, BasicBlockOrder &Order) const {
514   if (BF.layout_empty())
515     return;
516 
517   // Cluster basic blocks.
518   CAlgo->clusterBasicBlocks(BF);
519 
520   if (opts::PrintClusters)
521     CAlgo->printClusters();
522 
523   // Arrange basic blocks according to clusters.
524   for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
525     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
526 }
527 
528 void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
529     const BinaryFunction &BF, BasicBlockOrder &Order) const {
530   if (BF.layout_empty())
531     return;
532 
533   // Cluster basic blocks.
534   CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true);
535   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
536   std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges =
537       CAlgo->ClusterEdges;
538 
539   // Compute clusters' average frequencies.
540   CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
541   std::vector<double> &AvgFreq = CAlgo->AvgFreq;
542 
543   if (opts::PrintClusters)
544     CAlgo->printClusters();
545 
546   // Cluster layout order
547   std::vector<uint32_t> ClusterOrder;
548 
549   // Do a topological sort for clusters, prioritizing frequently-executed BBs
550   // during the traversal.
551   std::stack<uint32_t> Stack;
552   std::vector<uint32_t> Status;
553   std::vector<uint32_t> Parent;
554   Status.resize(Clusters.size(), 0);
555   Parent.resize(Clusters.size(), 0);
556   constexpr uint32_t STACKED = 1;
557   constexpr uint32_t VISITED = 2;
558   Status[0] = STACKED;
559   Stack.push(0);
560   while (!Stack.empty()) {
561     uint32_t I = Stack.top();
562     if (!(Status[I] & VISITED)) {
563       Status[I] |= VISITED;
564       // Order successors by weight
565       auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
566         return ClusterEdges[I][A] > ClusterEdges[I][B];
567       };
568       std::priority_queue<uint32_t, std::vector<uint32_t>,
569                           decltype(ClusterComp)>
570           SuccQueue(ClusterComp);
571       for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) {
572         if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
573             !Clusters[Target.first].empty()) {
574           Parent[Target.first] = I;
575           Status[Target.first] = STACKED;
576           SuccQueue.push(Target.first);
577         }
578       }
579       while (!SuccQueue.empty()) {
580         Stack.push(SuccQueue.top());
581         SuccQueue.pop();
582       }
583       continue;
584     }
585     // Already visited this node
586     Stack.pop();
587     ClusterOrder.push_back(I);
588   }
589   std::reverse(ClusterOrder.begin(), ClusterOrder.end());
590   // Put unreachable clusters at the end
591   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
592     if (!(Status[I] & VISITED) && !Clusters[I].empty())
593       ClusterOrder.push_back(I);
594 
595   // Sort nodes with equal precedence
596   auto Beg = ClusterOrder.begin();
597   // Don't reorder the first cluster, which contains the function entry point
598   ++Beg;
599   std::stable_sort(Beg, ClusterOrder.end(),
600                    [&AvgFreq, &Parent](uint32_t A, uint32_t B) {
601                      uint32_t P = Parent[A];
602                      while (Parent[P] != 0) {
603                        if (Parent[P] == B)
604                          return false;
605                        P = Parent[P];
606                      }
607                      P = Parent[B];
608                      while (Parent[P] != 0) {
609                        if (Parent[P] == A)
610                          return true;
611                        P = Parent[P];
612                      }
613                      return AvgFreq[A] > AvgFreq[B];
614                    });
615 
616   if (opts::PrintClusters) {
617     errs() << "New cluster order: ";
618     const char *Sep = "";
619     for (uint32_t O : ClusterOrder) {
620       errs() << Sep << O;
621       Sep = ", ";
622     }
623     errs() << '\n';
624   }
625 
626   // Arrange basic blocks according to cluster order.
627   for (uint32_t ClusterIndex : ClusterOrder) {
628     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
629     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
630   }
631 }
632 
633 void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
634     const BinaryFunction &BF, BasicBlockOrder &Order) const {
635   if (BF.layout_empty())
636     return;
637 
638   const uint64_t ColdThreshold =
639       opts::ColdThreshold * (*BF.layout_begin())->getExecutionCount() / 1000;
640 
641   // Cluster basic blocks.
642   CAlgo->clusterBasicBlocks(BF);
643   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
644 
645   // Compute clusters' average frequencies.
646   CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
647   std::vector<double> &AvgFreq = CAlgo->AvgFreq;
648 
649   if (opts::PrintClusters)
650     CAlgo->printClusters();
651 
652   // Cluster layout order
653   std::vector<uint32_t> ClusterOrder;
654 
655   // Order clusters based on average instruction execution frequency
656   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
657     if (!Clusters[I].empty())
658       ClusterOrder.push_back(I);
659   // Don't reorder the first cluster, which contains the function entry point
660   std::stable_sort(
661       std::next(ClusterOrder.begin()), ClusterOrder.end(),
662       [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; });
663 
664   if (opts::PrintClusters) {
665     errs() << "New cluster order: ";
666     const char *Sep = "";
667     for (uint32_t O : ClusterOrder) {
668       errs() << Sep << O;
669       Sep = ", ";
670     }
671     errs() << '\n';
672   }
673 
674   // Arrange basic blocks according to cluster order.
675   for (uint32_t ClusterIndex : ClusterOrder) {
676     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
677     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
678     // Force zero execution count on clusters that do not meet the cut off
679     // specified by --cold-threshold.
680     if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold)) {
681       for (BinaryBasicBlock *BBPtr : Cluster) {
682         BBPtr->setExecutionCount(0);
683       }
684     }
685   }
686 }
687 
688 void ReverseReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF,
689                                                  BasicBlockOrder &Order) const {
690   if (BF.layout_empty())
691     return;
692 
693   BinaryBasicBlock *FirstBB = *BF.layout_begin();
694   Order.push_back(FirstBB);
695   for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI)
696     Order.push_back(*RLI);
697 }
698 
699 void RandomClusterReorderAlgorithm::reorderBasicBlocks(
700     const BinaryFunction &BF, BasicBlockOrder &Order) const {
701   if (BF.layout_empty())
702     return;
703 
704   // Cluster basic blocks.
705   CAlgo->clusterBasicBlocks(BF);
706   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
707 
708   if (opts::PrintClusters)
709     CAlgo->printClusters();
710 
711   // Cluster layout order
712   std::vector<uint32_t> ClusterOrder;
713 
714   // Order clusters based on average instruction execution frequency
715   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
716     if (!Clusters[I].empty())
717       ClusterOrder.push_back(I);
718 
719   std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(),
720                std::default_random_engine(opts::RandomSeed.getValue()));
721 
722   if (opts::PrintClusters) {
723     errs() << "New cluster order: ";
724     const char *Sep = "";
725     for (uint32_t O : ClusterOrder) {
726       errs() << Sep << O;
727       Sep = ", ";
728     }
729     errs() << '\n';
730   }
731 
732   // Arrange basic blocks according to cluster order.
733   for (uint32_t ClusterIndex : ClusterOrder) {
734     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
735     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
736   }
737 }
738