1 //===- bolt/Passes/MCF.cpp ------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements functions for solving minimum-cost flow problem. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/MCF.h" 14 #include "bolt/Core/BinaryFunction.h" 15 #include "bolt/Passes/DataflowInfoManager.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/Support/CommandLine.h" 18 #include <algorithm> 19 #include <vector> 20 21 #undef DEBUG_TYPE 22 #define DEBUG_TYPE "mcf" 23 24 using namespace llvm; 25 using namespace bolt; 26 27 namespace opts { 28 29 extern cl::OptionCategory BoltOptCategory; 30 31 extern cl::opt<bool> TimeOpts; 32 33 static cl::opt<bool> IterativeGuess( 34 "iterative-guess", 35 cl::desc("in non-LBR mode, guess edge counts using iterative technique"), 36 37 cl::Hidden, cl::cat(BoltOptCategory)); 38 39 static cl::opt<bool> 40 EqualizeBBCounts("equalize-bb-counts", 41 cl::desc("in non-LBR mode, use same count for BBs " 42 "that should have equivalent count"), 43 cl::Hidden, cl::cat(BoltOptCategory)); 44 45 static cl::opt<bool> UseRArcs( 46 "mcf-use-rarcs", 47 cl::desc("in MCF, consider the possibility of cancelling flow to balance " 48 "edges"), 49 cl::Hidden, cl::cat(BoltOptCategory)); 50 51 } // namespace opts 52 53 namespace llvm { 54 namespace bolt { 55 56 namespace { 57 58 // Edge Weight Inference Heuristic 59 // 60 // We start by maintaining the invariant used in LBR mode where the sum of 61 // pred edges count is equal to the block execution count. This loop will set 62 // pred edges count by balancing its own execution count in different pred 63 // edges. The weight of each edge is guessed by looking at how hot each pred 64 // block is (in terms of samples). 65 // There are two caveats in this approach. One is for critical edges and the 66 // other is for self-referencing blocks (loops of 1 BB). For critical edges, 67 // we can't infer the hotness of them based solely on pred BBs execution 68 // count. For each critical edge we look at the pred BB, then look at its 69 // succs to adjust its weight. 70 // 71 // [ 60 ] [ 25 ] 72 // | \ | 73 // [ 10 ] [ 75 ] 74 // 75 // The illustration above shows a critical edge \. We wish to adjust bb count 76 // 60 to 50 to properly determine the weight of the critical edge to be 77 // 50 / 75. 78 // For self-referencing edges, we attribute its weight by subtracting the 79 // current BB execution count by the sum of predecessors count if this result 80 // is non-negative. 81 using EdgeWeightMap = 82 DenseMap<std::pair<const BinaryBasicBlock *, const BinaryBasicBlock *>, 83 double>; 84 85 template <class NodeT> 86 void updateEdgeWeight(EdgeWeightMap &EdgeWeights, const BinaryBasicBlock *A, 87 const BinaryBasicBlock *B, double Weight); 88 89 template <> 90 void updateEdgeWeight<BinaryBasicBlock *>(EdgeWeightMap &EdgeWeights, 91 const BinaryBasicBlock *A, 92 const BinaryBasicBlock *B, 93 double Weight) { 94 EdgeWeights[std::make_pair(A, B)] = Weight; 95 return; 96 } 97 98 template <> 99 void updateEdgeWeight<Inverse<BinaryBasicBlock *>>(EdgeWeightMap &EdgeWeights, 100 const BinaryBasicBlock *A, 101 const BinaryBasicBlock *B, 102 double Weight) { 103 EdgeWeights[std::make_pair(B, A)] = Weight; 104 return; 105 } 106 107 template <class NodeT> 108 void computeEdgeWeights(BinaryBasicBlock *BB, EdgeWeightMap &EdgeWeights) { 109 typedef GraphTraits<NodeT> GraphT; 110 typedef GraphTraits<Inverse<NodeT>> InvTraits; 111 112 double TotalChildrenCount = 0.0; 113 SmallVector<double, 4> ChildrenExecCount; 114 // First pass computes total children execution count that directly 115 // contribute to this BB. 116 for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB), 117 E = GraphT::child_end(BB); 118 CI != E; ++CI) { 119 typename GraphT::NodeRef Child = *CI; 120 double ChildExecCount = Child->getExecutionCount(); 121 // Is self-reference? 122 if (Child == BB) { 123 ChildExecCount = 0.0; // will fill this in second pass 124 } else if (GraphT::child_end(BB) - GraphT::child_begin(BB) > 1 && 125 InvTraits::child_end(Child) - InvTraits::child_begin(Child) > 126 1) { 127 // Handle critical edges. This will cause a skew towards crit edges, but 128 // it is a quick solution. 129 double CritWeight = 0.0; 130 uint64_t Denominator = 0; 131 for (typename InvTraits::ChildIteratorType 132 II = InvTraits::child_begin(Child), 133 IE = InvTraits::child_end(Child); 134 II != IE; ++II) { 135 typename GraphT::NodeRef N = *II; 136 Denominator += N->getExecutionCount(); 137 if (N != BB) 138 continue; 139 CritWeight = N->getExecutionCount(); 140 } 141 if (Denominator) 142 CritWeight /= static_cast<double>(Denominator); 143 ChildExecCount *= CritWeight; 144 } 145 ChildrenExecCount.push_back(ChildExecCount); 146 TotalChildrenCount += ChildExecCount; 147 } 148 // Second pass fixes the weight of a possible self-reference edge 149 uint32_t ChildIndex = 0; 150 for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB), 151 E = GraphT::child_end(BB); 152 CI != E; ++CI) { 153 typename GraphT::NodeRef Child = *CI; 154 if (Child != BB) { 155 ++ChildIndex; 156 continue; 157 } 158 if (static_cast<double>(BB->getExecutionCount()) > TotalChildrenCount) { 159 ChildrenExecCount[ChildIndex] = 160 BB->getExecutionCount() - TotalChildrenCount; 161 TotalChildrenCount += ChildrenExecCount[ChildIndex]; 162 } 163 break; 164 } 165 // Third pass finally assigns weights to edges 166 ChildIndex = 0; 167 for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB), 168 E = GraphT::child_end(BB); 169 CI != E; ++CI) { 170 typename GraphT::NodeRef Child = *CI; 171 double Weight = 1 / (GraphT::child_end(BB) - GraphT::child_begin(BB)); 172 if (TotalChildrenCount != 0.0) 173 Weight = ChildrenExecCount[ChildIndex] / TotalChildrenCount; 174 updateEdgeWeight<NodeT>(EdgeWeights, BB, Child, Weight); 175 ++ChildIndex; 176 } 177 } 178 179 template <class NodeT> 180 void computeEdgeWeights(BinaryFunction &BF, EdgeWeightMap &EdgeWeights) { 181 for (BinaryBasicBlock &BB : BF) 182 computeEdgeWeights<NodeT>(&BB, EdgeWeights); 183 } 184 185 /// Make BB count match the sum of all incoming edges. If AllEdges is true, 186 /// make it match max(SumPredEdges, SumSuccEdges). 187 void recalculateBBCounts(BinaryFunction &BF, bool AllEdges) { 188 for (BinaryBasicBlock &BB : BF) { 189 uint64_t TotalPredsEWeight = 0; 190 for (BinaryBasicBlock *Pred : BB.predecessors()) 191 TotalPredsEWeight += Pred->getBranchInfo(BB).Count; 192 193 if (TotalPredsEWeight > BB.getExecutionCount()) 194 BB.setExecutionCount(TotalPredsEWeight); 195 196 if (!AllEdges) 197 continue; 198 199 uint64_t TotalSuccsEWeight = 0; 200 for (BinaryBasicBlock::BinaryBranchInfo &BI : BB.branch_info()) 201 TotalSuccsEWeight += BI.Count; 202 203 if (TotalSuccsEWeight > BB.getExecutionCount()) 204 BB.setExecutionCount(TotalSuccsEWeight); 205 } 206 } 207 208 // This is our main edge count guessing heuristic. Look at predecessors and 209 // assign a proportionally higher count to pred edges coming from blocks with 210 // a higher execution count in comparison with the other predecessor blocks, 211 // making SumPredEdges match the current BB count. 212 // If "UseSucc" is true, apply the same logic to successor edges as well. Since 213 // some successor edges may already have assigned a count, only update it if the 214 // new count is higher. 215 void guessEdgeByRelHotness(BinaryFunction &BF, bool UseSucc, 216 EdgeWeightMap &PredEdgeWeights, 217 EdgeWeightMap &SuccEdgeWeights) { 218 for (BinaryBasicBlock &BB : BF) { 219 for (BinaryBasicBlock *Pred : BB.predecessors()) { 220 double RelativeExec = PredEdgeWeights[std::make_pair(Pred, &BB)]; 221 RelativeExec *= BB.getExecutionCount(); 222 BinaryBasicBlock::BinaryBranchInfo &BI = Pred->getBranchInfo(BB); 223 if (static_cast<uint64_t>(RelativeExec) > BI.Count) 224 BI.Count = static_cast<uint64_t>(RelativeExec); 225 } 226 227 if (!UseSucc) 228 continue; 229 230 auto BI = BB.branch_info_begin(); 231 for (BinaryBasicBlock *Succ : BB.successors()) { 232 double RelativeExec = SuccEdgeWeights[std::make_pair(&BB, Succ)]; 233 RelativeExec *= BB.getExecutionCount(); 234 if (static_cast<uint64_t>(RelativeExec) > BI->Count) 235 BI->Count = static_cast<uint64_t>(RelativeExec); 236 ++BI; 237 } 238 } 239 } 240 241 using ArcSet = 242 DenseSet<std::pair<const BinaryBasicBlock *, const BinaryBasicBlock *>>; 243 244 /// Predecessor edges version of guessEdgeByIterativeApproach. GuessedArcs has 245 /// all edges we already established their count. Try to guess the count of 246 /// the remaining edge, if there is only one to guess, and return true if we 247 /// were able to guess. 248 bool guessPredEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) { 249 if (BB->pred_size() == 0) 250 return false; 251 252 uint64_t TotalPredCount = 0; 253 unsigned NumGuessedEdges = 0; 254 for (BinaryBasicBlock *Pred : BB->predecessors()) { 255 if (GuessedArcs.count(std::make_pair(Pred, BB))) 256 ++NumGuessedEdges; 257 TotalPredCount += Pred->getBranchInfo(*BB).Count; 258 } 259 260 if (NumGuessedEdges != BB->pred_size() - 1) 261 return false; 262 263 int64_t Guessed = 264 static_cast<int64_t>(BB->getExecutionCount()) - TotalPredCount; 265 if (Guessed < 0) 266 Guessed = 0; 267 268 for (BinaryBasicBlock *Pred : BB->predecessors()) { 269 if (GuessedArcs.count(std::make_pair(Pred, BB))) 270 continue; 271 272 Pred->getBranchInfo(*BB).Count = Guessed; 273 return true; 274 } 275 llvm_unreachable("Expected unguessed arc"); 276 } 277 278 /// Successor edges version of guessEdgeByIterativeApproach. GuessedArcs has 279 /// all edges we already established their count. Try to guess the count of 280 /// the remaining edge, if there is only one to guess, and return true if we 281 /// were able to guess. 282 bool guessSuccEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) { 283 if (BB->succ_size() == 0) 284 return false; 285 286 uint64_t TotalSuccCount = 0; 287 unsigned NumGuessedEdges = 0; 288 auto BI = BB->branch_info_begin(); 289 for (BinaryBasicBlock *Succ : BB->successors()) { 290 if (GuessedArcs.count(std::make_pair(BB, Succ))) 291 ++NumGuessedEdges; 292 TotalSuccCount += BI->Count; 293 ++BI; 294 } 295 296 if (NumGuessedEdges != BB->succ_size() - 1) 297 return false; 298 299 int64_t Guessed = 300 static_cast<int64_t>(BB->getExecutionCount()) - TotalSuccCount; 301 if (Guessed < 0) 302 Guessed = 0; 303 304 BI = BB->branch_info_begin(); 305 for (BinaryBasicBlock *Succ : BB->successors()) { 306 if (GuessedArcs.count(std::make_pair(BB, Succ))) { 307 ++BI; 308 continue; 309 } 310 311 BI->Count = Guessed; 312 GuessedArcs.insert(std::make_pair(BB, Succ)); 313 return true; 314 } 315 llvm_unreachable("Expected unguessed arc"); 316 } 317 318 /// Guess edge count whenever we have only one edge (pred or succ) left 319 /// to guess. Then make its count equal to BB count minus all other edge 320 /// counts we already know their count. Repeat this until there is no 321 /// change. 322 void guessEdgeByIterativeApproach(BinaryFunction &BF) { 323 ArcSet KnownArcs; 324 bool Changed = false; 325 326 do { 327 Changed = false; 328 for (BinaryBasicBlock &BB : BF) { 329 if (guessPredEdgeCounts(&BB, KnownArcs)) 330 Changed = true; 331 if (guessSuccEdgeCounts(&BB, KnownArcs)) 332 Changed = true; 333 } 334 } while (Changed); 335 336 // Guess count for non-inferred edges 337 for (BinaryBasicBlock &BB : BF) { 338 for (BinaryBasicBlock *Pred : BB.predecessors()) { 339 if (KnownArcs.count(std::make_pair(Pred, &BB))) 340 continue; 341 BinaryBasicBlock::BinaryBranchInfo &BI = Pred->getBranchInfo(BB); 342 BI.Count = 343 std::min(Pred->getExecutionCount(), BB.getExecutionCount()) / 2; 344 KnownArcs.insert(std::make_pair(Pred, &BB)); 345 } 346 auto BI = BB.branch_info_begin(); 347 for (BinaryBasicBlock *Succ : BB.successors()) { 348 if (KnownArcs.count(std::make_pair(&BB, Succ))) { 349 ++BI; 350 continue; 351 } 352 BI->Count = 353 std::min(BB.getExecutionCount(), Succ->getExecutionCount()) / 2; 354 KnownArcs.insert(std::make_pair(&BB, Succ)); 355 break; 356 } 357 } 358 } 359 360 /// Associate each basic block with the BinaryLoop object corresponding to the 361 /// innermost loop containing this block. 362 DenseMap<const BinaryBasicBlock *, const BinaryLoop *> 363 createLoopNestLevelMap(BinaryFunction &BF) { 364 DenseMap<const BinaryBasicBlock *, const BinaryLoop *> LoopNestLevel; 365 const BinaryLoopInfo &BLI = BF.getLoopInfo(); 366 367 for (BinaryBasicBlock &BB : BF) 368 LoopNestLevel[&BB] = BLI[&BB]; 369 370 return LoopNestLevel; 371 } 372 373 /// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations 374 /// without the Usability Burden" by Diego Novillo to make basic block counts 375 /// equal if we show that A dominates B, B post-dominates A and they are in the 376 /// same loop and same loop nesting level. 377 void equalizeBBCounts(BinaryFunction &BF) { 378 auto Info = DataflowInfoManager(BF, nullptr, nullptr); 379 DominatorAnalysis<false> &DA = Info.getDominatorAnalysis(); 380 DominatorAnalysis<true> &PDA = Info.getPostDominatorAnalysis(); 381 auto &InsnToBB = Info.getInsnToBBMap(); 382 // These analyses work at the instruction granularity, but we really only need 383 // basic block granularity here. So we'll use a set of visited edges to avoid 384 // revisiting the same BBs again and again. 385 DenseMap<const BinaryBasicBlock *, std::set<const BinaryBasicBlock *>> 386 Visited; 387 // Equivalence classes mapping. Each equivalence class is defined by the set 388 // of BBs that obeys the aforementioned properties. 389 DenseMap<const BinaryBasicBlock *, signed> BBsToEC; 390 std::vector<std::vector<BinaryBasicBlock *>> Classes; 391 392 BF.calculateLoopInfo(); 393 DenseMap<const BinaryBasicBlock *, const BinaryLoop *> LoopNestLevel = 394 createLoopNestLevelMap(BF); 395 396 for (BinaryBasicBlock &BB : BF) 397 BBsToEC[&BB] = -1; 398 399 for (BinaryBasicBlock &BB : BF) { 400 auto I = BB.begin(); 401 if (I == BB.end()) 402 continue; 403 404 DA.doForAllDominators(*I, [&](const MCInst &DomInst) { 405 BinaryBasicBlock *DomBB = InsnToBB[&DomInst]; 406 if (Visited[DomBB].count(&BB)) 407 return; 408 Visited[DomBB].insert(&BB); 409 if (!PDA.doesADominateB(*I, DomInst)) 410 return; 411 if (LoopNestLevel[&BB] != LoopNestLevel[DomBB]) 412 return; 413 if (BBsToEC[DomBB] == -1 && BBsToEC[&BB] == -1) { 414 BBsToEC[DomBB] = Classes.size(); 415 BBsToEC[&BB] = Classes.size(); 416 Classes.emplace_back(); 417 Classes.back().push_back(DomBB); 418 Classes.back().push_back(&BB); 419 return; 420 } 421 if (BBsToEC[DomBB] == -1) { 422 BBsToEC[DomBB] = BBsToEC[&BB]; 423 Classes[BBsToEC[&BB]].push_back(DomBB); 424 return; 425 } 426 if (BBsToEC[&BB] == -1) { 427 BBsToEC[&BB] = BBsToEC[DomBB]; 428 Classes[BBsToEC[DomBB]].push_back(&BB); 429 return; 430 } 431 signed BBECNum = BBsToEC[&BB]; 432 std::vector<BinaryBasicBlock *> DomEC = Classes[BBsToEC[DomBB]]; 433 std::vector<BinaryBasicBlock *> BBEC = Classes[BBECNum]; 434 for (BinaryBasicBlock *Block : DomEC) { 435 BBsToEC[Block] = BBECNum; 436 BBEC.push_back(Block); 437 } 438 DomEC.clear(); 439 }); 440 } 441 442 for (std::vector<BinaryBasicBlock *> &Class : Classes) { 443 uint64_t Max = 0ULL; 444 for (BinaryBasicBlock *BB : Class) 445 Max = std::max(Max, BB->getExecutionCount()); 446 for (BinaryBasicBlock *BB : Class) 447 BB->setExecutionCount(Max); 448 } 449 } 450 451 } // end anonymous namespace 452 453 void estimateEdgeCounts(BinaryFunction &BF) { 454 EdgeWeightMap PredEdgeWeights; 455 EdgeWeightMap SuccEdgeWeights; 456 if (!opts::IterativeGuess) { 457 computeEdgeWeights<Inverse<BinaryBasicBlock *>>(BF, PredEdgeWeights); 458 computeEdgeWeights<BinaryBasicBlock *>(BF, SuccEdgeWeights); 459 } 460 if (opts::EqualizeBBCounts) { 461 LLVM_DEBUG(BF.print(dbgs(), "before equalize BB counts", true)); 462 equalizeBBCounts(BF); 463 LLVM_DEBUG(BF.print(dbgs(), "after equalize BB counts", true)); 464 } 465 if (opts::IterativeGuess) 466 guessEdgeByIterativeApproach(BF); 467 else 468 guessEdgeByRelHotness(BF, /*UseSuccs=*/false, PredEdgeWeights, 469 SuccEdgeWeights); 470 recalculateBBCounts(BF, /*AllEdges=*/false); 471 } 472 473 void solveMCF(BinaryFunction &BF, MCFCostFunction CostFunction) { 474 llvm_unreachable("not implemented"); 475 } 476 477 } // namespace bolt 478 } // namespace llvm 479