1 //===- bolt/Passes/MCF.cpp ------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements functions for solving minimum-cost flow problem. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/MCF.h" 14 #include "bolt/Core/BinaryFunction.h" 15 #include "bolt/Passes/DataflowInfoManager.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/Support/CommandLine.h" 18 #include <algorithm> 19 #include <vector> 20 21 #undef DEBUG_TYPE 22 #define DEBUG_TYPE "mcf" 23 24 using namespace llvm; 25 using namespace bolt; 26 27 namespace opts { 28 29 extern cl::OptionCategory BoltOptCategory; 30 31 extern cl::opt<bool> TimeOpts; 32 33 static cl::opt<bool> 34 IterativeGuess("iterative-guess", 35 cl::desc("in non-LBR mode, guess edge counts using iterative technique"), 36 cl::ZeroOrMore, 37 cl::init(false), 38 cl::Hidden, 39 cl::cat(BoltOptCategory)); 40 41 static cl::opt<bool> 42 EqualizeBBCounts("equalize-bb-counts", 43 cl::desc("in non-LBR mode, use same count for BBs " 44 "that should have equivalent count"), 45 cl::ZeroOrMore, 46 cl::init(false), 47 cl::Hidden, 48 cl::cat(BoltOptCategory)); 49 50 static cl::opt<bool> 51 UseRArcs("mcf-use-rarcs", 52 cl::desc("in MCF, consider the possibility of cancelling flow to balance " 53 "edges"), 54 cl::ZeroOrMore, 55 cl::init(false), 56 cl::Hidden, 57 cl::cat(BoltOptCategory)); 58 59 } // namespace opts 60 61 namespace llvm { 62 namespace bolt { 63 64 namespace { 65 66 // Edge Weight Inference Heuristic 67 // 68 // We start by maintaining the invariant used in LBR mode where the sum of 69 // pred edges count is equal to the block execution count. This loop will set 70 // pred edges count by balancing its own execution count in different pred 71 // edges. The weight of each edge is guessed by looking at how hot each pred 72 // block is (in terms of samples). 73 // There are two caveats in this approach. One is for critical edges and the 74 // other is for self-referencing blocks (loops of 1 BB). For critical edges, 75 // we can't infer the hotness of them based solely on pred BBs execution 76 // count. For each critical edge we look at the pred BB, then look at its 77 // succs to adjust its weight. 78 // 79 // [ 60 ] [ 25 ] 80 // | \ | 81 // [ 10 ] [ 75 ] 82 // 83 // The illustration above shows a critical edge \. We wish to adjust bb count 84 // 60 to 50 to properly determine the weight of the critical edge to be 85 // 50 / 75. 86 // For self-referencing edges, we attribute its weight by subtracting the 87 // current BB execution count by the sum of predecessors count if this result 88 // is non-negative. 89 using EdgeWeightMap = 90 DenseMap<std::pair<const BinaryBasicBlock *, const BinaryBasicBlock *>, 91 double>; 92 93 template <class NodeT> 94 void updateEdgeWeight(EdgeWeightMap &EdgeWeights, const BinaryBasicBlock *A, 95 const BinaryBasicBlock *B, double Weight); 96 97 template <> 98 void updateEdgeWeight<BinaryBasicBlock *>(EdgeWeightMap &EdgeWeights, 99 const BinaryBasicBlock *A, 100 const BinaryBasicBlock *B, 101 double Weight) { 102 EdgeWeights[std::make_pair(A, B)] = Weight; 103 return; 104 } 105 106 template <> 107 void updateEdgeWeight<Inverse<BinaryBasicBlock *>>(EdgeWeightMap &EdgeWeights, 108 const BinaryBasicBlock *A, 109 const BinaryBasicBlock *B, 110 double Weight) { 111 EdgeWeights[std::make_pair(B, A)] = Weight; 112 return; 113 } 114 115 template <class NodeT> 116 void computeEdgeWeights(BinaryBasicBlock *BB, EdgeWeightMap &EdgeWeights) { 117 typedef GraphTraits<NodeT> GraphT; 118 typedef GraphTraits<Inverse<NodeT>> InvTraits; 119 120 double TotalChildrenCount = 0.0; 121 SmallVector<double, 4> ChildrenExecCount; 122 // First pass computes total children execution count that directly 123 // contribute to this BB. 124 for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB), 125 E = GraphT::child_end(BB); 126 CI != E; ++CI) { 127 typename GraphT::NodeRef Child = *CI; 128 double ChildExecCount = Child->getExecutionCount(); 129 // Is self-reference? 130 if (Child == BB) { 131 ChildExecCount = 0.0; // will fill this in second pass 132 } else if (GraphT::child_end(BB) - GraphT::child_begin(BB) > 1 && 133 InvTraits::child_end(Child) - InvTraits::child_begin(Child) > 134 1) { 135 // Handle critical edges. This will cause a skew towards crit edges, but 136 // it is a quick solution. 137 double CritWeight = 0.0; 138 uint64_t Denominator = 0; 139 for (typename InvTraits::ChildIteratorType 140 II = InvTraits::child_begin(Child), 141 IE = InvTraits::child_end(Child); 142 II != IE; ++II) { 143 typename GraphT::NodeRef N = *II; 144 Denominator += N->getExecutionCount(); 145 if (N != BB) 146 continue; 147 CritWeight = N->getExecutionCount(); 148 } 149 if (Denominator) 150 CritWeight /= static_cast<double>(Denominator); 151 ChildExecCount *= CritWeight; 152 } 153 ChildrenExecCount.push_back(ChildExecCount); 154 TotalChildrenCount += ChildExecCount; 155 } 156 // Second pass fixes the weight of a possible self-reference edge 157 uint32_t ChildIndex = 0; 158 for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB), 159 E = GraphT::child_end(BB); 160 CI != E; ++CI) { 161 typename GraphT::NodeRef Child = *CI; 162 if (Child != BB) { 163 ++ChildIndex; 164 continue; 165 } 166 if (static_cast<double>(BB->getExecutionCount()) > TotalChildrenCount) { 167 ChildrenExecCount[ChildIndex] = 168 BB->getExecutionCount() - TotalChildrenCount; 169 TotalChildrenCount += ChildrenExecCount[ChildIndex]; 170 } 171 break; 172 } 173 // Third pass finally assigns weights to edges 174 ChildIndex = 0; 175 for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB), 176 E = GraphT::child_end(BB); 177 CI != E; ++CI) { 178 typename GraphT::NodeRef Child = *CI; 179 double Weight = 1 / (GraphT::child_end(BB) - GraphT::child_begin(BB)); 180 if (TotalChildrenCount != 0.0) 181 Weight = ChildrenExecCount[ChildIndex] / TotalChildrenCount; 182 updateEdgeWeight<NodeT>(EdgeWeights, BB, Child, Weight); 183 ++ChildIndex; 184 } 185 } 186 187 template <class NodeT> 188 void computeEdgeWeights(BinaryFunction &BF, EdgeWeightMap &EdgeWeights) { 189 for (BinaryBasicBlock &BB : BF) 190 computeEdgeWeights<NodeT>(&BB, EdgeWeights); 191 } 192 193 /// Make BB count match the sum of all incoming edges. If AllEdges is true, 194 /// make it match max(SumPredEdges, SumSuccEdges). 195 void recalculateBBCounts(BinaryFunction &BF, bool AllEdges) { 196 for (BinaryBasicBlock &BB : BF) { 197 uint64_t TotalPredsEWeight = 0; 198 for (BinaryBasicBlock *Pred : BB.predecessors()) 199 TotalPredsEWeight += Pred->getBranchInfo(BB).Count; 200 201 if (TotalPredsEWeight > BB.getExecutionCount()) 202 BB.setExecutionCount(TotalPredsEWeight); 203 204 if (!AllEdges) 205 continue; 206 207 uint64_t TotalSuccsEWeight = 0; 208 for (BinaryBasicBlock::BinaryBranchInfo &BI : BB.branch_info()) 209 TotalSuccsEWeight += BI.Count; 210 211 if (TotalSuccsEWeight > BB.getExecutionCount()) 212 BB.setExecutionCount(TotalSuccsEWeight); 213 } 214 } 215 216 // This is our main edge count guessing heuristic. Look at predecessors and 217 // assign a proportionally higher count to pred edges coming from blocks with 218 // a higher execution count in comparison with the other predecessor blocks, 219 // making SumPredEdges match the current BB count. 220 // If "UseSucc" is true, apply the same logic to successor edges as well. Since 221 // some successor edges may already have assigned a count, only update it if the 222 // new count is higher. 223 void guessEdgeByRelHotness(BinaryFunction &BF, bool UseSucc, 224 EdgeWeightMap &PredEdgeWeights, 225 EdgeWeightMap &SuccEdgeWeights) { 226 for (BinaryBasicBlock &BB : BF) { 227 for (BinaryBasicBlock *Pred : BB.predecessors()) { 228 double RelativeExec = PredEdgeWeights[std::make_pair(Pred, &BB)]; 229 RelativeExec *= BB.getExecutionCount(); 230 BinaryBasicBlock::BinaryBranchInfo &BI = Pred->getBranchInfo(BB); 231 if (static_cast<uint64_t>(RelativeExec) > BI.Count) 232 BI.Count = static_cast<uint64_t>(RelativeExec); 233 } 234 235 if (!UseSucc) 236 continue; 237 238 auto BI = BB.branch_info_begin(); 239 for (BinaryBasicBlock *Succ : BB.successors()) { 240 double RelativeExec = SuccEdgeWeights[std::make_pair(&BB, Succ)]; 241 RelativeExec *= BB.getExecutionCount(); 242 if (static_cast<uint64_t>(RelativeExec) > BI->Count) 243 BI->Count = static_cast<uint64_t>(RelativeExec); 244 ++BI; 245 } 246 } 247 } 248 249 using ArcSet = 250 DenseSet<std::pair<const BinaryBasicBlock *, const BinaryBasicBlock *>>; 251 252 /// Predecessor edges version of guessEdgeByIterativeApproach. GuessedArcs has 253 /// all edges we already established their count. Try to guess the count of 254 /// the remaining edge, if there is only one to guess, and return true if we 255 /// were able to guess. 256 bool guessPredEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) { 257 if (BB->pred_size() == 0) 258 return false; 259 260 uint64_t TotalPredCount = 0; 261 unsigned NumGuessedEdges = 0; 262 for (BinaryBasicBlock *Pred : BB->predecessors()) { 263 if (GuessedArcs.count(std::make_pair(Pred, BB))) 264 ++NumGuessedEdges; 265 TotalPredCount += Pred->getBranchInfo(*BB).Count; 266 } 267 268 if (NumGuessedEdges != BB->pred_size() - 1) 269 return false; 270 271 int64_t Guessed = 272 static_cast<int64_t>(BB->getExecutionCount()) - TotalPredCount; 273 if (Guessed < 0) 274 Guessed = 0; 275 276 for (BinaryBasicBlock *Pred : BB->predecessors()) { 277 if (GuessedArcs.count(std::make_pair(Pred, BB))) 278 continue; 279 280 Pred->getBranchInfo(*BB).Count = Guessed; 281 return true; 282 } 283 llvm_unreachable("Expected unguessed arc"); 284 } 285 286 /// Successor edges version of guessEdgeByIterativeApproach. GuessedArcs has 287 /// all edges we already established their count. Try to guess the count of 288 /// the remaining edge, if there is only one to guess, and return true if we 289 /// were able to guess. 290 bool guessSuccEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) { 291 if (BB->succ_size() == 0) 292 return false; 293 294 uint64_t TotalSuccCount = 0; 295 unsigned NumGuessedEdges = 0; 296 auto BI = BB->branch_info_begin(); 297 for (BinaryBasicBlock *Succ : BB->successors()) { 298 if (GuessedArcs.count(std::make_pair(BB, Succ))) 299 ++NumGuessedEdges; 300 TotalSuccCount += BI->Count; 301 ++BI; 302 } 303 304 if (NumGuessedEdges != BB->succ_size() - 1) 305 return false; 306 307 int64_t Guessed = 308 static_cast<int64_t>(BB->getExecutionCount()) - TotalSuccCount; 309 if (Guessed < 0) 310 Guessed = 0; 311 312 BI = BB->branch_info_begin(); 313 for (BinaryBasicBlock *Succ : BB->successors()) { 314 if (GuessedArcs.count(std::make_pair(BB, Succ))) { 315 ++BI; 316 continue; 317 } 318 319 BI->Count = Guessed; 320 GuessedArcs.insert(std::make_pair(BB, Succ)); 321 return true; 322 } 323 llvm_unreachable("Expected unguessed arc"); 324 } 325 326 /// Guess edge count whenever we have only one edge (pred or succ) left 327 /// to guess. Then make its count equal to BB count minus all other edge 328 /// counts we already know their count. Repeat this until there is no 329 /// change. 330 void guessEdgeByIterativeApproach(BinaryFunction &BF) { 331 ArcSet KnownArcs; 332 bool Changed = false; 333 334 do { 335 Changed = false; 336 for (BinaryBasicBlock &BB : BF) { 337 if (guessPredEdgeCounts(&BB, KnownArcs)) 338 Changed = true; 339 if (guessSuccEdgeCounts(&BB, KnownArcs)) 340 Changed = true; 341 } 342 } while (Changed); 343 344 // Guess count for non-inferred edges 345 for (BinaryBasicBlock &BB : BF) { 346 for (BinaryBasicBlock *Pred : BB.predecessors()) { 347 if (KnownArcs.count(std::make_pair(Pred, &BB))) 348 continue; 349 BinaryBasicBlock::BinaryBranchInfo &BI = Pred->getBranchInfo(BB); 350 BI.Count = 351 std::min(Pred->getExecutionCount(), BB.getExecutionCount()) / 2; 352 KnownArcs.insert(std::make_pair(Pred, &BB)); 353 } 354 auto BI = BB.branch_info_begin(); 355 for (BinaryBasicBlock *Succ : BB.successors()) { 356 if (KnownArcs.count(std::make_pair(&BB, Succ))) { 357 ++BI; 358 continue; 359 } 360 BI->Count = 361 std::min(BB.getExecutionCount(), Succ->getExecutionCount()) / 2; 362 KnownArcs.insert(std::make_pair(&BB, Succ)); 363 break; 364 } 365 } 366 } 367 368 /// Associate each basic block with the BinaryLoop object corresponding to the 369 /// innermost loop containing this block. 370 DenseMap<const BinaryBasicBlock *, const BinaryLoop *> 371 createLoopNestLevelMap(BinaryFunction &BF) { 372 DenseMap<const BinaryBasicBlock *, const BinaryLoop *> LoopNestLevel; 373 const BinaryLoopInfo &BLI = BF.getLoopInfo(); 374 375 for (BinaryBasicBlock &BB : BF) 376 LoopNestLevel[&BB] = BLI[&BB]; 377 378 return LoopNestLevel; 379 } 380 381 /// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations 382 /// without the Usability Burden" by Diego Novillo to make basic block counts 383 /// equal if we show that A dominates B, B post-dominates A and they are in the 384 /// same loop and same loop nesting level. 385 void equalizeBBCounts(BinaryFunction &BF) { 386 auto Info = DataflowInfoManager(BF, nullptr, nullptr); 387 DominatorAnalysis<false> &DA = Info.getDominatorAnalysis(); 388 DominatorAnalysis<true> &PDA = Info.getPostDominatorAnalysis(); 389 auto &InsnToBB = Info.getInsnToBBMap(); 390 // These analyses work at the instruction granularity, but we really only need 391 // basic block granularity here. So we'll use a set of visited edges to avoid 392 // revisiting the same BBs again and again. 393 DenseMap<const BinaryBasicBlock *, std::set<const BinaryBasicBlock *>> 394 Visited; 395 // Equivalence classes mapping. Each equivalence class is defined by the set 396 // of BBs that obeys the aforementioned properties. 397 DenseMap<const BinaryBasicBlock *, signed> BBsToEC; 398 std::vector<std::vector<BinaryBasicBlock *>> Classes; 399 400 BF.calculateLoopInfo(); 401 DenseMap<const BinaryBasicBlock *, const BinaryLoop *> LoopNestLevel = 402 createLoopNestLevelMap(BF); 403 404 for (BinaryBasicBlock &BB : BF) 405 BBsToEC[&BB] = -1; 406 407 for (BinaryBasicBlock &BB : BF) { 408 auto I = BB.begin(); 409 if (I == BB.end()) 410 continue; 411 412 DA.doForAllDominators(*I, [&](const MCInst &DomInst) { 413 BinaryBasicBlock *DomBB = InsnToBB[&DomInst]; 414 if (Visited[DomBB].count(&BB)) 415 return; 416 Visited[DomBB].insert(&BB); 417 if (!PDA.doesADominateB(*I, DomInst)) 418 return; 419 if (LoopNestLevel[&BB] != LoopNestLevel[DomBB]) 420 return; 421 if (BBsToEC[DomBB] == -1 && BBsToEC[&BB] == -1) { 422 BBsToEC[DomBB] = Classes.size(); 423 BBsToEC[&BB] = Classes.size(); 424 Classes.emplace_back(); 425 Classes.back().push_back(DomBB); 426 Classes.back().push_back(&BB); 427 return; 428 } 429 if (BBsToEC[DomBB] == -1) { 430 BBsToEC[DomBB] = BBsToEC[&BB]; 431 Classes[BBsToEC[&BB]].push_back(DomBB); 432 return; 433 } 434 if (BBsToEC[&BB] == -1) { 435 BBsToEC[&BB] = BBsToEC[DomBB]; 436 Classes[BBsToEC[DomBB]].push_back(&BB); 437 return; 438 } 439 signed BBECNum = BBsToEC[&BB]; 440 std::vector<BinaryBasicBlock *> DomEC = Classes[BBsToEC[DomBB]]; 441 std::vector<BinaryBasicBlock *> BBEC = Classes[BBECNum]; 442 for (BinaryBasicBlock *Block : DomEC) { 443 BBsToEC[Block] = BBECNum; 444 BBEC.push_back(Block); 445 } 446 DomEC.clear(); 447 }); 448 } 449 450 for (std::vector<BinaryBasicBlock *> &Class : Classes) { 451 uint64_t Max = 0ULL; 452 for (BinaryBasicBlock *BB : Class) 453 Max = std::max(Max, BB->getExecutionCount()); 454 for (BinaryBasicBlock *BB : Class) 455 BB->setExecutionCount(Max); 456 } 457 } 458 459 } // end anonymous namespace 460 461 void estimateEdgeCounts(BinaryFunction &BF) { 462 EdgeWeightMap PredEdgeWeights; 463 EdgeWeightMap SuccEdgeWeights; 464 if (!opts::IterativeGuess) { 465 computeEdgeWeights<Inverse<BinaryBasicBlock *>>(BF, PredEdgeWeights); 466 computeEdgeWeights<BinaryBasicBlock *>(BF, SuccEdgeWeights); 467 } 468 if (opts::EqualizeBBCounts) { 469 LLVM_DEBUG(BF.print(dbgs(), "before equalize BB counts", true)); 470 equalizeBBCounts(BF); 471 LLVM_DEBUG(BF.print(dbgs(), "after equalize BB counts", true)); 472 } 473 if (opts::IterativeGuess) 474 guessEdgeByIterativeApproach(BF); 475 else 476 guessEdgeByRelHotness(BF, /*UseSuccs=*/false, PredEdgeWeights, 477 SuccEdgeWeights); 478 recalculateBBCounts(BF, /*AllEdges=*/false); 479 } 480 481 void solveMCF(BinaryFunction &BF, MCFCostFunction CostFunction) { 482 llvm_unreachable("not implemented"); 483 } 484 485 } // namespace bolt 486 } // namespace llvm 487