1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements multiple passes for binary optimization and analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/BinaryPasses.h" 14 #include "bolt/Core/ParallelUtilities.h" 15 #include "bolt/Passes/ReorderAlgorithm.h" 16 #include "bolt/Passes/ReorderFunctions.h" 17 #include "llvm/Support/CommandLine.h" 18 19 #include <numeric> 20 #include <vector> 21 22 #define DEBUG_TYPE "bolt-opts" 23 24 using namespace llvm; 25 using namespace bolt; 26 27 namespace { 28 29 const char *dynoStatsOptName(const bolt::DynoStats::Category C) { 30 if (C == bolt::DynoStats::FIRST_DYNO_STAT) 31 return "none"; 32 else if (C == bolt::DynoStats::LAST_DYNO_STAT) 33 return "all"; 34 35 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1]; 36 37 OptNames[C] = bolt::DynoStats::Description(C); 38 39 std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); 40 41 return OptNames[C].c_str(); 42 } 43 44 const char *dynoStatsOptDesc(const bolt::DynoStats::Category C) { 45 if (C == bolt::DynoStats::FIRST_DYNO_STAT) 46 return "unsorted"; 47 else if (C == bolt::DynoStats::LAST_DYNO_STAT) 48 return "sorted by all stats"; 49 50 return bolt::DynoStats::Description(C); 51 } 52 53 } 54 55 namespace opts { 56 57 extern cl::OptionCategory BoltCategory; 58 extern cl::OptionCategory BoltOptCategory; 59 60 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion; 61 extern cl::opt<unsigned> Verbosity; 62 extern cl::opt<bool> EnableBAT; 63 extern cl::opt<unsigned> ExecutionCountThreshold; 64 extern cl::opt<bool> UpdateDebugSections; 65 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions; 66 67 enum DynoStatsSortOrder : char { 68 Ascending, 69 Descending 70 }; 71 72 static cl::opt<DynoStatsSortOrder> 73 DynoStatsSortOrderOpt("print-sorted-by-order", 74 cl::desc("use ascending or descending order when printing functions " 75 "ordered by dyno stats"), 76 cl::ZeroOrMore, 77 cl::init(DynoStatsSortOrder::Descending), 78 cl::cat(BoltOptCategory)); 79 80 cl::list<std::string> 81 HotTextMoveSections("hot-text-move-sections", 82 cl::desc("list of sections containing functions used for hugifying hot text. " 83 "BOLT makes sure these functions are not placed on the same page as " 84 "the hot text. (default=\'.stub,.mover\')."), 85 cl::value_desc("sec1,sec2,sec3,..."), 86 cl::CommaSeparated, 87 cl::ZeroOrMore, 88 cl::cat(BoltCategory)); 89 90 bool isHotTextMover(const BinaryFunction &Function) { 91 for (std::string &SectionName : opts::HotTextMoveSections) { 92 if (Function.getOriginSectionName() && 93 *Function.getOriginSectionName() == SectionName) 94 return true; 95 } 96 97 return false; 98 } 99 100 static cl::opt<bool> 101 MinBranchClusters("min-branch-clusters", 102 cl::desc("use a modified clustering algorithm geared towards minimizing " 103 "branches"), 104 cl::ZeroOrMore, 105 cl::Hidden, 106 cl::cat(BoltOptCategory)); 107 108 enum PeepholeOpts : char { 109 PEEP_NONE = 0x0, 110 PEEP_DOUBLE_JUMPS = 0x2, 111 PEEP_TAILCALL_TRAPS = 0x4, 112 PEEP_USELESS_BRANCHES = 0x8, 113 PEEP_ALL = 0xf 114 }; 115 116 static cl::list<PeepholeOpts> 117 Peepholes("peepholes", 118 cl::CommaSeparated, 119 cl::desc("enable peephole optimizations"), 120 cl::value_desc("opt1,opt2,opt3,..."), 121 cl::values( 122 clEnumValN(PEEP_NONE, "none", "disable peepholes"), 123 clEnumValN(PEEP_DOUBLE_JUMPS, "double-jumps", 124 "remove double jumps when able"), 125 clEnumValN(PEEP_TAILCALL_TRAPS, "tailcall-traps", "insert tail call traps"), 126 clEnumValN(PEEP_USELESS_BRANCHES, "useless-branches", 127 "remove useless conditional branches"), 128 clEnumValN(PEEP_ALL, "all", "enable all peephole optimizations")), 129 cl::ZeroOrMore, 130 cl::cat(BoltOptCategory)); 131 132 static cl::opt<unsigned> 133 PrintFuncStat("print-function-statistics", 134 cl::desc("print statistics about basic block ordering"), 135 cl::init(0), 136 cl::ZeroOrMore, 137 cl::cat(BoltOptCategory)); 138 139 static cl::list<bolt::DynoStats::Category> 140 PrintSortedBy("print-sorted-by", 141 cl::CommaSeparated, 142 cl::desc("print functions sorted by order of dyno stats"), 143 cl::value_desc("key1,key2,key3,..."), 144 cl::values( 145 #define D(name, ...) \ 146 clEnumValN(bolt::DynoStats::name, \ 147 dynoStatsOptName(bolt::DynoStats::name), \ 148 dynoStatsOptDesc(bolt::DynoStats::name)), 149 DYNO_STATS 150 #undef D 151 clEnumValN(0xffff, ".", ".") 152 ), 153 cl::ZeroOrMore, 154 cl::cat(BoltOptCategory)); 155 156 static cl::opt<bool> 157 PrintUnknown("print-unknown", 158 cl::desc("print names of functions with unknown control flow"), 159 cl::init(false), 160 cl::ZeroOrMore, 161 cl::cat(BoltCategory), 162 cl::Hidden); 163 164 static cl::opt<bool> 165 PrintUnknownCFG("print-unknown-cfg", 166 cl::desc("dump CFG of functions with unknown control flow"), 167 cl::init(false), 168 cl::ZeroOrMore, 169 cl::cat(BoltCategory), 170 cl::ReallyHidden); 171 172 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks( 173 "reorder-blocks", cl::desc("change layout of basic blocks in a function"), 174 cl::init(bolt::ReorderBasicBlocks::LT_NONE), 175 cl::values( 176 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none", 177 "do not reorder basic blocks"), 178 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse", 179 "layout blocks in reverse order"), 180 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal", 181 "perform optimal layout based on profile"), 182 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, 183 "branch-predictor", 184 "perform optimal layout prioritizing branch " 185 "predictions"), 186 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache", 187 "perform optimal layout prioritizing I-cache " 188 "behavior"), 189 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "cache+", 190 "perform layout optimizing I-cache behavior"), 191 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp", 192 "perform layout optimizing I-cache behavior"), 193 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, 194 "cluster-shuffle", "perform random layout of clusters")), 195 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 196 197 static cl::opt<unsigned> 198 ReportBadLayout("report-bad-layout", 199 cl::desc("print top <uint> functions with suboptimal code layout on input"), 200 cl::init(0), 201 cl::ZeroOrMore, 202 cl::Hidden, 203 cl::cat(BoltOptCategory)); 204 205 static cl::opt<bool> 206 ReportStaleFuncs("report-stale", 207 cl::desc("print the list of functions with stale profile"), 208 cl::init(false), 209 cl::ZeroOrMore, 210 cl::Hidden, 211 cl::cat(BoltOptCategory)); 212 213 enum SctcModes : char { 214 SctcAlways, 215 SctcPreserveDirection, 216 SctcHeuristic 217 }; 218 219 static cl::opt<SctcModes> 220 SctcMode("sctc-mode", 221 cl::desc("mode for simplify conditional tail calls"), 222 cl::init(SctcAlways), 223 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), 224 clEnumValN(SctcPreserveDirection, 225 "preserve", 226 "only perform sctc when branch direction is " 227 "preserved"), 228 clEnumValN(SctcHeuristic, 229 "heuristic", 230 "use branch prediction data to control sctc")), 231 cl::ZeroOrMore, 232 cl::cat(BoltOptCategory)); 233 234 static cl::opt<unsigned> 235 StaleThreshold("stale-threshold", 236 cl::desc( 237 "maximum percentage of stale functions to tolerate (default: 100)"), 238 cl::init(100), 239 cl::Hidden, 240 cl::cat(BoltOptCategory)); 241 242 static cl::opt<unsigned> 243 TSPThreshold("tsp-threshold", 244 cl::desc("maximum number of hot basic blocks in a function for which to use " 245 "a precise TSP solution while re-ordering basic blocks"), 246 cl::init(10), 247 cl::ZeroOrMore, 248 cl::Hidden, 249 cl::cat(BoltOptCategory)); 250 251 static cl::opt<unsigned> 252 TopCalledLimit("top-called-limit", 253 cl::desc("maximum number of functions to print in top called " 254 "functions section"), 255 cl::init(100), 256 cl::ZeroOrMore, 257 cl::Hidden, 258 cl::cat(BoltCategory)); 259 260 } // namespace opts 261 262 namespace llvm { 263 namespace bolt { 264 265 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { 266 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG && 267 !BF.isIgnored(); 268 } 269 270 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { 271 return BF.isSimple() && !BF.isIgnored(); 272 } 273 274 void NormalizeCFG::runOnFunction(BinaryFunction &BF) { 275 uint64_t NumRemoved = 0; 276 uint64_t NumDuplicateEdges = 0; 277 uint64_t NeedsFixBranches = 0; 278 for (BinaryBasicBlock &BB : BF) { 279 if (!BB.empty()) 280 continue; 281 282 if (BB.isEntryPoint() || BB.isLandingPad()) 283 continue; 284 285 // Handle a dangling empty block. 286 if (BB.succ_size() == 0) { 287 // If an empty dangling basic block has a predecessor, it could be a 288 // result of codegen for __builtin_unreachable. In such case, do not 289 // remove the block. 290 if (BB.pred_size() == 0) { 291 BB.markValid(false); 292 ++NumRemoved; 293 } 294 continue; 295 } 296 297 // The block should have just one successor. 298 BinaryBasicBlock *Successor = BB.getSuccessor(); 299 assert(Successor && "invalid CFG encountered"); 300 301 // Redirect all predecessors to the successor block. 302 while (!BB.pred_empty()) { 303 BinaryBasicBlock *Predecessor = *BB.pred_begin(); 304 if (Predecessor->hasJumpTable()) 305 break; 306 307 if (Predecessor == Successor) 308 break; 309 310 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB); 311 Predecessor->replaceSuccessor(&BB, Successor, BI.Count, 312 BI.MispredictedCount); 313 // We need to fix branches even if we failed to replace all successors 314 // and remove the block. 315 NeedsFixBranches = true; 316 } 317 318 if (BB.pred_empty()) { 319 BB.removeAllSuccessors(); 320 BB.markValid(false); 321 ++NumRemoved; 322 } 323 } 324 325 if (NumRemoved) 326 BF.eraseInvalidBBs(); 327 328 // Check for duplicate successors. Do it after the empty block elimination as 329 // we can get more duplicate successors. 330 for (BinaryBasicBlock &BB : BF) 331 if (!BB.hasJumpTable() && BB.succ_size() == 2 && 332 BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true)) 333 ++NumDuplicateEdges; 334 335 // fixBranches() will get rid of duplicate edges and update jump instructions. 336 if (NumDuplicateEdges || NeedsFixBranches) 337 BF.fixBranches(); 338 339 NumDuplicateEdgesMerged += NumDuplicateEdges; 340 NumBlocksRemoved += NumRemoved; 341 } 342 343 void NormalizeCFG::runOnFunctions(BinaryContext &BC) { 344 ParallelUtilities::runOnEachFunction( 345 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 346 [&](BinaryFunction &BF) { runOnFunction(BF); }, 347 [&](const BinaryFunction &BF) { return !shouldOptimize(BF); }, 348 "NormalizeCFG"); 349 if (NumBlocksRemoved) 350 outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block" 351 << (NumBlocksRemoved == 1 ? "" : "s") << '\n'; 352 if (NumDuplicateEdgesMerged) 353 outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged 354 << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s") 355 << '\n'; 356 } 357 358 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { 359 if (Function.layout_size() > 0) { 360 unsigned Count; 361 uint64_t Bytes; 362 Function.markUnreachableBlocks(); 363 LLVM_DEBUG({ 364 for (BinaryBasicBlock *BB : Function.layout()) { 365 if (!BB->isValid()) { 366 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB->getName() 367 << " in function " << Function << "\n"; 368 Function.dump(); 369 } 370 } 371 }); 372 std::tie(Count, Bytes) = Function.eraseInvalidBBs(); 373 DeletedBlocks += Count; 374 DeletedBytes += Bytes; 375 if (Count) { 376 Modified.insert(&Function); 377 if (opts::Verbosity > 0) 378 outs() << "BOLT-INFO: Removed " << Count 379 << " dead basic block(s) accounting for " << Bytes 380 << " bytes in function " << Function << '\n'; 381 } 382 } 383 } 384 385 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { 386 for (auto &It : BC.getBinaryFunctions()) { 387 BinaryFunction &Function = It.second; 388 if (shouldOptimize(Function)) 389 runOnFunction(Function); 390 } 391 392 outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " 393 << DeletedBytes << " bytes of code.\n"; 394 } 395 396 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { 397 return (BinaryFunctionPass::shouldPrint(BF) && 398 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); 399 } 400 401 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const { 402 // Apply execution count threshold 403 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 404 return false; 405 406 return BinaryFunctionPass::shouldOptimize(BF); 407 } 408 409 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { 410 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) 411 return; 412 413 std::atomic<uint64_t> ModifiedFuncCount{0}; 414 415 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 416 modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters); 417 if (BF.hasLayoutChanged()) 418 ++ModifiedFuncCount; 419 }; 420 421 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 422 return !shouldOptimize(BF); 423 }; 424 425 ParallelUtilities::runOnEachFunction( 426 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc, 427 "ReorderBasicBlocks"); 428 429 outs() << "BOLT-INFO: basic block reordering modified layout of " 430 << format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(), 431 100.0 * ModifiedFuncCount.load() / 432 BC.getBinaryFunctions().size()); 433 434 if (opts::PrintFuncStat > 0) { 435 raw_ostream &OS = outs(); 436 // Copy all the values into vector in order to sort them 437 std::map<uint64_t, BinaryFunction &> ScoreMap; 438 auto &BFs = BC.getBinaryFunctions(); 439 for (auto It = BFs.begin(); It != BFs.end(); ++It) 440 ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>( 441 It->second.getFunctionScore(), It->second)); 442 443 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n"; 444 OS << " There are " << BFs.size() << " functions in total. \n"; 445 OS << " Number of functions being modified: " 446 << ModifiedFuncCount.load() << "\n"; 447 OS << " User asks for detailed information on top " 448 << opts::PrintFuncStat << " functions. (Ranked by function score)" 449 << "\n\n"; 450 uint64_t I = 0; 451 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit = 452 ScoreMap.rbegin(); 453 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { 454 BinaryFunction &Function = Rit->second; 455 456 OS << " Information for function of top: " << (I + 1) << ": \n"; 457 OS << " Function Score is: " << Function.getFunctionScore() 458 << "\n"; 459 OS << " There are " << Function.size() 460 << " number of blocks in this function.\n"; 461 OS << " There are " << Function.getInstructionCount() 462 << " number of instructions in this function.\n"; 463 OS << " The edit distance for this function is: " 464 << Function.getEditDistance() << "\n\n"; 465 } 466 } 467 } 468 469 void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, 470 LayoutType Type, 471 bool MinBranchClusters) const { 472 if (BF.size() == 0 || Type == LT_NONE) 473 return; 474 475 BinaryFunction::BasicBlockOrderType NewLayout; 476 std::unique_ptr<ReorderAlgorithm> Algo; 477 478 // Cannot do optimal layout without profile. 479 if (Type != LT_REVERSE && !BF.hasValidProfile()) 480 return; 481 482 if (Type == LT_REVERSE) { 483 Algo.reset(new ReverseReorderAlgorithm()); 484 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { 485 // Work on optimal solution if problem is small enough 486 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); 487 Algo.reset(new TSPReorderAlgorithm()); 488 } else { 489 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n"); 490 491 std::unique_ptr<ClusterAlgorithm> CAlgo; 492 if (MinBranchClusters) 493 CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); 494 else 495 CAlgo.reset(new PHGreedyClusterAlgorithm()); 496 497 switch (Type) { 498 case LT_OPTIMIZE: 499 Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); 500 break; 501 502 case LT_OPTIMIZE_BRANCH: 503 Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); 504 break; 505 506 case LT_OPTIMIZE_CACHE: 507 Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); 508 break; 509 510 case LT_OPTIMIZE_EXT_TSP: 511 Algo.reset(new ExtTSPReorderAlgorithm()); 512 break; 513 514 case LT_OPTIMIZE_SHUFFLE: 515 Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); 516 break; 517 518 default: 519 llvm_unreachable("unexpected layout type"); 520 } 521 } 522 523 Algo->reorderBasicBlocks(BF, NewLayout); 524 525 BF.updateBasicBlockLayout(NewLayout); 526 } 527 528 void FixupBranches::runOnFunctions(BinaryContext &BC) { 529 for (auto &It : BC.getBinaryFunctions()) { 530 BinaryFunction &Function = It.second; 531 if (!BC.shouldEmit(Function) || !Function.isSimple()) 532 continue; 533 534 Function.fixBranches(); 535 } 536 } 537 538 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) { 539 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 540 if (!BF.finalizeCFIState()) { 541 if (BC.HasRelocations) { 542 errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF 543 << ". Exiting.\n"; 544 exit(1); 545 } 546 BF.setSimple(false); 547 return; 548 } 549 550 BF.setFinalized(); 551 552 // Update exception handling information. 553 BF.updateEHRanges(); 554 }; 555 556 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 557 return !BC.shouldEmit(BF); 558 }; 559 560 ParallelUtilities::runOnEachFunction( 561 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 562 SkipPredicate, "FinalizeFunctions"); 563 } 564 565 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { 566 if (BC.HasRelocations) 567 return; 568 569 if (!opts::UpdateDebugSections) 570 return; 571 572 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit 573 // incorrect debug info. 574 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 575 uint64_t HotSize, ColdSize; 576 std::tie(HotSize, ColdSize) = 577 BC.calculateEmittedSize(BF, /*FixBranches=*/false); 578 if (HotSize > BF.getMaxSize()) 579 BF.setSimple(false); 580 }; 581 582 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 583 return !shouldOptimize(BF); 584 }; 585 586 ParallelUtilities::runOnEachFunction( 587 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 588 SkipFunc, "CheckLargeFunctions"); 589 } 590 591 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { 592 // Unlike other passes, allow functions in non-CFG state. 593 return BF.isSimple() && !BF.isIgnored(); 594 } 595 596 void LowerAnnotations::runOnFunctions(BinaryContext &BC) { 597 std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations; 598 599 for (auto &It : BC.getBinaryFunctions()) { 600 BinaryFunction &BF = It.second; 601 int64_t CurrentGnuArgsSize = 0; 602 603 // Have we crossed hot/cold border for split functions? 604 bool SeenCold = false; 605 606 for (BinaryBasicBlock *BB : BF.layout()) { 607 if (BB->isCold() && !SeenCold) { 608 SeenCold = true; 609 CurrentGnuArgsSize = 0; 610 } 611 612 // First convert GnuArgsSize annotations into CFIs. This may change instr 613 // pointers, so do it before recording ptrs for preserved annotations 614 if (BF.usesGnuArgsSize()) { 615 for (auto II = BB->begin(); II != BB->end(); ++II) { 616 if (!BC.MIB->isInvoke(*II)) 617 continue; 618 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II); 619 assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size"); 620 if (NewGnuArgsSize != CurrentGnuArgsSize) { 621 auto InsertII = BF.addCFIInstruction( 622 BB, II, 623 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize)); 624 CurrentGnuArgsSize = NewGnuArgsSize; 625 II = std::next(InsertII); 626 } 627 } 628 } 629 630 // Now record preserved annotations separately and then strip annotations. 631 for (auto II = BB->begin(); II != BB->end(); ++II) { 632 if (BF.requiresAddressTranslation() && 633 BC.MIB->hasAnnotation(*II, "Offset")) 634 PreservedOffsetAnnotations.emplace_back( 635 &(*II), BC.MIB->getAnnotationAs<uint32_t>(*II, "Offset")); 636 BC.MIB->stripAnnotations(*II); 637 } 638 } 639 } 640 for (BinaryFunction *BF : BC.getInjectedBinaryFunctions()) 641 for (BinaryBasicBlock &BB : *BF) 642 for (MCInst &Instruction : BB) 643 BC.MIB->stripAnnotations(Instruction); 644 645 // Release all memory taken by annotations 646 BC.MIB->freeAnnotations(); 647 648 // Reinsert preserved annotations we need during code emission. 649 for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations) 650 BC.MIB->addAnnotation<uint32_t>(*Item.first, "Offset", Item.second); 651 } 652 653 namespace { 654 655 // This peephole fixes jump instructions that jump to another basic 656 // block with a single jump instruction, e.g. 657 // 658 // B0: ... 659 // jmp B1 (or jcc B1) 660 // 661 // B1: jmp B2 662 // 663 // -> 664 // 665 // B0: ... 666 // jmp B2 (or jcc B2) 667 // 668 uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { 669 uint64_t NumDoubleJumps = 0; 670 671 MCContext *Ctx = Function.getBinaryContext().Ctx.get(); 672 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 673 for (BinaryBasicBlock &BB : Function) { 674 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ, 675 const MCSymbol *SuccSym) { 676 // Ignore infinite loop jumps or fallthrough tail jumps. 677 if (Pred == Succ || Succ == &BB) 678 return false; 679 680 if (Succ) { 681 const MCSymbol *TBB = nullptr; 682 const MCSymbol *FBB = nullptr; 683 MCInst *CondBranch = nullptr; 684 MCInst *UncondBranch = nullptr; 685 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 686 if (!Res) { 687 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; 688 Pred->dump()); 689 return false; 690 } 691 Pred->replaceSuccessor(&BB, Succ); 692 693 // We must patch up any existing branch instructions to match up 694 // with the new successor. 695 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && 696 "Predecessor block has inconsistent number of successors"); 697 if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) { 698 MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); 699 } else if (UncondBranch && 700 MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) { 701 MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); 702 } else if (!UncondBranch) { 703 assert(Function.getBasicBlockAfter(Pred, false) != Succ && 704 "Don't add an explicit jump to a fallthrough block."); 705 Pred->addBranchInstruction(Succ); 706 } 707 } else { 708 // Succ will be null in the tail call case. In this case we 709 // need to explicitly add a tail call instruction. 710 MCInst *Branch = Pred->getLastNonPseudoInstr(); 711 if (Branch && MIB->isUnconditionalBranch(*Branch)) { 712 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel()); 713 Pred->removeSuccessor(&BB); 714 Pred->eraseInstruction(Pred->findInstruction(Branch)); 715 Pred->addTailCallInstruction(SuccSym); 716 } else { 717 return false; 718 } 719 } 720 721 ++NumDoubleJumps; 722 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from " 723 << Pred->getName() << " -> " << BB.getName() << " to " 724 << Pred->getName() << " -> " << SuccSym->getName() 725 << (!Succ ? " (tail)\n" : "\n")); 726 727 return true; 728 }; 729 730 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) 731 continue; 732 733 MCInst *Inst = BB.getFirstNonPseudoInstr(); 734 const bool IsTailCall = MIB->isTailCall(*Inst); 735 736 if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall) 737 continue; 738 739 // If we operate after SCTC make sure it's not a conditional tail call. 740 if (IsTailCall && MIB->isConditionalBranch(*Inst)) 741 continue; 742 743 const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst); 744 BinaryBasicBlock *Succ = BB.getSuccessor(); 745 746 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) 747 continue; 748 749 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()}; 750 751 for (BinaryBasicBlock *Pred : Preds) { 752 if (Pred->isLandingPad()) 753 continue; 754 755 if (Pred->getSuccessor() == &BB || 756 (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || 757 Pred->getConditionalSuccessor(false) == &BB) 758 if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid) 759 BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() || 760 BB.isEntryPoint()); 761 } 762 } 763 764 return NumDoubleJumps; 765 } 766 } // namespace 767 768 bool SimplifyConditionalTailCalls::shouldRewriteBranch( 769 const BinaryBasicBlock *PredBB, const MCInst &CondBranch, 770 const BinaryBasicBlock *BB, const bool DirectionFlag) { 771 if (BeenOptimized.count(PredBB)) 772 return false; 773 774 const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); 775 776 if (IsForward) 777 ++NumOrigForwardBranches; 778 else 779 ++NumOrigBackwardBranches; 780 781 if (opts::SctcMode == opts::SctcAlways) 782 return true; 783 784 if (opts::SctcMode == opts::SctcPreserveDirection) 785 return IsForward == DirectionFlag; 786 787 const ErrorOr<std::pair<double, double>> Frequency = 788 PredBB->getBranchStats(BB); 789 790 // It's ok to rewrite the conditional branch if the new target will be 791 // a backward branch. 792 793 // If no data available for these branches, then it should be ok to 794 // do the optimization since it will reduce code size. 795 if (Frequency.getError()) 796 return true; 797 798 // TODO: should this use misprediction frequency instead? 799 const bool Result = (IsForward && Frequency.get().first >= 0.5) || 800 (!IsForward && Frequency.get().first <= 0.5); 801 802 return Result == DirectionFlag; 803 } 804 805 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) { 806 // Need updated indices to correctly detect branch' direction. 807 BF.updateLayoutIndices(); 808 BF.markUnreachableBlocks(); 809 810 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 811 MCContext *Ctx = BF.getBinaryContext().Ctx.get(); 812 uint64_t NumLocalCTCCandidates = 0; 813 uint64_t NumLocalCTCs = 0; 814 uint64_t LocalCTCTakenCount = 0; 815 uint64_t LocalCTCExecCount = 0; 816 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>> 817 NeedsUncondBranch; 818 819 // Will block be deleted by UCE? 820 auto isValid = [](const BinaryBasicBlock *BB) { 821 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint()); 822 }; 823 824 for (BinaryBasicBlock *BB : BF.layout()) { 825 // Locate BB with a single direct tail-call instruction. 826 if (BB->getNumNonPseudos() != 1) 827 continue; 828 829 MCInst *Instr = BB->getFirstNonPseudoInstr(); 830 if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr)) 831 continue; 832 833 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr); 834 if (!CalleeSymbol) 835 continue; 836 837 // Detect direction of the possible conditional tail call. 838 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); 839 840 // Iterate through all predecessors. 841 for (BinaryBasicBlock *PredBB : BB->predecessors()) { 842 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true); 843 if (!CondSucc) 844 continue; 845 846 ++NumLocalCTCCandidates; 847 848 const MCSymbol *TBB = nullptr; 849 const MCSymbol *FBB = nullptr; 850 MCInst *CondBranch = nullptr; 851 MCInst *UncondBranch = nullptr; 852 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 853 854 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz 855 if (!Result) { 856 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n"; 857 PredBB->dump()); 858 continue; 859 } 860 861 assert(Result && "internal error analyzing conditional branch"); 862 assert(CondBranch && "conditional branch expected"); 863 864 // It's possible that PredBB is also a successor to BB that may have 865 // been processed by a previous iteration of the SCTC loop, in which 866 // case it may have been marked invalid. We should skip rewriting in 867 // this case. 868 if (!PredBB->isValid()) { 869 assert(PredBB->isSuccessor(BB) && 870 "PredBB should be valid if it is not a successor to BB"); 871 continue; 872 } 873 874 // We don't want to reverse direction of the branch in new order 875 // without further profile analysis. 876 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; 877 if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag)) 878 continue; 879 880 // Record this block so that we don't try to optimize it twice. 881 BeenOptimized.insert(PredBB); 882 883 uint64_t Count = 0; 884 if (CondSucc != BB) { 885 // Patch the new target address into the conditional branch. 886 MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx); 887 // Since we reversed the condition on the branch we need to change 888 // the target for the unconditional branch or add a unconditional 889 // branch to the old target. This has to be done manually since 890 // fixupBranches is not called after SCTC. 891 NeedsUncondBranch.emplace_back(PredBB, CondSucc); 892 Count = PredBB->getFallthroughBranchInfo().Count; 893 } else { 894 // Change destination of the conditional branch. 895 MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx); 896 Count = PredBB->getTakenBranchInfo().Count; 897 } 898 const uint64_t CTCTakenFreq = 899 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; 900 901 // Annotate it, so "isCall" returns true for this jcc 902 MIB->setConditionalTailCall(*CondBranch); 903 // Add info abount the conditional tail call frequency, otherwise this 904 // info will be lost when we delete the associated BranchInfo entry 905 auto &CTCAnnotation = 906 MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount"); 907 CTCAnnotation = CTCTakenFreq; 908 909 // Remove the unused successor which may be eliminated later 910 // if there are no other users. 911 PredBB->removeSuccessor(BB); 912 // Update BB execution count 913 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) 914 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); 915 else if (CTCTakenFreq > BB->getKnownExecutionCount()) 916 BB->setExecutionCount(0); 917 918 ++NumLocalCTCs; 919 LocalCTCTakenCount += CTCTakenFreq; 920 LocalCTCExecCount += PredBB->getKnownExecutionCount(); 921 } 922 923 // Remove the block from CFG if all predecessors were removed. 924 BB->markValid(isValid(BB)); 925 } 926 927 // Add unconditional branches at the end of BBs to new successors 928 // as long as the successor is not a fallthrough. 929 for (auto &Entry : NeedsUncondBranch) { 930 BinaryBasicBlock *PredBB = Entry.first; 931 const BinaryBasicBlock *CondSucc = Entry.second; 932 933 const MCSymbol *TBB = nullptr; 934 const MCSymbol *FBB = nullptr; 935 MCInst *CondBranch = nullptr; 936 MCInst *UncondBranch = nullptr; 937 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 938 939 // Find the next valid block. Invalid blocks will be deleted 940 // so they shouldn't be considered fallthrough targets. 941 const BinaryBasicBlock *NextBlock = BF.getBasicBlockAfter(PredBB, false); 942 while (NextBlock && !isValid(NextBlock)) 943 NextBlock = BF.getBasicBlockAfter(NextBlock, false); 944 945 // Get the unconditional successor to this block. 946 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor(); 947 assert(PredSucc && "The other branch should be a tail call"); 948 949 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); 950 951 if (UncondBranch) { 952 if (HasFallthrough) 953 PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch)); 954 else 955 MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx); 956 } else if (!HasFallthrough) { 957 MCInst Branch; 958 MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx); 959 PredBB->addInstruction(Branch); 960 } 961 } 962 963 if (NumLocalCTCs > 0) { 964 NumDoubleJumps += fixDoubleJumps(BF, true); 965 // Clean-up unreachable tail-call blocks. 966 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs(); 967 DeletedBlocks += Stats.first; 968 DeletedBytes += Stats.second; 969 970 assert(BF.validateCFG()); 971 } 972 973 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs 974 << " conditional tail calls from a total of " 975 << NumLocalCTCCandidates << " candidates in function " << BF 976 << ". CTCs execution count for this function is " 977 << LocalCTCExecCount << " and CTC taken count is " 978 << LocalCTCTakenCount << "\n";); 979 980 NumTailCallsPatched += NumLocalCTCs; 981 NumCandidateTailCalls += NumLocalCTCCandidates; 982 CTCExecCount += LocalCTCExecCount; 983 CTCTakenCount += LocalCTCTakenCount; 984 985 return NumLocalCTCs > 0; 986 } 987 988 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { 989 if (!BC.isX86()) 990 return; 991 992 for (auto &It : BC.getBinaryFunctions()) { 993 BinaryFunction &Function = It.second; 994 995 if (!shouldOptimize(Function)) 996 continue; 997 998 if (fixTailCalls(Function)) { 999 Modified.insert(&Function); 1000 Function.setHasCanonicalCFG(false); 1001 } 1002 } 1003 1004 outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched 1005 << " tail calls (" << NumOrigForwardBranches << " forward)" 1006 << " tail calls (" << NumOrigBackwardBranches << " backward)" 1007 << " from a total of " << NumCandidateTailCalls << " while removing " 1008 << NumDoubleJumps << " double jumps" 1009 << " and removing " << DeletedBlocks << " basic blocks" 1010 << " totalling " << DeletedBytes 1011 << " bytes of code. CTCs total execution count is " << CTCExecCount 1012 << " and the number of times CTCs are taken is " << CTCTakenCount 1013 << ".\n"; 1014 } 1015 1016 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) { 1017 uint64_t Count = 0; 1018 const BinaryContext &BC = Function.getBinaryContext(); 1019 for (BinaryBasicBlock &BB : Function) { 1020 for (MCInst &Inst : BB) { 1021 MCInst OriginalInst; 1022 if (opts::Verbosity > 2) 1023 OriginalInst = Inst; 1024 1025 if (!BC.MIB->shortenInstruction(Inst)) 1026 continue; 1027 1028 if (opts::Verbosity > 2) { 1029 outs() << "BOLT-INFO: shortening:\nBOLT-INFO: "; 1030 BC.printInstruction(outs(), OriginalInst, 0, &Function); 1031 outs() << "BOLT-INFO: to:"; 1032 BC.printInstruction(outs(), Inst, 0, &Function); 1033 } 1034 1035 ++Count; 1036 } 1037 } 1038 1039 return Count; 1040 } 1041 1042 void ShortenInstructions::runOnFunctions(BinaryContext &BC) { 1043 std::atomic<uint64_t> NumShortened{0}; 1044 if (!BC.isX86()) 1045 return; 1046 1047 ParallelUtilities::runOnEachFunction( 1048 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, 1049 [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); }, 1050 nullptr, "ShortenInstructions"); 1051 1052 outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n"; 1053 } 1054 1055 void Peepholes::addTailcallTraps(BinaryFunction &Function) { 1056 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 1057 for (BinaryBasicBlock &BB : Function) { 1058 MCInst *Inst = BB.getLastNonPseudoInstr(); 1059 if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) { 1060 MCInst Trap; 1061 if (MIB->createTrap(Trap)) { 1062 BB.addInstruction(Trap); 1063 ++TailCallTraps; 1064 } 1065 } 1066 } 1067 } 1068 1069 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) { 1070 for (BinaryBasicBlock &BB : Function) { 1071 if (BB.succ_size() != 2) 1072 continue; 1073 1074 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true); 1075 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false); 1076 if (CondBB != UncondBB) 1077 continue; 1078 1079 const MCSymbol *TBB = nullptr; 1080 const MCSymbol *FBB = nullptr; 1081 MCInst *CondBranch = nullptr; 1082 MCInst *UncondBranch = nullptr; 1083 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 1084 1085 // analyzeBranch() can fail due to unusual branch instructions, 1086 // e.g. jrcxz, or jump tables (indirect jump). 1087 if (!Result || !CondBranch) 1088 continue; 1089 1090 BB.removeDuplicateConditionalSuccessor(CondBranch); 1091 ++NumUselessCondBranches; 1092 } 1093 } 1094 1095 void Peepholes::runOnFunctions(BinaryContext &BC) { 1096 const char Opts = std::accumulate( 1097 opts::Peepholes.begin(), opts::Peepholes.end(), 0, 1098 [](const char A, const opts::PeepholeOpts B) { return A | B; }); 1099 if (Opts == opts::PEEP_NONE || !BC.isX86()) 1100 return; 1101 1102 for (auto &It : BC.getBinaryFunctions()) { 1103 BinaryFunction &Function = It.second; 1104 if (shouldOptimize(Function)) { 1105 if (Opts & opts::PEEP_DOUBLE_JUMPS) 1106 NumDoubleJumps += fixDoubleJumps(Function, false); 1107 if (Opts & opts::PEEP_TAILCALL_TRAPS) 1108 addTailcallTraps(Function); 1109 if (Opts & opts::PEEP_USELESS_BRANCHES) 1110 removeUselessCondBranches(Function); 1111 assert(Function.validateCFG()); 1112 } 1113 } 1114 outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps 1115 << " double jumps patched.\n" 1116 << "BOLT-INFO: Peephole: " << TailCallTraps 1117 << " tail call traps inserted.\n" 1118 << "BOLT-INFO: Peephole: " << NumUselessCondBranches 1119 << " useless conditional branches removed.\n"; 1120 } 1121 1122 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) { 1123 BinaryContext &BC = BF.getBinaryContext(); 1124 MCPlusBuilder *MIB = BC.MIB.get(); 1125 1126 uint64_t NumLocalLoadsSimplified = 0; 1127 uint64_t NumDynamicLocalLoadsSimplified = 0; 1128 uint64_t NumLocalLoadsFound = 0; 1129 uint64_t NumDynamicLocalLoadsFound = 0; 1130 1131 for (BinaryBasicBlock *BB : BF.layout()) { 1132 for (MCInst &Inst : *BB) { 1133 unsigned Opcode = Inst.getOpcode(); 1134 const MCInstrDesc &Desc = BC.MII->get(Opcode); 1135 1136 // Skip instructions that do not load from memory. 1137 if (!Desc.mayLoad()) 1138 continue; 1139 1140 // Try to statically evaluate the target memory address; 1141 uint64_t TargetAddress; 1142 1143 if (MIB->hasPCRelOperand(Inst)) { 1144 // Try to find the symbol that corresponds to the PC-relative operand. 1145 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst); 1146 assert(DispOpI != Inst.end() && "expected PC-relative displacement"); 1147 assert(DispOpI->isExpr() && 1148 "found PC-relative with non-symbolic displacement"); 1149 1150 // Get displacement symbol. 1151 const MCSymbol *DisplSymbol; 1152 uint64_t DisplOffset; 1153 1154 std::tie(DisplSymbol, DisplOffset) = 1155 MIB->getTargetSymbolInfo(DispOpI->getExpr()); 1156 1157 if (!DisplSymbol) 1158 continue; 1159 1160 // Look up the symbol address in the global symbols map of the binary 1161 // context object. 1162 BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName()); 1163 if (!BD) 1164 continue; 1165 TargetAddress = BD->getAddress() + DisplOffset; 1166 } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) { 1167 continue; 1168 } 1169 1170 // Get the contents of the section containing the target address of the 1171 // memory operand. We are only interested in read-only sections. 1172 ErrorOr<BinarySection &> DataSection = 1173 BC.getSectionForAddress(TargetAddress); 1174 if (!DataSection || !DataSection->isReadOnly()) 1175 continue; 1176 1177 if (BC.getRelocationAt(TargetAddress) || 1178 BC.getDynamicRelocationAt(TargetAddress)) 1179 continue; 1180 1181 uint32_t Offset = TargetAddress - DataSection->getAddress(); 1182 StringRef ConstantData = DataSection->getContents(); 1183 1184 ++NumLocalLoadsFound; 1185 if (BB->hasProfile()) 1186 NumDynamicLocalLoadsFound += BB->getExecutionCount(); 1187 1188 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { 1189 ++NumLocalLoadsSimplified; 1190 if (BB->hasProfile()) 1191 NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); 1192 } 1193 } 1194 } 1195 1196 NumLoadsFound += NumLocalLoadsFound; 1197 NumDynamicLoadsFound += NumDynamicLocalLoadsFound; 1198 NumLoadsSimplified += NumLocalLoadsSimplified; 1199 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; 1200 1201 return NumLocalLoadsSimplified > 0; 1202 } 1203 1204 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { 1205 for (auto &It : BC.getBinaryFunctions()) { 1206 BinaryFunction &Function = It.second; 1207 if (shouldOptimize(Function) && simplifyRODataLoads(Function)) 1208 Modified.insert(&Function); 1209 } 1210 1211 outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " 1212 << NumLoadsFound << " loads from a statically computed address.\n" 1213 << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified 1214 << "\n" 1215 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n"; 1216 } 1217 1218 void AssignSections::runOnFunctions(BinaryContext &BC) { 1219 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) { 1220 Function->setCodeSectionName(BC.getInjectedCodeSectionName()); 1221 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); 1222 } 1223 1224 // In non-relocation mode functions have pre-assigned section names. 1225 if (!BC.HasRelocations) 1226 return; 1227 1228 const bool UseColdSection = 1229 BC.NumProfiledFuncs > 0 || 1230 opts::ReorderFunctions == ReorderFunctions::RT_USER; 1231 for (auto &BFI : BC.getBinaryFunctions()) { 1232 BinaryFunction &Function = BFI.second; 1233 if (opts::isHotTextMover(Function)) { 1234 Function.setCodeSectionName(BC.getHotTextMoverSectionName()); 1235 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); 1236 continue; 1237 } 1238 1239 if (!UseColdSection || Function.hasValidIndex() || 1240 Function.hasValidProfile()) 1241 Function.setCodeSectionName(BC.getMainCodeSectionName()); 1242 else 1243 Function.setCodeSectionName(BC.getColdCodeSectionName()); 1244 1245 if (Function.isSplit()) 1246 Function.setColdCodeSectionName(BC.getColdCodeSectionName()); 1247 } 1248 } 1249 1250 void PrintProfileStats::runOnFunctions(BinaryContext &BC) { 1251 double FlowImbalanceMean = 0.0; 1252 size_t NumBlocksConsidered = 0; 1253 double WorstBias = 0.0; 1254 const BinaryFunction *WorstBiasFunc = nullptr; 1255 1256 // For each function CFG, we fill an IncomingMap with the sum of the frequency 1257 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum 1258 // of the frequency of outgoing edges. 1259 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>; 1260 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps; 1261 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps; 1262 1263 // Compute mean 1264 for (const auto &BFI : BC.getBinaryFunctions()) { 1265 const BinaryFunction &Function = BFI.second; 1266 if (Function.empty() || !Function.isSimple()) 1267 continue; 1268 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1269 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1270 for (const BinaryBasicBlock &BB : Function) { 1271 uint64_t TotalOutgoing = 0ULL; 1272 auto SuccBIIter = BB.branch_info_begin(); 1273 for (BinaryBasicBlock *Succ : BB.successors()) { 1274 uint64_t Count = SuccBIIter->Count; 1275 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { 1276 ++SuccBIIter; 1277 continue; 1278 } 1279 TotalOutgoing += Count; 1280 IncomingMap[Succ] += Count; 1281 ++SuccBIIter; 1282 } 1283 OutgoingMap[&BB] = TotalOutgoing; 1284 } 1285 1286 size_t NumBlocks = 0; 1287 double Mean = 0.0; 1288 for (const BinaryBasicBlock &BB : Function) { 1289 // Do not compute score for low frequency blocks, entry or exit blocks 1290 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint()) 1291 continue; 1292 ++NumBlocks; 1293 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1294 Mean += fabs(Difference / IncomingMap[&BB]); 1295 } 1296 1297 FlowImbalanceMean += Mean; 1298 NumBlocksConsidered += NumBlocks; 1299 if (!NumBlocks) 1300 continue; 1301 double FuncMean = Mean / NumBlocks; 1302 if (FuncMean > WorstBias) { 1303 WorstBias = FuncMean; 1304 WorstBiasFunc = &Function; 1305 } 1306 } 1307 if (NumBlocksConsidered > 0) 1308 FlowImbalanceMean /= NumBlocksConsidered; 1309 1310 // Compute standard deviation 1311 NumBlocksConsidered = 0; 1312 double FlowImbalanceVar = 0.0; 1313 for (const auto &BFI : BC.getBinaryFunctions()) { 1314 const BinaryFunction &Function = BFI.second; 1315 if (Function.empty() || !Function.isSimple()) 1316 continue; 1317 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1318 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1319 for (const BinaryBasicBlock &BB : Function) { 1320 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0) 1321 continue; 1322 ++NumBlocksConsidered; 1323 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1324 FlowImbalanceVar += 1325 pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2); 1326 } 1327 } 1328 if (NumBlocksConsidered) { 1329 FlowImbalanceVar /= NumBlocksConsidered; 1330 FlowImbalanceVar = sqrt(FlowImbalanceVar); 1331 } 1332 1333 // Report to user 1334 outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n", 1335 (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar)); 1336 if (WorstBiasFunc && opts::Verbosity >= 1) { 1337 outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName() 1338 << "\n"; 1339 LLVM_DEBUG(WorstBiasFunc->dump()); 1340 } 1341 } 1342 1343 void PrintProgramStats::runOnFunctions(BinaryContext &BC) { 1344 uint64_t NumRegularFunctions = 0; 1345 uint64_t NumStaleProfileFunctions = 0; 1346 uint64_t NumNonSimpleProfiledFunctions = 0; 1347 uint64_t NumUnknownControlFlowFunctions = 0; 1348 uint64_t TotalSampleCount = 0; 1349 uint64_t StaleSampleCount = 0; 1350 std::vector<const BinaryFunction *> ProfiledFunctions; 1351 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; 1352 for (auto &BFI : BC.getBinaryFunctions()) { 1353 const BinaryFunction &Function = BFI.second; 1354 1355 // Ignore PLT functions for stats. 1356 if (Function.isPLTFunction()) 1357 continue; 1358 1359 ++NumRegularFunctions; 1360 1361 if (!Function.isSimple()) { 1362 if (Function.hasProfile()) 1363 ++NumNonSimpleProfiledFunctions; 1364 continue; 1365 } 1366 1367 if (Function.hasUnknownControlFlow()) { 1368 if (opts::PrintUnknownCFG) 1369 Function.dump(); 1370 else if (opts::PrintUnknown) 1371 errs() << "function with unknown control flow: " << Function << '\n'; 1372 1373 ++NumUnknownControlFlowFunctions; 1374 } 1375 1376 if (!Function.hasProfile()) 1377 continue; 1378 1379 uint64_t SampleCount = Function.getRawBranchCount(); 1380 TotalSampleCount += SampleCount; 1381 1382 if (Function.hasValidProfile()) { 1383 ProfiledFunctions.push_back(&Function); 1384 } else { 1385 if (opts::ReportStaleFuncs) { 1386 outs() << StaleFuncsHeader; 1387 StaleFuncsHeader = ""; 1388 outs() << " " << Function << '\n'; 1389 } 1390 ++NumStaleProfileFunctions; 1391 StaleSampleCount += SampleCount; 1392 } 1393 } 1394 BC.NumProfiledFuncs = ProfiledFunctions.size(); 1395 1396 const size_t NumAllProfiledFunctions = 1397 ProfiledFunctions.size() + NumStaleProfileFunctions; 1398 outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of " 1399 << NumRegularFunctions << " functions in the binary (" 1400 << format("%.1f", NumAllProfiledFunctions / 1401 (float)NumRegularFunctions * 100.0f) 1402 << "%) have non-empty execution profile\n"; 1403 if (NumNonSimpleProfiledFunctions) { 1404 outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function" 1405 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s") 1406 << " with profile could not be optimized\n"; 1407 } 1408 if (NumStaleProfileFunctions) { 1409 const float PctStale = 1410 NumStaleProfileFunctions / (float)NumAllProfiledFunctions * 100.0f; 1411 auto printErrorOrWarning = [&]() { 1412 if (PctStale > opts::StaleThreshold) 1413 errs() << "BOLT-ERROR: "; 1414 else 1415 errs() << "BOLT-WARNING: "; 1416 }; 1417 printErrorOrWarning(); 1418 errs() << NumStaleProfileFunctions 1419 << format(" (%.1f%% of all profiled)", PctStale) << " function" 1420 << (NumStaleProfileFunctions == 1 ? "" : "s") 1421 << " have invalid (possibly stale) profile." 1422 " Use -report-stale to see the list.\n"; 1423 if (TotalSampleCount > 0) { 1424 printErrorOrWarning(); 1425 errs() << StaleSampleCount << " out of " << TotalSampleCount 1426 << " samples in the binary (" 1427 << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount)) 1428 << "%) belong to functions with invalid" 1429 " (possibly stale) profile.\n"; 1430 } 1431 if (PctStale > opts::StaleThreshold) { 1432 errs() << "BOLT-ERROR: stale functions exceed specified threshold of " 1433 << opts::StaleThreshold << "%. Exiting.\n"; 1434 exit(1); 1435 } 1436 } 1437 1438 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { 1439 outs() << "BOLT-INFO: profile for " << NumUnusedObjects 1440 << " objects was ignored\n"; 1441 } 1442 1443 if (ProfiledFunctions.size() > 10) { 1444 if (opts::Verbosity >= 1) { 1445 outs() << "BOLT-INFO: top called functions are:\n"; 1446 std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(), 1447 [](const BinaryFunction *A, const BinaryFunction *B) { 1448 return B->getExecutionCount() < A->getExecutionCount(); 1449 }); 1450 auto SFI = ProfiledFunctions.begin(); 1451 auto SFIend = ProfiledFunctions.end(); 1452 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend; 1453 ++SFI, ++I) 1454 outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n'; 1455 } 1456 } 1457 1458 if (!opts::PrintSortedBy.empty() && 1459 std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(), 1460 DynoStats::FIRST_DYNO_STAT) == opts::PrintSortedBy.end()) { 1461 1462 std::vector<const BinaryFunction *> Functions; 1463 std::map<const BinaryFunction *, DynoStats> Stats; 1464 1465 for (const auto &BFI : BC.getBinaryFunctions()) { 1466 const BinaryFunction &BF = BFI.second; 1467 if (shouldOptimize(BF) && BF.hasValidProfile()) { 1468 Functions.push_back(&BF); 1469 Stats.emplace(&BF, getDynoStats(BF)); 1470 } 1471 } 1472 1473 const bool SortAll = 1474 std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(), 1475 DynoStats::LAST_DYNO_STAT) != opts::PrintSortedBy.end(); 1476 1477 const bool Ascending = 1478 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; 1479 1480 if (SortAll) { 1481 std::stable_sort(Functions.begin(), Functions.end(), 1482 [Ascending, &Stats](const BinaryFunction *A, 1483 const BinaryFunction *B) { 1484 return Ascending ? Stats.at(A) < Stats.at(B) 1485 : Stats.at(B) < Stats.at(A); 1486 }); 1487 } else { 1488 std::stable_sort( 1489 Functions.begin(), Functions.end(), 1490 [Ascending, &Stats](const BinaryFunction *A, 1491 const BinaryFunction *B) { 1492 const DynoStats &StatsA = Stats.at(A); 1493 const DynoStats &StatsB = Stats.at(B); 1494 return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy) 1495 : StatsB.lessThan(StatsA, opts::PrintSortedBy); 1496 }); 1497 } 1498 1499 outs() << "BOLT-INFO: top functions sorted by "; 1500 if (SortAll) { 1501 outs() << "dyno stats"; 1502 } else { 1503 outs() << "("; 1504 bool PrintComma = false; 1505 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1506 if (PrintComma) 1507 outs() << ", "; 1508 outs() << DynoStats::Description(Category); 1509 PrintComma = true; 1510 } 1511 outs() << ")"; 1512 } 1513 1514 outs() << " are:\n"; 1515 auto SFI = Functions.begin(); 1516 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { 1517 const DynoStats Stats = getDynoStats(**SFI); 1518 outs() << " " << **SFI; 1519 if (!SortAll) { 1520 outs() << " ("; 1521 bool PrintComma = false; 1522 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1523 if (PrintComma) 1524 outs() << ", "; 1525 outs() << dynoStatsOptName(Category) << "=" << Stats[Category]; 1526 PrintComma = true; 1527 } 1528 outs() << ")"; 1529 } 1530 outs() << "\n"; 1531 } 1532 } 1533 1534 if (!BC.TrappedFunctions.empty()) { 1535 errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function" 1536 << (BC.TrappedFunctions.size() > 1 ? "s" : "") 1537 << " will trap on entry. Use -trap-avx512=0 to disable" 1538 " traps."; 1539 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) { 1540 errs() << '\n'; 1541 for (const BinaryFunction *Function : BC.TrappedFunctions) 1542 errs() << " " << *Function << '\n'; 1543 } else { 1544 errs() << " Use -v=1 to see the list.\n"; 1545 } 1546 } 1547 1548 // Print information on missed macro-fusion opportunities seen on input. 1549 if (BC.MissedMacroFusionPairs) { 1550 outs() << "BOLT-INFO: the input contains " << BC.MissedMacroFusionPairs 1551 << " (dynamic count : " << BC.MissedMacroFusionExecCount 1552 << ") opportunities for macro-fusion optimization"; 1553 switch (opts::AlignMacroOpFusion) { 1554 case MFT_NONE: 1555 outs() << ". Use -align-macro-fusion to fix.\n"; 1556 break; 1557 case MFT_HOT: 1558 outs() << ". Will fix instances on a hot path.\n"; 1559 break; 1560 case MFT_ALL: 1561 outs() << " that are going to be fixed\n"; 1562 break; 1563 } 1564 } 1565 1566 // Collect and print information about suboptimal code layout on input. 1567 if (opts::ReportBadLayout) { 1568 std::vector<const BinaryFunction *> SuboptimalFuncs; 1569 for (auto &BFI : BC.getBinaryFunctions()) { 1570 const BinaryFunction &BF = BFI.second; 1571 if (!BF.hasValidProfile()) 1572 continue; 1573 1574 const uint64_t HotThreshold = 1575 std::max<uint64_t>(BF.getKnownExecutionCount(), 1); 1576 bool HotSeen = false; 1577 for (const BinaryBasicBlock *BB : BF.rlayout()) { 1578 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { 1579 HotSeen = true; 1580 continue; 1581 } 1582 if (HotSeen && BB->getKnownExecutionCount() == 0) { 1583 SuboptimalFuncs.push_back(&BF); 1584 break; 1585 } 1586 } 1587 } 1588 1589 if (!SuboptimalFuncs.empty()) { 1590 std::sort(SuboptimalFuncs.begin(), SuboptimalFuncs.end(), 1591 [](const BinaryFunction *A, const BinaryFunction *B) { 1592 return A->getKnownExecutionCount() / A->getSize() > 1593 B->getKnownExecutionCount() / B->getSize(); 1594 }); 1595 1596 outs() << "BOLT-INFO: " << SuboptimalFuncs.size() 1597 << " functions have " 1598 "cold code in the middle of hot code. Top functions are:\n"; 1599 for (unsigned I = 0; 1600 I < std::min(static_cast<size_t>(opts::ReportBadLayout), 1601 SuboptimalFuncs.size()); 1602 ++I) 1603 SuboptimalFuncs[I]->print(outs()); 1604 } 1605 } 1606 1607 if (NumUnknownControlFlowFunctions) { 1608 outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions 1609 << " functions have instructions with unknown control flow"; 1610 if (!opts::PrintUnknown) 1611 outs() << ". Use -print-unknown to see the list."; 1612 outs() << '\n'; 1613 } 1614 } 1615 1616 void InstructionLowering::runOnFunctions(BinaryContext &BC) { 1617 for (auto &BFI : BC.getBinaryFunctions()) 1618 for (BinaryBasicBlock &BB : BFI.second) 1619 for (MCInst &Instruction : BB) 1620 BC.MIB->lowerTailCall(Instruction); 1621 } 1622 1623 void StripRepRet::runOnFunctions(BinaryContext &BC) { 1624 uint64_t NumPrefixesRemoved = 0; 1625 uint64_t NumBytesSaved = 0; 1626 for (auto &BFI : BC.getBinaryFunctions()) { 1627 for (BinaryBasicBlock &BB : BFI.second) { 1628 auto LastInstRIter = BB.getLastNonPseudo(); 1629 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) || 1630 !BC.MIB->deleteREPPrefix(*LastInstRIter)) 1631 continue; 1632 1633 NumPrefixesRemoved += BB.getKnownExecutionCount(); 1634 ++NumBytesSaved; 1635 } 1636 } 1637 1638 if (NumBytesSaved) 1639 outs() << "BOLT-INFO: removed " << NumBytesSaved 1640 << " 'repz' prefixes" 1641 " with estimated execution count of " 1642 << NumPrefixesRemoved << " times.\n"; 1643 } 1644 1645 void InlineMemcpy::runOnFunctions(BinaryContext &BC) { 1646 if (!BC.isX86()) 1647 return; 1648 1649 uint64_t NumInlined = 0; 1650 uint64_t NumInlinedDyno = 0; 1651 for (auto &BFI : BC.getBinaryFunctions()) { 1652 for (BinaryBasicBlock &BB : BFI.second) { 1653 for (auto II = BB.begin(); II != BB.end(); ++II) { 1654 MCInst &Inst = *II; 1655 1656 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1657 !Inst.getOperand(0).isExpr()) 1658 continue; 1659 1660 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1661 if (CalleeSymbol->getName() != "memcpy" && 1662 CalleeSymbol->getName() != "memcpy@PLT" && 1663 CalleeSymbol->getName() != "_memcpy8") 1664 continue; 1665 1666 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); 1667 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1668 1669 const InstructionListType NewCode = 1670 BC.MIB->createInlineMemcpy(IsMemcpy8); 1671 II = BB.replaceInstruction(II, NewCode); 1672 std::advance(II, NewCode.size() - 1); 1673 if (IsTailCall) { 1674 MCInst Return; 1675 BC.MIB->createReturn(Return); 1676 II = BB.insertInstruction(std::next(II), std::move(Return)); 1677 } 1678 1679 ++NumInlined; 1680 NumInlinedDyno += BB.getKnownExecutionCount(); 1681 } 1682 } 1683 } 1684 1685 if (NumInlined) { 1686 outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls"; 1687 if (NumInlinedDyno) 1688 outs() << ". The calls were executed " << NumInlinedDyno 1689 << " times based on profile."; 1690 outs() << '\n'; 1691 } 1692 } 1693 1694 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const { 1695 if (!BinaryFunctionPass::shouldOptimize(Function)) 1696 return false; 1697 1698 for (const std::string &FunctionSpec : Spec) { 1699 StringRef FunctionName = StringRef(FunctionSpec).split(':').first; 1700 if (Function.hasNameRegex(FunctionName)) 1701 return true; 1702 } 1703 1704 return false; 1705 } 1706 1707 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize( 1708 const BinaryFunction &Function) const { 1709 StringRef SitesString; 1710 for (const std::string &FunctionSpec : Spec) { 1711 StringRef FunctionName; 1712 std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':'); 1713 if (Function.hasNameRegex(FunctionName)) 1714 break; 1715 SitesString = ""; 1716 } 1717 1718 std::set<size_t> Sites; 1719 SmallVector<StringRef, 4> SitesVec; 1720 SitesString.split(SitesVec, ':'); 1721 for (StringRef SiteString : SitesVec) { 1722 if (SiteString.empty()) 1723 continue; 1724 size_t Result; 1725 if (!SiteString.getAsInteger(10, Result)) 1726 Sites.emplace(Result); 1727 } 1728 1729 return Sites; 1730 } 1731 1732 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) { 1733 if (!BC.isX86()) 1734 return; 1735 1736 uint64_t NumSpecialized = 0; 1737 uint64_t NumSpecializedDyno = 0; 1738 for (auto &BFI : BC.getBinaryFunctions()) { 1739 BinaryFunction &Function = BFI.second; 1740 if (!shouldOptimize(Function)) 1741 continue; 1742 1743 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function); 1744 auto shouldOptimize = [&](size_t N) { 1745 return CallsToOptimize.empty() || CallsToOptimize.count(N); 1746 }; 1747 1748 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend()); 1749 size_t CallSiteID = 0; 1750 for (BinaryBasicBlock *CurBB : Blocks) { 1751 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { 1752 MCInst &Inst = *II; 1753 1754 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1755 !Inst.getOperand(0).isExpr()) 1756 continue; 1757 1758 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1759 if (CalleeSymbol->getName() != "memcpy" && 1760 CalleeSymbol->getName() != "memcpy@PLT") 1761 continue; 1762 1763 if (BC.MIB->isTailCall(Inst)) 1764 continue; 1765 1766 ++CallSiteID; 1767 1768 if (!shouldOptimize(CallSiteID)) 1769 continue; 1770 1771 // Create a copy of a call to memcpy(dest, src, size). 1772 MCInst MemcpyInstr = Inst; 1773 1774 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II); 1775 1776 BinaryBasicBlock *NextBB = nullptr; 1777 if (OneByteMemcpyBB->getNumNonPseudos() > 1) { 1778 NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin()); 1779 NextBB->eraseInstruction(NextBB->begin()); 1780 } else { 1781 NextBB = OneByteMemcpyBB->getSuccessor(); 1782 OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin()); 1783 assert(NextBB && "unexpected call to memcpy() with no return"); 1784 } 1785 1786 BinaryBasicBlock *MemcpyBB = 1787 Function.addBasicBlock(CurBB->getInputOffset()); 1788 InstructionListType CmpJCC = 1789 BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1, 1790 OneByteMemcpyBB->getLabel(), BC.Ctx.get()); 1791 CurBB->addInstructions(CmpJCC); 1792 CurBB->addSuccessor(MemcpyBB); 1793 1794 MemcpyBB->addInstruction(std::move(MemcpyInstr)); 1795 MemcpyBB->addSuccessor(NextBB); 1796 MemcpyBB->setCFIState(NextBB->getCFIState()); 1797 MemcpyBB->setExecutionCount(0); 1798 1799 // To prevent the actual call from being moved to cold, we set its 1800 // execution count to 1. 1801 if (CurBB->getKnownExecutionCount() > 0) 1802 MemcpyBB->setExecutionCount(1); 1803 1804 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy(); 1805 OneByteMemcpyBB->addInstructions(OneByteMemcpy); 1806 1807 ++NumSpecialized; 1808 NumSpecializedDyno += CurBB->getKnownExecutionCount(); 1809 1810 CurBB = NextBB; 1811 1812 // Note: we don't expect the next instruction to be a call to memcpy. 1813 II = CurBB->begin(); 1814 } 1815 } 1816 } 1817 1818 if (NumSpecialized) { 1819 outs() << "BOLT-INFO: specialized " << NumSpecialized 1820 << " memcpy() call sites for size 1"; 1821 if (NumSpecializedDyno) 1822 outs() << ". The calls were executed " << NumSpecializedDyno 1823 << " times based on profile."; 1824 outs() << '\n'; 1825 } 1826 } 1827 1828 void RemoveNops::runOnFunction(BinaryFunction &BF) { 1829 const BinaryContext &BC = BF.getBinaryContext(); 1830 for (BinaryBasicBlock &BB : BF) { 1831 for (int64_t I = BB.size() - 1; I >= 0; --I) { 1832 MCInst &Inst = BB.getInstructionAtIndex(I); 1833 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP")) 1834 BB.eraseInstructionAtIndex(I); 1835 } 1836 } 1837 } 1838 1839 void RemoveNops::runOnFunctions(BinaryContext &BC) { 1840 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 1841 runOnFunction(BF); 1842 }; 1843 1844 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 1845 return BF.shouldPreserveNops(); 1846 }; 1847 1848 ParallelUtilities::runOnEachFunction( 1849 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 1850 SkipFunc, "RemoveNops"); 1851 } 1852 1853 } // namespace bolt 1854 } // namespace llvm 1855