1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements multiple passes for binary optimization and analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/BinaryPasses.h" 14 #include "bolt/Core/ParallelUtilities.h" 15 #include "bolt/Passes/ReorderAlgorithm.h" 16 #include "bolt/Passes/ReorderFunctions.h" 17 #include "llvm/Support/CommandLine.h" 18 19 #include <numeric> 20 #include <vector> 21 22 #define DEBUG_TYPE "bolt-opts" 23 24 using namespace llvm; 25 using namespace bolt; 26 27 namespace { 28 29 const char *dynoStatsOptName(const bolt::DynoStats::Category C) { 30 if (C == bolt::DynoStats::FIRST_DYNO_STAT) 31 return "none"; 32 else if (C == bolt::DynoStats::LAST_DYNO_STAT) 33 return "all"; 34 35 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1]; 36 37 OptNames[C] = bolt::DynoStats::Description(C); 38 39 std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); 40 41 return OptNames[C].c_str(); 42 } 43 44 const char *dynoStatsOptDesc(const bolt::DynoStats::Category C) { 45 if (C == bolt::DynoStats::FIRST_DYNO_STAT) 46 return "unsorted"; 47 else if (C == bolt::DynoStats::LAST_DYNO_STAT) 48 return "sorted by all stats"; 49 50 return bolt::DynoStats::Description(C); 51 } 52 53 } 54 55 namespace opts { 56 57 extern cl::OptionCategory BoltCategory; 58 extern cl::OptionCategory BoltOptCategory; 59 60 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion; 61 extern cl::opt<unsigned> Verbosity; 62 extern cl::opt<bool> EnableBAT; 63 extern cl::opt<unsigned> ExecutionCountThreshold; 64 extern cl::opt<bool> UpdateDebugSections; 65 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions; 66 67 enum DynoStatsSortOrder : char { 68 Ascending, 69 Descending 70 }; 71 72 static cl::opt<DynoStatsSortOrder> 73 DynoStatsSortOrderOpt("print-sorted-by-order", 74 cl::desc("use ascending or descending order when printing functions " 75 "ordered by dyno stats"), 76 cl::ZeroOrMore, 77 cl::init(DynoStatsSortOrder::Descending), 78 cl::cat(BoltOptCategory)); 79 80 cl::list<std::string> 81 HotTextMoveSections("hot-text-move-sections", 82 cl::desc("list of sections containing functions used for hugifying hot text. " 83 "BOLT makes sure these functions are not placed on the same page as " 84 "the hot text. (default=\'.stub,.mover\')."), 85 cl::value_desc("sec1,sec2,sec3,..."), 86 cl::CommaSeparated, 87 cl::ZeroOrMore, 88 cl::cat(BoltCategory)); 89 90 bool isHotTextMover(const BinaryFunction &Function) { 91 for (std::string &SectionName : opts::HotTextMoveSections) { 92 if (Function.getOriginSectionName() && 93 *Function.getOriginSectionName() == SectionName) 94 return true; 95 } 96 97 return false; 98 } 99 100 static cl::opt<bool> 101 MinBranchClusters("min-branch-clusters", 102 cl::desc("use a modified clustering algorithm geared towards minimizing " 103 "branches"), 104 cl::ZeroOrMore, 105 cl::Hidden, 106 cl::cat(BoltOptCategory)); 107 108 static cl::list<Peepholes::PeepholeOpts> Peepholes( 109 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"), 110 cl::value_desc("opt1,opt2,opt3,..."), 111 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"), 112 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps", 113 "remove double jumps when able"), 114 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps", 115 "insert tail call traps"), 116 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches", 117 "remove useless conditional branches"), 118 clEnumValN(Peepholes::PEEP_ALL, "all", 119 "enable all peephole optimizations")), 120 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 121 122 static cl::opt<unsigned> 123 PrintFuncStat("print-function-statistics", 124 cl::desc("print statistics about basic block ordering"), 125 cl::init(0), 126 cl::ZeroOrMore, 127 cl::cat(BoltOptCategory)); 128 129 static cl::list<bolt::DynoStats::Category> 130 PrintSortedBy("print-sorted-by", 131 cl::CommaSeparated, 132 cl::desc("print functions sorted by order of dyno stats"), 133 cl::value_desc("key1,key2,key3,..."), 134 cl::values( 135 #define D(name, ...) \ 136 clEnumValN(bolt::DynoStats::name, \ 137 dynoStatsOptName(bolt::DynoStats::name), \ 138 dynoStatsOptDesc(bolt::DynoStats::name)), 139 DYNO_STATS 140 #undef D 141 clEnumValN(0xffff, ".", ".") 142 ), 143 cl::ZeroOrMore, 144 cl::cat(BoltOptCategory)); 145 146 static cl::opt<bool> 147 PrintUnknown("print-unknown", 148 cl::desc("print names of functions with unknown control flow"), 149 cl::init(false), 150 cl::ZeroOrMore, 151 cl::cat(BoltCategory), 152 cl::Hidden); 153 154 static cl::opt<bool> 155 PrintUnknownCFG("print-unknown-cfg", 156 cl::desc("dump CFG of functions with unknown control flow"), 157 cl::init(false), 158 cl::ZeroOrMore, 159 cl::cat(BoltCategory), 160 cl::ReallyHidden); 161 162 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks( 163 "reorder-blocks", cl::desc("change layout of basic blocks in a function"), 164 cl::init(bolt::ReorderBasicBlocks::LT_NONE), 165 cl::values( 166 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none", 167 "do not reorder basic blocks"), 168 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse", 169 "layout blocks in reverse order"), 170 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal", 171 "perform optimal layout based on profile"), 172 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, 173 "branch-predictor", 174 "perform optimal layout prioritizing branch " 175 "predictions"), 176 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache", 177 "perform optimal layout prioritizing I-cache " 178 "behavior"), 179 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "cache+", 180 "perform layout optimizing I-cache behavior"), 181 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp", 182 "perform layout optimizing I-cache behavior"), 183 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, 184 "cluster-shuffle", "perform random layout of clusters")), 185 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 186 187 static cl::opt<unsigned> 188 ReportBadLayout("report-bad-layout", 189 cl::desc("print top <uint> functions with suboptimal code layout on input"), 190 cl::init(0), 191 cl::ZeroOrMore, 192 cl::Hidden, 193 cl::cat(BoltOptCategory)); 194 195 static cl::opt<bool> 196 ReportStaleFuncs("report-stale", 197 cl::desc("print the list of functions with stale profile"), 198 cl::init(false), 199 cl::ZeroOrMore, 200 cl::Hidden, 201 cl::cat(BoltOptCategory)); 202 203 enum SctcModes : char { 204 SctcAlways, 205 SctcPreserveDirection, 206 SctcHeuristic 207 }; 208 209 static cl::opt<SctcModes> 210 SctcMode("sctc-mode", 211 cl::desc("mode for simplify conditional tail calls"), 212 cl::init(SctcAlways), 213 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), 214 clEnumValN(SctcPreserveDirection, 215 "preserve", 216 "only perform sctc when branch direction is " 217 "preserved"), 218 clEnumValN(SctcHeuristic, 219 "heuristic", 220 "use branch prediction data to control sctc")), 221 cl::ZeroOrMore, 222 cl::cat(BoltOptCategory)); 223 224 static cl::opt<unsigned> 225 StaleThreshold("stale-threshold", 226 cl::desc( 227 "maximum percentage of stale functions to tolerate (default: 100)"), 228 cl::init(100), 229 cl::Hidden, 230 cl::cat(BoltOptCategory)); 231 232 static cl::opt<unsigned> 233 TSPThreshold("tsp-threshold", 234 cl::desc("maximum number of hot basic blocks in a function for which to use " 235 "a precise TSP solution while re-ordering basic blocks"), 236 cl::init(10), 237 cl::ZeroOrMore, 238 cl::Hidden, 239 cl::cat(BoltOptCategory)); 240 241 static cl::opt<unsigned> 242 TopCalledLimit("top-called-limit", 243 cl::desc("maximum number of functions to print in top called " 244 "functions section"), 245 cl::init(100), 246 cl::ZeroOrMore, 247 cl::Hidden, 248 cl::cat(BoltCategory)); 249 250 } // namespace opts 251 252 namespace llvm { 253 namespace bolt { 254 255 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { 256 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG && 257 !BF.isIgnored(); 258 } 259 260 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { 261 return BF.isSimple() && !BF.isIgnored(); 262 } 263 264 void NormalizeCFG::runOnFunction(BinaryFunction &BF) { 265 uint64_t NumRemoved = 0; 266 uint64_t NumDuplicateEdges = 0; 267 uint64_t NeedsFixBranches = 0; 268 for (BinaryBasicBlock &BB : BF) { 269 if (!BB.empty()) 270 continue; 271 272 if (BB.isEntryPoint() || BB.isLandingPad()) 273 continue; 274 275 // Handle a dangling empty block. 276 if (BB.succ_size() == 0) { 277 // If an empty dangling basic block has a predecessor, it could be a 278 // result of codegen for __builtin_unreachable. In such case, do not 279 // remove the block. 280 if (BB.pred_size() == 0) { 281 BB.markValid(false); 282 ++NumRemoved; 283 } 284 continue; 285 } 286 287 // The block should have just one successor. 288 BinaryBasicBlock *Successor = BB.getSuccessor(); 289 assert(Successor && "invalid CFG encountered"); 290 291 // Redirect all predecessors to the successor block. 292 while (!BB.pred_empty()) { 293 BinaryBasicBlock *Predecessor = *BB.pred_begin(); 294 if (Predecessor->hasJumpTable()) 295 break; 296 297 if (Predecessor == Successor) 298 break; 299 300 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB); 301 Predecessor->replaceSuccessor(&BB, Successor, BI.Count, 302 BI.MispredictedCount); 303 // We need to fix branches even if we failed to replace all successors 304 // and remove the block. 305 NeedsFixBranches = true; 306 } 307 308 if (BB.pred_empty()) { 309 BB.removeAllSuccessors(); 310 BB.markValid(false); 311 ++NumRemoved; 312 } 313 } 314 315 if (NumRemoved) 316 BF.eraseInvalidBBs(); 317 318 // Check for duplicate successors. Do it after the empty block elimination as 319 // we can get more duplicate successors. 320 for (BinaryBasicBlock &BB : BF) 321 if (!BB.hasJumpTable() && BB.succ_size() == 2 && 322 BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true)) 323 ++NumDuplicateEdges; 324 325 // fixBranches() will get rid of duplicate edges and update jump instructions. 326 if (NumDuplicateEdges || NeedsFixBranches) 327 BF.fixBranches(); 328 329 NumDuplicateEdgesMerged += NumDuplicateEdges; 330 NumBlocksRemoved += NumRemoved; 331 } 332 333 void NormalizeCFG::runOnFunctions(BinaryContext &BC) { 334 ParallelUtilities::runOnEachFunction( 335 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 336 [&](BinaryFunction &BF) { runOnFunction(BF); }, 337 [&](const BinaryFunction &BF) { return !shouldOptimize(BF); }, 338 "NormalizeCFG"); 339 if (NumBlocksRemoved) 340 outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block" 341 << (NumBlocksRemoved == 1 ? "" : "s") << '\n'; 342 if (NumDuplicateEdgesMerged) 343 outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged 344 << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s") 345 << '\n'; 346 } 347 348 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { 349 if (Function.layout_size() > 0) { 350 unsigned Count; 351 uint64_t Bytes; 352 Function.markUnreachableBlocks(); 353 LLVM_DEBUG({ 354 for (BinaryBasicBlock *BB : Function.layout()) { 355 if (!BB->isValid()) { 356 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB->getName() 357 << " in function " << Function << "\n"; 358 Function.dump(); 359 } 360 } 361 }); 362 std::tie(Count, Bytes) = Function.eraseInvalidBBs(); 363 DeletedBlocks += Count; 364 DeletedBytes += Bytes; 365 if (Count) { 366 Modified.insert(&Function); 367 if (opts::Verbosity > 0) 368 outs() << "BOLT-INFO: Removed " << Count 369 << " dead basic block(s) accounting for " << Bytes 370 << " bytes in function " << Function << '\n'; 371 } 372 } 373 } 374 375 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { 376 for (auto &It : BC.getBinaryFunctions()) { 377 BinaryFunction &Function = It.second; 378 if (shouldOptimize(Function)) 379 runOnFunction(Function); 380 } 381 382 outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " 383 << DeletedBytes << " bytes of code.\n"; 384 } 385 386 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { 387 return (BinaryFunctionPass::shouldPrint(BF) && 388 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); 389 } 390 391 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const { 392 // Apply execution count threshold 393 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 394 return false; 395 396 return BinaryFunctionPass::shouldOptimize(BF); 397 } 398 399 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { 400 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) 401 return; 402 403 std::atomic<uint64_t> ModifiedFuncCount{0}; 404 405 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 406 modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters); 407 if (BF.hasLayoutChanged()) 408 ++ModifiedFuncCount; 409 }; 410 411 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 412 return !shouldOptimize(BF); 413 }; 414 415 ParallelUtilities::runOnEachFunction( 416 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc, 417 "ReorderBasicBlocks"); 418 419 outs() << "BOLT-INFO: basic block reordering modified layout of " 420 << format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(), 421 100.0 * ModifiedFuncCount.load() / 422 BC.getBinaryFunctions().size()); 423 424 if (opts::PrintFuncStat > 0) { 425 raw_ostream &OS = outs(); 426 // Copy all the values into vector in order to sort them 427 std::map<uint64_t, BinaryFunction &> ScoreMap; 428 auto &BFs = BC.getBinaryFunctions(); 429 for (auto It = BFs.begin(); It != BFs.end(); ++It) 430 ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>( 431 It->second.getFunctionScore(), It->second)); 432 433 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n"; 434 OS << " There are " << BFs.size() << " functions in total. \n"; 435 OS << " Number of functions being modified: " 436 << ModifiedFuncCount.load() << "\n"; 437 OS << " User asks for detailed information on top " 438 << opts::PrintFuncStat << " functions. (Ranked by function score)" 439 << "\n\n"; 440 uint64_t I = 0; 441 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit = 442 ScoreMap.rbegin(); 443 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { 444 BinaryFunction &Function = Rit->second; 445 446 OS << " Information for function of top: " << (I + 1) << ": \n"; 447 OS << " Function Score is: " << Function.getFunctionScore() 448 << "\n"; 449 OS << " There are " << Function.size() 450 << " number of blocks in this function.\n"; 451 OS << " There are " << Function.getInstructionCount() 452 << " number of instructions in this function.\n"; 453 OS << " The edit distance for this function is: " 454 << Function.getEditDistance() << "\n\n"; 455 } 456 } 457 } 458 459 void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, 460 LayoutType Type, 461 bool MinBranchClusters) const { 462 if (BF.size() == 0 || Type == LT_NONE) 463 return; 464 465 BinaryFunction::BasicBlockOrderType NewLayout; 466 std::unique_ptr<ReorderAlgorithm> Algo; 467 468 // Cannot do optimal layout without profile. 469 if (Type != LT_REVERSE && !BF.hasValidProfile()) 470 return; 471 472 if (Type == LT_REVERSE) { 473 Algo.reset(new ReverseReorderAlgorithm()); 474 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { 475 // Work on optimal solution if problem is small enough 476 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); 477 Algo.reset(new TSPReorderAlgorithm()); 478 } else { 479 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n"); 480 481 std::unique_ptr<ClusterAlgorithm> CAlgo; 482 if (MinBranchClusters) 483 CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); 484 else 485 CAlgo.reset(new PHGreedyClusterAlgorithm()); 486 487 switch (Type) { 488 case LT_OPTIMIZE: 489 Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); 490 break; 491 492 case LT_OPTIMIZE_BRANCH: 493 Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); 494 break; 495 496 case LT_OPTIMIZE_CACHE: 497 Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); 498 break; 499 500 case LT_OPTIMIZE_EXT_TSP: 501 Algo.reset(new ExtTSPReorderAlgorithm()); 502 break; 503 504 case LT_OPTIMIZE_SHUFFLE: 505 Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); 506 break; 507 508 default: 509 llvm_unreachable("unexpected layout type"); 510 } 511 } 512 513 Algo->reorderBasicBlocks(BF, NewLayout); 514 515 BF.updateBasicBlockLayout(NewLayout); 516 } 517 518 void FixupBranches::runOnFunctions(BinaryContext &BC) { 519 for (auto &It : BC.getBinaryFunctions()) { 520 BinaryFunction &Function = It.second; 521 if (!BC.shouldEmit(Function) || !Function.isSimple()) 522 continue; 523 524 Function.fixBranches(); 525 } 526 } 527 528 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) { 529 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 530 if (!BF.finalizeCFIState()) { 531 if (BC.HasRelocations) { 532 errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF 533 << ". Exiting.\n"; 534 exit(1); 535 } 536 BF.setSimple(false); 537 return; 538 } 539 540 BF.setFinalized(); 541 542 // Update exception handling information. 543 BF.updateEHRanges(); 544 }; 545 546 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 547 return !BC.shouldEmit(BF); 548 }; 549 550 ParallelUtilities::runOnEachFunction( 551 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 552 SkipPredicate, "FinalizeFunctions"); 553 } 554 555 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { 556 if (BC.HasRelocations) 557 return; 558 559 if (!opts::UpdateDebugSections) 560 return; 561 562 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit 563 // incorrect debug info. 564 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 565 uint64_t HotSize, ColdSize; 566 std::tie(HotSize, ColdSize) = 567 BC.calculateEmittedSize(BF, /*FixBranches=*/false); 568 if (HotSize > BF.getMaxSize()) 569 BF.setSimple(false); 570 }; 571 572 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 573 return !shouldOptimize(BF); 574 }; 575 576 ParallelUtilities::runOnEachFunction( 577 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 578 SkipFunc, "CheckLargeFunctions"); 579 } 580 581 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { 582 // Unlike other passes, allow functions in non-CFG state. 583 return BF.isSimple() && !BF.isIgnored(); 584 } 585 586 void LowerAnnotations::runOnFunctions(BinaryContext &BC) { 587 std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations; 588 589 for (auto &It : BC.getBinaryFunctions()) { 590 BinaryFunction &BF = It.second; 591 int64_t CurrentGnuArgsSize = 0; 592 593 // Have we crossed hot/cold border for split functions? 594 bool SeenCold = false; 595 596 for (BinaryBasicBlock *BB : BF.layout()) { 597 if (BB->isCold() && !SeenCold) { 598 SeenCold = true; 599 CurrentGnuArgsSize = 0; 600 } 601 602 // First convert GnuArgsSize annotations into CFIs. This may change instr 603 // pointers, so do it before recording ptrs for preserved annotations 604 if (BF.usesGnuArgsSize()) { 605 for (auto II = BB->begin(); II != BB->end(); ++II) { 606 if (!BC.MIB->isInvoke(*II)) 607 continue; 608 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II); 609 assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size"); 610 if (NewGnuArgsSize != CurrentGnuArgsSize) { 611 auto InsertII = BF.addCFIInstruction( 612 BB, II, 613 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize)); 614 CurrentGnuArgsSize = NewGnuArgsSize; 615 II = std::next(InsertII); 616 } 617 } 618 } 619 620 // Now record preserved annotations separately and then strip annotations. 621 for (auto II = BB->begin(); II != BB->end(); ++II) { 622 if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II)) 623 PreservedOffsetAnnotations.emplace_back(&(*II), 624 *BC.MIB->getOffset(*II)); 625 BC.MIB->stripAnnotations(*II); 626 } 627 } 628 } 629 for (BinaryFunction *BF : BC.getInjectedBinaryFunctions()) 630 for (BinaryBasicBlock &BB : *BF) 631 for (MCInst &Instruction : BB) 632 BC.MIB->stripAnnotations(Instruction); 633 634 // Release all memory taken by annotations 635 BC.MIB->freeAnnotations(); 636 637 // Reinsert preserved annotations we need during code emission. 638 for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations) 639 BC.MIB->setOffset(*Item.first, Item.second); 640 } 641 642 namespace { 643 644 // This peephole fixes jump instructions that jump to another basic 645 // block with a single jump instruction, e.g. 646 // 647 // B0: ... 648 // jmp B1 (or jcc B1) 649 // 650 // B1: jmp B2 651 // 652 // -> 653 // 654 // B0: ... 655 // jmp B2 (or jcc B2) 656 // 657 uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { 658 uint64_t NumDoubleJumps = 0; 659 660 MCContext *Ctx = Function.getBinaryContext().Ctx.get(); 661 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 662 for (BinaryBasicBlock &BB : Function) { 663 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ, 664 const MCSymbol *SuccSym) { 665 // Ignore infinite loop jumps or fallthrough tail jumps. 666 if (Pred == Succ || Succ == &BB) 667 return false; 668 669 if (Succ) { 670 const MCSymbol *TBB = nullptr; 671 const MCSymbol *FBB = nullptr; 672 MCInst *CondBranch = nullptr; 673 MCInst *UncondBranch = nullptr; 674 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 675 if (!Res) { 676 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; 677 Pred->dump()); 678 return false; 679 } 680 Pred->replaceSuccessor(&BB, Succ); 681 682 // We must patch up any existing branch instructions to match up 683 // with the new successor. 684 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && 685 "Predecessor block has inconsistent number of successors"); 686 if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) { 687 MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); 688 } else if (UncondBranch && 689 MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) { 690 MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); 691 } else if (!UncondBranch) { 692 assert(Function.getBasicBlockAfter(Pred, false) != Succ && 693 "Don't add an explicit jump to a fallthrough block."); 694 Pred->addBranchInstruction(Succ); 695 } 696 } else { 697 // Succ will be null in the tail call case. In this case we 698 // need to explicitly add a tail call instruction. 699 MCInst *Branch = Pred->getLastNonPseudoInstr(); 700 if (Branch && MIB->isUnconditionalBranch(*Branch)) { 701 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel()); 702 Pred->removeSuccessor(&BB); 703 Pred->eraseInstruction(Pred->findInstruction(Branch)); 704 Pred->addTailCallInstruction(SuccSym); 705 } else { 706 return false; 707 } 708 } 709 710 ++NumDoubleJumps; 711 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from " 712 << Pred->getName() << " -> " << BB.getName() << " to " 713 << Pred->getName() << " -> " << SuccSym->getName() 714 << (!Succ ? " (tail)\n" : "\n")); 715 716 return true; 717 }; 718 719 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) 720 continue; 721 722 MCInst *Inst = BB.getFirstNonPseudoInstr(); 723 const bool IsTailCall = MIB->isTailCall(*Inst); 724 725 if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall) 726 continue; 727 728 // If we operate after SCTC make sure it's not a conditional tail call. 729 if (IsTailCall && MIB->isConditionalBranch(*Inst)) 730 continue; 731 732 const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst); 733 BinaryBasicBlock *Succ = BB.getSuccessor(); 734 735 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) 736 continue; 737 738 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()}; 739 740 for (BinaryBasicBlock *Pred : Preds) { 741 if (Pred->isLandingPad()) 742 continue; 743 744 if (Pred->getSuccessor() == &BB || 745 (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || 746 Pred->getConditionalSuccessor(false) == &BB) 747 if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid) 748 BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() || 749 BB.isEntryPoint()); 750 } 751 } 752 753 return NumDoubleJumps; 754 } 755 } // namespace 756 757 bool SimplifyConditionalTailCalls::shouldRewriteBranch( 758 const BinaryBasicBlock *PredBB, const MCInst &CondBranch, 759 const BinaryBasicBlock *BB, const bool DirectionFlag) { 760 if (BeenOptimized.count(PredBB)) 761 return false; 762 763 const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); 764 765 if (IsForward) 766 ++NumOrigForwardBranches; 767 else 768 ++NumOrigBackwardBranches; 769 770 if (opts::SctcMode == opts::SctcAlways) 771 return true; 772 773 if (opts::SctcMode == opts::SctcPreserveDirection) 774 return IsForward == DirectionFlag; 775 776 const ErrorOr<std::pair<double, double>> Frequency = 777 PredBB->getBranchStats(BB); 778 779 // It's ok to rewrite the conditional branch if the new target will be 780 // a backward branch. 781 782 // If no data available for these branches, then it should be ok to 783 // do the optimization since it will reduce code size. 784 if (Frequency.getError()) 785 return true; 786 787 // TODO: should this use misprediction frequency instead? 788 const bool Result = (IsForward && Frequency.get().first >= 0.5) || 789 (!IsForward && Frequency.get().first <= 0.5); 790 791 return Result == DirectionFlag; 792 } 793 794 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) { 795 // Need updated indices to correctly detect branch' direction. 796 BF.updateLayoutIndices(); 797 BF.markUnreachableBlocks(); 798 799 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 800 MCContext *Ctx = BF.getBinaryContext().Ctx.get(); 801 uint64_t NumLocalCTCCandidates = 0; 802 uint64_t NumLocalCTCs = 0; 803 uint64_t LocalCTCTakenCount = 0; 804 uint64_t LocalCTCExecCount = 0; 805 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>> 806 NeedsUncondBranch; 807 808 // Will block be deleted by UCE? 809 auto isValid = [](const BinaryBasicBlock *BB) { 810 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint()); 811 }; 812 813 for (BinaryBasicBlock *BB : BF.layout()) { 814 // Locate BB with a single direct tail-call instruction. 815 if (BB->getNumNonPseudos() != 1) 816 continue; 817 818 MCInst *Instr = BB->getFirstNonPseudoInstr(); 819 if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr)) 820 continue; 821 822 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr); 823 if (!CalleeSymbol) 824 continue; 825 826 // Detect direction of the possible conditional tail call. 827 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); 828 829 // Iterate through all predecessors. 830 for (BinaryBasicBlock *PredBB : BB->predecessors()) { 831 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true); 832 if (!CondSucc) 833 continue; 834 835 ++NumLocalCTCCandidates; 836 837 const MCSymbol *TBB = nullptr; 838 const MCSymbol *FBB = nullptr; 839 MCInst *CondBranch = nullptr; 840 MCInst *UncondBranch = nullptr; 841 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 842 843 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz 844 if (!Result) { 845 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n"; 846 PredBB->dump()); 847 continue; 848 } 849 850 assert(Result && "internal error analyzing conditional branch"); 851 assert(CondBranch && "conditional branch expected"); 852 853 // It's possible that PredBB is also a successor to BB that may have 854 // been processed by a previous iteration of the SCTC loop, in which 855 // case it may have been marked invalid. We should skip rewriting in 856 // this case. 857 if (!PredBB->isValid()) { 858 assert(PredBB->isSuccessor(BB) && 859 "PredBB should be valid if it is not a successor to BB"); 860 continue; 861 } 862 863 // We don't want to reverse direction of the branch in new order 864 // without further profile analysis. 865 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; 866 if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag)) 867 continue; 868 869 // Record this block so that we don't try to optimize it twice. 870 BeenOptimized.insert(PredBB); 871 872 uint64_t Count = 0; 873 if (CondSucc != BB) { 874 // Patch the new target address into the conditional branch. 875 MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx); 876 // Since we reversed the condition on the branch we need to change 877 // the target for the unconditional branch or add a unconditional 878 // branch to the old target. This has to be done manually since 879 // fixupBranches is not called after SCTC. 880 NeedsUncondBranch.emplace_back(PredBB, CondSucc); 881 Count = PredBB->getFallthroughBranchInfo().Count; 882 } else { 883 // Change destination of the conditional branch. 884 MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx); 885 Count = PredBB->getTakenBranchInfo().Count; 886 } 887 const uint64_t CTCTakenFreq = 888 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; 889 890 // Annotate it, so "isCall" returns true for this jcc 891 MIB->setConditionalTailCall(*CondBranch); 892 // Add info abount the conditional tail call frequency, otherwise this 893 // info will be lost when we delete the associated BranchInfo entry 894 auto &CTCAnnotation = 895 MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount"); 896 CTCAnnotation = CTCTakenFreq; 897 898 // Remove the unused successor which may be eliminated later 899 // if there are no other users. 900 PredBB->removeSuccessor(BB); 901 // Update BB execution count 902 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) 903 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); 904 else if (CTCTakenFreq > BB->getKnownExecutionCount()) 905 BB->setExecutionCount(0); 906 907 ++NumLocalCTCs; 908 LocalCTCTakenCount += CTCTakenFreq; 909 LocalCTCExecCount += PredBB->getKnownExecutionCount(); 910 } 911 912 // Remove the block from CFG if all predecessors were removed. 913 BB->markValid(isValid(BB)); 914 } 915 916 // Add unconditional branches at the end of BBs to new successors 917 // as long as the successor is not a fallthrough. 918 for (auto &Entry : NeedsUncondBranch) { 919 BinaryBasicBlock *PredBB = Entry.first; 920 const BinaryBasicBlock *CondSucc = Entry.second; 921 922 const MCSymbol *TBB = nullptr; 923 const MCSymbol *FBB = nullptr; 924 MCInst *CondBranch = nullptr; 925 MCInst *UncondBranch = nullptr; 926 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 927 928 // Find the next valid block. Invalid blocks will be deleted 929 // so they shouldn't be considered fallthrough targets. 930 const BinaryBasicBlock *NextBlock = BF.getBasicBlockAfter(PredBB, false); 931 while (NextBlock && !isValid(NextBlock)) 932 NextBlock = BF.getBasicBlockAfter(NextBlock, false); 933 934 // Get the unconditional successor to this block. 935 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor(); 936 assert(PredSucc && "The other branch should be a tail call"); 937 938 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); 939 940 if (UncondBranch) { 941 if (HasFallthrough) 942 PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch)); 943 else 944 MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx); 945 } else if (!HasFallthrough) { 946 MCInst Branch; 947 MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx); 948 PredBB->addInstruction(Branch); 949 } 950 } 951 952 if (NumLocalCTCs > 0) { 953 NumDoubleJumps += fixDoubleJumps(BF, true); 954 // Clean-up unreachable tail-call blocks. 955 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs(); 956 DeletedBlocks += Stats.first; 957 DeletedBytes += Stats.second; 958 959 assert(BF.validateCFG()); 960 } 961 962 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs 963 << " conditional tail calls from a total of " 964 << NumLocalCTCCandidates << " candidates in function " << BF 965 << ". CTCs execution count for this function is " 966 << LocalCTCExecCount << " and CTC taken count is " 967 << LocalCTCTakenCount << "\n";); 968 969 NumTailCallsPatched += NumLocalCTCs; 970 NumCandidateTailCalls += NumLocalCTCCandidates; 971 CTCExecCount += LocalCTCExecCount; 972 CTCTakenCount += LocalCTCTakenCount; 973 974 return NumLocalCTCs > 0; 975 } 976 977 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { 978 if (!BC.isX86()) 979 return; 980 981 for (auto &It : BC.getBinaryFunctions()) { 982 BinaryFunction &Function = It.second; 983 984 if (!shouldOptimize(Function)) 985 continue; 986 987 if (fixTailCalls(Function)) { 988 Modified.insert(&Function); 989 Function.setHasCanonicalCFG(false); 990 } 991 } 992 993 outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched 994 << " tail calls (" << NumOrigForwardBranches << " forward)" 995 << " tail calls (" << NumOrigBackwardBranches << " backward)" 996 << " from a total of " << NumCandidateTailCalls << " while removing " 997 << NumDoubleJumps << " double jumps" 998 << " and removing " << DeletedBlocks << " basic blocks" 999 << " totalling " << DeletedBytes 1000 << " bytes of code. CTCs total execution count is " << CTCExecCount 1001 << " and the number of times CTCs are taken is " << CTCTakenCount 1002 << ".\n"; 1003 } 1004 1005 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) { 1006 uint64_t Count = 0; 1007 const BinaryContext &BC = Function.getBinaryContext(); 1008 for (BinaryBasicBlock &BB : Function) { 1009 for (MCInst &Inst : BB) { 1010 MCInst OriginalInst; 1011 if (opts::Verbosity > 2) 1012 OriginalInst = Inst; 1013 1014 if (!BC.MIB->shortenInstruction(Inst)) 1015 continue; 1016 1017 if (opts::Verbosity > 2) { 1018 outs() << "BOLT-INFO: shortening:\nBOLT-INFO: "; 1019 BC.printInstruction(outs(), OriginalInst, 0, &Function); 1020 outs() << "BOLT-INFO: to:"; 1021 BC.printInstruction(outs(), Inst, 0, &Function); 1022 } 1023 1024 ++Count; 1025 } 1026 } 1027 1028 return Count; 1029 } 1030 1031 void ShortenInstructions::runOnFunctions(BinaryContext &BC) { 1032 std::atomic<uint64_t> NumShortened{0}; 1033 if (!BC.isX86()) 1034 return; 1035 1036 ParallelUtilities::runOnEachFunction( 1037 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, 1038 [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); }, 1039 nullptr, "ShortenInstructions"); 1040 1041 outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n"; 1042 } 1043 1044 void Peepholes::addTailcallTraps(BinaryFunction &Function) { 1045 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 1046 for (BinaryBasicBlock &BB : Function) { 1047 MCInst *Inst = BB.getLastNonPseudoInstr(); 1048 if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) { 1049 MCInst Trap; 1050 if (MIB->createTrap(Trap)) { 1051 BB.addInstruction(Trap); 1052 ++TailCallTraps; 1053 } 1054 } 1055 } 1056 } 1057 1058 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) { 1059 for (BinaryBasicBlock &BB : Function) { 1060 if (BB.succ_size() != 2) 1061 continue; 1062 1063 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true); 1064 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false); 1065 if (CondBB != UncondBB) 1066 continue; 1067 1068 const MCSymbol *TBB = nullptr; 1069 const MCSymbol *FBB = nullptr; 1070 MCInst *CondBranch = nullptr; 1071 MCInst *UncondBranch = nullptr; 1072 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 1073 1074 // analyzeBranch() can fail due to unusual branch instructions, 1075 // e.g. jrcxz, or jump tables (indirect jump). 1076 if (!Result || !CondBranch) 1077 continue; 1078 1079 BB.removeDuplicateConditionalSuccessor(CondBranch); 1080 ++NumUselessCondBranches; 1081 } 1082 } 1083 1084 void Peepholes::runOnFunctions(BinaryContext &BC) { 1085 const char Opts = 1086 std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0, 1087 [](const char A, const PeepholeOpts B) { return A | B; }); 1088 if (Opts == PEEP_NONE) 1089 return; 1090 1091 for (auto &It : BC.getBinaryFunctions()) { 1092 BinaryFunction &Function = It.second; 1093 if (shouldOptimize(Function)) { 1094 if (Opts & PEEP_DOUBLE_JUMPS) 1095 NumDoubleJumps += fixDoubleJumps(Function, false); 1096 if (Opts & PEEP_TAILCALL_TRAPS) 1097 addTailcallTraps(Function); 1098 if (Opts & PEEP_USELESS_BRANCHES) 1099 removeUselessCondBranches(Function); 1100 assert(Function.validateCFG()); 1101 } 1102 } 1103 outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps 1104 << " double jumps patched.\n" 1105 << "BOLT-INFO: Peephole: " << TailCallTraps 1106 << " tail call traps inserted.\n" 1107 << "BOLT-INFO: Peephole: " << NumUselessCondBranches 1108 << " useless conditional branches removed.\n"; 1109 } 1110 1111 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) { 1112 BinaryContext &BC = BF.getBinaryContext(); 1113 MCPlusBuilder *MIB = BC.MIB.get(); 1114 1115 uint64_t NumLocalLoadsSimplified = 0; 1116 uint64_t NumDynamicLocalLoadsSimplified = 0; 1117 uint64_t NumLocalLoadsFound = 0; 1118 uint64_t NumDynamicLocalLoadsFound = 0; 1119 1120 for (BinaryBasicBlock *BB : BF.layout()) { 1121 for (MCInst &Inst : *BB) { 1122 unsigned Opcode = Inst.getOpcode(); 1123 const MCInstrDesc &Desc = BC.MII->get(Opcode); 1124 1125 // Skip instructions that do not load from memory. 1126 if (!Desc.mayLoad()) 1127 continue; 1128 1129 // Try to statically evaluate the target memory address; 1130 uint64_t TargetAddress; 1131 1132 if (MIB->hasPCRelOperand(Inst)) { 1133 // Try to find the symbol that corresponds to the PC-relative operand. 1134 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst); 1135 assert(DispOpI != Inst.end() && "expected PC-relative displacement"); 1136 assert(DispOpI->isExpr() && 1137 "found PC-relative with non-symbolic displacement"); 1138 1139 // Get displacement symbol. 1140 const MCSymbol *DisplSymbol; 1141 uint64_t DisplOffset; 1142 1143 std::tie(DisplSymbol, DisplOffset) = 1144 MIB->getTargetSymbolInfo(DispOpI->getExpr()); 1145 1146 if (!DisplSymbol) 1147 continue; 1148 1149 // Look up the symbol address in the global symbols map of the binary 1150 // context object. 1151 BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName()); 1152 if (!BD) 1153 continue; 1154 TargetAddress = BD->getAddress() + DisplOffset; 1155 } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) { 1156 continue; 1157 } 1158 1159 // Get the contents of the section containing the target address of the 1160 // memory operand. We are only interested in read-only sections. 1161 ErrorOr<BinarySection &> DataSection = 1162 BC.getSectionForAddress(TargetAddress); 1163 if (!DataSection || !DataSection->isReadOnly()) 1164 continue; 1165 1166 if (BC.getRelocationAt(TargetAddress) || 1167 BC.getDynamicRelocationAt(TargetAddress)) 1168 continue; 1169 1170 uint32_t Offset = TargetAddress - DataSection->getAddress(); 1171 StringRef ConstantData = DataSection->getContents(); 1172 1173 ++NumLocalLoadsFound; 1174 if (BB->hasProfile()) 1175 NumDynamicLocalLoadsFound += BB->getExecutionCount(); 1176 1177 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { 1178 ++NumLocalLoadsSimplified; 1179 if (BB->hasProfile()) 1180 NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); 1181 } 1182 } 1183 } 1184 1185 NumLoadsFound += NumLocalLoadsFound; 1186 NumDynamicLoadsFound += NumDynamicLocalLoadsFound; 1187 NumLoadsSimplified += NumLocalLoadsSimplified; 1188 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; 1189 1190 return NumLocalLoadsSimplified > 0; 1191 } 1192 1193 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { 1194 for (auto &It : BC.getBinaryFunctions()) { 1195 BinaryFunction &Function = It.second; 1196 if (shouldOptimize(Function) && simplifyRODataLoads(Function)) 1197 Modified.insert(&Function); 1198 } 1199 1200 outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " 1201 << NumLoadsFound << " loads from a statically computed address.\n" 1202 << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified 1203 << "\n" 1204 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n"; 1205 } 1206 1207 void AssignSections::runOnFunctions(BinaryContext &BC) { 1208 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) { 1209 Function->setCodeSectionName(BC.getInjectedCodeSectionName()); 1210 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); 1211 } 1212 1213 // In non-relocation mode functions have pre-assigned section names. 1214 if (!BC.HasRelocations) 1215 return; 1216 1217 const bool UseColdSection = 1218 BC.NumProfiledFuncs > 0 || 1219 opts::ReorderFunctions == ReorderFunctions::RT_USER; 1220 for (auto &BFI : BC.getBinaryFunctions()) { 1221 BinaryFunction &Function = BFI.second; 1222 if (opts::isHotTextMover(Function)) { 1223 Function.setCodeSectionName(BC.getHotTextMoverSectionName()); 1224 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); 1225 continue; 1226 } 1227 1228 if (!UseColdSection || Function.hasValidIndex() || 1229 Function.hasValidProfile()) 1230 Function.setCodeSectionName(BC.getMainCodeSectionName()); 1231 else 1232 Function.setCodeSectionName(BC.getColdCodeSectionName()); 1233 1234 if (Function.isSplit()) 1235 Function.setColdCodeSectionName(BC.getColdCodeSectionName()); 1236 } 1237 } 1238 1239 void PrintProfileStats::runOnFunctions(BinaryContext &BC) { 1240 double FlowImbalanceMean = 0.0; 1241 size_t NumBlocksConsidered = 0; 1242 double WorstBias = 0.0; 1243 const BinaryFunction *WorstBiasFunc = nullptr; 1244 1245 // For each function CFG, we fill an IncomingMap with the sum of the frequency 1246 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum 1247 // of the frequency of outgoing edges. 1248 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>; 1249 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps; 1250 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps; 1251 1252 // Compute mean 1253 for (const auto &BFI : BC.getBinaryFunctions()) { 1254 const BinaryFunction &Function = BFI.second; 1255 if (Function.empty() || !Function.isSimple()) 1256 continue; 1257 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1258 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1259 for (const BinaryBasicBlock &BB : Function) { 1260 uint64_t TotalOutgoing = 0ULL; 1261 auto SuccBIIter = BB.branch_info_begin(); 1262 for (BinaryBasicBlock *Succ : BB.successors()) { 1263 uint64_t Count = SuccBIIter->Count; 1264 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { 1265 ++SuccBIIter; 1266 continue; 1267 } 1268 TotalOutgoing += Count; 1269 IncomingMap[Succ] += Count; 1270 ++SuccBIIter; 1271 } 1272 OutgoingMap[&BB] = TotalOutgoing; 1273 } 1274 1275 size_t NumBlocks = 0; 1276 double Mean = 0.0; 1277 for (const BinaryBasicBlock &BB : Function) { 1278 // Do not compute score for low frequency blocks, entry or exit blocks 1279 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint()) 1280 continue; 1281 ++NumBlocks; 1282 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1283 Mean += fabs(Difference / IncomingMap[&BB]); 1284 } 1285 1286 FlowImbalanceMean += Mean; 1287 NumBlocksConsidered += NumBlocks; 1288 if (!NumBlocks) 1289 continue; 1290 double FuncMean = Mean / NumBlocks; 1291 if (FuncMean > WorstBias) { 1292 WorstBias = FuncMean; 1293 WorstBiasFunc = &Function; 1294 } 1295 } 1296 if (NumBlocksConsidered > 0) 1297 FlowImbalanceMean /= NumBlocksConsidered; 1298 1299 // Compute standard deviation 1300 NumBlocksConsidered = 0; 1301 double FlowImbalanceVar = 0.0; 1302 for (const auto &BFI : BC.getBinaryFunctions()) { 1303 const BinaryFunction &Function = BFI.second; 1304 if (Function.empty() || !Function.isSimple()) 1305 continue; 1306 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1307 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1308 for (const BinaryBasicBlock &BB : Function) { 1309 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0) 1310 continue; 1311 ++NumBlocksConsidered; 1312 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1313 FlowImbalanceVar += 1314 pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2); 1315 } 1316 } 1317 if (NumBlocksConsidered) { 1318 FlowImbalanceVar /= NumBlocksConsidered; 1319 FlowImbalanceVar = sqrt(FlowImbalanceVar); 1320 } 1321 1322 // Report to user 1323 outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n", 1324 (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar)); 1325 if (WorstBiasFunc && opts::Verbosity >= 1) { 1326 outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName() 1327 << "\n"; 1328 LLVM_DEBUG(WorstBiasFunc->dump()); 1329 } 1330 } 1331 1332 void PrintProgramStats::runOnFunctions(BinaryContext &BC) { 1333 uint64_t NumRegularFunctions = 0; 1334 uint64_t NumStaleProfileFunctions = 0; 1335 uint64_t NumNonSimpleProfiledFunctions = 0; 1336 uint64_t NumUnknownControlFlowFunctions = 0; 1337 uint64_t TotalSampleCount = 0; 1338 uint64_t StaleSampleCount = 0; 1339 std::vector<const BinaryFunction *> ProfiledFunctions; 1340 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; 1341 for (auto &BFI : BC.getBinaryFunctions()) { 1342 const BinaryFunction &Function = BFI.second; 1343 1344 // Ignore PLT functions for stats. 1345 if (Function.isPLTFunction()) 1346 continue; 1347 1348 ++NumRegularFunctions; 1349 1350 if (!Function.isSimple()) { 1351 if (Function.hasProfile()) 1352 ++NumNonSimpleProfiledFunctions; 1353 continue; 1354 } 1355 1356 if (Function.hasUnknownControlFlow()) { 1357 if (opts::PrintUnknownCFG) 1358 Function.dump(); 1359 else if (opts::PrintUnknown) 1360 errs() << "function with unknown control flow: " << Function << '\n'; 1361 1362 ++NumUnknownControlFlowFunctions; 1363 } 1364 1365 if (!Function.hasProfile()) 1366 continue; 1367 1368 uint64_t SampleCount = Function.getRawBranchCount(); 1369 TotalSampleCount += SampleCount; 1370 1371 if (Function.hasValidProfile()) { 1372 ProfiledFunctions.push_back(&Function); 1373 } else { 1374 if (opts::ReportStaleFuncs) { 1375 outs() << StaleFuncsHeader; 1376 StaleFuncsHeader = ""; 1377 outs() << " " << Function << '\n'; 1378 } 1379 ++NumStaleProfileFunctions; 1380 StaleSampleCount += SampleCount; 1381 } 1382 } 1383 BC.NumProfiledFuncs = ProfiledFunctions.size(); 1384 1385 const size_t NumAllProfiledFunctions = 1386 ProfiledFunctions.size() + NumStaleProfileFunctions; 1387 outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of " 1388 << NumRegularFunctions << " functions in the binary (" 1389 << format("%.1f", NumAllProfiledFunctions / 1390 (float)NumRegularFunctions * 100.0f) 1391 << "%) have non-empty execution profile\n"; 1392 if (NumNonSimpleProfiledFunctions) { 1393 outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function" 1394 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s") 1395 << " with profile could not be optimized\n"; 1396 } 1397 if (NumStaleProfileFunctions) { 1398 const float PctStale = 1399 NumStaleProfileFunctions / (float)NumAllProfiledFunctions * 100.0f; 1400 auto printErrorOrWarning = [&]() { 1401 if (PctStale > opts::StaleThreshold) 1402 errs() << "BOLT-ERROR: "; 1403 else 1404 errs() << "BOLT-WARNING: "; 1405 }; 1406 printErrorOrWarning(); 1407 errs() << NumStaleProfileFunctions 1408 << format(" (%.1f%% of all profiled)", PctStale) << " function" 1409 << (NumStaleProfileFunctions == 1 ? "" : "s") 1410 << " have invalid (possibly stale) profile." 1411 " Use -report-stale to see the list.\n"; 1412 if (TotalSampleCount > 0) { 1413 printErrorOrWarning(); 1414 errs() << StaleSampleCount << " out of " << TotalSampleCount 1415 << " samples in the binary (" 1416 << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount)) 1417 << "%) belong to functions with invalid" 1418 " (possibly stale) profile.\n"; 1419 } 1420 if (PctStale > opts::StaleThreshold) { 1421 errs() << "BOLT-ERROR: stale functions exceed specified threshold of " 1422 << opts::StaleThreshold << "%. Exiting.\n"; 1423 exit(1); 1424 } 1425 } 1426 1427 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { 1428 outs() << "BOLT-INFO: profile for " << NumUnusedObjects 1429 << " objects was ignored\n"; 1430 } 1431 1432 if (ProfiledFunctions.size() > 10) { 1433 if (opts::Verbosity >= 1) { 1434 outs() << "BOLT-INFO: top called functions are:\n"; 1435 std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(), 1436 [](const BinaryFunction *A, const BinaryFunction *B) { 1437 return B->getExecutionCount() < A->getExecutionCount(); 1438 }); 1439 auto SFI = ProfiledFunctions.begin(); 1440 auto SFIend = ProfiledFunctions.end(); 1441 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend; 1442 ++SFI, ++I) 1443 outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n'; 1444 } 1445 } 1446 1447 if (!opts::PrintSortedBy.empty() && 1448 std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(), 1449 DynoStats::FIRST_DYNO_STAT) == opts::PrintSortedBy.end()) { 1450 1451 std::vector<const BinaryFunction *> Functions; 1452 std::map<const BinaryFunction *, DynoStats> Stats; 1453 1454 for (const auto &BFI : BC.getBinaryFunctions()) { 1455 const BinaryFunction &BF = BFI.second; 1456 if (shouldOptimize(BF) && BF.hasValidProfile()) { 1457 Functions.push_back(&BF); 1458 Stats.emplace(&BF, getDynoStats(BF)); 1459 } 1460 } 1461 1462 const bool SortAll = 1463 std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(), 1464 DynoStats::LAST_DYNO_STAT) != opts::PrintSortedBy.end(); 1465 1466 const bool Ascending = 1467 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; 1468 1469 if (SortAll) { 1470 std::stable_sort(Functions.begin(), Functions.end(), 1471 [Ascending, &Stats](const BinaryFunction *A, 1472 const BinaryFunction *B) { 1473 return Ascending ? Stats.at(A) < Stats.at(B) 1474 : Stats.at(B) < Stats.at(A); 1475 }); 1476 } else { 1477 std::stable_sort( 1478 Functions.begin(), Functions.end(), 1479 [Ascending, &Stats](const BinaryFunction *A, 1480 const BinaryFunction *B) { 1481 const DynoStats &StatsA = Stats.at(A); 1482 const DynoStats &StatsB = Stats.at(B); 1483 return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy) 1484 : StatsB.lessThan(StatsA, opts::PrintSortedBy); 1485 }); 1486 } 1487 1488 outs() << "BOLT-INFO: top functions sorted by "; 1489 if (SortAll) { 1490 outs() << "dyno stats"; 1491 } else { 1492 outs() << "("; 1493 bool PrintComma = false; 1494 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1495 if (PrintComma) 1496 outs() << ", "; 1497 outs() << DynoStats::Description(Category); 1498 PrintComma = true; 1499 } 1500 outs() << ")"; 1501 } 1502 1503 outs() << " are:\n"; 1504 auto SFI = Functions.begin(); 1505 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { 1506 const DynoStats Stats = getDynoStats(**SFI); 1507 outs() << " " << **SFI; 1508 if (!SortAll) { 1509 outs() << " ("; 1510 bool PrintComma = false; 1511 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1512 if (PrintComma) 1513 outs() << ", "; 1514 outs() << dynoStatsOptName(Category) << "=" << Stats[Category]; 1515 PrintComma = true; 1516 } 1517 outs() << ")"; 1518 } 1519 outs() << "\n"; 1520 } 1521 } 1522 1523 if (!BC.TrappedFunctions.empty()) { 1524 errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function" 1525 << (BC.TrappedFunctions.size() > 1 ? "s" : "") 1526 << " will trap on entry. Use -trap-avx512=0 to disable" 1527 " traps."; 1528 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) { 1529 errs() << '\n'; 1530 for (const BinaryFunction *Function : BC.TrappedFunctions) 1531 errs() << " " << *Function << '\n'; 1532 } else { 1533 errs() << " Use -v=1 to see the list.\n"; 1534 } 1535 } 1536 1537 // Print information on missed macro-fusion opportunities seen on input. 1538 if (BC.MissedMacroFusionPairs) { 1539 outs() << "BOLT-INFO: the input contains " << BC.MissedMacroFusionPairs 1540 << " (dynamic count : " << BC.MissedMacroFusionExecCount 1541 << ") opportunities for macro-fusion optimization"; 1542 switch (opts::AlignMacroOpFusion) { 1543 case MFT_NONE: 1544 outs() << ". Use -align-macro-fusion to fix.\n"; 1545 break; 1546 case MFT_HOT: 1547 outs() << ". Will fix instances on a hot path.\n"; 1548 break; 1549 case MFT_ALL: 1550 outs() << " that are going to be fixed\n"; 1551 break; 1552 } 1553 } 1554 1555 // Collect and print information about suboptimal code layout on input. 1556 if (opts::ReportBadLayout) { 1557 std::vector<const BinaryFunction *> SuboptimalFuncs; 1558 for (auto &BFI : BC.getBinaryFunctions()) { 1559 const BinaryFunction &BF = BFI.second; 1560 if (!BF.hasValidProfile()) 1561 continue; 1562 1563 const uint64_t HotThreshold = 1564 std::max<uint64_t>(BF.getKnownExecutionCount(), 1); 1565 bool HotSeen = false; 1566 for (const BinaryBasicBlock *BB : BF.rlayout()) { 1567 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { 1568 HotSeen = true; 1569 continue; 1570 } 1571 if (HotSeen && BB->getKnownExecutionCount() == 0) { 1572 SuboptimalFuncs.push_back(&BF); 1573 break; 1574 } 1575 } 1576 } 1577 1578 if (!SuboptimalFuncs.empty()) { 1579 std::sort(SuboptimalFuncs.begin(), SuboptimalFuncs.end(), 1580 [](const BinaryFunction *A, const BinaryFunction *B) { 1581 return A->getKnownExecutionCount() / A->getSize() > 1582 B->getKnownExecutionCount() / B->getSize(); 1583 }); 1584 1585 outs() << "BOLT-INFO: " << SuboptimalFuncs.size() 1586 << " functions have " 1587 "cold code in the middle of hot code. Top functions are:\n"; 1588 for (unsigned I = 0; 1589 I < std::min(static_cast<size_t>(opts::ReportBadLayout), 1590 SuboptimalFuncs.size()); 1591 ++I) 1592 SuboptimalFuncs[I]->print(outs()); 1593 } 1594 } 1595 1596 if (NumUnknownControlFlowFunctions) { 1597 outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions 1598 << " functions have instructions with unknown control flow"; 1599 if (!opts::PrintUnknown) 1600 outs() << ". Use -print-unknown to see the list."; 1601 outs() << '\n'; 1602 } 1603 } 1604 1605 void InstructionLowering::runOnFunctions(BinaryContext &BC) { 1606 for (auto &BFI : BC.getBinaryFunctions()) 1607 for (BinaryBasicBlock &BB : BFI.second) 1608 for (MCInst &Instruction : BB) 1609 BC.MIB->lowerTailCall(Instruction); 1610 } 1611 1612 void StripRepRet::runOnFunctions(BinaryContext &BC) { 1613 uint64_t NumPrefixesRemoved = 0; 1614 uint64_t NumBytesSaved = 0; 1615 for (auto &BFI : BC.getBinaryFunctions()) { 1616 for (BinaryBasicBlock &BB : BFI.second) { 1617 auto LastInstRIter = BB.getLastNonPseudo(); 1618 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) || 1619 !BC.MIB->deleteREPPrefix(*LastInstRIter)) 1620 continue; 1621 1622 NumPrefixesRemoved += BB.getKnownExecutionCount(); 1623 ++NumBytesSaved; 1624 } 1625 } 1626 1627 if (NumBytesSaved) 1628 outs() << "BOLT-INFO: removed " << NumBytesSaved 1629 << " 'repz' prefixes" 1630 " with estimated execution count of " 1631 << NumPrefixesRemoved << " times.\n"; 1632 } 1633 1634 void InlineMemcpy::runOnFunctions(BinaryContext &BC) { 1635 if (!BC.isX86()) 1636 return; 1637 1638 uint64_t NumInlined = 0; 1639 uint64_t NumInlinedDyno = 0; 1640 for (auto &BFI : BC.getBinaryFunctions()) { 1641 for (BinaryBasicBlock &BB : BFI.second) { 1642 for (auto II = BB.begin(); II != BB.end(); ++II) { 1643 MCInst &Inst = *II; 1644 1645 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1646 !Inst.getOperand(0).isExpr()) 1647 continue; 1648 1649 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1650 if (CalleeSymbol->getName() != "memcpy" && 1651 CalleeSymbol->getName() != "memcpy@PLT" && 1652 CalleeSymbol->getName() != "_memcpy8") 1653 continue; 1654 1655 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); 1656 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1657 1658 const InstructionListType NewCode = 1659 BC.MIB->createInlineMemcpy(IsMemcpy8); 1660 II = BB.replaceInstruction(II, NewCode); 1661 std::advance(II, NewCode.size() - 1); 1662 if (IsTailCall) { 1663 MCInst Return; 1664 BC.MIB->createReturn(Return); 1665 II = BB.insertInstruction(std::next(II), std::move(Return)); 1666 } 1667 1668 ++NumInlined; 1669 NumInlinedDyno += BB.getKnownExecutionCount(); 1670 } 1671 } 1672 } 1673 1674 if (NumInlined) { 1675 outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls"; 1676 if (NumInlinedDyno) 1677 outs() << ". The calls were executed " << NumInlinedDyno 1678 << " times based on profile."; 1679 outs() << '\n'; 1680 } 1681 } 1682 1683 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const { 1684 if (!BinaryFunctionPass::shouldOptimize(Function)) 1685 return false; 1686 1687 for (const std::string &FunctionSpec : Spec) { 1688 StringRef FunctionName = StringRef(FunctionSpec).split(':').first; 1689 if (Function.hasNameRegex(FunctionName)) 1690 return true; 1691 } 1692 1693 return false; 1694 } 1695 1696 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize( 1697 const BinaryFunction &Function) const { 1698 StringRef SitesString; 1699 for (const std::string &FunctionSpec : Spec) { 1700 StringRef FunctionName; 1701 std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':'); 1702 if (Function.hasNameRegex(FunctionName)) 1703 break; 1704 SitesString = ""; 1705 } 1706 1707 std::set<size_t> Sites; 1708 SmallVector<StringRef, 4> SitesVec; 1709 SitesString.split(SitesVec, ':'); 1710 for (StringRef SiteString : SitesVec) { 1711 if (SiteString.empty()) 1712 continue; 1713 size_t Result; 1714 if (!SiteString.getAsInteger(10, Result)) 1715 Sites.emplace(Result); 1716 } 1717 1718 return Sites; 1719 } 1720 1721 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) { 1722 if (!BC.isX86()) 1723 return; 1724 1725 uint64_t NumSpecialized = 0; 1726 uint64_t NumSpecializedDyno = 0; 1727 for (auto &BFI : BC.getBinaryFunctions()) { 1728 BinaryFunction &Function = BFI.second; 1729 if (!shouldOptimize(Function)) 1730 continue; 1731 1732 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function); 1733 auto shouldOptimize = [&](size_t N) { 1734 return CallsToOptimize.empty() || CallsToOptimize.count(N); 1735 }; 1736 1737 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend()); 1738 size_t CallSiteID = 0; 1739 for (BinaryBasicBlock *CurBB : Blocks) { 1740 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { 1741 MCInst &Inst = *II; 1742 1743 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1744 !Inst.getOperand(0).isExpr()) 1745 continue; 1746 1747 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1748 if (CalleeSymbol->getName() != "memcpy" && 1749 CalleeSymbol->getName() != "memcpy@PLT") 1750 continue; 1751 1752 if (BC.MIB->isTailCall(Inst)) 1753 continue; 1754 1755 ++CallSiteID; 1756 1757 if (!shouldOptimize(CallSiteID)) 1758 continue; 1759 1760 // Create a copy of a call to memcpy(dest, src, size). 1761 MCInst MemcpyInstr = Inst; 1762 1763 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II); 1764 1765 BinaryBasicBlock *NextBB = nullptr; 1766 if (OneByteMemcpyBB->getNumNonPseudos() > 1) { 1767 NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin()); 1768 NextBB->eraseInstruction(NextBB->begin()); 1769 } else { 1770 NextBB = OneByteMemcpyBB->getSuccessor(); 1771 OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin()); 1772 assert(NextBB && "unexpected call to memcpy() with no return"); 1773 } 1774 1775 BinaryBasicBlock *MemcpyBB = 1776 Function.addBasicBlock(CurBB->getInputOffset()); 1777 InstructionListType CmpJCC = 1778 BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1, 1779 OneByteMemcpyBB->getLabel(), BC.Ctx.get()); 1780 CurBB->addInstructions(CmpJCC); 1781 CurBB->addSuccessor(MemcpyBB); 1782 1783 MemcpyBB->addInstruction(std::move(MemcpyInstr)); 1784 MemcpyBB->addSuccessor(NextBB); 1785 MemcpyBB->setCFIState(NextBB->getCFIState()); 1786 MemcpyBB->setExecutionCount(0); 1787 1788 // To prevent the actual call from being moved to cold, we set its 1789 // execution count to 1. 1790 if (CurBB->getKnownExecutionCount() > 0) 1791 MemcpyBB->setExecutionCount(1); 1792 1793 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy(); 1794 OneByteMemcpyBB->addInstructions(OneByteMemcpy); 1795 1796 ++NumSpecialized; 1797 NumSpecializedDyno += CurBB->getKnownExecutionCount(); 1798 1799 CurBB = NextBB; 1800 1801 // Note: we don't expect the next instruction to be a call to memcpy. 1802 II = CurBB->begin(); 1803 } 1804 } 1805 } 1806 1807 if (NumSpecialized) { 1808 outs() << "BOLT-INFO: specialized " << NumSpecialized 1809 << " memcpy() call sites for size 1"; 1810 if (NumSpecializedDyno) 1811 outs() << ". The calls were executed " << NumSpecializedDyno 1812 << " times based on profile."; 1813 outs() << '\n'; 1814 } 1815 } 1816 1817 void RemoveNops::runOnFunction(BinaryFunction &BF) { 1818 const BinaryContext &BC = BF.getBinaryContext(); 1819 for (BinaryBasicBlock &BB : BF) { 1820 for (int64_t I = BB.size() - 1; I >= 0; --I) { 1821 MCInst &Inst = BB.getInstructionAtIndex(I); 1822 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP")) 1823 BB.eraseInstructionAtIndex(I); 1824 } 1825 } 1826 } 1827 1828 void RemoveNops::runOnFunctions(BinaryContext &BC) { 1829 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 1830 runOnFunction(BF); 1831 }; 1832 1833 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 1834 return BF.shouldPreserveNops(); 1835 }; 1836 1837 ParallelUtilities::runOnEachFunction( 1838 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 1839 SkipFunc, "RemoveNops"); 1840 } 1841 1842 } // namespace bolt 1843 } // namespace llvm 1844