1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements multiple passes for binary optimization and analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/BinaryPasses.h" 14 #include "bolt/Core/ParallelUtilities.h" 15 #include "bolt/Passes/ReorderAlgorithm.h" 16 #include "bolt/Passes/ReorderFunctions.h" 17 #include "llvm/Support/CommandLine.h" 18 19 #include <numeric> 20 #include <vector> 21 22 #define DEBUG_TYPE "bolt-opts" 23 24 using namespace llvm; 25 using namespace bolt; 26 27 namespace { 28 29 const char *dynoStatsOptName(const bolt::DynoStats::Category C) { 30 if (C == bolt::DynoStats::FIRST_DYNO_STAT) 31 return "none"; 32 else if (C == bolt::DynoStats::LAST_DYNO_STAT) 33 return "all"; 34 35 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1]; 36 37 OptNames[C] = bolt::DynoStats::Description(C); 38 39 std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); 40 41 return OptNames[C].c_str(); 42 } 43 44 const char *dynoStatsOptDesc(const bolt::DynoStats::Category C) { 45 if (C == bolt::DynoStats::FIRST_DYNO_STAT) 46 return "unsorted"; 47 else if (C == bolt::DynoStats::LAST_DYNO_STAT) 48 return "sorted by all stats"; 49 50 return bolt::DynoStats::Description(C); 51 } 52 53 } 54 55 namespace opts { 56 57 extern cl::OptionCategory BoltCategory; 58 extern cl::OptionCategory BoltOptCategory; 59 60 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion; 61 extern cl::opt<unsigned> Verbosity; 62 extern cl::opt<bool> EnableBAT; 63 extern cl::opt<unsigned> ExecutionCountThreshold; 64 extern cl::opt<bool> UpdateDebugSections; 65 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions; 66 67 enum DynoStatsSortOrder : char { 68 Ascending, 69 Descending 70 }; 71 72 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt( 73 "print-sorted-by-order", 74 cl::desc("use ascending or descending order when printing functions " 75 "ordered by dyno stats"), 76 cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory)); 77 78 cl::list<std::string> 79 HotTextMoveSections("hot-text-move-sections", 80 cl::desc("list of sections containing functions used for hugifying hot text. " 81 "BOLT makes sure these functions are not placed on the same page as " 82 "the hot text. (default=\'.stub,.mover\')."), 83 cl::value_desc("sec1,sec2,sec3,..."), 84 cl::CommaSeparated, 85 cl::ZeroOrMore, 86 cl::cat(BoltCategory)); 87 88 bool isHotTextMover(const BinaryFunction &Function) { 89 for (std::string &SectionName : opts::HotTextMoveSections) { 90 if (Function.getOriginSectionName() && 91 *Function.getOriginSectionName() == SectionName) 92 return true; 93 } 94 95 return false; 96 } 97 98 static cl::opt<bool> MinBranchClusters( 99 "min-branch-clusters", 100 cl::desc("use a modified clustering algorithm geared towards minimizing " 101 "branches"), 102 cl::Hidden, cl::cat(BoltOptCategory)); 103 104 static cl::list<Peepholes::PeepholeOpts> Peepholes( 105 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"), 106 cl::value_desc("opt1,opt2,opt3,..."), 107 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"), 108 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps", 109 "remove double jumps when able"), 110 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps", 111 "insert tail call traps"), 112 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches", 113 "remove useless conditional branches"), 114 clEnumValN(Peepholes::PEEP_ALL, "all", 115 "enable all peephole optimizations")), 116 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 117 118 static cl::opt<unsigned> 119 PrintFuncStat("print-function-statistics", 120 cl::desc("print statistics about basic block ordering"), 121 cl::init(0), cl::cat(BoltOptCategory)); 122 123 static cl::list<bolt::DynoStats::Category> 124 PrintSortedBy("print-sorted-by", 125 cl::CommaSeparated, 126 cl::desc("print functions sorted by order of dyno stats"), 127 cl::value_desc("key1,key2,key3,..."), 128 cl::values( 129 #define D(name, ...) \ 130 clEnumValN(bolt::DynoStats::name, \ 131 dynoStatsOptName(bolt::DynoStats::name), \ 132 dynoStatsOptDesc(bolt::DynoStats::name)), 133 DYNO_STATS 134 #undef D 135 clEnumValN(0xffff, ".", ".") 136 ), 137 cl::ZeroOrMore, 138 cl::cat(BoltOptCategory)); 139 140 static cl::opt<bool> 141 PrintUnknown("print-unknown", 142 cl::desc("print names of functions with unknown control flow"), 143 cl::cat(BoltCategory), cl::Hidden); 144 145 static cl::opt<bool> 146 PrintUnknownCFG("print-unknown-cfg", 147 cl::desc("dump CFG of functions with unknown control flow"), 148 cl::cat(BoltCategory), cl::ReallyHidden); 149 150 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks( 151 "reorder-blocks", cl::desc("change layout of basic blocks in a function"), 152 cl::init(bolt::ReorderBasicBlocks::LT_NONE), 153 cl::values( 154 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none", 155 "do not reorder basic blocks"), 156 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse", 157 "layout blocks in reverse order"), 158 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal", 159 "perform optimal layout based on profile"), 160 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, 161 "branch-predictor", 162 "perform optimal layout prioritizing branch " 163 "predictions"), 164 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache", 165 "perform optimal layout prioritizing I-cache " 166 "behavior"), 167 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+", 168 "perform layout optimizing I-cache behavior"), 169 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp", 170 "perform layout optimizing I-cache behavior"), 171 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, 172 "cluster-shuffle", "perform random layout of clusters")), 173 cl::ZeroOrMore, cl::cat(BoltOptCategory), 174 cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) { 175 if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) { 176 WithColor::warning() 177 << "'-reorder-blocks=cache+' is deprecated, " 178 << "please use '-reorder-blocks=ext-tsp' instead\n"; 179 ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP; 180 } 181 })); 182 183 static cl::opt<unsigned> ReportBadLayout( 184 "report-bad-layout", 185 cl::desc("print top <uint> functions with suboptimal code layout on input"), 186 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 187 188 static cl::opt<bool> 189 ReportStaleFuncs("report-stale", 190 cl::desc("print the list of functions with stale profile"), 191 cl::Hidden, cl::cat(BoltOptCategory)); 192 193 enum SctcModes : char { 194 SctcAlways, 195 SctcPreserveDirection, 196 SctcHeuristic 197 }; 198 199 static cl::opt<SctcModes> 200 SctcMode("sctc-mode", 201 cl::desc("mode for simplify conditional tail calls"), 202 cl::init(SctcAlways), 203 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), 204 clEnumValN(SctcPreserveDirection, 205 "preserve", 206 "only perform sctc when branch direction is " 207 "preserved"), 208 clEnumValN(SctcHeuristic, 209 "heuristic", 210 "use branch prediction data to control sctc")), 211 cl::ZeroOrMore, 212 cl::cat(BoltOptCategory)); 213 214 static cl::opt<unsigned> 215 StaleThreshold("stale-threshold", 216 cl::desc( 217 "maximum percentage of stale functions to tolerate (default: 100)"), 218 cl::init(100), 219 cl::Hidden, 220 cl::cat(BoltOptCategory)); 221 222 static cl::opt<unsigned> TSPThreshold( 223 "tsp-threshold", 224 cl::desc( 225 "maximum number of hot basic blocks in a function for which to use " 226 "a precise TSP solution while re-ordering basic blocks"), 227 cl::init(10), cl::Hidden, cl::cat(BoltOptCategory)); 228 229 static cl::opt<unsigned> TopCalledLimit( 230 "top-called-limit", 231 cl::desc("maximum number of functions to print in top called " 232 "functions section"), 233 cl::init(100), cl::Hidden, cl::cat(BoltCategory)); 234 235 } // namespace opts 236 237 namespace llvm { 238 namespace bolt { 239 240 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { 241 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG && 242 !BF.isIgnored(); 243 } 244 245 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { 246 return BF.isSimple() && !BF.isIgnored(); 247 } 248 249 void NormalizeCFG::runOnFunction(BinaryFunction &BF) { 250 uint64_t NumRemoved = 0; 251 uint64_t NumDuplicateEdges = 0; 252 uint64_t NeedsFixBranches = 0; 253 for (BinaryBasicBlock &BB : BF) { 254 if (!BB.empty()) 255 continue; 256 257 if (BB.isEntryPoint() || BB.isLandingPad()) 258 continue; 259 260 // Handle a dangling empty block. 261 if (BB.succ_size() == 0) { 262 // If an empty dangling basic block has a predecessor, it could be a 263 // result of codegen for __builtin_unreachable. In such case, do not 264 // remove the block. 265 if (BB.pred_size() == 0) { 266 BB.markValid(false); 267 ++NumRemoved; 268 } 269 continue; 270 } 271 272 // The block should have just one successor. 273 BinaryBasicBlock *Successor = BB.getSuccessor(); 274 assert(Successor && "invalid CFG encountered"); 275 276 // Redirect all predecessors to the successor block. 277 while (!BB.pred_empty()) { 278 BinaryBasicBlock *Predecessor = *BB.pred_begin(); 279 if (Predecessor->hasJumpTable()) 280 break; 281 282 if (Predecessor == Successor) 283 break; 284 285 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB); 286 Predecessor->replaceSuccessor(&BB, Successor, BI.Count, 287 BI.MispredictedCount); 288 // We need to fix branches even if we failed to replace all successors 289 // and remove the block. 290 NeedsFixBranches = true; 291 } 292 293 if (BB.pred_empty()) { 294 BB.removeAllSuccessors(); 295 BB.markValid(false); 296 ++NumRemoved; 297 } 298 } 299 300 if (NumRemoved) 301 BF.eraseInvalidBBs(); 302 303 // Check for duplicate successors. Do it after the empty block elimination as 304 // we can get more duplicate successors. 305 for (BinaryBasicBlock &BB : BF) 306 if (!BB.hasJumpTable() && BB.succ_size() == 2 && 307 BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true)) 308 ++NumDuplicateEdges; 309 310 // fixBranches() will get rid of duplicate edges and update jump instructions. 311 if (NumDuplicateEdges || NeedsFixBranches) 312 BF.fixBranches(); 313 314 NumDuplicateEdgesMerged += NumDuplicateEdges; 315 NumBlocksRemoved += NumRemoved; 316 } 317 318 void NormalizeCFG::runOnFunctions(BinaryContext &BC) { 319 ParallelUtilities::runOnEachFunction( 320 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 321 [&](BinaryFunction &BF) { runOnFunction(BF); }, 322 [&](const BinaryFunction &BF) { return !shouldOptimize(BF); }, 323 "NormalizeCFG"); 324 if (NumBlocksRemoved) 325 outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block" 326 << (NumBlocksRemoved == 1 ? "" : "s") << '\n'; 327 if (NumDuplicateEdgesMerged) 328 outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged 329 << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s") 330 << '\n'; 331 } 332 333 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { 334 if (Function.layout_size() > 0) { 335 unsigned Count; 336 uint64_t Bytes; 337 Function.markUnreachableBlocks(); 338 LLVM_DEBUG({ 339 for (BinaryBasicBlock *BB : Function.layout()) { 340 if (!BB->isValid()) { 341 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB->getName() 342 << " in function " << Function << "\n"; 343 Function.dump(); 344 } 345 } 346 }); 347 std::tie(Count, Bytes) = Function.eraseInvalidBBs(); 348 DeletedBlocks += Count; 349 DeletedBytes += Bytes; 350 if (Count) { 351 Modified.insert(&Function); 352 if (opts::Verbosity > 0) 353 outs() << "BOLT-INFO: Removed " << Count 354 << " dead basic block(s) accounting for " << Bytes 355 << " bytes in function " << Function << '\n'; 356 } 357 } 358 } 359 360 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { 361 for (auto &It : BC.getBinaryFunctions()) { 362 BinaryFunction &Function = It.second; 363 if (shouldOptimize(Function)) 364 runOnFunction(Function); 365 } 366 367 outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " 368 << DeletedBytes << " bytes of code.\n"; 369 } 370 371 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { 372 return (BinaryFunctionPass::shouldPrint(BF) && 373 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); 374 } 375 376 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const { 377 // Apply execution count threshold 378 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 379 return false; 380 381 return BinaryFunctionPass::shouldOptimize(BF); 382 } 383 384 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { 385 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) 386 return; 387 388 std::atomic<uint64_t> ModifiedFuncCount{0}; 389 390 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 391 modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters); 392 if (BF.hasLayoutChanged()) 393 ++ModifiedFuncCount; 394 }; 395 396 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 397 return !shouldOptimize(BF); 398 }; 399 400 ParallelUtilities::runOnEachFunction( 401 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc, 402 "ReorderBasicBlocks"); 403 404 outs() << "BOLT-INFO: basic block reordering modified layout of " 405 << format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(), 406 100.0 * ModifiedFuncCount.load() / 407 BC.getBinaryFunctions().size()); 408 409 if (opts::PrintFuncStat > 0) { 410 raw_ostream &OS = outs(); 411 // Copy all the values into vector in order to sort them 412 std::map<uint64_t, BinaryFunction &> ScoreMap; 413 auto &BFs = BC.getBinaryFunctions(); 414 for (auto It = BFs.begin(); It != BFs.end(); ++It) 415 ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>( 416 It->second.getFunctionScore(), It->second)); 417 418 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n"; 419 OS << " There are " << BFs.size() << " functions in total. \n"; 420 OS << " Number of functions being modified: " 421 << ModifiedFuncCount.load() << "\n"; 422 OS << " User asks for detailed information on top " 423 << opts::PrintFuncStat << " functions. (Ranked by function score)" 424 << "\n\n"; 425 uint64_t I = 0; 426 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit = 427 ScoreMap.rbegin(); 428 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { 429 BinaryFunction &Function = Rit->second; 430 431 OS << " Information for function of top: " << (I + 1) << ": \n"; 432 OS << " Function Score is: " << Function.getFunctionScore() 433 << "\n"; 434 OS << " There are " << Function.size() 435 << " number of blocks in this function.\n"; 436 OS << " There are " << Function.getInstructionCount() 437 << " number of instructions in this function.\n"; 438 OS << " The edit distance for this function is: " 439 << Function.getEditDistance() << "\n\n"; 440 } 441 } 442 } 443 444 void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, 445 LayoutType Type, 446 bool MinBranchClusters) const { 447 if (BF.size() == 0 || Type == LT_NONE) 448 return; 449 450 BinaryFunction::BasicBlockOrderType NewLayout; 451 std::unique_ptr<ReorderAlgorithm> Algo; 452 453 // Cannot do optimal layout without profile. 454 if (Type != LT_REVERSE && !BF.hasValidProfile()) 455 return; 456 457 if (Type == LT_REVERSE) { 458 Algo.reset(new ReverseReorderAlgorithm()); 459 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { 460 // Work on optimal solution if problem is small enough 461 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); 462 Algo.reset(new TSPReorderAlgorithm()); 463 } else { 464 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n"); 465 466 std::unique_ptr<ClusterAlgorithm> CAlgo; 467 if (MinBranchClusters) 468 CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); 469 else 470 CAlgo.reset(new PHGreedyClusterAlgorithm()); 471 472 switch (Type) { 473 case LT_OPTIMIZE: 474 Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); 475 break; 476 477 case LT_OPTIMIZE_BRANCH: 478 Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); 479 break; 480 481 case LT_OPTIMIZE_CACHE: 482 Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); 483 break; 484 485 case LT_OPTIMIZE_EXT_TSP: 486 Algo.reset(new ExtTSPReorderAlgorithm()); 487 break; 488 489 case LT_OPTIMIZE_SHUFFLE: 490 Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); 491 break; 492 493 default: 494 llvm_unreachable("unexpected layout type"); 495 } 496 } 497 498 Algo->reorderBasicBlocks(BF, NewLayout); 499 500 BF.updateBasicBlockLayout(NewLayout); 501 } 502 503 void FixupBranches::runOnFunctions(BinaryContext &BC) { 504 for (auto &It : BC.getBinaryFunctions()) { 505 BinaryFunction &Function = It.second; 506 if (!BC.shouldEmit(Function) || !Function.isSimple()) 507 continue; 508 509 Function.fixBranches(); 510 } 511 } 512 513 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) { 514 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 515 if (!BF.finalizeCFIState()) { 516 if (BC.HasRelocations) { 517 errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF 518 << ". Exiting.\n"; 519 exit(1); 520 } 521 BF.setSimple(false); 522 return; 523 } 524 525 BF.setFinalized(); 526 527 // Update exception handling information. 528 BF.updateEHRanges(); 529 }; 530 531 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 532 return !BC.shouldEmit(BF); 533 }; 534 535 ParallelUtilities::runOnEachFunction( 536 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 537 SkipPredicate, "FinalizeFunctions"); 538 } 539 540 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { 541 if (BC.HasRelocations) 542 return; 543 544 if (!opts::UpdateDebugSections) 545 return; 546 547 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit 548 // incorrect debug info. 549 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 550 uint64_t HotSize, ColdSize; 551 std::tie(HotSize, ColdSize) = 552 BC.calculateEmittedSize(BF, /*FixBranches=*/false); 553 if (HotSize > BF.getMaxSize()) 554 BF.setSimple(false); 555 }; 556 557 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 558 return !shouldOptimize(BF); 559 }; 560 561 ParallelUtilities::runOnEachFunction( 562 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 563 SkipFunc, "CheckLargeFunctions"); 564 } 565 566 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { 567 // Unlike other passes, allow functions in non-CFG state. 568 return BF.isSimple() && !BF.isIgnored(); 569 } 570 571 void LowerAnnotations::runOnFunctions(BinaryContext &BC) { 572 std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations; 573 574 for (auto &It : BC.getBinaryFunctions()) { 575 BinaryFunction &BF = It.second; 576 int64_t CurrentGnuArgsSize = 0; 577 578 // Have we crossed hot/cold border for split functions? 579 bool SeenCold = false; 580 581 for (BinaryBasicBlock *BB : BF.layout()) { 582 if (BB->isCold() && !SeenCold) { 583 SeenCold = true; 584 CurrentGnuArgsSize = 0; 585 } 586 587 // First convert GnuArgsSize annotations into CFIs. This may change instr 588 // pointers, so do it before recording ptrs for preserved annotations 589 if (BF.usesGnuArgsSize()) { 590 for (auto II = BB->begin(); II != BB->end(); ++II) { 591 if (!BC.MIB->isInvoke(*II)) 592 continue; 593 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II); 594 assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size"); 595 if (NewGnuArgsSize != CurrentGnuArgsSize) { 596 auto InsertII = BF.addCFIInstruction( 597 BB, II, 598 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize)); 599 CurrentGnuArgsSize = NewGnuArgsSize; 600 II = std::next(InsertII); 601 } 602 } 603 } 604 605 // Now record preserved annotations separately and then strip annotations. 606 for (auto II = BB->begin(); II != BB->end(); ++II) { 607 if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II)) 608 PreservedOffsetAnnotations.emplace_back(&(*II), 609 *BC.MIB->getOffset(*II)); 610 BC.MIB->stripAnnotations(*II); 611 } 612 } 613 } 614 for (BinaryFunction *BF : BC.getInjectedBinaryFunctions()) 615 for (BinaryBasicBlock &BB : *BF) 616 for (MCInst &Instruction : BB) 617 BC.MIB->stripAnnotations(Instruction); 618 619 // Release all memory taken by annotations 620 BC.MIB->freeAnnotations(); 621 622 // Reinsert preserved annotations we need during code emission. 623 for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations) 624 BC.MIB->setOffset(*Item.first, Item.second); 625 } 626 627 namespace { 628 629 // This peephole fixes jump instructions that jump to another basic 630 // block with a single jump instruction, e.g. 631 // 632 // B0: ... 633 // jmp B1 (or jcc B1) 634 // 635 // B1: jmp B2 636 // 637 // -> 638 // 639 // B0: ... 640 // jmp B2 (or jcc B2) 641 // 642 uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { 643 uint64_t NumDoubleJumps = 0; 644 645 MCContext *Ctx = Function.getBinaryContext().Ctx.get(); 646 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 647 for (BinaryBasicBlock &BB : Function) { 648 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ, 649 const MCSymbol *SuccSym) { 650 // Ignore infinite loop jumps or fallthrough tail jumps. 651 if (Pred == Succ || Succ == &BB) 652 return false; 653 654 if (Succ) { 655 const MCSymbol *TBB = nullptr; 656 const MCSymbol *FBB = nullptr; 657 MCInst *CondBranch = nullptr; 658 MCInst *UncondBranch = nullptr; 659 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 660 if (!Res) { 661 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; 662 Pred->dump()); 663 return false; 664 } 665 Pred->replaceSuccessor(&BB, Succ); 666 667 // We must patch up any existing branch instructions to match up 668 // with the new successor. 669 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && 670 "Predecessor block has inconsistent number of successors"); 671 if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) { 672 MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); 673 } else if (UncondBranch && 674 MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) { 675 MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); 676 } else if (!UncondBranch) { 677 assert(Function.getBasicBlockAfter(Pred, false) != Succ && 678 "Don't add an explicit jump to a fallthrough block."); 679 Pred->addBranchInstruction(Succ); 680 } 681 } else { 682 // Succ will be null in the tail call case. In this case we 683 // need to explicitly add a tail call instruction. 684 MCInst *Branch = Pred->getLastNonPseudoInstr(); 685 if (Branch && MIB->isUnconditionalBranch(*Branch)) { 686 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel()); 687 Pred->removeSuccessor(&BB); 688 Pred->eraseInstruction(Pred->findInstruction(Branch)); 689 Pred->addTailCallInstruction(SuccSym); 690 } else { 691 return false; 692 } 693 } 694 695 ++NumDoubleJumps; 696 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from " 697 << Pred->getName() << " -> " << BB.getName() << " to " 698 << Pred->getName() << " -> " << SuccSym->getName() 699 << (!Succ ? " (tail)\n" : "\n")); 700 701 return true; 702 }; 703 704 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) 705 continue; 706 707 MCInst *Inst = BB.getFirstNonPseudoInstr(); 708 const bool IsTailCall = MIB->isTailCall(*Inst); 709 710 if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall) 711 continue; 712 713 // If we operate after SCTC make sure it's not a conditional tail call. 714 if (IsTailCall && MIB->isConditionalBranch(*Inst)) 715 continue; 716 717 const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst); 718 BinaryBasicBlock *Succ = BB.getSuccessor(); 719 720 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) 721 continue; 722 723 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()}; 724 725 for (BinaryBasicBlock *Pred : Preds) { 726 if (Pred->isLandingPad()) 727 continue; 728 729 if (Pred->getSuccessor() == &BB || 730 (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || 731 Pred->getConditionalSuccessor(false) == &BB) 732 if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid) 733 BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() || 734 BB.isEntryPoint()); 735 } 736 } 737 738 return NumDoubleJumps; 739 } 740 } // namespace 741 742 bool SimplifyConditionalTailCalls::shouldRewriteBranch( 743 const BinaryBasicBlock *PredBB, const MCInst &CondBranch, 744 const BinaryBasicBlock *BB, const bool DirectionFlag) { 745 if (BeenOptimized.count(PredBB)) 746 return false; 747 748 const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); 749 750 if (IsForward) 751 ++NumOrigForwardBranches; 752 else 753 ++NumOrigBackwardBranches; 754 755 if (opts::SctcMode == opts::SctcAlways) 756 return true; 757 758 if (opts::SctcMode == opts::SctcPreserveDirection) 759 return IsForward == DirectionFlag; 760 761 const ErrorOr<std::pair<double, double>> Frequency = 762 PredBB->getBranchStats(BB); 763 764 // It's ok to rewrite the conditional branch if the new target will be 765 // a backward branch. 766 767 // If no data available for these branches, then it should be ok to 768 // do the optimization since it will reduce code size. 769 if (Frequency.getError()) 770 return true; 771 772 // TODO: should this use misprediction frequency instead? 773 const bool Result = (IsForward && Frequency.get().first >= 0.5) || 774 (!IsForward && Frequency.get().first <= 0.5); 775 776 return Result == DirectionFlag; 777 } 778 779 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) { 780 // Need updated indices to correctly detect branch' direction. 781 BF.updateLayoutIndices(); 782 BF.markUnreachableBlocks(); 783 784 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 785 MCContext *Ctx = BF.getBinaryContext().Ctx.get(); 786 uint64_t NumLocalCTCCandidates = 0; 787 uint64_t NumLocalCTCs = 0; 788 uint64_t LocalCTCTakenCount = 0; 789 uint64_t LocalCTCExecCount = 0; 790 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>> 791 NeedsUncondBranch; 792 793 // Will block be deleted by UCE? 794 auto isValid = [](const BinaryBasicBlock *BB) { 795 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint()); 796 }; 797 798 for (BinaryBasicBlock *BB : BF.layout()) { 799 // Locate BB with a single direct tail-call instruction. 800 if (BB->getNumNonPseudos() != 1) 801 continue; 802 803 MCInst *Instr = BB->getFirstNonPseudoInstr(); 804 if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr)) 805 continue; 806 807 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr); 808 if (!CalleeSymbol) 809 continue; 810 811 // Detect direction of the possible conditional tail call. 812 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); 813 814 // Iterate through all predecessors. 815 for (BinaryBasicBlock *PredBB : BB->predecessors()) { 816 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true); 817 if (!CondSucc) 818 continue; 819 820 ++NumLocalCTCCandidates; 821 822 const MCSymbol *TBB = nullptr; 823 const MCSymbol *FBB = nullptr; 824 MCInst *CondBranch = nullptr; 825 MCInst *UncondBranch = nullptr; 826 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 827 828 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz 829 if (!Result) { 830 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n"; 831 PredBB->dump()); 832 continue; 833 } 834 835 assert(Result && "internal error analyzing conditional branch"); 836 assert(CondBranch && "conditional branch expected"); 837 838 // It's possible that PredBB is also a successor to BB that may have 839 // been processed by a previous iteration of the SCTC loop, in which 840 // case it may have been marked invalid. We should skip rewriting in 841 // this case. 842 if (!PredBB->isValid()) { 843 assert(PredBB->isSuccessor(BB) && 844 "PredBB should be valid if it is not a successor to BB"); 845 continue; 846 } 847 848 // We don't want to reverse direction of the branch in new order 849 // without further profile analysis. 850 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; 851 if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag)) 852 continue; 853 854 // Record this block so that we don't try to optimize it twice. 855 BeenOptimized.insert(PredBB); 856 857 uint64_t Count = 0; 858 if (CondSucc != BB) { 859 // Patch the new target address into the conditional branch. 860 MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx); 861 // Since we reversed the condition on the branch we need to change 862 // the target for the unconditional branch or add a unconditional 863 // branch to the old target. This has to be done manually since 864 // fixupBranches is not called after SCTC. 865 NeedsUncondBranch.emplace_back(PredBB, CondSucc); 866 Count = PredBB->getFallthroughBranchInfo().Count; 867 } else { 868 // Change destination of the conditional branch. 869 MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx); 870 Count = PredBB->getTakenBranchInfo().Count; 871 } 872 const uint64_t CTCTakenFreq = 873 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; 874 875 // Annotate it, so "isCall" returns true for this jcc 876 MIB->setConditionalTailCall(*CondBranch); 877 // Add info abount the conditional tail call frequency, otherwise this 878 // info will be lost when we delete the associated BranchInfo entry 879 auto &CTCAnnotation = 880 MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount"); 881 CTCAnnotation = CTCTakenFreq; 882 883 // Remove the unused successor which may be eliminated later 884 // if there are no other users. 885 PredBB->removeSuccessor(BB); 886 // Update BB execution count 887 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) 888 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); 889 else if (CTCTakenFreq > BB->getKnownExecutionCount()) 890 BB->setExecutionCount(0); 891 892 ++NumLocalCTCs; 893 LocalCTCTakenCount += CTCTakenFreq; 894 LocalCTCExecCount += PredBB->getKnownExecutionCount(); 895 } 896 897 // Remove the block from CFG if all predecessors were removed. 898 BB->markValid(isValid(BB)); 899 } 900 901 // Add unconditional branches at the end of BBs to new successors 902 // as long as the successor is not a fallthrough. 903 for (auto &Entry : NeedsUncondBranch) { 904 BinaryBasicBlock *PredBB = Entry.first; 905 const BinaryBasicBlock *CondSucc = Entry.second; 906 907 const MCSymbol *TBB = nullptr; 908 const MCSymbol *FBB = nullptr; 909 MCInst *CondBranch = nullptr; 910 MCInst *UncondBranch = nullptr; 911 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 912 913 // Find the next valid block. Invalid blocks will be deleted 914 // so they shouldn't be considered fallthrough targets. 915 const BinaryBasicBlock *NextBlock = BF.getBasicBlockAfter(PredBB, false); 916 while (NextBlock && !isValid(NextBlock)) 917 NextBlock = BF.getBasicBlockAfter(NextBlock, false); 918 919 // Get the unconditional successor to this block. 920 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor(); 921 assert(PredSucc && "The other branch should be a tail call"); 922 923 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); 924 925 if (UncondBranch) { 926 if (HasFallthrough) 927 PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch)); 928 else 929 MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx); 930 } else if (!HasFallthrough) { 931 MCInst Branch; 932 MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx); 933 PredBB->addInstruction(Branch); 934 } 935 } 936 937 if (NumLocalCTCs > 0) { 938 NumDoubleJumps += fixDoubleJumps(BF, true); 939 // Clean-up unreachable tail-call blocks. 940 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs(); 941 DeletedBlocks += Stats.first; 942 DeletedBytes += Stats.second; 943 944 assert(BF.validateCFG()); 945 } 946 947 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs 948 << " conditional tail calls from a total of " 949 << NumLocalCTCCandidates << " candidates in function " << BF 950 << ". CTCs execution count for this function is " 951 << LocalCTCExecCount << " and CTC taken count is " 952 << LocalCTCTakenCount << "\n";); 953 954 NumTailCallsPatched += NumLocalCTCs; 955 NumCandidateTailCalls += NumLocalCTCCandidates; 956 CTCExecCount += LocalCTCExecCount; 957 CTCTakenCount += LocalCTCTakenCount; 958 959 return NumLocalCTCs > 0; 960 } 961 962 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { 963 if (!BC.isX86()) 964 return; 965 966 for (auto &It : BC.getBinaryFunctions()) { 967 BinaryFunction &Function = It.second; 968 969 if (!shouldOptimize(Function)) 970 continue; 971 972 if (fixTailCalls(Function)) { 973 Modified.insert(&Function); 974 Function.setHasCanonicalCFG(false); 975 } 976 } 977 978 outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched 979 << " tail calls (" << NumOrigForwardBranches << " forward)" 980 << " tail calls (" << NumOrigBackwardBranches << " backward)" 981 << " from a total of " << NumCandidateTailCalls << " while removing " 982 << NumDoubleJumps << " double jumps" 983 << " and removing " << DeletedBlocks << " basic blocks" 984 << " totalling " << DeletedBytes 985 << " bytes of code. CTCs total execution count is " << CTCExecCount 986 << " and the number of times CTCs are taken is " << CTCTakenCount 987 << ".\n"; 988 } 989 990 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) { 991 uint64_t Count = 0; 992 const BinaryContext &BC = Function.getBinaryContext(); 993 for (BinaryBasicBlock &BB : Function) { 994 for (MCInst &Inst : BB) { 995 MCInst OriginalInst; 996 if (opts::Verbosity > 2) 997 OriginalInst = Inst; 998 999 if (!BC.MIB->shortenInstruction(Inst, *BC.STI)) 1000 continue; 1001 1002 if (opts::Verbosity > 2) { 1003 BC.scopeLock(); 1004 outs() << "BOLT-INFO: shortening:\nBOLT-INFO: "; 1005 BC.printInstruction(outs(), OriginalInst, 0, &Function); 1006 outs() << "BOLT-INFO: to:"; 1007 BC.printInstruction(outs(), Inst, 0, &Function); 1008 } 1009 1010 ++Count; 1011 } 1012 } 1013 1014 return Count; 1015 } 1016 1017 void ShortenInstructions::runOnFunctions(BinaryContext &BC) { 1018 std::atomic<uint64_t> NumShortened{0}; 1019 if (!BC.isX86()) 1020 return; 1021 1022 ParallelUtilities::runOnEachFunction( 1023 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, 1024 [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); }, 1025 nullptr, "ShortenInstructions"); 1026 1027 outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n"; 1028 } 1029 1030 void Peepholes::addTailcallTraps(BinaryFunction &Function) { 1031 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 1032 for (BinaryBasicBlock &BB : Function) { 1033 MCInst *Inst = BB.getLastNonPseudoInstr(); 1034 if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) { 1035 MCInst Trap; 1036 if (MIB->createTrap(Trap)) { 1037 BB.addInstruction(Trap); 1038 ++TailCallTraps; 1039 } 1040 } 1041 } 1042 } 1043 1044 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) { 1045 for (BinaryBasicBlock &BB : Function) { 1046 if (BB.succ_size() != 2) 1047 continue; 1048 1049 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true); 1050 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false); 1051 if (CondBB != UncondBB) 1052 continue; 1053 1054 const MCSymbol *TBB = nullptr; 1055 const MCSymbol *FBB = nullptr; 1056 MCInst *CondBranch = nullptr; 1057 MCInst *UncondBranch = nullptr; 1058 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 1059 1060 // analyzeBranch() can fail due to unusual branch instructions, 1061 // e.g. jrcxz, or jump tables (indirect jump). 1062 if (!Result || !CondBranch) 1063 continue; 1064 1065 BB.removeDuplicateConditionalSuccessor(CondBranch); 1066 ++NumUselessCondBranches; 1067 } 1068 } 1069 1070 void Peepholes::runOnFunctions(BinaryContext &BC) { 1071 const char Opts = 1072 std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0, 1073 [](const char A, const PeepholeOpts B) { return A | B; }); 1074 if (Opts == PEEP_NONE) 1075 return; 1076 1077 for (auto &It : BC.getBinaryFunctions()) { 1078 BinaryFunction &Function = It.second; 1079 if (shouldOptimize(Function)) { 1080 if (Opts & PEEP_DOUBLE_JUMPS) 1081 NumDoubleJumps += fixDoubleJumps(Function, false); 1082 if (Opts & PEEP_TAILCALL_TRAPS) 1083 addTailcallTraps(Function); 1084 if (Opts & PEEP_USELESS_BRANCHES) 1085 removeUselessCondBranches(Function); 1086 assert(Function.validateCFG()); 1087 } 1088 } 1089 outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps 1090 << " double jumps patched.\n" 1091 << "BOLT-INFO: Peephole: " << TailCallTraps 1092 << " tail call traps inserted.\n" 1093 << "BOLT-INFO: Peephole: " << NumUselessCondBranches 1094 << " useless conditional branches removed.\n"; 1095 } 1096 1097 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) { 1098 BinaryContext &BC = BF.getBinaryContext(); 1099 MCPlusBuilder *MIB = BC.MIB.get(); 1100 1101 uint64_t NumLocalLoadsSimplified = 0; 1102 uint64_t NumDynamicLocalLoadsSimplified = 0; 1103 uint64_t NumLocalLoadsFound = 0; 1104 uint64_t NumDynamicLocalLoadsFound = 0; 1105 1106 for (BinaryBasicBlock *BB : BF.layout()) { 1107 for (MCInst &Inst : *BB) { 1108 unsigned Opcode = Inst.getOpcode(); 1109 const MCInstrDesc &Desc = BC.MII->get(Opcode); 1110 1111 // Skip instructions that do not load from memory. 1112 if (!Desc.mayLoad()) 1113 continue; 1114 1115 // Try to statically evaluate the target memory address; 1116 uint64_t TargetAddress; 1117 1118 if (MIB->hasPCRelOperand(Inst)) { 1119 // Try to find the symbol that corresponds to the PC-relative operand. 1120 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst); 1121 assert(DispOpI != Inst.end() && "expected PC-relative displacement"); 1122 assert(DispOpI->isExpr() && 1123 "found PC-relative with non-symbolic displacement"); 1124 1125 // Get displacement symbol. 1126 const MCSymbol *DisplSymbol; 1127 uint64_t DisplOffset; 1128 1129 std::tie(DisplSymbol, DisplOffset) = 1130 MIB->getTargetSymbolInfo(DispOpI->getExpr()); 1131 1132 if (!DisplSymbol) 1133 continue; 1134 1135 // Look up the symbol address in the global symbols map of the binary 1136 // context object. 1137 BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName()); 1138 if (!BD) 1139 continue; 1140 TargetAddress = BD->getAddress() + DisplOffset; 1141 } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) { 1142 continue; 1143 } 1144 1145 // Get the contents of the section containing the target address of the 1146 // memory operand. We are only interested in read-only sections. 1147 ErrorOr<BinarySection &> DataSection = 1148 BC.getSectionForAddress(TargetAddress); 1149 if (!DataSection || !DataSection->isReadOnly()) 1150 continue; 1151 1152 if (BC.getRelocationAt(TargetAddress) || 1153 BC.getDynamicRelocationAt(TargetAddress)) 1154 continue; 1155 1156 uint32_t Offset = TargetAddress - DataSection->getAddress(); 1157 StringRef ConstantData = DataSection->getContents(); 1158 1159 ++NumLocalLoadsFound; 1160 if (BB->hasProfile()) 1161 NumDynamicLocalLoadsFound += BB->getExecutionCount(); 1162 1163 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { 1164 ++NumLocalLoadsSimplified; 1165 if (BB->hasProfile()) 1166 NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); 1167 } 1168 } 1169 } 1170 1171 NumLoadsFound += NumLocalLoadsFound; 1172 NumDynamicLoadsFound += NumDynamicLocalLoadsFound; 1173 NumLoadsSimplified += NumLocalLoadsSimplified; 1174 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; 1175 1176 return NumLocalLoadsSimplified > 0; 1177 } 1178 1179 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { 1180 for (auto &It : BC.getBinaryFunctions()) { 1181 BinaryFunction &Function = It.second; 1182 if (shouldOptimize(Function) && simplifyRODataLoads(Function)) 1183 Modified.insert(&Function); 1184 } 1185 1186 outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " 1187 << NumLoadsFound << " loads from a statically computed address.\n" 1188 << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified 1189 << "\n" 1190 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n"; 1191 } 1192 1193 void AssignSections::runOnFunctions(BinaryContext &BC) { 1194 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) { 1195 Function->setCodeSectionName(BC.getInjectedCodeSectionName()); 1196 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); 1197 } 1198 1199 // In non-relocation mode functions have pre-assigned section names. 1200 if (!BC.HasRelocations) 1201 return; 1202 1203 const bool UseColdSection = 1204 BC.NumProfiledFuncs > 0 || 1205 opts::ReorderFunctions == ReorderFunctions::RT_USER; 1206 for (auto &BFI : BC.getBinaryFunctions()) { 1207 BinaryFunction &Function = BFI.second; 1208 if (opts::isHotTextMover(Function)) { 1209 Function.setCodeSectionName(BC.getHotTextMoverSectionName()); 1210 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); 1211 continue; 1212 } 1213 1214 if (!UseColdSection || Function.hasValidIndex()) 1215 Function.setCodeSectionName(BC.getMainCodeSectionName()); 1216 else 1217 Function.setCodeSectionName(BC.getColdCodeSectionName()); 1218 1219 if (Function.isSplit()) 1220 Function.setColdCodeSectionName(BC.getColdCodeSectionName()); 1221 } 1222 } 1223 1224 void PrintProfileStats::runOnFunctions(BinaryContext &BC) { 1225 double FlowImbalanceMean = 0.0; 1226 size_t NumBlocksConsidered = 0; 1227 double WorstBias = 0.0; 1228 const BinaryFunction *WorstBiasFunc = nullptr; 1229 1230 // For each function CFG, we fill an IncomingMap with the sum of the frequency 1231 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum 1232 // of the frequency of outgoing edges. 1233 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>; 1234 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps; 1235 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps; 1236 1237 // Compute mean 1238 for (const auto &BFI : BC.getBinaryFunctions()) { 1239 const BinaryFunction &Function = BFI.second; 1240 if (Function.empty() || !Function.isSimple()) 1241 continue; 1242 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1243 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1244 for (const BinaryBasicBlock &BB : Function) { 1245 uint64_t TotalOutgoing = 0ULL; 1246 auto SuccBIIter = BB.branch_info_begin(); 1247 for (BinaryBasicBlock *Succ : BB.successors()) { 1248 uint64_t Count = SuccBIIter->Count; 1249 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { 1250 ++SuccBIIter; 1251 continue; 1252 } 1253 TotalOutgoing += Count; 1254 IncomingMap[Succ] += Count; 1255 ++SuccBIIter; 1256 } 1257 OutgoingMap[&BB] = TotalOutgoing; 1258 } 1259 1260 size_t NumBlocks = 0; 1261 double Mean = 0.0; 1262 for (const BinaryBasicBlock &BB : Function) { 1263 // Do not compute score for low frequency blocks, entry or exit blocks 1264 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint()) 1265 continue; 1266 ++NumBlocks; 1267 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1268 Mean += fabs(Difference / IncomingMap[&BB]); 1269 } 1270 1271 FlowImbalanceMean += Mean; 1272 NumBlocksConsidered += NumBlocks; 1273 if (!NumBlocks) 1274 continue; 1275 double FuncMean = Mean / NumBlocks; 1276 if (FuncMean > WorstBias) { 1277 WorstBias = FuncMean; 1278 WorstBiasFunc = &Function; 1279 } 1280 } 1281 if (NumBlocksConsidered > 0) 1282 FlowImbalanceMean /= NumBlocksConsidered; 1283 1284 // Compute standard deviation 1285 NumBlocksConsidered = 0; 1286 double FlowImbalanceVar = 0.0; 1287 for (const auto &BFI : BC.getBinaryFunctions()) { 1288 const BinaryFunction &Function = BFI.second; 1289 if (Function.empty() || !Function.isSimple()) 1290 continue; 1291 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1292 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1293 for (const BinaryBasicBlock &BB : Function) { 1294 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0) 1295 continue; 1296 ++NumBlocksConsidered; 1297 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1298 FlowImbalanceVar += 1299 pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2); 1300 } 1301 } 1302 if (NumBlocksConsidered) { 1303 FlowImbalanceVar /= NumBlocksConsidered; 1304 FlowImbalanceVar = sqrt(FlowImbalanceVar); 1305 } 1306 1307 // Report to user 1308 outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n", 1309 (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar)); 1310 if (WorstBiasFunc && opts::Verbosity >= 1) { 1311 outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName() 1312 << "\n"; 1313 LLVM_DEBUG(WorstBiasFunc->dump()); 1314 } 1315 } 1316 1317 void PrintProgramStats::runOnFunctions(BinaryContext &BC) { 1318 uint64_t NumRegularFunctions = 0; 1319 uint64_t NumStaleProfileFunctions = 0; 1320 uint64_t NumNonSimpleProfiledFunctions = 0; 1321 uint64_t NumUnknownControlFlowFunctions = 0; 1322 uint64_t TotalSampleCount = 0; 1323 uint64_t StaleSampleCount = 0; 1324 std::vector<const BinaryFunction *> ProfiledFunctions; 1325 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; 1326 for (auto &BFI : BC.getBinaryFunctions()) { 1327 const BinaryFunction &Function = BFI.second; 1328 1329 // Ignore PLT functions for stats. 1330 if (Function.isPLTFunction()) 1331 continue; 1332 1333 ++NumRegularFunctions; 1334 1335 if (!Function.isSimple()) { 1336 if (Function.hasProfile()) 1337 ++NumNonSimpleProfiledFunctions; 1338 continue; 1339 } 1340 1341 if (Function.hasUnknownControlFlow()) { 1342 if (opts::PrintUnknownCFG) 1343 Function.dump(); 1344 else if (opts::PrintUnknown) 1345 errs() << "function with unknown control flow: " << Function << '\n'; 1346 1347 ++NumUnknownControlFlowFunctions; 1348 } 1349 1350 if (!Function.hasProfile()) 1351 continue; 1352 1353 uint64_t SampleCount = Function.getRawBranchCount(); 1354 TotalSampleCount += SampleCount; 1355 1356 if (Function.hasValidProfile()) { 1357 ProfiledFunctions.push_back(&Function); 1358 } else { 1359 if (opts::ReportStaleFuncs) { 1360 outs() << StaleFuncsHeader; 1361 StaleFuncsHeader = ""; 1362 outs() << " " << Function << '\n'; 1363 } 1364 ++NumStaleProfileFunctions; 1365 StaleSampleCount += SampleCount; 1366 } 1367 } 1368 BC.NumProfiledFuncs = ProfiledFunctions.size(); 1369 1370 const size_t NumAllProfiledFunctions = 1371 ProfiledFunctions.size() + NumStaleProfileFunctions; 1372 outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of " 1373 << NumRegularFunctions << " functions in the binary (" 1374 << format("%.1f", NumAllProfiledFunctions / 1375 (float)NumRegularFunctions * 100.0f) 1376 << "%) have non-empty execution profile\n"; 1377 if (NumNonSimpleProfiledFunctions) { 1378 outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function" 1379 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s") 1380 << " with profile could not be optimized\n"; 1381 } 1382 if (NumStaleProfileFunctions) { 1383 const float PctStale = 1384 NumStaleProfileFunctions / (float)NumAllProfiledFunctions * 100.0f; 1385 auto printErrorOrWarning = [&]() { 1386 if (PctStale > opts::StaleThreshold) 1387 errs() << "BOLT-ERROR: "; 1388 else 1389 errs() << "BOLT-WARNING: "; 1390 }; 1391 printErrorOrWarning(); 1392 errs() << NumStaleProfileFunctions 1393 << format(" (%.1f%% of all profiled)", PctStale) << " function" 1394 << (NumStaleProfileFunctions == 1 ? "" : "s") 1395 << " have invalid (possibly stale) profile." 1396 " Use -report-stale to see the list.\n"; 1397 if (TotalSampleCount > 0) { 1398 printErrorOrWarning(); 1399 errs() << StaleSampleCount << " out of " << TotalSampleCount 1400 << " samples in the binary (" 1401 << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount)) 1402 << "%) belong to functions with invalid" 1403 " (possibly stale) profile.\n"; 1404 } 1405 if (PctStale > opts::StaleThreshold) { 1406 errs() << "BOLT-ERROR: stale functions exceed specified threshold of " 1407 << opts::StaleThreshold << "%. Exiting.\n"; 1408 exit(1); 1409 } 1410 } 1411 1412 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { 1413 outs() << "BOLT-INFO: profile for " << NumUnusedObjects 1414 << " objects was ignored\n"; 1415 } 1416 1417 if (ProfiledFunctions.size() > 10) { 1418 if (opts::Verbosity >= 1) { 1419 outs() << "BOLT-INFO: top called functions are:\n"; 1420 std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(), 1421 [](const BinaryFunction *A, const BinaryFunction *B) { 1422 return B->getExecutionCount() < A->getExecutionCount(); 1423 }); 1424 auto SFI = ProfiledFunctions.begin(); 1425 auto SFIend = ProfiledFunctions.end(); 1426 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend; 1427 ++SFI, ++I) 1428 outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n'; 1429 } 1430 } 1431 1432 if (!opts::PrintSortedBy.empty() && 1433 std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(), 1434 DynoStats::FIRST_DYNO_STAT) == opts::PrintSortedBy.end()) { 1435 1436 std::vector<const BinaryFunction *> Functions; 1437 std::map<const BinaryFunction *, DynoStats> Stats; 1438 1439 for (const auto &BFI : BC.getBinaryFunctions()) { 1440 const BinaryFunction &BF = BFI.second; 1441 if (shouldOptimize(BF) && BF.hasValidProfile()) { 1442 Functions.push_back(&BF); 1443 Stats.emplace(&BF, getDynoStats(BF)); 1444 } 1445 } 1446 1447 const bool SortAll = 1448 std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(), 1449 DynoStats::LAST_DYNO_STAT) != opts::PrintSortedBy.end(); 1450 1451 const bool Ascending = 1452 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; 1453 1454 if (SortAll) { 1455 std::stable_sort(Functions.begin(), Functions.end(), 1456 [Ascending, &Stats](const BinaryFunction *A, 1457 const BinaryFunction *B) { 1458 return Ascending ? Stats.at(A) < Stats.at(B) 1459 : Stats.at(B) < Stats.at(A); 1460 }); 1461 } else { 1462 std::stable_sort( 1463 Functions.begin(), Functions.end(), 1464 [Ascending, &Stats](const BinaryFunction *A, 1465 const BinaryFunction *B) { 1466 const DynoStats &StatsA = Stats.at(A); 1467 const DynoStats &StatsB = Stats.at(B); 1468 return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy) 1469 : StatsB.lessThan(StatsA, opts::PrintSortedBy); 1470 }); 1471 } 1472 1473 outs() << "BOLT-INFO: top functions sorted by "; 1474 if (SortAll) { 1475 outs() << "dyno stats"; 1476 } else { 1477 outs() << "("; 1478 bool PrintComma = false; 1479 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1480 if (PrintComma) 1481 outs() << ", "; 1482 outs() << DynoStats::Description(Category); 1483 PrintComma = true; 1484 } 1485 outs() << ")"; 1486 } 1487 1488 outs() << " are:\n"; 1489 auto SFI = Functions.begin(); 1490 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { 1491 const DynoStats Stats = getDynoStats(**SFI); 1492 outs() << " " << **SFI; 1493 if (!SortAll) { 1494 outs() << " ("; 1495 bool PrintComma = false; 1496 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1497 if (PrintComma) 1498 outs() << ", "; 1499 outs() << dynoStatsOptName(Category) << "=" << Stats[Category]; 1500 PrintComma = true; 1501 } 1502 outs() << ")"; 1503 } 1504 outs() << "\n"; 1505 } 1506 } 1507 1508 if (!BC.TrappedFunctions.empty()) { 1509 errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function" 1510 << (BC.TrappedFunctions.size() > 1 ? "s" : "") 1511 << " will trap on entry. Use -trap-avx512=0 to disable" 1512 " traps."; 1513 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) { 1514 errs() << '\n'; 1515 for (const BinaryFunction *Function : BC.TrappedFunctions) 1516 errs() << " " << *Function << '\n'; 1517 } else { 1518 errs() << " Use -v=1 to see the list.\n"; 1519 } 1520 } 1521 1522 // Print information on missed macro-fusion opportunities seen on input. 1523 if (BC.MissedMacroFusionPairs) { 1524 outs() << "BOLT-INFO: the input contains " << BC.MissedMacroFusionPairs 1525 << " (dynamic count : " << BC.MissedMacroFusionExecCount 1526 << ") opportunities for macro-fusion optimization"; 1527 switch (opts::AlignMacroOpFusion) { 1528 case MFT_NONE: 1529 outs() << ". Use -align-macro-fusion to fix.\n"; 1530 break; 1531 case MFT_HOT: 1532 outs() << ". Will fix instances on a hot path.\n"; 1533 break; 1534 case MFT_ALL: 1535 outs() << " that are going to be fixed\n"; 1536 break; 1537 } 1538 } 1539 1540 // Collect and print information about suboptimal code layout on input. 1541 if (opts::ReportBadLayout) { 1542 std::vector<const BinaryFunction *> SuboptimalFuncs; 1543 for (auto &BFI : BC.getBinaryFunctions()) { 1544 const BinaryFunction &BF = BFI.second; 1545 if (!BF.hasValidProfile()) 1546 continue; 1547 1548 const uint64_t HotThreshold = 1549 std::max<uint64_t>(BF.getKnownExecutionCount(), 1); 1550 bool HotSeen = false; 1551 for (const BinaryBasicBlock *BB : BF.rlayout()) { 1552 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { 1553 HotSeen = true; 1554 continue; 1555 } 1556 if (HotSeen && BB->getKnownExecutionCount() == 0) { 1557 SuboptimalFuncs.push_back(&BF); 1558 break; 1559 } 1560 } 1561 } 1562 1563 if (!SuboptimalFuncs.empty()) { 1564 std::sort(SuboptimalFuncs.begin(), SuboptimalFuncs.end(), 1565 [](const BinaryFunction *A, const BinaryFunction *B) { 1566 return A->getKnownExecutionCount() / A->getSize() > 1567 B->getKnownExecutionCount() / B->getSize(); 1568 }); 1569 1570 outs() << "BOLT-INFO: " << SuboptimalFuncs.size() 1571 << " functions have " 1572 "cold code in the middle of hot code. Top functions are:\n"; 1573 for (unsigned I = 0; 1574 I < std::min(static_cast<size_t>(opts::ReportBadLayout), 1575 SuboptimalFuncs.size()); 1576 ++I) 1577 SuboptimalFuncs[I]->print(outs()); 1578 } 1579 } 1580 1581 if (NumUnknownControlFlowFunctions) { 1582 outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions 1583 << " functions have instructions with unknown control flow"; 1584 if (!opts::PrintUnknown) 1585 outs() << ". Use -print-unknown to see the list."; 1586 outs() << '\n'; 1587 } 1588 } 1589 1590 void InstructionLowering::runOnFunctions(BinaryContext &BC) { 1591 for (auto &BFI : BC.getBinaryFunctions()) 1592 for (BinaryBasicBlock &BB : BFI.second) 1593 for (MCInst &Instruction : BB) 1594 BC.MIB->lowerTailCall(Instruction); 1595 } 1596 1597 void StripRepRet::runOnFunctions(BinaryContext &BC) { 1598 uint64_t NumPrefixesRemoved = 0; 1599 uint64_t NumBytesSaved = 0; 1600 for (auto &BFI : BC.getBinaryFunctions()) { 1601 for (BinaryBasicBlock &BB : BFI.second) { 1602 auto LastInstRIter = BB.getLastNonPseudo(); 1603 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) || 1604 !BC.MIB->deleteREPPrefix(*LastInstRIter)) 1605 continue; 1606 1607 NumPrefixesRemoved += BB.getKnownExecutionCount(); 1608 ++NumBytesSaved; 1609 } 1610 } 1611 1612 if (NumBytesSaved) 1613 outs() << "BOLT-INFO: removed " << NumBytesSaved 1614 << " 'repz' prefixes" 1615 " with estimated execution count of " 1616 << NumPrefixesRemoved << " times.\n"; 1617 } 1618 1619 void InlineMemcpy::runOnFunctions(BinaryContext &BC) { 1620 if (!BC.isX86()) 1621 return; 1622 1623 uint64_t NumInlined = 0; 1624 uint64_t NumInlinedDyno = 0; 1625 for (auto &BFI : BC.getBinaryFunctions()) { 1626 for (BinaryBasicBlock &BB : BFI.second) { 1627 for (auto II = BB.begin(); II != BB.end(); ++II) { 1628 MCInst &Inst = *II; 1629 1630 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1631 !Inst.getOperand(0).isExpr()) 1632 continue; 1633 1634 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1635 if (CalleeSymbol->getName() != "memcpy" && 1636 CalleeSymbol->getName() != "memcpy@PLT" && 1637 CalleeSymbol->getName() != "_memcpy8") 1638 continue; 1639 1640 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); 1641 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1642 1643 const InstructionListType NewCode = 1644 BC.MIB->createInlineMemcpy(IsMemcpy8); 1645 II = BB.replaceInstruction(II, NewCode); 1646 std::advance(II, NewCode.size() - 1); 1647 if (IsTailCall) { 1648 MCInst Return; 1649 BC.MIB->createReturn(Return); 1650 II = BB.insertInstruction(std::next(II), std::move(Return)); 1651 } 1652 1653 ++NumInlined; 1654 NumInlinedDyno += BB.getKnownExecutionCount(); 1655 } 1656 } 1657 } 1658 1659 if (NumInlined) { 1660 outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls"; 1661 if (NumInlinedDyno) 1662 outs() << ". The calls were executed " << NumInlinedDyno 1663 << " times based on profile."; 1664 outs() << '\n'; 1665 } 1666 } 1667 1668 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const { 1669 if (!BinaryFunctionPass::shouldOptimize(Function)) 1670 return false; 1671 1672 for (const std::string &FunctionSpec : Spec) { 1673 StringRef FunctionName = StringRef(FunctionSpec).split(':').first; 1674 if (Function.hasNameRegex(FunctionName)) 1675 return true; 1676 } 1677 1678 return false; 1679 } 1680 1681 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize( 1682 const BinaryFunction &Function) const { 1683 StringRef SitesString; 1684 for (const std::string &FunctionSpec : Spec) { 1685 StringRef FunctionName; 1686 std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':'); 1687 if (Function.hasNameRegex(FunctionName)) 1688 break; 1689 SitesString = ""; 1690 } 1691 1692 std::set<size_t> Sites; 1693 SmallVector<StringRef, 4> SitesVec; 1694 SitesString.split(SitesVec, ':'); 1695 for (StringRef SiteString : SitesVec) { 1696 if (SiteString.empty()) 1697 continue; 1698 size_t Result; 1699 if (!SiteString.getAsInteger(10, Result)) 1700 Sites.emplace(Result); 1701 } 1702 1703 return Sites; 1704 } 1705 1706 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) { 1707 if (!BC.isX86()) 1708 return; 1709 1710 uint64_t NumSpecialized = 0; 1711 uint64_t NumSpecializedDyno = 0; 1712 for (auto &BFI : BC.getBinaryFunctions()) { 1713 BinaryFunction &Function = BFI.second; 1714 if (!shouldOptimize(Function)) 1715 continue; 1716 1717 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function); 1718 auto shouldOptimize = [&](size_t N) { 1719 return CallsToOptimize.empty() || CallsToOptimize.count(N); 1720 }; 1721 1722 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend()); 1723 size_t CallSiteID = 0; 1724 for (BinaryBasicBlock *CurBB : Blocks) { 1725 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { 1726 MCInst &Inst = *II; 1727 1728 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1729 !Inst.getOperand(0).isExpr()) 1730 continue; 1731 1732 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1733 if (CalleeSymbol->getName() != "memcpy" && 1734 CalleeSymbol->getName() != "memcpy@PLT") 1735 continue; 1736 1737 if (BC.MIB->isTailCall(Inst)) 1738 continue; 1739 1740 ++CallSiteID; 1741 1742 if (!shouldOptimize(CallSiteID)) 1743 continue; 1744 1745 // Create a copy of a call to memcpy(dest, src, size). 1746 MCInst MemcpyInstr = Inst; 1747 1748 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II); 1749 1750 BinaryBasicBlock *NextBB = nullptr; 1751 if (OneByteMemcpyBB->getNumNonPseudos() > 1) { 1752 NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin()); 1753 NextBB->eraseInstruction(NextBB->begin()); 1754 } else { 1755 NextBB = OneByteMemcpyBB->getSuccessor(); 1756 OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin()); 1757 assert(NextBB && "unexpected call to memcpy() with no return"); 1758 } 1759 1760 BinaryBasicBlock *MemcpyBB = 1761 Function.addBasicBlock(CurBB->getInputOffset()); 1762 InstructionListType CmpJCC = 1763 BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1, 1764 OneByteMemcpyBB->getLabel(), BC.Ctx.get()); 1765 CurBB->addInstructions(CmpJCC); 1766 CurBB->addSuccessor(MemcpyBB); 1767 1768 MemcpyBB->addInstruction(std::move(MemcpyInstr)); 1769 MemcpyBB->addSuccessor(NextBB); 1770 MemcpyBB->setCFIState(NextBB->getCFIState()); 1771 MemcpyBB->setExecutionCount(0); 1772 1773 // To prevent the actual call from being moved to cold, we set its 1774 // execution count to 1. 1775 if (CurBB->getKnownExecutionCount() > 0) 1776 MemcpyBB->setExecutionCount(1); 1777 1778 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy(); 1779 OneByteMemcpyBB->addInstructions(OneByteMemcpy); 1780 1781 ++NumSpecialized; 1782 NumSpecializedDyno += CurBB->getKnownExecutionCount(); 1783 1784 CurBB = NextBB; 1785 1786 // Note: we don't expect the next instruction to be a call to memcpy. 1787 II = CurBB->begin(); 1788 } 1789 } 1790 } 1791 1792 if (NumSpecialized) { 1793 outs() << "BOLT-INFO: specialized " << NumSpecialized 1794 << " memcpy() call sites for size 1"; 1795 if (NumSpecializedDyno) 1796 outs() << ". The calls were executed " << NumSpecializedDyno 1797 << " times based on profile."; 1798 outs() << '\n'; 1799 } 1800 } 1801 1802 void RemoveNops::runOnFunction(BinaryFunction &BF) { 1803 const BinaryContext &BC = BF.getBinaryContext(); 1804 for (BinaryBasicBlock &BB : BF) { 1805 for (int64_t I = BB.size() - 1; I >= 0; --I) { 1806 MCInst &Inst = BB.getInstructionAtIndex(I); 1807 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP")) 1808 BB.eraseInstructionAtIndex(I); 1809 } 1810 } 1811 } 1812 1813 void RemoveNops::runOnFunctions(BinaryContext &BC) { 1814 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 1815 runOnFunction(BF); 1816 }; 1817 1818 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 1819 return BF.shouldPreserveNops(); 1820 }; 1821 1822 ParallelUtilities::runOnEachFunction( 1823 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 1824 SkipFunc, "RemoveNops"); 1825 } 1826 1827 } // namespace bolt 1828 } // namespace llvm 1829