1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements multiple passes for binary optimization and analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/BinaryPasses.h" 14 #include "bolt/Core/ParallelUtilities.h" 15 #include "bolt/Passes/ReorderAlgorithm.h" 16 #include "bolt/Passes/ReorderFunctions.h" 17 #include "llvm/Support/CommandLine.h" 18 19 #include <numeric> 20 #include <vector> 21 22 #define DEBUG_TYPE "bolt-opts" 23 24 using namespace llvm; 25 using namespace bolt; 26 27 namespace { 28 29 const char *dynoStatsOptName(const bolt::DynoStats::Category C) { 30 if (C == bolt::DynoStats::FIRST_DYNO_STAT) 31 return "none"; 32 else if (C == bolt::DynoStats::LAST_DYNO_STAT) 33 return "all"; 34 35 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1]; 36 37 OptNames[C] = bolt::DynoStats::Description(C); 38 39 std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); 40 41 return OptNames[C].c_str(); 42 } 43 44 const char *dynoStatsOptDesc(const bolt::DynoStats::Category C) { 45 if (C == bolt::DynoStats::FIRST_DYNO_STAT) 46 return "unsorted"; 47 else if (C == bolt::DynoStats::LAST_DYNO_STAT) 48 return "sorted by all stats"; 49 50 return bolt::DynoStats::Description(C); 51 } 52 53 } 54 55 namespace opts { 56 57 extern cl::OptionCategory BoltCategory; 58 extern cl::OptionCategory BoltOptCategory; 59 60 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion; 61 extern cl::opt<unsigned> Verbosity; 62 extern cl::opt<bool> EnableBAT; 63 extern cl::opt<unsigned> ExecutionCountThreshold; 64 extern cl::opt<bool> UpdateDebugSections; 65 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions; 66 67 enum DynoStatsSortOrder : char { 68 Ascending, 69 Descending 70 }; 71 72 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt( 73 "print-sorted-by-order", 74 cl::desc("use ascending or descending order when printing functions " 75 "ordered by dyno stats"), 76 cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory)); 77 78 cl::list<std::string> 79 HotTextMoveSections("hot-text-move-sections", 80 cl::desc("list of sections containing functions used for hugifying hot text. " 81 "BOLT makes sure these functions are not placed on the same page as " 82 "the hot text. (default=\'.stub,.mover\')."), 83 cl::value_desc("sec1,sec2,sec3,..."), 84 cl::CommaSeparated, 85 cl::ZeroOrMore, 86 cl::cat(BoltCategory)); 87 88 bool isHotTextMover(const BinaryFunction &Function) { 89 for (std::string &SectionName : opts::HotTextMoveSections) { 90 if (Function.getOriginSectionName() && 91 *Function.getOriginSectionName() == SectionName) 92 return true; 93 } 94 95 return false; 96 } 97 98 static cl::opt<bool> MinBranchClusters( 99 "min-branch-clusters", 100 cl::desc("use a modified clustering algorithm geared towards minimizing " 101 "branches"), 102 cl::Hidden, cl::cat(BoltOptCategory)); 103 104 static cl::list<Peepholes::PeepholeOpts> Peepholes( 105 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"), 106 cl::value_desc("opt1,opt2,opt3,..."), 107 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"), 108 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps", 109 "remove double jumps when able"), 110 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps", 111 "insert tail call traps"), 112 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches", 113 "remove useless conditional branches"), 114 clEnumValN(Peepholes::PEEP_ALL, "all", 115 "enable all peephole optimizations")), 116 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 117 118 static cl::opt<unsigned> 119 PrintFuncStat("print-function-statistics", 120 cl::desc("print statistics about basic block ordering"), 121 cl::init(0), cl::cat(BoltOptCategory)); 122 123 static cl::list<bolt::DynoStats::Category> 124 PrintSortedBy("print-sorted-by", 125 cl::CommaSeparated, 126 cl::desc("print functions sorted by order of dyno stats"), 127 cl::value_desc("key1,key2,key3,..."), 128 cl::values( 129 #define D(name, ...) \ 130 clEnumValN(bolt::DynoStats::name, \ 131 dynoStatsOptName(bolt::DynoStats::name), \ 132 dynoStatsOptDesc(bolt::DynoStats::name)), 133 DYNO_STATS 134 #undef D 135 clEnumValN(0xffff, ".", ".") 136 ), 137 cl::ZeroOrMore, 138 cl::cat(BoltOptCategory)); 139 140 static cl::opt<bool> 141 PrintUnknown("print-unknown", 142 cl::desc("print names of functions with unknown control flow"), 143 cl::cat(BoltCategory), cl::Hidden); 144 145 static cl::opt<bool> 146 PrintUnknownCFG("print-unknown-cfg", 147 cl::desc("dump CFG of functions with unknown control flow"), 148 cl::cat(BoltCategory), cl::ReallyHidden); 149 150 // Please MSVC19 with a forward declaration: otherwise it reports an error about 151 // an undeclared variable inside a callback. 152 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks; 153 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks( 154 "reorder-blocks", cl::desc("change layout of basic blocks in a function"), 155 cl::init(bolt::ReorderBasicBlocks::LT_NONE), 156 cl::values( 157 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none", 158 "do not reorder basic blocks"), 159 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse", 160 "layout blocks in reverse order"), 161 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal", 162 "perform optimal layout based on profile"), 163 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, 164 "branch-predictor", 165 "perform optimal layout prioritizing branch " 166 "predictions"), 167 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache", 168 "perform optimal layout prioritizing I-cache " 169 "behavior"), 170 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+", 171 "perform layout optimizing I-cache behavior"), 172 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp", 173 "perform layout optimizing I-cache behavior"), 174 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, 175 "cluster-shuffle", "perform random layout of clusters")), 176 cl::ZeroOrMore, cl::cat(BoltOptCategory), 177 cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) { 178 if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) { 179 WithColor::warning() 180 << "'-reorder-blocks=cache+' is deprecated, " 181 << "please use '-reorder-blocks=ext-tsp' instead\n"; 182 ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP; 183 } 184 })); 185 186 static cl::opt<unsigned> ReportBadLayout( 187 "report-bad-layout", 188 cl::desc("print top <uint> functions with suboptimal code layout on input"), 189 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 190 191 static cl::opt<bool> 192 ReportStaleFuncs("report-stale", 193 cl::desc("print the list of functions with stale profile"), 194 cl::Hidden, cl::cat(BoltOptCategory)); 195 196 enum SctcModes : char { 197 SctcAlways, 198 SctcPreserveDirection, 199 SctcHeuristic 200 }; 201 202 static cl::opt<SctcModes> 203 SctcMode("sctc-mode", 204 cl::desc("mode for simplify conditional tail calls"), 205 cl::init(SctcAlways), 206 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), 207 clEnumValN(SctcPreserveDirection, 208 "preserve", 209 "only perform sctc when branch direction is " 210 "preserved"), 211 clEnumValN(SctcHeuristic, 212 "heuristic", 213 "use branch prediction data to control sctc")), 214 cl::ZeroOrMore, 215 cl::cat(BoltOptCategory)); 216 217 static cl::opt<unsigned> 218 StaleThreshold("stale-threshold", 219 cl::desc( 220 "maximum percentage of stale functions to tolerate (default: 100)"), 221 cl::init(100), 222 cl::Hidden, 223 cl::cat(BoltOptCategory)); 224 225 static cl::opt<unsigned> TSPThreshold( 226 "tsp-threshold", 227 cl::desc( 228 "maximum number of hot basic blocks in a function for which to use " 229 "a precise TSP solution while re-ordering basic blocks"), 230 cl::init(10), cl::Hidden, cl::cat(BoltOptCategory)); 231 232 static cl::opt<unsigned> TopCalledLimit( 233 "top-called-limit", 234 cl::desc("maximum number of functions to print in top called " 235 "functions section"), 236 cl::init(100), cl::Hidden, cl::cat(BoltCategory)); 237 238 } // namespace opts 239 240 namespace llvm { 241 namespace bolt { 242 243 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { 244 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG && 245 !BF.isIgnored(); 246 } 247 248 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { 249 return BF.isSimple() && !BF.isIgnored(); 250 } 251 252 void NormalizeCFG::runOnFunction(BinaryFunction &BF) { 253 uint64_t NumRemoved = 0; 254 uint64_t NumDuplicateEdges = 0; 255 uint64_t NeedsFixBranches = 0; 256 for (BinaryBasicBlock &BB : BF) { 257 if (!BB.empty()) 258 continue; 259 260 if (BB.isEntryPoint() || BB.isLandingPad()) 261 continue; 262 263 // Handle a dangling empty block. 264 if (BB.succ_size() == 0) { 265 // If an empty dangling basic block has a predecessor, it could be a 266 // result of codegen for __builtin_unreachable. In such case, do not 267 // remove the block. 268 if (BB.pred_size() == 0) { 269 BB.markValid(false); 270 ++NumRemoved; 271 } 272 continue; 273 } 274 275 // The block should have just one successor. 276 BinaryBasicBlock *Successor = BB.getSuccessor(); 277 assert(Successor && "invalid CFG encountered"); 278 279 // Redirect all predecessors to the successor block. 280 while (!BB.pred_empty()) { 281 BinaryBasicBlock *Predecessor = *BB.pred_begin(); 282 if (Predecessor->hasJumpTable()) 283 break; 284 285 if (Predecessor == Successor) 286 break; 287 288 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB); 289 Predecessor->replaceSuccessor(&BB, Successor, BI.Count, 290 BI.MispredictedCount); 291 // We need to fix branches even if we failed to replace all successors 292 // and remove the block. 293 NeedsFixBranches = true; 294 } 295 296 if (BB.pred_empty()) { 297 BB.removeAllSuccessors(); 298 BB.markValid(false); 299 ++NumRemoved; 300 } 301 } 302 303 if (NumRemoved) 304 BF.eraseInvalidBBs(); 305 306 // Check for duplicate successors. Do it after the empty block elimination as 307 // we can get more duplicate successors. 308 for (BinaryBasicBlock &BB : BF) 309 if (!BB.hasJumpTable() && BB.succ_size() == 2 && 310 BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true)) 311 ++NumDuplicateEdges; 312 313 // fixBranches() will get rid of duplicate edges and update jump instructions. 314 if (NumDuplicateEdges || NeedsFixBranches) 315 BF.fixBranches(); 316 317 NumDuplicateEdgesMerged += NumDuplicateEdges; 318 NumBlocksRemoved += NumRemoved; 319 } 320 321 void NormalizeCFG::runOnFunctions(BinaryContext &BC) { 322 ParallelUtilities::runOnEachFunction( 323 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 324 [&](BinaryFunction &BF) { runOnFunction(BF); }, 325 [&](const BinaryFunction &BF) { return !shouldOptimize(BF); }, 326 "NormalizeCFG"); 327 if (NumBlocksRemoved) 328 outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block" 329 << (NumBlocksRemoved == 1 ? "" : "s") << '\n'; 330 if (NumDuplicateEdgesMerged) 331 outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged 332 << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s") 333 << '\n'; 334 } 335 336 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { 337 if (Function.layout_size() > 0) { 338 unsigned Count; 339 uint64_t Bytes; 340 Function.markUnreachableBlocks(); 341 LLVM_DEBUG({ 342 for (BinaryBasicBlock *BB : Function.layout()) { 343 if (!BB->isValid()) { 344 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB->getName() 345 << " in function " << Function << "\n"; 346 Function.dump(); 347 } 348 } 349 }); 350 std::tie(Count, Bytes) = Function.eraseInvalidBBs(); 351 DeletedBlocks += Count; 352 DeletedBytes += Bytes; 353 if (Count) { 354 Modified.insert(&Function); 355 if (opts::Verbosity > 0) 356 outs() << "BOLT-INFO: Removed " << Count 357 << " dead basic block(s) accounting for " << Bytes 358 << " bytes in function " << Function << '\n'; 359 } 360 } 361 } 362 363 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { 364 for (auto &It : BC.getBinaryFunctions()) { 365 BinaryFunction &Function = It.second; 366 if (shouldOptimize(Function)) 367 runOnFunction(Function); 368 } 369 370 outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " 371 << DeletedBytes << " bytes of code.\n"; 372 } 373 374 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { 375 return (BinaryFunctionPass::shouldPrint(BF) && 376 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); 377 } 378 379 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const { 380 // Apply execution count threshold 381 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 382 return false; 383 384 return BinaryFunctionPass::shouldOptimize(BF); 385 } 386 387 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { 388 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) 389 return; 390 391 std::atomic<uint64_t> ModifiedFuncCount{0}; 392 393 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 394 modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters); 395 if (BF.hasLayoutChanged()) 396 ++ModifiedFuncCount; 397 }; 398 399 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 400 return !shouldOptimize(BF); 401 }; 402 403 ParallelUtilities::runOnEachFunction( 404 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc, 405 "ReorderBasicBlocks"); 406 407 outs() << "BOLT-INFO: basic block reordering modified layout of " 408 << format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(), 409 100.0 * ModifiedFuncCount.load() / 410 BC.getBinaryFunctions().size()); 411 412 if (opts::PrintFuncStat > 0) { 413 raw_ostream &OS = outs(); 414 // Copy all the values into vector in order to sort them 415 std::map<uint64_t, BinaryFunction &> ScoreMap; 416 auto &BFs = BC.getBinaryFunctions(); 417 for (auto It = BFs.begin(); It != BFs.end(); ++It) 418 ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>( 419 It->second.getFunctionScore(), It->second)); 420 421 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n"; 422 OS << " There are " << BFs.size() << " functions in total. \n"; 423 OS << " Number of functions being modified: " 424 << ModifiedFuncCount.load() << "\n"; 425 OS << " User asks for detailed information on top " 426 << opts::PrintFuncStat << " functions. (Ranked by function score)" 427 << "\n\n"; 428 uint64_t I = 0; 429 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit = 430 ScoreMap.rbegin(); 431 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { 432 BinaryFunction &Function = Rit->second; 433 434 OS << " Information for function of top: " << (I + 1) << ": \n"; 435 OS << " Function Score is: " << Function.getFunctionScore() 436 << "\n"; 437 OS << " There are " << Function.size() 438 << " number of blocks in this function.\n"; 439 OS << " There are " << Function.getInstructionCount() 440 << " number of instructions in this function.\n"; 441 OS << " The edit distance for this function is: " 442 << Function.getEditDistance() << "\n\n"; 443 } 444 } 445 } 446 447 void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, 448 LayoutType Type, 449 bool MinBranchClusters) const { 450 if (BF.size() == 0 || Type == LT_NONE) 451 return; 452 453 BinaryFunction::BasicBlockOrderType NewLayout; 454 std::unique_ptr<ReorderAlgorithm> Algo; 455 456 // Cannot do optimal layout without profile. 457 if (Type != LT_REVERSE && !BF.hasValidProfile()) 458 return; 459 460 if (Type == LT_REVERSE) { 461 Algo.reset(new ReverseReorderAlgorithm()); 462 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { 463 // Work on optimal solution if problem is small enough 464 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); 465 Algo.reset(new TSPReorderAlgorithm()); 466 } else { 467 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n"); 468 469 std::unique_ptr<ClusterAlgorithm> CAlgo; 470 if (MinBranchClusters) 471 CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); 472 else 473 CAlgo.reset(new PHGreedyClusterAlgorithm()); 474 475 switch (Type) { 476 case LT_OPTIMIZE: 477 Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); 478 break; 479 480 case LT_OPTIMIZE_BRANCH: 481 Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); 482 break; 483 484 case LT_OPTIMIZE_CACHE: 485 Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); 486 break; 487 488 case LT_OPTIMIZE_EXT_TSP: 489 Algo.reset(new ExtTSPReorderAlgorithm()); 490 break; 491 492 case LT_OPTIMIZE_SHUFFLE: 493 Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); 494 break; 495 496 default: 497 llvm_unreachable("unexpected layout type"); 498 } 499 } 500 501 Algo->reorderBasicBlocks(BF, NewLayout); 502 503 BF.updateBasicBlockLayout(NewLayout); 504 } 505 506 void FixupBranches::runOnFunctions(BinaryContext &BC) { 507 for (auto &It : BC.getBinaryFunctions()) { 508 BinaryFunction &Function = It.second; 509 if (!BC.shouldEmit(Function) || !Function.isSimple()) 510 continue; 511 512 Function.fixBranches(); 513 } 514 } 515 516 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) { 517 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 518 if (!BF.finalizeCFIState()) { 519 if (BC.HasRelocations) { 520 errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF 521 << ". Exiting.\n"; 522 exit(1); 523 } 524 BF.setSimple(false); 525 return; 526 } 527 528 BF.setFinalized(); 529 530 // Update exception handling information. 531 BF.updateEHRanges(); 532 }; 533 534 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 535 return !BC.shouldEmit(BF); 536 }; 537 538 ParallelUtilities::runOnEachFunction( 539 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 540 SkipPredicate, "FinalizeFunctions"); 541 } 542 543 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { 544 if (BC.HasRelocations) 545 return; 546 547 if (!opts::UpdateDebugSections) 548 return; 549 550 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit 551 // incorrect debug info. 552 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 553 uint64_t HotSize, ColdSize; 554 std::tie(HotSize, ColdSize) = 555 BC.calculateEmittedSize(BF, /*FixBranches=*/false); 556 if (HotSize > BF.getMaxSize()) 557 BF.setSimple(false); 558 }; 559 560 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 561 return !shouldOptimize(BF); 562 }; 563 564 ParallelUtilities::runOnEachFunction( 565 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 566 SkipFunc, "CheckLargeFunctions"); 567 } 568 569 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { 570 // Unlike other passes, allow functions in non-CFG state. 571 return BF.isSimple() && !BF.isIgnored(); 572 } 573 574 void LowerAnnotations::runOnFunctions(BinaryContext &BC) { 575 std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations; 576 577 for (auto &It : BC.getBinaryFunctions()) { 578 BinaryFunction &BF = It.second; 579 int64_t CurrentGnuArgsSize = 0; 580 581 // Have we crossed hot/cold border for split functions? 582 bool SeenCold = false; 583 584 for (BinaryBasicBlock *BB : BF.layout()) { 585 if (BB->isCold() && !SeenCold) { 586 SeenCold = true; 587 CurrentGnuArgsSize = 0; 588 } 589 590 // First convert GnuArgsSize annotations into CFIs. This may change instr 591 // pointers, so do it before recording ptrs for preserved annotations 592 if (BF.usesGnuArgsSize()) { 593 for (auto II = BB->begin(); II != BB->end(); ++II) { 594 if (!BC.MIB->isInvoke(*II)) 595 continue; 596 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II); 597 assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size"); 598 if (NewGnuArgsSize != CurrentGnuArgsSize) { 599 auto InsertII = BF.addCFIInstruction( 600 BB, II, 601 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize)); 602 CurrentGnuArgsSize = NewGnuArgsSize; 603 II = std::next(InsertII); 604 } 605 } 606 } 607 608 // Now record preserved annotations separately and then strip annotations. 609 for (auto II = BB->begin(); II != BB->end(); ++II) { 610 if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II)) 611 PreservedOffsetAnnotations.emplace_back(&(*II), 612 *BC.MIB->getOffset(*II)); 613 BC.MIB->stripAnnotations(*II); 614 } 615 } 616 } 617 for (BinaryFunction *BF : BC.getInjectedBinaryFunctions()) 618 for (BinaryBasicBlock &BB : *BF) 619 for (MCInst &Instruction : BB) 620 BC.MIB->stripAnnotations(Instruction); 621 622 // Release all memory taken by annotations 623 BC.MIB->freeAnnotations(); 624 625 // Reinsert preserved annotations we need during code emission. 626 for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations) 627 BC.MIB->setOffset(*Item.first, Item.second); 628 } 629 630 namespace { 631 632 // This peephole fixes jump instructions that jump to another basic 633 // block with a single jump instruction, e.g. 634 // 635 // B0: ... 636 // jmp B1 (or jcc B1) 637 // 638 // B1: jmp B2 639 // 640 // -> 641 // 642 // B0: ... 643 // jmp B2 (or jcc B2) 644 // 645 uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { 646 uint64_t NumDoubleJumps = 0; 647 648 MCContext *Ctx = Function.getBinaryContext().Ctx.get(); 649 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 650 for (BinaryBasicBlock &BB : Function) { 651 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ, 652 const MCSymbol *SuccSym) { 653 // Ignore infinite loop jumps or fallthrough tail jumps. 654 if (Pred == Succ || Succ == &BB) 655 return false; 656 657 if (Succ) { 658 const MCSymbol *TBB = nullptr; 659 const MCSymbol *FBB = nullptr; 660 MCInst *CondBranch = nullptr; 661 MCInst *UncondBranch = nullptr; 662 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 663 if (!Res) { 664 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; 665 Pred->dump()); 666 return false; 667 } 668 Pred->replaceSuccessor(&BB, Succ); 669 670 // We must patch up any existing branch instructions to match up 671 // with the new successor. 672 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && 673 "Predecessor block has inconsistent number of successors"); 674 if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) { 675 MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); 676 } else if (UncondBranch && 677 MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) { 678 MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); 679 } else if (!UncondBranch) { 680 assert(Function.getBasicBlockAfter(Pred, false) != Succ && 681 "Don't add an explicit jump to a fallthrough block."); 682 Pred->addBranchInstruction(Succ); 683 } 684 } else { 685 // Succ will be null in the tail call case. In this case we 686 // need to explicitly add a tail call instruction. 687 MCInst *Branch = Pred->getLastNonPseudoInstr(); 688 if (Branch && MIB->isUnconditionalBranch(*Branch)) { 689 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel()); 690 Pred->removeSuccessor(&BB); 691 Pred->eraseInstruction(Pred->findInstruction(Branch)); 692 Pred->addTailCallInstruction(SuccSym); 693 } else { 694 return false; 695 } 696 } 697 698 ++NumDoubleJumps; 699 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from " 700 << Pred->getName() << " -> " << BB.getName() << " to " 701 << Pred->getName() << " -> " << SuccSym->getName() 702 << (!Succ ? " (tail)\n" : "\n")); 703 704 return true; 705 }; 706 707 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) 708 continue; 709 710 MCInst *Inst = BB.getFirstNonPseudoInstr(); 711 const bool IsTailCall = MIB->isTailCall(*Inst); 712 713 if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall) 714 continue; 715 716 // If we operate after SCTC make sure it's not a conditional tail call. 717 if (IsTailCall && MIB->isConditionalBranch(*Inst)) 718 continue; 719 720 const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst); 721 BinaryBasicBlock *Succ = BB.getSuccessor(); 722 723 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) 724 continue; 725 726 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()}; 727 728 for (BinaryBasicBlock *Pred : Preds) { 729 if (Pred->isLandingPad()) 730 continue; 731 732 if (Pred->getSuccessor() == &BB || 733 (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || 734 Pred->getConditionalSuccessor(false) == &BB) 735 if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid) 736 BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() || 737 BB.isEntryPoint()); 738 } 739 } 740 741 return NumDoubleJumps; 742 } 743 } // namespace 744 745 bool SimplifyConditionalTailCalls::shouldRewriteBranch( 746 const BinaryBasicBlock *PredBB, const MCInst &CondBranch, 747 const BinaryBasicBlock *BB, const bool DirectionFlag) { 748 if (BeenOptimized.count(PredBB)) 749 return false; 750 751 const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); 752 753 if (IsForward) 754 ++NumOrigForwardBranches; 755 else 756 ++NumOrigBackwardBranches; 757 758 if (opts::SctcMode == opts::SctcAlways) 759 return true; 760 761 if (opts::SctcMode == opts::SctcPreserveDirection) 762 return IsForward == DirectionFlag; 763 764 const ErrorOr<std::pair<double, double>> Frequency = 765 PredBB->getBranchStats(BB); 766 767 // It's ok to rewrite the conditional branch if the new target will be 768 // a backward branch. 769 770 // If no data available for these branches, then it should be ok to 771 // do the optimization since it will reduce code size. 772 if (Frequency.getError()) 773 return true; 774 775 // TODO: should this use misprediction frequency instead? 776 const bool Result = (IsForward && Frequency.get().first >= 0.5) || 777 (!IsForward && Frequency.get().first <= 0.5); 778 779 return Result == DirectionFlag; 780 } 781 782 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) { 783 // Need updated indices to correctly detect branch' direction. 784 BF.updateLayoutIndices(); 785 BF.markUnreachableBlocks(); 786 787 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 788 MCContext *Ctx = BF.getBinaryContext().Ctx.get(); 789 uint64_t NumLocalCTCCandidates = 0; 790 uint64_t NumLocalCTCs = 0; 791 uint64_t LocalCTCTakenCount = 0; 792 uint64_t LocalCTCExecCount = 0; 793 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>> 794 NeedsUncondBranch; 795 796 // Will block be deleted by UCE? 797 auto isValid = [](const BinaryBasicBlock *BB) { 798 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint()); 799 }; 800 801 for (BinaryBasicBlock *BB : BF.layout()) { 802 // Locate BB with a single direct tail-call instruction. 803 if (BB->getNumNonPseudos() != 1) 804 continue; 805 806 MCInst *Instr = BB->getFirstNonPseudoInstr(); 807 if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr)) 808 continue; 809 810 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr); 811 if (!CalleeSymbol) 812 continue; 813 814 // Detect direction of the possible conditional tail call. 815 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); 816 817 // Iterate through all predecessors. 818 for (BinaryBasicBlock *PredBB : BB->predecessors()) { 819 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true); 820 if (!CondSucc) 821 continue; 822 823 ++NumLocalCTCCandidates; 824 825 const MCSymbol *TBB = nullptr; 826 const MCSymbol *FBB = nullptr; 827 MCInst *CondBranch = nullptr; 828 MCInst *UncondBranch = nullptr; 829 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 830 831 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz 832 if (!Result) { 833 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n"; 834 PredBB->dump()); 835 continue; 836 } 837 838 assert(Result && "internal error analyzing conditional branch"); 839 assert(CondBranch && "conditional branch expected"); 840 841 // It's possible that PredBB is also a successor to BB that may have 842 // been processed by a previous iteration of the SCTC loop, in which 843 // case it may have been marked invalid. We should skip rewriting in 844 // this case. 845 if (!PredBB->isValid()) { 846 assert(PredBB->isSuccessor(BB) && 847 "PredBB should be valid if it is not a successor to BB"); 848 continue; 849 } 850 851 // We don't want to reverse direction of the branch in new order 852 // without further profile analysis. 853 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; 854 if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag)) 855 continue; 856 857 // Record this block so that we don't try to optimize it twice. 858 BeenOptimized.insert(PredBB); 859 860 uint64_t Count = 0; 861 if (CondSucc != BB) { 862 // Patch the new target address into the conditional branch. 863 MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx); 864 // Since we reversed the condition on the branch we need to change 865 // the target for the unconditional branch or add a unconditional 866 // branch to the old target. This has to be done manually since 867 // fixupBranches is not called after SCTC. 868 NeedsUncondBranch.emplace_back(PredBB, CondSucc); 869 Count = PredBB->getFallthroughBranchInfo().Count; 870 } else { 871 // Change destination of the conditional branch. 872 MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx); 873 Count = PredBB->getTakenBranchInfo().Count; 874 } 875 const uint64_t CTCTakenFreq = 876 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; 877 878 // Annotate it, so "isCall" returns true for this jcc 879 MIB->setConditionalTailCall(*CondBranch); 880 // Add info abount the conditional tail call frequency, otherwise this 881 // info will be lost when we delete the associated BranchInfo entry 882 auto &CTCAnnotation = 883 MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount"); 884 CTCAnnotation = CTCTakenFreq; 885 886 // Remove the unused successor which may be eliminated later 887 // if there are no other users. 888 PredBB->removeSuccessor(BB); 889 // Update BB execution count 890 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) 891 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); 892 else if (CTCTakenFreq > BB->getKnownExecutionCount()) 893 BB->setExecutionCount(0); 894 895 ++NumLocalCTCs; 896 LocalCTCTakenCount += CTCTakenFreq; 897 LocalCTCExecCount += PredBB->getKnownExecutionCount(); 898 } 899 900 // Remove the block from CFG if all predecessors were removed. 901 BB->markValid(isValid(BB)); 902 } 903 904 // Add unconditional branches at the end of BBs to new successors 905 // as long as the successor is not a fallthrough. 906 for (auto &Entry : NeedsUncondBranch) { 907 BinaryBasicBlock *PredBB = Entry.first; 908 const BinaryBasicBlock *CondSucc = Entry.second; 909 910 const MCSymbol *TBB = nullptr; 911 const MCSymbol *FBB = nullptr; 912 MCInst *CondBranch = nullptr; 913 MCInst *UncondBranch = nullptr; 914 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 915 916 // Find the next valid block. Invalid blocks will be deleted 917 // so they shouldn't be considered fallthrough targets. 918 const BinaryBasicBlock *NextBlock = BF.getBasicBlockAfter(PredBB, false); 919 while (NextBlock && !isValid(NextBlock)) 920 NextBlock = BF.getBasicBlockAfter(NextBlock, false); 921 922 // Get the unconditional successor to this block. 923 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor(); 924 assert(PredSucc && "The other branch should be a tail call"); 925 926 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); 927 928 if (UncondBranch) { 929 if (HasFallthrough) 930 PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch)); 931 else 932 MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx); 933 } else if (!HasFallthrough) { 934 MCInst Branch; 935 MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx); 936 PredBB->addInstruction(Branch); 937 } 938 } 939 940 if (NumLocalCTCs > 0) { 941 NumDoubleJumps += fixDoubleJumps(BF, true); 942 // Clean-up unreachable tail-call blocks. 943 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs(); 944 DeletedBlocks += Stats.first; 945 DeletedBytes += Stats.second; 946 947 assert(BF.validateCFG()); 948 } 949 950 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs 951 << " conditional tail calls from a total of " 952 << NumLocalCTCCandidates << " candidates in function " << BF 953 << ". CTCs execution count for this function is " 954 << LocalCTCExecCount << " and CTC taken count is " 955 << LocalCTCTakenCount << "\n";); 956 957 NumTailCallsPatched += NumLocalCTCs; 958 NumCandidateTailCalls += NumLocalCTCCandidates; 959 CTCExecCount += LocalCTCExecCount; 960 CTCTakenCount += LocalCTCTakenCount; 961 962 return NumLocalCTCs > 0; 963 } 964 965 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { 966 if (!BC.isX86()) 967 return; 968 969 for (auto &It : BC.getBinaryFunctions()) { 970 BinaryFunction &Function = It.second; 971 972 if (!shouldOptimize(Function)) 973 continue; 974 975 if (fixTailCalls(Function)) { 976 Modified.insert(&Function); 977 Function.setHasCanonicalCFG(false); 978 } 979 } 980 981 outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched 982 << " tail calls (" << NumOrigForwardBranches << " forward)" 983 << " tail calls (" << NumOrigBackwardBranches << " backward)" 984 << " from a total of " << NumCandidateTailCalls << " while removing " 985 << NumDoubleJumps << " double jumps" 986 << " and removing " << DeletedBlocks << " basic blocks" 987 << " totalling " << DeletedBytes 988 << " bytes of code. CTCs total execution count is " << CTCExecCount 989 << " and the number of times CTCs are taken is " << CTCTakenCount 990 << ".\n"; 991 } 992 993 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) { 994 uint64_t Count = 0; 995 const BinaryContext &BC = Function.getBinaryContext(); 996 for (BinaryBasicBlock &BB : Function) { 997 for (MCInst &Inst : BB) { 998 MCInst OriginalInst; 999 if (opts::Verbosity > 2) 1000 OriginalInst = Inst; 1001 1002 if (!BC.MIB->shortenInstruction(Inst, *BC.STI)) 1003 continue; 1004 1005 if (opts::Verbosity > 2) { 1006 BC.scopeLock(); 1007 outs() << "BOLT-INFO: shortening:\nBOLT-INFO: "; 1008 BC.printInstruction(outs(), OriginalInst, 0, &Function); 1009 outs() << "BOLT-INFO: to:"; 1010 BC.printInstruction(outs(), Inst, 0, &Function); 1011 } 1012 1013 ++Count; 1014 } 1015 } 1016 1017 return Count; 1018 } 1019 1020 void ShortenInstructions::runOnFunctions(BinaryContext &BC) { 1021 std::atomic<uint64_t> NumShortened{0}; 1022 if (!BC.isX86()) 1023 return; 1024 1025 ParallelUtilities::runOnEachFunction( 1026 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, 1027 [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); }, 1028 nullptr, "ShortenInstructions"); 1029 1030 outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n"; 1031 } 1032 1033 void Peepholes::addTailcallTraps(BinaryFunction &Function) { 1034 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 1035 for (BinaryBasicBlock &BB : Function) { 1036 MCInst *Inst = BB.getLastNonPseudoInstr(); 1037 if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) { 1038 MCInst Trap; 1039 if (MIB->createTrap(Trap)) { 1040 BB.addInstruction(Trap); 1041 ++TailCallTraps; 1042 } 1043 } 1044 } 1045 } 1046 1047 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) { 1048 for (BinaryBasicBlock &BB : Function) { 1049 if (BB.succ_size() != 2) 1050 continue; 1051 1052 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true); 1053 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false); 1054 if (CondBB != UncondBB) 1055 continue; 1056 1057 const MCSymbol *TBB = nullptr; 1058 const MCSymbol *FBB = nullptr; 1059 MCInst *CondBranch = nullptr; 1060 MCInst *UncondBranch = nullptr; 1061 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 1062 1063 // analyzeBranch() can fail due to unusual branch instructions, 1064 // e.g. jrcxz, or jump tables (indirect jump). 1065 if (!Result || !CondBranch) 1066 continue; 1067 1068 BB.removeDuplicateConditionalSuccessor(CondBranch); 1069 ++NumUselessCondBranches; 1070 } 1071 } 1072 1073 void Peepholes::runOnFunctions(BinaryContext &BC) { 1074 const char Opts = 1075 std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0, 1076 [](const char A, const PeepholeOpts B) { return A | B; }); 1077 if (Opts == PEEP_NONE) 1078 return; 1079 1080 for (auto &It : BC.getBinaryFunctions()) { 1081 BinaryFunction &Function = It.second; 1082 if (shouldOptimize(Function)) { 1083 if (Opts & PEEP_DOUBLE_JUMPS) 1084 NumDoubleJumps += fixDoubleJumps(Function, false); 1085 if (Opts & PEEP_TAILCALL_TRAPS) 1086 addTailcallTraps(Function); 1087 if (Opts & PEEP_USELESS_BRANCHES) 1088 removeUselessCondBranches(Function); 1089 assert(Function.validateCFG()); 1090 } 1091 } 1092 outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps 1093 << " double jumps patched.\n" 1094 << "BOLT-INFO: Peephole: " << TailCallTraps 1095 << " tail call traps inserted.\n" 1096 << "BOLT-INFO: Peephole: " << NumUselessCondBranches 1097 << " useless conditional branches removed.\n"; 1098 } 1099 1100 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) { 1101 BinaryContext &BC = BF.getBinaryContext(); 1102 MCPlusBuilder *MIB = BC.MIB.get(); 1103 1104 uint64_t NumLocalLoadsSimplified = 0; 1105 uint64_t NumDynamicLocalLoadsSimplified = 0; 1106 uint64_t NumLocalLoadsFound = 0; 1107 uint64_t NumDynamicLocalLoadsFound = 0; 1108 1109 for (BinaryBasicBlock *BB : BF.layout()) { 1110 for (MCInst &Inst : *BB) { 1111 unsigned Opcode = Inst.getOpcode(); 1112 const MCInstrDesc &Desc = BC.MII->get(Opcode); 1113 1114 // Skip instructions that do not load from memory. 1115 if (!Desc.mayLoad()) 1116 continue; 1117 1118 // Try to statically evaluate the target memory address; 1119 uint64_t TargetAddress; 1120 1121 if (MIB->hasPCRelOperand(Inst)) { 1122 // Try to find the symbol that corresponds to the PC-relative operand. 1123 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst); 1124 assert(DispOpI != Inst.end() && "expected PC-relative displacement"); 1125 assert(DispOpI->isExpr() && 1126 "found PC-relative with non-symbolic displacement"); 1127 1128 // Get displacement symbol. 1129 const MCSymbol *DisplSymbol; 1130 uint64_t DisplOffset; 1131 1132 std::tie(DisplSymbol, DisplOffset) = 1133 MIB->getTargetSymbolInfo(DispOpI->getExpr()); 1134 1135 if (!DisplSymbol) 1136 continue; 1137 1138 // Look up the symbol address in the global symbols map of the binary 1139 // context object. 1140 BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName()); 1141 if (!BD) 1142 continue; 1143 TargetAddress = BD->getAddress() + DisplOffset; 1144 } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) { 1145 continue; 1146 } 1147 1148 // Get the contents of the section containing the target address of the 1149 // memory operand. We are only interested in read-only sections. 1150 ErrorOr<BinarySection &> DataSection = 1151 BC.getSectionForAddress(TargetAddress); 1152 if (!DataSection || !DataSection->isReadOnly()) 1153 continue; 1154 1155 if (BC.getRelocationAt(TargetAddress) || 1156 BC.getDynamicRelocationAt(TargetAddress)) 1157 continue; 1158 1159 uint32_t Offset = TargetAddress - DataSection->getAddress(); 1160 StringRef ConstantData = DataSection->getContents(); 1161 1162 ++NumLocalLoadsFound; 1163 if (BB->hasProfile()) 1164 NumDynamicLocalLoadsFound += BB->getExecutionCount(); 1165 1166 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { 1167 ++NumLocalLoadsSimplified; 1168 if (BB->hasProfile()) 1169 NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); 1170 } 1171 } 1172 } 1173 1174 NumLoadsFound += NumLocalLoadsFound; 1175 NumDynamicLoadsFound += NumDynamicLocalLoadsFound; 1176 NumLoadsSimplified += NumLocalLoadsSimplified; 1177 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; 1178 1179 return NumLocalLoadsSimplified > 0; 1180 } 1181 1182 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { 1183 for (auto &It : BC.getBinaryFunctions()) { 1184 BinaryFunction &Function = It.second; 1185 if (shouldOptimize(Function) && simplifyRODataLoads(Function)) 1186 Modified.insert(&Function); 1187 } 1188 1189 outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " 1190 << NumLoadsFound << " loads from a statically computed address.\n" 1191 << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified 1192 << "\n" 1193 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n"; 1194 } 1195 1196 void AssignSections::runOnFunctions(BinaryContext &BC) { 1197 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) { 1198 Function->setCodeSectionName(BC.getInjectedCodeSectionName()); 1199 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); 1200 } 1201 1202 // In non-relocation mode functions have pre-assigned section names. 1203 if (!BC.HasRelocations) 1204 return; 1205 1206 const bool UseColdSection = 1207 BC.NumProfiledFuncs > 0 || 1208 opts::ReorderFunctions == ReorderFunctions::RT_USER; 1209 for (auto &BFI : BC.getBinaryFunctions()) { 1210 BinaryFunction &Function = BFI.second; 1211 if (opts::isHotTextMover(Function)) { 1212 Function.setCodeSectionName(BC.getHotTextMoverSectionName()); 1213 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); 1214 continue; 1215 } 1216 1217 if (!UseColdSection || Function.hasValidIndex()) 1218 Function.setCodeSectionName(BC.getMainCodeSectionName()); 1219 else 1220 Function.setCodeSectionName(BC.getColdCodeSectionName()); 1221 1222 if (Function.isSplit()) 1223 Function.setColdCodeSectionName(BC.getColdCodeSectionName()); 1224 } 1225 } 1226 1227 void PrintProfileStats::runOnFunctions(BinaryContext &BC) { 1228 double FlowImbalanceMean = 0.0; 1229 size_t NumBlocksConsidered = 0; 1230 double WorstBias = 0.0; 1231 const BinaryFunction *WorstBiasFunc = nullptr; 1232 1233 // For each function CFG, we fill an IncomingMap with the sum of the frequency 1234 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum 1235 // of the frequency of outgoing edges. 1236 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>; 1237 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps; 1238 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps; 1239 1240 // Compute mean 1241 for (const auto &BFI : BC.getBinaryFunctions()) { 1242 const BinaryFunction &Function = BFI.second; 1243 if (Function.empty() || !Function.isSimple()) 1244 continue; 1245 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1246 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1247 for (const BinaryBasicBlock &BB : Function) { 1248 uint64_t TotalOutgoing = 0ULL; 1249 auto SuccBIIter = BB.branch_info_begin(); 1250 for (BinaryBasicBlock *Succ : BB.successors()) { 1251 uint64_t Count = SuccBIIter->Count; 1252 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { 1253 ++SuccBIIter; 1254 continue; 1255 } 1256 TotalOutgoing += Count; 1257 IncomingMap[Succ] += Count; 1258 ++SuccBIIter; 1259 } 1260 OutgoingMap[&BB] = TotalOutgoing; 1261 } 1262 1263 size_t NumBlocks = 0; 1264 double Mean = 0.0; 1265 for (const BinaryBasicBlock &BB : Function) { 1266 // Do not compute score for low frequency blocks, entry or exit blocks 1267 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint()) 1268 continue; 1269 ++NumBlocks; 1270 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1271 Mean += fabs(Difference / IncomingMap[&BB]); 1272 } 1273 1274 FlowImbalanceMean += Mean; 1275 NumBlocksConsidered += NumBlocks; 1276 if (!NumBlocks) 1277 continue; 1278 double FuncMean = Mean / NumBlocks; 1279 if (FuncMean > WorstBias) { 1280 WorstBias = FuncMean; 1281 WorstBiasFunc = &Function; 1282 } 1283 } 1284 if (NumBlocksConsidered > 0) 1285 FlowImbalanceMean /= NumBlocksConsidered; 1286 1287 // Compute standard deviation 1288 NumBlocksConsidered = 0; 1289 double FlowImbalanceVar = 0.0; 1290 for (const auto &BFI : BC.getBinaryFunctions()) { 1291 const BinaryFunction &Function = BFI.second; 1292 if (Function.empty() || !Function.isSimple()) 1293 continue; 1294 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1295 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1296 for (const BinaryBasicBlock &BB : Function) { 1297 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0) 1298 continue; 1299 ++NumBlocksConsidered; 1300 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1301 FlowImbalanceVar += 1302 pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2); 1303 } 1304 } 1305 if (NumBlocksConsidered) { 1306 FlowImbalanceVar /= NumBlocksConsidered; 1307 FlowImbalanceVar = sqrt(FlowImbalanceVar); 1308 } 1309 1310 // Report to user 1311 outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n", 1312 (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar)); 1313 if (WorstBiasFunc && opts::Verbosity >= 1) { 1314 outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName() 1315 << "\n"; 1316 LLVM_DEBUG(WorstBiasFunc->dump()); 1317 } 1318 } 1319 1320 void PrintProgramStats::runOnFunctions(BinaryContext &BC) { 1321 uint64_t NumRegularFunctions = 0; 1322 uint64_t NumStaleProfileFunctions = 0; 1323 uint64_t NumNonSimpleProfiledFunctions = 0; 1324 uint64_t NumUnknownControlFlowFunctions = 0; 1325 uint64_t TotalSampleCount = 0; 1326 uint64_t StaleSampleCount = 0; 1327 std::vector<const BinaryFunction *> ProfiledFunctions; 1328 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; 1329 for (auto &BFI : BC.getBinaryFunctions()) { 1330 const BinaryFunction &Function = BFI.second; 1331 1332 // Ignore PLT functions for stats. 1333 if (Function.isPLTFunction()) 1334 continue; 1335 1336 ++NumRegularFunctions; 1337 1338 if (!Function.isSimple()) { 1339 if (Function.hasProfile()) 1340 ++NumNonSimpleProfiledFunctions; 1341 continue; 1342 } 1343 1344 if (Function.hasUnknownControlFlow()) { 1345 if (opts::PrintUnknownCFG) 1346 Function.dump(); 1347 else if (opts::PrintUnknown) 1348 errs() << "function with unknown control flow: " << Function << '\n'; 1349 1350 ++NumUnknownControlFlowFunctions; 1351 } 1352 1353 if (!Function.hasProfile()) 1354 continue; 1355 1356 uint64_t SampleCount = Function.getRawBranchCount(); 1357 TotalSampleCount += SampleCount; 1358 1359 if (Function.hasValidProfile()) { 1360 ProfiledFunctions.push_back(&Function); 1361 } else { 1362 if (opts::ReportStaleFuncs) { 1363 outs() << StaleFuncsHeader; 1364 StaleFuncsHeader = ""; 1365 outs() << " " << Function << '\n'; 1366 } 1367 ++NumStaleProfileFunctions; 1368 StaleSampleCount += SampleCount; 1369 } 1370 } 1371 BC.NumProfiledFuncs = ProfiledFunctions.size(); 1372 1373 const size_t NumAllProfiledFunctions = 1374 ProfiledFunctions.size() + NumStaleProfileFunctions; 1375 outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of " 1376 << NumRegularFunctions << " functions in the binary (" 1377 << format("%.1f", NumAllProfiledFunctions / 1378 (float)NumRegularFunctions * 100.0f) 1379 << "%) have non-empty execution profile\n"; 1380 if (NumNonSimpleProfiledFunctions) { 1381 outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function" 1382 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s") 1383 << " with profile could not be optimized\n"; 1384 } 1385 if (NumStaleProfileFunctions) { 1386 const float PctStale = 1387 NumStaleProfileFunctions / (float)NumAllProfiledFunctions * 100.0f; 1388 auto printErrorOrWarning = [&]() { 1389 if (PctStale > opts::StaleThreshold) 1390 errs() << "BOLT-ERROR: "; 1391 else 1392 errs() << "BOLT-WARNING: "; 1393 }; 1394 printErrorOrWarning(); 1395 errs() << NumStaleProfileFunctions 1396 << format(" (%.1f%% of all profiled)", PctStale) << " function" 1397 << (NumStaleProfileFunctions == 1 ? "" : "s") 1398 << " have invalid (possibly stale) profile." 1399 " Use -report-stale to see the list.\n"; 1400 if (TotalSampleCount > 0) { 1401 printErrorOrWarning(); 1402 errs() << StaleSampleCount << " out of " << TotalSampleCount 1403 << " samples in the binary (" 1404 << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount)) 1405 << "%) belong to functions with invalid" 1406 " (possibly stale) profile.\n"; 1407 } 1408 if (PctStale > opts::StaleThreshold) { 1409 errs() << "BOLT-ERROR: stale functions exceed specified threshold of " 1410 << opts::StaleThreshold << "%. Exiting.\n"; 1411 exit(1); 1412 } 1413 } 1414 1415 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { 1416 outs() << "BOLT-INFO: profile for " << NumUnusedObjects 1417 << " objects was ignored\n"; 1418 } 1419 1420 if (ProfiledFunctions.size() > 10) { 1421 if (opts::Verbosity >= 1) { 1422 outs() << "BOLT-INFO: top called functions are:\n"; 1423 llvm::sort(ProfiledFunctions, 1424 [](const BinaryFunction *A, const BinaryFunction *B) { 1425 return B->getExecutionCount() < A->getExecutionCount(); 1426 }); 1427 auto SFI = ProfiledFunctions.begin(); 1428 auto SFIend = ProfiledFunctions.end(); 1429 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend; 1430 ++SFI, ++I) 1431 outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n'; 1432 } 1433 } 1434 1435 if (!opts::PrintSortedBy.empty() && 1436 !llvm::is_contained(opts::PrintSortedBy, DynoStats::FIRST_DYNO_STAT)) { 1437 1438 std::vector<const BinaryFunction *> Functions; 1439 std::map<const BinaryFunction *, DynoStats> Stats; 1440 1441 for (const auto &BFI : BC.getBinaryFunctions()) { 1442 const BinaryFunction &BF = BFI.second; 1443 if (shouldOptimize(BF) && BF.hasValidProfile()) { 1444 Functions.push_back(&BF); 1445 Stats.emplace(&BF, getDynoStats(BF)); 1446 } 1447 } 1448 1449 const bool SortAll = 1450 llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT); 1451 1452 const bool Ascending = 1453 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; 1454 1455 if (SortAll) { 1456 llvm::stable_sort(Functions, 1457 [Ascending, &Stats](const BinaryFunction *A, 1458 const BinaryFunction *B) { 1459 return Ascending ? Stats.at(A) < Stats.at(B) 1460 : Stats.at(B) < Stats.at(A); 1461 }); 1462 } else { 1463 llvm::stable_sort( 1464 Functions, [Ascending, &Stats](const BinaryFunction *A, 1465 const BinaryFunction *B) { 1466 const DynoStats &StatsA = Stats.at(A); 1467 const DynoStats &StatsB = Stats.at(B); 1468 return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy) 1469 : StatsB.lessThan(StatsA, opts::PrintSortedBy); 1470 }); 1471 } 1472 1473 outs() << "BOLT-INFO: top functions sorted by "; 1474 if (SortAll) { 1475 outs() << "dyno stats"; 1476 } else { 1477 outs() << "("; 1478 bool PrintComma = false; 1479 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1480 if (PrintComma) 1481 outs() << ", "; 1482 outs() << DynoStats::Description(Category); 1483 PrintComma = true; 1484 } 1485 outs() << ")"; 1486 } 1487 1488 outs() << " are:\n"; 1489 auto SFI = Functions.begin(); 1490 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { 1491 const DynoStats Stats = getDynoStats(**SFI); 1492 outs() << " " << **SFI; 1493 if (!SortAll) { 1494 outs() << " ("; 1495 bool PrintComma = false; 1496 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1497 if (PrintComma) 1498 outs() << ", "; 1499 outs() << dynoStatsOptName(Category) << "=" << Stats[Category]; 1500 PrintComma = true; 1501 } 1502 outs() << ")"; 1503 } 1504 outs() << "\n"; 1505 } 1506 } 1507 1508 if (!BC.TrappedFunctions.empty()) { 1509 errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function" 1510 << (BC.TrappedFunctions.size() > 1 ? "s" : "") 1511 << " will trap on entry. Use -trap-avx512=0 to disable" 1512 " traps."; 1513 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) { 1514 errs() << '\n'; 1515 for (const BinaryFunction *Function : BC.TrappedFunctions) 1516 errs() << " " << *Function << '\n'; 1517 } else { 1518 errs() << " Use -v=1 to see the list.\n"; 1519 } 1520 } 1521 1522 // Print information on missed macro-fusion opportunities seen on input. 1523 if (BC.MissedMacroFusionPairs) { 1524 outs() << "BOLT-INFO: the input contains " << BC.MissedMacroFusionPairs 1525 << " (dynamic count : " << BC.MissedMacroFusionExecCount 1526 << ") opportunities for macro-fusion optimization"; 1527 switch (opts::AlignMacroOpFusion) { 1528 case MFT_NONE: 1529 outs() << ". Use -align-macro-fusion to fix.\n"; 1530 break; 1531 case MFT_HOT: 1532 outs() << ". Will fix instances on a hot path.\n"; 1533 break; 1534 case MFT_ALL: 1535 outs() << " that are going to be fixed\n"; 1536 break; 1537 } 1538 } 1539 1540 // Collect and print information about suboptimal code layout on input. 1541 if (opts::ReportBadLayout) { 1542 std::vector<const BinaryFunction *> SuboptimalFuncs; 1543 for (auto &BFI : BC.getBinaryFunctions()) { 1544 const BinaryFunction &BF = BFI.second; 1545 if (!BF.hasValidProfile()) 1546 continue; 1547 1548 const uint64_t HotThreshold = 1549 std::max<uint64_t>(BF.getKnownExecutionCount(), 1); 1550 bool HotSeen = false; 1551 for (const BinaryBasicBlock *BB : BF.rlayout()) { 1552 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { 1553 HotSeen = true; 1554 continue; 1555 } 1556 if (HotSeen && BB->getKnownExecutionCount() == 0) { 1557 SuboptimalFuncs.push_back(&BF); 1558 break; 1559 } 1560 } 1561 } 1562 1563 if (!SuboptimalFuncs.empty()) { 1564 llvm::sort(SuboptimalFuncs, 1565 [](const BinaryFunction *A, const BinaryFunction *B) { 1566 return A->getKnownExecutionCount() / A->getSize() > 1567 B->getKnownExecutionCount() / B->getSize(); 1568 }); 1569 1570 outs() << "BOLT-INFO: " << SuboptimalFuncs.size() 1571 << " functions have " 1572 "cold code in the middle of hot code. Top functions are:\n"; 1573 for (unsigned I = 0; 1574 I < std::min(static_cast<size_t>(opts::ReportBadLayout), 1575 SuboptimalFuncs.size()); 1576 ++I) 1577 SuboptimalFuncs[I]->print(outs()); 1578 } 1579 } 1580 1581 if (NumUnknownControlFlowFunctions) { 1582 outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions 1583 << " functions have instructions with unknown control flow"; 1584 if (!opts::PrintUnknown) 1585 outs() << ". Use -print-unknown to see the list."; 1586 outs() << '\n'; 1587 } 1588 } 1589 1590 void InstructionLowering::runOnFunctions(BinaryContext &BC) { 1591 for (auto &BFI : BC.getBinaryFunctions()) 1592 for (BinaryBasicBlock &BB : BFI.second) 1593 for (MCInst &Instruction : BB) 1594 BC.MIB->lowerTailCall(Instruction); 1595 } 1596 1597 void StripRepRet::runOnFunctions(BinaryContext &BC) { 1598 if (!BC.isX86()) 1599 return; 1600 1601 uint64_t NumPrefixesRemoved = 0; 1602 uint64_t NumBytesSaved = 0; 1603 for (auto &BFI : BC.getBinaryFunctions()) { 1604 for (BinaryBasicBlock &BB : BFI.second) { 1605 auto LastInstRIter = BB.getLastNonPseudo(); 1606 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) || 1607 !BC.MIB->deleteREPPrefix(*LastInstRIter)) 1608 continue; 1609 1610 NumPrefixesRemoved += BB.getKnownExecutionCount(); 1611 ++NumBytesSaved; 1612 } 1613 } 1614 1615 if (NumBytesSaved) 1616 outs() << "BOLT-INFO: removed " << NumBytesSaved 1617 << " 'repz' prefixes" 1618 " with estimated execution count of " 1619 << NumPrefixesRemoved << " times.\n"; 1620 } 1621 1622 void InlineMemcpy::runOnFunctions(BinaryContext &BC) { 1623 if (!BC.isX86()) 1624 return; 1625 1626 uint64_t NumInlined = 0; 1627 uint64_t NumInlinedDyno = 0; 1628 for (auto &BFI : BC.getBinaryFunctions()) { 1629 for (BinaryBasicBlock &BB : BFI.second) { 1630 for (auto II = BB.begin(); II != BB.end(); ++II) { 1631 MCInst &Inst = *II; 1632 1633 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1634 !Inst.getOperand(0).isExpr()) 1635 continue; 1636 1637 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1638 if (CalleeSymbol->getName() != "memcpy" && 1639 CalleeSymbol->getName() != "memcpy@PLT" && 1640 CalleeSymbol->getName() != "_memcpy8") 1641 continue; 1642 1643 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); 1644 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1645 1646 const InstructionListType NewCode = 1647 BC.MIB->createInlineMemcpy(IsMemcpy8); 1648 II = BB.replaceInstruction(II, NewCode); 1649 std::advance(II, NewCode.size() - 1); 1650 if (IsTailCall) { 1651 MCInst Return; 1652 BC.MIB->createReturn(Return); 1653 II = BB.insertInstruction(std::next(II), std::move(Return)); 1654 } 1655 1656 ++NumInlined; 1657 NumInlinedDyno += BB.getKnownExecutionCount(); 1658 } 1659 } 1660 } 1661 1662 if (NumInlined) { 1663 outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls"; 1664 if (NumInlinedDyno) 1665 outs() << ". The calls were executed " << NumInlinedDyno 1666 << " times based on profile."; 1667 outs() << '\n'; 1668 } 1669 } 1670 1671 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const { 1672 if (!BinaryFunctionPass::shouldOptimize(Function)) 1673 return false; 1674 1675 for (const std::string &FunctionSpec : Spec) { 1676 StringRef FunctionName = StringRef(FunctionSpec).split(':').first; 1677 if (Function.hasNameRegex(FunctionName)) 1678 return true; 1679 } 1680 1681 return false; 1682 } 1683 1684 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize( 1685 const BinaryFunction &Function) const { 1686 StringRef SitesString; 1687 for (const std::string &FunctionSpec : Spec) { 1688 StringRef FunctionName; 1689 std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':'); 1690 if (Function.hasNameRegex(FunctionName)) 1691 break; 1692 SitesString = ""; 1693 } 1694 1695 std::set<size_t> Sites; 1696 SmallVector<StringRef, 4> SitesVec; 1697 SitesString.split(SitesVec, ':'); 1698 for (StringRef SiteString : SitesVec) { 1699 if (SiteString.empty()) 1700 continue; 1701 size_t Result; 1702 if (!SiteString.getAsInteger(10, Result)) 1703 Sites.emplace(Result); 1704 } 1705 1706 return Sites; 1707 } 1708 1709 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) { 1710 if (!BC.isX86()) 1711 return; 1712 1713 uint64_t NumSpecialized = 0; 1714 uint64_t NumSpecializedDyno = 0; 1715 for (auto &BFI : BC.getBinaryFunctions()) { 1716 BinaryFunction &Function = BFI.second; 1717 if (!shouldOptimize(Function)) 1718 continue; 1719 1720 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function); 1721 auto shouldOptimize = [&](size_t N) { 1722 return CallsToOptimize.empty() || CallsToOptimize.count(N); 1723 }; 1724 1725 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend()); 1726 size_t CallSiteID = 0; 1727 for (BinaryBasicBlock *CurBB : Blocks) { 1728 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { 1729 MCInst &Inst = *II; 1730 1731 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1732 !Inst.getOperand(0).isExpr()) 1733 continue; 1734 1735 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1736 if (CalleeSymbol->getName() != "memcpy" && 1737 CalleeSymbol->getName() != "memcpy@PLT") 1738 continue; 1739 1740 if (BC.MIB->isTailCall(Inst)) 1741 continue; 1742 1743 ++CallSiteID; 1744 1745 if (!shouldOptimize(CallSiteID)) 1746 continue; 1747 1748 // Create a copy of a call to memcpy(dest, src, size). 1749 MCInst MemcpyInstr = Inst; 1750 1751 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II); 1752 1753 BinaryBasicBlock *NextBB = nullptr; 1754 if (OneByteMemcpyBB->getNumNonPseudos() > 1) { 1755 NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin()); 1756 NextBB->eraseInstruction(NextBB->begin()); 1757 } else { 1758 NextBB = OneByteMemcpyBB->getSuccessor(); 1759 OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin()); 1760 assert(NextBB && "unexpected call to memcpy() with no return"); 1761 } 1762 1763 BinaryBasicBlock *MemcpyBB = Function.addBasicBlock(); 1764 MemcpyBB->setOffset(CurBB->getInputOffset()); 1765 InstructionListType CmpJCC = 1766 BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1, 1767 OneByteMemcpyBB->getLabel(), BC.Ctx.get()); 1768 CurBB->addInstructions(CmpJCC); 1769 CurBB->addSuccessor(MemcpyBB); 1770 1771 MemcpyBB->addInstruction(std::move(MemcpyInstr)); 1772 MemcpyBB->addSuccessor(NextBB); 1773 MemcpyBB->setCFIState(NextBB->getCFIState()); 1774 MemcpyBB->setExecutionCount(0); 1775 1776 // To prevent the actual call from being moved to cold, we set its 1777 // execution count to 1. 1778 if (CurBB->getKnownExecutionCount() > 0) 1779 MemcpyBB->setExecutionCount(1); 1780 1781 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy(); 1782 OneByteMemcpyBB->addInstructions(OneByteMemcpy); 1783 1784 ++NumSpecialized; 1785 NumSpecializedDyno += CurBB->getKnownExecutionCount(); 1786 1787 CurBB = NextBB; 1788 1789 // Note: we don't expect the next instruction to be a call to memcpy. 1790 II = CurBB->begin(); 1791 } 1792 } 1793 } 1794 1795 if (NumSpecialized) { 1796 outs() << "BOLT-INFO: specialized " << NumSpecialized 1797 << " memcpy() call sites for size 1"; 1798 if (NumSpecializedDyno) 1799 outs() << ". The calls were executed " << NumSpecializedDyno 1800 << " times based on profile."; 1801 outs() << '\n'; 1802 } 1803 } 1804 1805 void RemoveNops::runOnFunction(BinaryFunction &BF) { 1806 const BinaryContext &BC = BF.getBinaryContext(); 1807 for (BinaryBasicBlock &BB : BF) { 1808 for (int64_t I = BB.size() - 1; I >= 0; --I) { 1809 MCInst &Inst = BB.getInstructionAtIndex(I); 1810 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP")) 1811 BB.eraseInstructionAtIndex(I); 1812 } 1813 } 1814 } 1815 1816 void RemoveNops::runOnFunctions(BinaryContext &BC) { 1817 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 1818 runOnFunction(BF); 1819 }; 1820 1821 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 1822 return BF.shouldPreserveNops(); 1823 }; 1824 1825 ParallelUtilities::runOnEachFunction( 1826 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 1827 SkipFunc, "RemoveNops"); 1828 } 1829 1830 } // namespace bolt 1831 } // namespace llvm 1832