1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/ParallelUtilities.h"
15 #include "bolt/Passes/ReorderAlgorithm.h"
16 #include "bolt/Passes/ReorderFunctions.h"
17 #include "llvm/Support/CommandLine.h"
18 
19 #include <numeric>
20 #include <vector>
21 
22 #define DEBUG_TYPE "bolt-opts"
23 
24 using namespace llvm;
25 using namespace bolt;
26 
27 namespace {
28 
29 const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30   if (C == bolt::DynoStats::FIRST_DYNO_STAT)
31     return "none";
32   else if (C == bolt::DynoStats::LAST_DYNO_STAT)
33     return "all";
34 
35   static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
36 
37   OptNames[C] = bolt::DynoStats::Description(C);
38 
39   std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
40 
41   return OptNames[C].c_str();
42 }
43 
44 const char *dynoStatsOptDesc(const bolt::DynoStats::Category C) {
45   if (C == bolt::DynoStats::FIRST_DYNO_STAT)
46     return "unsorted";
47   else if (C == bolt::DynoStats::LAST_DYNO_STAT)
48     return "sorted by all stats";
49 
50   return bolt::DynoStats::Description(C);
51 }
52 
53 }
54 
55 namespace opts {
56 
57 extern cl::OptionCategory BoltCategory;
58 extern cl::OptionCategory BoltOptCategory;
59 
60 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
61 extern cl::opt<unsigned> Verbosity;
62 extern cl::opt<bool> EnableBAT;
63 extern cl::opt<unsigned> ExecutionCountThreshold;
64 extern cl::opt<bool> UpdateDebugSections;
65 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
66 
67 enum DynoStatsSortOrder : char {
68   Ascending,
69   Descending
70 };
71 
72 static cl::opt<DynoStatsSortOrder>
73 DynoStatsSortOrderOpt("print-sorted-by-order",
74   cl::desc("use ascending or descending order when printing functions "
75            "ordered by dyno stats"),
76   cl::ZeroOrMore,
77   cl::init(DynoStatsSortOrder::Descending),
78   cl::cat(BoltOptCategory));
79 
80 cl::list<std::string>
81 HotTextMoveSections("hot-text-move-sections",
82   cl::desc("list of sections containing functions used for hugifying hot text. "
83            "BOLT makes sure these functions are not placed on the same page as "
84            "the hot text. (default=\'.stub,.mover\')."),
85   cl::value_desc("sec1,sec2,sec3,..."),
86   cl::CommaSeparated,
87   cl::ZeroOrMore,
88   cl::cat(BoltCategory));
89 
90 bool isHotTextMover(const BinaryFunction &Function) {
91   for (std::string &SectionName : opts::HotTextMoveSections) {
92     if (Function.getOriginSectionName() &&
93         *Function.getOriginSectionName() == SectionName)
94       return true;
95   }
96 
97   return false;
98 }
99 
100 static cl::opt<bool>
101 MinBranchClusters("min-branch-clusters",
102   cl::desc("use a modified clustering algorithm geared towards minimizing "
103            "branches"),
104   cl::ZeroOrMore,
105   cl::Hidden,
106   cl::cat(BoltOptCategory));
107 
108 enum PeepholeOpts : char {
109   PEEP_NONE             = 0x0,
110   PEEP_DOUBLE_JUMPS     = 0x2,
111   PEEP_TAILCALL_TRAPS   = 0x4,
112   PEEP_USELESS_BRANCHES = 0x8,
113   PEEP_ALL              = 0xf
114 };
115 
116 static cl::list<PeepholeOpts>
117 Peepholes("peepholes",
118   cl::CommaSeparated,
119   cl::desc("enable peephole optimizations"),
120   cl::value_desc("opt1,opt2,opt3,..."),
121   cl::values(
122     clEnumValN(PEEP_NONE, "none", "disable peepholes"),
123     clEnumValN(PEEP_DOUBLE_JUMPS, "double-jumps",
124                "remove double jumps when able"),
125     clEnumValN(PEEP_TAILCALL_TRAPS, "tailcall-traps", "insert tail call traps"),
126     clEnumValN(PEEP_USELESS_BRANCHES, "useless-branches",
127                "remove useless conditional branches"),
128     clEnumValN(PEEP_ALL, "all", "enable all peephole optimizations")),
129   cl::ZeroOrMore,
130   cl::cat(BoltOptCategory));
131 
132 static cl::opt<unsigned>
133 PrintFuncStat("print-function-statistics",
134   cl::desc("print statistics about basic block ordering"),
135   cl::init(0),
136   cl::ZeroOrMore,
137   cl::cat(BoltOptCategory));
138 
139 static cl::list<bolt::DynoStats::Category>
140 PrintSortedBy("print-sorted-by",
141   cl::CommaSeparated,
142   cl::desc("print functions sorted by order of dyno stats"),
143   cl::value_desc("key1,key2,key3,..."),
144   cl::values(
145 #define D(name, ...)                                        \
146     clEnumValN(bolt::DynoStats::name,                     \
147                dynoStatsOptName(bolt::DynoStats::name),   \
148                dynoStatsOptDesc(bolt::DynoStats::name)),
149     DYNO_STATS
150 #undef D
151     clEnumValN(0xffff, ".", ".")
152     ),
153   cl::ZeroOrMore,
154   cl::cat(BoltOptCategory));
155 
156 static cl::opt<bool>
157 PrintUnknown("print-unknown",
158   cl::desc("print names of functions with unknown control flow"),
159   cl::init(false),
160   cl::ZeroOrMore,
161   cl::cat(BoltCategory),
162   cl::Hidden);
163 
164 static cl::opt<bool>
165 PrintUnknownCFG("print-unknown-cfg",
166   cl::desc("dump CFG of functions with unknown control flow"),
167   cl::init(false),
168   cl::ZeroOrMore,
169   cl::cat(BoltCategory),
170   cl::ReallyHidden);
171 
172 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
173     "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
174     cl::init(bolt::ReorderBasicBlocks::LT_NONE),
175     cl::values(
176         clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
177                    "do not reorder basic blocks"),
178         clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
179                    "layout blocks in reverse order"),
180         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
181                    "perform optimal layout based on profile"),
182         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
183                    "branch-predictor",
184                    "perform optimal layout prioritizing branch "
185                    "predictions"),
186         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
187                    "perform optimal layout prioritizing I-cache "
188                    "behavior"),
189         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "cache+",
190                    "perform layout optimizing I-cache behavior"),
191         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
192                    "perform layout optimizing I-cache behavior"),
193         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
194                    "cluster-shuffle", "perform random layout of clusters")),
195     cl::ZeroOrMore, cl::cat(BoltOptCategory));
196 
197 static cl::opt<unsigned>
198 ReportBadLayout("report-bad-layout",
199   cl::desc("print top <uint> functions with suboptimal code layout on input"),
200   cl::init(0),
201   cl::ZeroOrMore,
202   cl::Hidden,
203   cl::cat(BoltOptCategory));
204 
205 static cl::opt<bool>
206 ReportStaleFuncs("report-stale",
207   cl::desc("print the list of functions with stale profile"),
208   cl::init(false),
209   cl::ZeroOrMore,
210   cl::Hidden,
211   cl::cat(BoltOptCategory));
212 
213 enum SctcModes : char {
214   SctcAlways,
215   SctcPreserveDirection,
216   SctcHeuristic
217 };
218 
219 static cl::opt<SctcModes>
220 SctcMode("sctc-mode",
221   cl::desc("mode for simplify conditional tail calls"),
222   cl::init(SctcAlways),
223   cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
224     clEnumValN(SctcPreserveDirection,
225       "preserve",
226       "only perform sctc when branch direction is "
227       "preserved"),
228     clEnumValN(SctcHeuristic,
229       "heuristic",
230       "use branch prediction data to control sctc")),
231   cl::ZeroOrMore,
232   cl::cat(BoltOptCategory));
233 
234 static cl::opt<unsigned>
235 StaleThreshold("stale-threshold",
236     cl::desc(
237       "maximum percentage of stale functions to tolerate (default: 100)"),
238     cl::init(100),
239     cl::Hidden,
240     cl::cat(BoltOptCategory));
241 
242 static cl::opt<unsigned>
243 TSPThreshold("tsp-threshold",
244   cl::desc("maximum number of hot basic blocks in a function for which to use "
245            "a precise TSP solution while re-ordering basic blocks"),
246   cl::init(10),
247   cl::ZeroOrMore,
248   cl::Hidden,
249   cl::cat(BoltOptCategory));
250 
251 static cl::opt<unsigned>
252 TopCalledLimit("top-called-limit",
253   cl::desc("maximum number of functions to print in top called "
254            "functions section"),
255   cl::init(100),
256   cl::ZeroOrMore,
257   cl::Hidden,
258   cl::cat(BoltCategory));
259 
260 } // namespace opts
261 
262 namespace llvm {
263 namespace bolt {
264 
265 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
266   return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
267          !BF.isIgnored();
268 }
269 
270 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
271   return BF.isSimple() && !BF.isIgnored();
272 }
273 
274 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
275   uint64_t NumRemoved = 0;
276   uint64_t NumDuplicateEdges = 0;
277   uint64_t NeedsFixBranches = 0;
278   for (BinaryBasicBlock &BB : BF) {
279     if (!BB.empty())
280       continue;
281 
282     if (BB.isEntryPoint() || BB.isLandingPad())
283       continue;
284 
285     // Handle a dangling empty block.
286     if (BB.succ_size() == 0) {
287       // If an empty dangling basic block has a predecessor, it could be a
288       // result of codegen for __builtin_unreachable. In such case, do not
289       // remove the block.
290       if (BB.pred_size() == 0) {
291         BB.markValid(false);
292         ++NumRemoved;
293       }
294       continue;
295     }
296 
297     // The block should have just one successor.
298     BinaryBasicBlock *Successor = BB.getSuccessor();
299     assert(Successor && "invalid CFG encountered");
300 
301     // Redirect all predecessors to the successor block.
302     while (!BB.pred_empty()) {
303       BinaryBasicBlock *Predecessor = *BB.pred_begin();
304       if (Predecessor->hasJumpTable())
305         break;
306 
307       if (Predecessor == Successor)
308         break;
309 
310       BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
311       Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
312                                     BI.MispredictedCount);
313       // We need to fix branches even if we failed to replace all successors
314       // and remove the block.
315       NeedsFixBranches = true;
316     }
317 
318     if (BB.pred_empty()) {
319       BB.removeAllSuccessors();
320       BB.markValid(false);
321       ++NumRemoved;
322     }
323   }
324 
325   if (NumRemoved)
326     BF.eraseInvalidBBs();
327 
328   // Check for duplicate successors. Do it after the empty block elimination as
329   // we can get more duplicate successors.
330   for (BinaryBasicBlock &BB : BF)
331     if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
332         BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
333       ++NumDuplicateEdges;
334 
335   // fixBranches() will get rid of duplicate edges and update jump instructions.
336   if (NumDuplicateEdges || NeedsFixBranches)
337     BF.fixBranches();
338 
339   NumDuplicateEdgesMerged += NumDuplicateEdges;
340   NumBlocksRemoved += NumRemoved;
341 }
342 
343 void NormalizeCFG::runOnFunctions(BinaryContext &BC) {
344   ParallelUtilities::runOnEachFunction(
345       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
346       [&](BinaryFunction &BF) { runOnFunction(BF); },
347       [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
348       "NormalizeCFG");
349   if (NumBlocksRemoved)
350     outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
351            << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
352   if (NumDuplicateEdgesMerged)
353     outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
354            << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s")
355            << '\n';
356 }
357 
358 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
359   if (Function.layout_size() > 0) {
360     unsigned Count;
361     uint64_t Bytes;
362     Function.markUnreachableBlocks();
363     LLVM_DEBUG({
364       for (BinaryBasicBlock *BB : Function.layout()) {
365         if (!BB->isValid()) {
366           dbgs() << "BOLT-INFO: UCE found unreachable block " << BB->getName()
367                  << " in function " << Function << "\n";
368           Function.dump();
369         }
370       }
371     });
372     std::tie(Count, Bytes) = Function.eraseInvalidBBs();
373     DeletedBlocks += Count;
374     DeletedBytes += Bytes;
375     if (Count) {
376       Modified.insert(&Function);
377       if (opts::Verbosity > 0)
378         outs() << "BOLT-INFO: Removed " << Count
379                << " dead basic block(s) accounting for " << Bytes
380                << " bytes in function " << Function << '\n';
381     }
382   }
383 }
384 
385 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
386   for (auto &It : BC.getBinaryFunctions()) {
387     BinaryFunction &Function = It.second;
388     if (shouldOptimize(Function))
389       runOnFunction(Function);
390   }
391 
392   outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
393          << DeletedBytes << " bytes of code.\n";
394 }
395 
396 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
397   return (BinaryFunctionPass::shouldPrint(BF) &&
398           opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
399 }
400 
401 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
402   // Apply execution count threshold
403   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
404     return false;
405 
406   return BinaryFunctionPass::shouldOptimize(BF);
407 }
408 
409 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
410   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
411     return;
412 
413   std::atomic<uint64_t> ModifiedFuncCount{0};
414 
415   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
416     modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
417     if (BF.hasLayoutChanged())
418       ++ModifiedFuncCount;
419   };
420 
421   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
422     return !shouldOptimize(BF);
423   };
424 
425   ParallelUtilities::runOnEachFunction(
426       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
427       "ReorderBasicBlocks");
428 
429   outs() << "BOLT-INFO: basic block reordering modified layout of "
430          << format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(),
431                    100.0 * ModifiedFuncCount.load() /
432                        BC.getBinaryFunctions().size());
433 
434   if (opts::PrintFuncStat > 0) {
435     raw_ostream &OS = outs();
436     // Copy all the values into vector in order to sort them
437     std::map<uint64_t, BinaryFunction &> ScoreMap;
438     auto &BFs = BC.getBinaryFunctions();
439     for (auto It = BFs.begin(); It != BFs.end(); ++It)
440       ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
441           It->second.getFunctionScore(), It->second));
442 
443     OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
444     OS << "           There are " << BFs.size() << " functions in total. \n";
445     OS << "           Number of functions being modified: "
446        << ModifiedFuncCount.load() << "\n";
447     OS << "           User asks for detailed information on top "
448        << opts::PrintFuncStat << " functions. (Ranked by function score)"
449        << "\n\n";
450     uint64_t I = 0;
451     for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
452              ScoreMap.rbegin();
453          Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
454       BinaryFunction &Function = Rit->second;
455 
456       OS << "           Information for function of top: " << (I + 1) << ": \n";
457       OS << "             Function Score is: " << Function.getFunctionScore()
458          << "\n";
459       OS << "             There are " << Function.size()
460          << " number of blocks in this function.\n";
461       OS << "             There are " << Function.getInstructionCount()
462          << " number of instructions in this function.\n";
463       OS << "             The edit distance for this function is: "
464          << Function.getEditDistance() << "\n\n";
465     }
466   }
467 }
468 
469 void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
470                                               LayoutType Type,
471                                               bool MinBranchClusters) const {
472   if (BF.size() == 0 || Type == LT_NONE)
473     return;
474 
475   BinaryFunction::BasicBlockOrderType NewLayout;
476   std::unique_ptr<ReorderAlgorithm> Algo;
477 
478   // Cannot do optimal layout without profile.
479   if (Type != LT_REVERSE && !BF.hasValidProfile())
480     return;
481 
482   if (Type == LT_REVERSE) {
483     Algo.reset(new ReverseReorderAlgorithm());
484   } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
485     // Work on optimal solution if problem is small enough
486     LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
487     Algo.reset(new TSPReorderAlgorithm());
488   } else {
489     LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
490 
491     std::unique_ptr<ClusterAlgorithm> CAlgo;
492     if (MinBranchClusters)
493       CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
494     else
495       CAlgo.reset(new PHGreedyClusterAlgorithm());
496 
497     switch (Type) {
498     case LT_OPTIMIZE:
499       Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
500       break;
501 
502     case LT_OPTIMIZE_BRANCH:
503       Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
504       break;
505 
506     case LT_OPTIMIZE_CACHE:
507       Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
508       break;
509 
510     case LT_OPTIMIZE_EXT_TSP:
511       Algo.reset(new ExtTSPReorderAlgorithm());
512       break;
513 
514     case LT_OPTIMIZE_SHUFFLE:
515       Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
516       break;
517 
518     default:
519       llvm_unreachable("unexpected layout type");
520     }
521   }
522 
523   Algo->reorderBasicBlocks(BF, NewLayout);
524 
525   BF.updateBasicBlockLayout(NewLayout);
526 }
527 
528 void FixupBranches::runOnFunctions(BinaryContext &BC) {
529   for (auto &It : BC.getBinaryFunctions()) {
530     BinaryFunction &Function = It.second;
531     if (!BC.shouldEmit(Function) || !Function.isSimple())
532       continue;
533 
534     Function.fixBranches();
535   }
536 }
537 
538 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
539   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
540     if (!BF.finalizeCFIState()) {
541       if (BC.HasRelocations) {
542         errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
543                << ". Exiting.\n";
544         exit(1);
545       }
546       BF.setSimple(false);
547       return;
548     }
549 
550     BF.setFinalized();
551 
552     // Update exception handling information.
553     BF.updateEHRanges();
554   };
555 
556   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
557     return !BC.shouldEmit(BF);
558   };
559 
560   ParallelUtilities::runOnEachFunction(
561       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
562       SkipPredicate, "FinalizeFunctions");
563 }
564 
565 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
566   if (BC.HasRelocations)
567     return;
568 
569   if (!opts::UpdateDebugSections)
570     return;
571 
572   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
573   // incorrect debug info.
574   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
575     uint64_t HotSize, ColdSize;
576     std::tie(HotSize, ColdSize) =
577         BC.calculateEmittedSize(BF, /*FixBranches=*/false);
578     if (HotSize > BF.getMaxSize())
579       BF.setSimple(false);
580   };
581 
582   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
583     return !shouldOptimize(BF);
584   };
585 
586   ParallelUtilities::runOnEachFunction(
587       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
588       SkipFunc, "CheckLargeFunctions");
589 }
590 
591 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
592   // Unlike other passes, allow functions in non-CFG state.
593   return BF.isSimple() && !BF.isIgnored();
594 }
595 
596 void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
597   std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations;
598 
599   for (auto &It : BC.getBinaryFunctions()) {
600     BinaryFunction &BF = It.second;
601     int64_t CurrentGnuArgsSize = 0;
602 
603     // Have we crossed hot/cold border for split functions?
604     bool SeenCold = false;
605 
606     for (BinaryBasicBlock *BB : BF.layout()) {
607       if (BB->isCold() && !SeenCold) {
608         SeenCold = true;
609         CurrentGnuArgsSize = 0;
610       }
611 
612       // First convert GnuArgsSize annotations into CFIs. This may change instr
613       // pointers, so do it before recording ptrs for preserved annotations
614       if (BF.usesGnuArgsSize()) {
615         for (auto II = BB->begin(); II != BB->end(); ++II) {
616           if (!BC.MIB->isInvoke(*II))
617             continue;
618           const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
619           assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size");
620           if (NewGnuArgsSize != CurrentGnuArgsSize) {
621             auto InsertII = BF.addCFIInstruction(
622                 BB, II,
623                 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
624             CurrentGnuArgsSize = NewGnuArgsSize;
625             II = std::next(InsertII);
626           }
627         }
628       }
629 
630       // Now record preserved annotations separately and then strip annotations.
631       for (auto II = BB->begin(); II != BB->end(); ++II) {
632         if (BF.requiresAddressTranslation() &&
633             BC.MIB->hasAnnotation(*II, "Offset"))
634           PreservedOffsetAnnotations.emplace_back(
635               &(*II), BC.MIB->getAnnotationAs<uint32_t>(*II, "Offset"));
636         BC.MIB->stripAnnotations(*II);
637       }
638     }
639   }
640   for (BinaryFunction *BF : BC.getInjectedBinaryFunctions())
641     for (BinaryBasicBlock &BB : *BF)
642       for (MCInst &Instruction : BB)
643         BC.MIB->stripAnnotations(Instruction);
644 
645   // Release all memory taken by annotations
646   BC.MIB->freeAnnotations();
647 
648   // Reinsert preserved annotations we need during code emission.
649   for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations)
650     BC.MIB->addAnnotation<uint32_t>(*Item.first, "Offset", Item.second);
651 }
652 
653 namespace {
654 
655 // This peephole fixes jump instructions that jump to another basic
656 // block with a single jump instruction, e.g.
657 //
658 // B0: ...
659 //     jmp  B1   (or jcc B1)
660 //
661 // B1: jmp  B2
662 //
663 // ->
664 //
665 // B0: ...
666 //     jmp  B2   (or jcc B2)
667 //
668 uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
669   uint64_t NumDoubleJumps = 0;
670 
671   MCContext *Ctx = Function.getBinaryContext().Ctx.get();
672   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
673   for (BinaryBasicBlock &BB : Function) {
674     auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
675                              const MCSymbol *SuccSym) {
676       // Ignore infinite loop jumps or fallthrough tail jumps.
677       if (Pred == Succ || Succ == &BB)
678         return false;
679 
680       if (Succ) {
681         const MCSymbol *TBB = nullptr;
682         const MCSymbol *FBB = nullptr;
683         MCInst *CondBranch = nullptr;
684         MCInst *UncondBranch = nullptr;
685         bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
686         if (!Res) {
687           LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
688                      Pred->dump());
689           return false;
690         }
691         Pred->replaceSuccessor(&BB, Succ);
692 
693         // We must patch up any existing branch instructions to match up
694         // with the new successor.
695         assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
696                "Predecessor block has inconsistent number of successors");
697         if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
698           MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
699         } else if (UncondBranch &&
700                    MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
701           MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
702         } else if (!UncondBranch) {
703           assert(Function.getBasicBlockAfter(Pred, false) != Succ &&
704                  "Don't add an explicit jump to a fallthrough block.");
705           Pred->addBranchInstruction(Succ);
706         }
707       } else {
708         // Succ will be null in the tail call case.  In this case we
709         // need to explicitly add a tail call instruction.
710         MCInst *Branch = Pred->getLastNonPseudoInstr();
711         if (Branch && MIB->isUnconditionalBranch(*Branch)) {
712           assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
713           Pred->removeSuccessor(&BB);
714           Pred->eraseInstruction(Pred->findInstruction(Branch));
715           Pred->addTailCallInstruction(SuccSym);
716         } else {
717           return false;
718         }
719       }
720 
721       ++NumDoubleJumps;
722       LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
723                         << Pred->getName() << " -> " << BB.getName() << " to "
724                         << Pred->getName() << " -> " << SuccSym->getName()
725                         << (!Succ ? " (tail)\n" : "\n"));
726 
727       return true;
728     };
729 
730     if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
731       continue;
732 
733     MCInst *Inst = BB.getFirstNonPseudoInstr();
734     const bool IsTailCall = MIB->isTailCall(*Inst);
735 
736     if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
737       continue;
738 
739     // If we operate after SCTC make sure it's not a conditional tail call.
740     if (IsTailCall && MIB->isConditionalBranch(*Inst))
741       continue;
742 
743     const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
744     BinaryBasicBlock *Succ = BB.getSuccessor();
745 
746     if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
747       continue;
748 
749     std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
750 
751     for (BinaryBasicBlock *Pred : Preds) {
752       if (Pred->isLandingPad())
753         continue;
754 
755       if (Pred->getSuccessor() == &BB ||
756           (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
757           Pred->getConditionalSuccessor(false) == &BB)
758         if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid)
759           BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
760                        BB.isEntryPoint());
761     }
762   }
763 
764   return NumDoubleJumps;
765 }
766 } // namespace
767 
768 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
769     const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
770     const BinaryBasicBlock *BB, const bool DirectionFlag) {
771   if (BeenOptimized.count(PredBB))
772     return false;
773 
774   const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
775 
776   if (IsForward)
777     ++NumOrigForwardBranches;
778   else
779     ++NumOrigBackwardBranches;
780 
781   if (opts::SctcMode == opts::SctcAlways)
782     return true;
783 
784   if (opts::SctcMode == opts::SctcPreserveDirection)
785     return IsForward == DirectionFlag;
786 
787   const ErrorOr<std::pair<double, double>> Frequency =
788       PredBB->getBranchStats(BB);
789 
790   // It's ok to rewrite the conditional branch if the new target will be
791   // a backward branch.
792 
793   // If no data available for these branches, then it should be ok to
794   // do the optimization since it will reduce code size.
795   if (Frequency.getError())
796     return true;
797 
798   // TODO: should this use misprediction frequency instead?
799   const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
800                       (!IsForward && Frequency.get().first <= 0.5);
801 
802   return Result == DirectionFlag;
803 }
804 
805 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
806   // Need updated indices to correctly detect branch' direction.
807   BF.updateLayoutIndices();
808   BF.markUnreachableBlocks();
809 
810   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
811   MCContext *Ctx = BF.getBinaryContext().Ctx.get();
812   uint64_t NumLocalCTCCandidates = 0;
813   uint64_t NumLocalCTCs = 0;
814   uint64_t LocalCTCTakenCount = 0;
815   uint64_t LocalCTCExecCount = 0;
816   std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
817       NeedsUncondBranch;
818 
819   // Will block be deleted by UCE?
820   auto isValid = [](const BinaryBasicBlock *BB) {
821     return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
822   };
823 
824   for (BinaryBasicBlock *BB : BF.layout()) {
825     // Locate BB with a single direct tail-call instruction.
826     if (BB->getNumNonPseudos() != 1)
827       continue;
828 
829     MCInst *Instr = BB->getFirstNonPseudoInstr();
830     if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
831       continue;
832 
833     const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
834     if (!CalleeSymbol)
835       continue;
836 
837     // Detect direction of the possible conditional tail call.
838     const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
839 
840     // Iterate through all predecessors.
841     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
842       BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
843       if (!CondSucc)
844         continue;
845 
846       ++NumLocalCTCCandidates;
847 
848       const MCSymbol *TBB = nullptr;
849       const MCSymbol *FBB = nullptr;
850       MCInst *CondBranch = nullptr;
851       MCInst *UncondBranch = nullptr;
852       bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
853 
854       // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
855       if (!Result) {
856         LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
857                    PredBB->dump());
858         continue;
859       }
860 
861       assert(Result && "internal error analyzing conditional branch");
862       assert(CondBranch && "conditional branch expected");
863 
864       // It's possible that PredBB is also a successor to BB that may have
865       // been processed by a previous iteration of the SCTC loop, in which
866       // case it may have been marked invalid.  We should skip rewriting in
867       // this case.
868       if (!PredBB->isValid()) {
869         assert(PredBB->isSuccessor(BB) &&
870                "PredBB should be valid if it is not a successor to BB");
871         continue;
872       }
873 
874       // We don't want to reverse direction of the branch in new order
875       // without further profile analysis.
876       const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
877       if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
878         continue;
879 
880       // Record this block so that we don't try to optimize it twice.
881       BeenOptimized.insert(PredBB);
882 
883       uint64_t Count = 0;
884       if (CondSucc != BB) {
885         // Patch the new target address into the conditional branch.
886         MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
887         // Since we reversed the condition on the branch we need to change
888         // the target for the unconditional branch or add a unconditional
889         // branch to the old target.  This has to be done manually since
890         // fixupBranches is not called after SCTC.
891         NeedsUncondBranch.emplace_back(PredBB, CondSucc);
892         Count = PredBB->getFallthroughBranchInfo().Count;
893       } else {
894         // Change destination of the conditional branch.
895         MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
896         Count = PredBB->getTakenBranchInfo().Count;
897       }
898       const uint64_t CTCTakenFreq =
899           Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
900 
901       // Annotate it, so "isCall" returns true for this jcc
902       MIB->setConditionalTailCall(*CondBranch);
903       // Add info abount the conditional tail call frequency, otherwise this
904       // info will be lost when we delete the associated BranchInfo entry
905       auto &CTCAnnotation =
906           MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
907       CTCAnnotation = CTCTakenFreq;
908 
909       // Remove the unused successor which may be eliminated later
910       // if there are no other users.
911       PredBB->removeSuccessor(BB);
912       // Update BB execution count
913       if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
914         BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
915       else if (CTCTakenFreq > BB->getKnownExecutionCount())
916         BB->setExecutionCount(0);
917 
918       ++NumLocalCTCs;
919       LocalCTCTakenCount += CTCTakenFreq;
920       LocalCTCExecCount += PredBB->getKnownExecutionCount();
921     }
922 
923     // Remove the block from CFG if all predecessors were removed.
924     BB->markValid(isValid(BB));
925   }
926 
927   // Add unconditional branches at the end of BBs to new successors
928   // as long as the successor is not a fallthrough.
929   for (auto &Entry : NeedsUncondBranch) {
930     BinaryBasicBlock *PredBB = Entry.first;
931     const BinaryBasicBlock *CondSucc = Entry.second;
932 
933     const MCSymbol *TBB = nullptr;
934     const MCSymbol *FBB = nullptr;
935     MCInst *CondBranch = nullptr;
936     MCInst *UncondBranch = nullptr;
937     PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
938 
939     // Find the next valid block.  Invalid blocks will be deleted
940     // so they shouldn't be considered fallthrough targets.
941     const BinaryBasicBlock *NextBlock = BF.getBasicBlockAfter(PredBB, false);
942     while (NextBlock && !isValid(NextBlock))
943       NextBlock = BF.getBasicBlockAfter(NextBlock, false);
944 
945     // Get the unconditional successor to this block.
946     const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
947     assert(PredSucc && "The other branch should be a tail call");
948 
949     const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
950 
951     if (UncondBranch) {
952       if (HasFallthrough)
953         PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
954       else
955         MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
956     } else if (!HasFallthrough) {
957       MCInst Branch;
958       MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
959       PredBB->addInstruction(Branch);
960     }
961   }
962 
963   if (NumLocalCTCs > 0) {
964     NumDoubleJumps += fixDoubleJumps(BF, true);
965     // Clean-up unreachable tail-call blocks.
966     const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
967     DeletedBlocks += Stats.first;
968     DeletedBytes += Stats.second;
969 
970     assert(BF.validateCFG());
971   }
972 
973   LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
974                     << " conditional tail calls from a total of "
975                     << NumLocalCTCCandidates << " candidates in function " << BF
976                     << ". CTCs execution count for this function is "
977                     << LocalCTCExecCount << " and CTC taken count is "
978                     << LocalCTCTakenCount << "\n";);
979 
980   NumTailCallsPatched += NumLocalCTCs;
981   NumCandidateTailCalls += NumLocalCTCCandidates;
982   CTCExecCount += LocalCTCExecCount;
983   CTCTakenCount += LocalCTCTakenCount;
984 
985   return NumLocalCTCs > 0;
986 }
987 
988 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
989   if (!BC.isX86())
990     return;
991 
992   for (auto &It : BC.getBinaryFunctions()) {
993     BinaryFunction &Function = It.second;
994 
995     if (!shouldOptimize(Function))
996       continue;
997 
998     if (fixTailCalls(Function)) {
999       Modified.insert(&Function);
1000       Function.setHasCanonicalCFG(false);
1001     }
1002   }
1003 
1004   outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
1005          << " tail calls (" << NumOrigForwardBranches << " forward)"
1006          << " tail calls (" << NumOrigBackwardBranches << " backward)"
1007          << " from a total of " << NumCandidateTailCalls << " while removing "
1008          << NumDoubleJumps << " double jumps"
1009          << " and removing " << DeletedBlocks << " basic blocks"
1010          << " totalling " << DeletedBytes
1011          << " bytes of code. CTCs total execution count is " << CTCExecCount
1012          << " and the number of times CTCs are taken is " << CTCTakenCount
1013          << ".\n";
1014 }
1015 
1016 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
1017   uint64_t Count = 0;
1018   const BinaryContext &BC = Function.getBinaryContext();
1019   for (BinaryBasicBlock &BB : Function) {
1020     for (MCInst &Inst : BB) {
1021       MCInst OriginalInst;
1022       if (opts::Verbosity > 2)
1023         OriginalInst = Inst;
1024 
1025       if (!BC.MIB->shortenInstruction(Inst))
1026         continue;
1027 
1028       if (opts::Verbosity > 2) {
1029         outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
1030         BC.printInstruction(outs(), OriginalInst, 0, &Function);
1031         outs() << "BOLT-INFO: to:";
1032         BC.printInstruction(outs(), Inst, 0, &Function);
1033       }
1034 
1035       ++Count;
1036     }
1037   }
1038 
1039   return Count;
1040 }
1041 
1042 void ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1043   std::atomic<uint64_t> NumShortened{0};
1044   if (!BC.isX86())
1045     return;
1046 
1047   ParallelUtilities::runOnEachFunction(
1048       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1049       [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1050       nullptr, "ShortenInstructions");
1051 
1052   outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n";
1053 }
1054 
1055 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1056   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1057   for (BinaryBasicBlock &BB : Function) {
1058     MCInst *Inst = BB.getLastNonPseudoInstr();
1059     if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1060       MCInst Trap;
1061       if (MIB->createTrap(Trap)) {
1062         BB.addInstruction(Trap);
1063         ++TailCallTraps;
1064       }
1065     }
1066   }
1067 }
1068 
1069 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1070   for (BinaryBasicBlock &BB : Function) {
1071     if (BB.succ_size() != 2)
1072       continue;
1073 
1074     BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1075     BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1076     if (CondBB != UncondBB)
1077       continue;
1078 
1079     const MCSymbol *TBB = nullptr;
1080     const MCSymbol *FBB = nullptr;
1081     MCInst *CondBranch = nullptr;
1082     MCInst *UncondBranch = nullptr;
1083     bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1084 
1085     // analyzeBranch() can fail due to unusual branch instructions,
1086     // e.g. jrcxz, or jump tables (indirect jump).
1087     if (!Result || !CondBranch)
1088       continue;
1089 
1090     BB.removeDuplicateConditionalSuccessor(CondBranch);
1091     ++NumUselessCondBranches;
1092   }
1093 }
1094 
1095 void Peepholes::runOnFunctions(BinaryContext &BC) {
1096   const char Opts = std::accumulate(
1097       opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1098       [](const char A, const opts::PeepholeOpts B) { return A | B; });
1099   if (Opts == opts::PEEP_NONE || !BC.isX86())
1100     return;
1101 
1102   for (auto &It : BC.getBinaryFunctions()) {
1103     BinaryFunction &Function = It.second;
1104     if (shouldOptimize(Function)) {
1105       if (Opts & opts::PEEP_DOUBLE_JUMPS)
1106         NumDoubleJumps += fixDoubleJumps(Function, false);
1107       if (Opts & opts::PEEP_TAILCALL_TRAPS)
1108         addTailcallTraps(Function);
1109       if (Opts & opts::PEEP_USELESS_BRANCHES)
1110         removeUselessCondBranches(Function);
1111       assert(Function.validateCFG());
1112     }
1113   }
1114   outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1115          << " double jumps patched.\n"
1116          << "BOLT-INFO: Peephole: " << TailCallTraps
1117          << " tail call traps inserted.\n"
1118          << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1119          << " useless conditional branches removed.\n";
1120 }
1121 
1122 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1123   BinaryContext &BC = BF.getBinaryContext();
1124   MCPlusBuilder *MIB = BC.MIB.get();
1125 
1126   uint64_t NumLocalLoadsSimplified = 0;
1127   uint64_t NumDynamicLocalLoadsSimplified = 0;
1128   uint64_t NumLocalLoadsFound = 0;
1129   uint64_t NumDynamicLocalLoadsFound = 0;
1130 
1131   for (BinaryBasicBlock *BB : BF.layout()) {
1132     for (MCInst &Inst : *BB) {
1133       unsigned Opcode = Inst.getOpcode();
1134       const MCInstrDesc &Desc = BC.MII->get(Opcode);
1135 
1136       // Skip instructions that do not load from memory.
1137       if (!Desc.mayLoad())
1138         continue;
1139 
1140       // Try to statically evaluate the target memory address;
1141       uint64_t TargetAddress;
1142 
1143       if (MIB->hasPCRelOperand(Inst)) {
1144         // Try to find the symbol that corresponds to the PC-relative operand.
1145         MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1146         assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1147         assert(DispOpI->isExpr() &&
1148                "found PC-relative with non-symbolic displacement");
1149 
1150         // Get displacement symbol.
1151         const MCSymbol *DisplSymbol;
1152         uint64_t DisplOffset;
1153 
1154         std::tie(DisplSymbol, DisplOffset) =
1155             MIB->getTargetSymbolInfo(DispOpI->getExpr());
1156 
1157         if (!DisplSymbol)
1158           continue;
1159 
1160         // Look up the symbol address in the global symbols map of the binary
1161         // context object.
1162         BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1163         if (!BD)
1164           continue;
1165         TargetAddress = BD->getAddress() + DisplOffset;
1166       } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1167         continue;
1168       }
1169 
1170       // Get the contents of the section containing the target address of the
1171       // memory operand. We are only interested in read-only sections.
1172       ErrorOr<BinarySection &> DataSection =
1173           BC.getSectionForAddress(TargetAddress);
1174       if (!DataSection || !DataSection->isReadOnly())
1175         continue;
1176 
1177       if (BC.getRelocationAt(TargetAddress) ||
1178           BC.getDynamicRelocationAt(TargetAddress))
1179         continue;
1180 
1181       uint32_t Offset = TargetAddress - DataSection->getAddress();
1182       StringRef ConstantData = DataSection->getContents();
1183 
1184       ++NumLocalLoadsFound;
1185       if (BB->hasProfile())
1186         NumDynamicLocalLoadsFound += BB->getExecutionCount();
1187 
1188       if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1189         ++NumLocalLoadsSimplified;
1190         if (BB->hasProfile())
1191           NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1192       }
1193     }
1194   }
1195 
1196   NumLoadsFound += NumLocalLoadsFound;
1197   NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1198   NumLoadsSimplified += NumLocalLoadsSimplified;
1199   NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1200 
1201   return NumLocalLoadsSimplified > 0;
1202 }
1203 
1204 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1205   for (auto &It : BC.getBinaryFunctions()) {
1206     BinaryFunction &Function = It.second;
1207     if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1208       Modified.insert(&Function);
1209   }
1210 
1211   outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1212          << NumLoadsFound << " loads from a statically computed address.\n"
1213          << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
1214          << "\n"
1215          << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
1216 }
1217 
1218 void AssignSections::runOnFunctions(BinaryContext &BC) {
1219   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1220     Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1221     Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1222   }
1223 
1224   // In non-relocation mode functions have pre-assigned section names.
1225   if (!BC.HasRelocations)
1226     return;
1227 
1228   const bool UseColdSection =
1229       BC.NumProfiledFuncs > 0 ||
1230       opts::ReorderFunctions == ReorderFunctions::RT_USER;
1231   for (auto &BFI : BC.getBinaryFunctions()) {
1232     BinaryFunction &Function = BFI.second;
1233     if (opts::isHotTextMover(Function)) {
1234       Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1235       Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1236       continue;
1237     }
1238 
1239     if (!UseColdSection || Function.hasValidIndex() ||
1240         Function.hasValidProfile())
1241       Function.setCodeSectionName(BC.getMainCodeSectionName());
1242     else
1243       Function.setCodeSectionName(BC.getColdCodeSectionName());
1244 
1245     if (Function.isSplit())
1246       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1247   }
1248 }
1249 
1250 void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1251   double FlowImbalanceMean = 0.0;
1252   size_t NumBlocksConsidered = 0;
1253   double WorstBias = 0.0;
1254   const BinaryFunction *WorstBiasFunc = nullptr;
1255 
1256   // For each function CFG, we fill an IncomingMap with the sum of the frequency
1257   // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1258   // of the frequency of outgoing edges.
1259   using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1260   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1261   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1262 
1263   // Compute mean
1264   for (const auto &BFI : BC.getBinaryFunctions()) {
1265     const BinaryFunction &Function = BFI.second;
1266     if (Function.empty() || !Function.isSimple())
1267       continue;
1268     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1269     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1270     for (const BinaryBasicBlock &BB : Function) {
1271       uint64_t TotalOutgoing = 0ULL;
1272       auto SuccBIIter = BB.branch_info_begin();
1273       for (BinaryBasicBlock *Succ : BB.successors()) {
1274         uint64_t Count = SuccBIIter->Count;
1275         if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1276           ++SuccBIIter;
1277           continue;
1278         }
1279         TotalOutgoing += Count;
1280         IncomingMap[Succ] += Count;
1281         ++SuccBIIter;
1282       }
1283       OutgoingMap[&BB] = TotalOutgoing;
1284     }
1285 
1286     size_t NumBlocks = 0;
1287     double Mean = 0.0;
1288     for (const BinaryBasicBlock &BB : Function) {
1289       // Do not compute score for low frequency blocks, entry or exit blocks
1290       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1291         continue;
1292       ++NumBlocks;
1293       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1294       Mean += fabs(Difference / IncomingMap[&BB]);
1295     }
1296 
1297     FlowImbalanceMean += Mean;
1298     NumBlocksConsidered += NumBlocks;
1299     if (!NumBlocks)
1300       continue;
1301     double FuncMean = Mean / NumBlocks;
1302     if (FuncMean > WorstBias) {
1303       WorstBias = FuncMean;
1304       WorstBiasFunc = &Function;
1305     }
1306   }
1307   if (NumBlocksConsidered > 0)
1308     FlowImbalanceMean /= NumBlocksConsidered;
1309 
1310   // Compute standard deviation
1311   NumBlocksConsidered = 0;
1312   double FlowImbalanceVar = 0.0;
1313   for (const auto &BFI : BC.getBinaryFunctions()) {
1314     const BinaryFunction &Function = BFI.second;
1315     if (Function.empty() || !Function.isSimple())
1316       continue;
1317     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1318     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1319     for (const BinaryBasicBlock &BB : Function) {
1320       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1321         continue;
1322       ++NumBlocksConsidered;
1323       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1324       FlowImbalanceVar +=
1325           pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1326     }
1327   }
1328   if (NumBlocksConsidered) {
1329     FlowImbalanceVar /= NumBlocksConsidered;
1330     FlowImbalanceVar = sqrt(FlowImbalanceVar);
1331   }
1332 
1333   // Report to user
1334   outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1335                    (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1336   if (WorstBiasFunc && opts::Verbosity >= 1) {
1337     outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
1338            << "\n";
1339     LLVM_DEBUG(WorstBiasFunc->dump());
1340   }
1341 }
1342 
1343 void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1344   uint64_t NumRegularFunctions = 0;
1345   uint64_t NumStaleProfileFunctions = 0;
1346   uint64_t NumNonSimpleProfiledFunctions = 0;
1347   uint64_t NumUnknownControlFlowFunctions = 0;
1348   uint64_t TotalSampleCount = 0;
1349   uint64_t StaleSampleCount = 0;
1350   std::vector<const BinaryFunction *> ProfiledFunctions;
1351   const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1352   for (auto &BFI : BC.getBinaryFunctions()) {
1353     const BinaryFunction &Function = BFI.second;
1354 
1355     // Ignore PLT functions for stats.
1356     if (Function.isPLTFunction())
1357       continue;
1358 
1359     ++NumRegularFunctions;
1360 
1361     if (!Function.isSimple()) {
1362       if (Function.hasProfile())
1363         ++NumNonSimpleProfiledFunctions;
1364       continue;
1365     }
1366 
1367     if (Function.hasUnknownControlFlow()) {
1368       if (opts::PrintUnknownCFG)
1369         Function.dump();
1370       else if (opts::PrintUnknown)
1371         errs() << "function with unknown control flow: " << Function << '\n';
1372 
1373       ++NumUnknownControlFlowFunctions;
1374     }
1375 
1376     if (!Function.hasProfile())
1377       continue;
1378 
1379     uint64_t SampleCount = Function.getRawBranchCount();
1380     TotalSampleCount += SampleCount;
1381 
1382     if (Function.hasValidProfile()) {
1383       ProfiledFunctions.push_back(&Function);
1384     } else {
1385       if (opts::ReportStaleFuncs) {
1386         outs() << StaleFuncsHeader;
1387         StaleFuncsHeader = "";
1388         outs() << "  " << Function << '\n';
1389       }
1390       ++NumStaleProfileFunctions;
1391       StaleSampleCount += SampleCount;
1392     }
1393   }
1394   BC.NumProfiledFuncs = ProfiledFunctions.size();
1395 
1396   const size_t NumAllProfiledFunctions =
1397       ProfiledFunctions.size() + NumStaleProfileFunctions;
1398   outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1399          << NumRegularFunctions << " functions in the binary ("
1400          << format("%.1f", NumAllProfiledFunctions /
1401                                (float)NumRegularFunctions * 100.0f)
1402          << "%) have non-empty execution profile\n";
1403   if (NumNonSimpleProfiledFunctions) {
1404     outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1405            << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1406            << " with profile could not be optimized\n";
1407   }
1408   if (NumStaleProfileFunctions) {
1409     const float PctStale =
1410         NumStaleProfileFunctions / (float)NumAllProfiledFunctions * 100.0f;
1411     auto printErrorOrWarning = [&]() {
1412       if (PctStale > opts::StaleThreshold)
1413         errs() << "BOLT-ERROR: ";
1414       else
1415         errs() << "BOLT-WARNING: ";
1416     };
1417     printErrorOrWarning();
1418     errs() << NumStaleProfileFunctions
1419            << format(" (%.1f%% of all profiled)", PctStale) << " function"
1420            << (NumStaleProfileFunctions == 1 ? "" : "s")
1421            << " have invalid (possibly stale) profile."
1422               " Use -report-stale to see the list.\n";
1423     if (TotalSampleCount > 0) {
1424       printErrorOrWarning();
1425       errs() << StaleSampleCount << " out of " << TotalSampleCount
1426              << " samples in the binary ("
1427              << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount))
1428              << "%) belong to functions with invalid"
1429                 " (possibly stale) profile.\n";
1430     }
1431     if (PctStale > opts::StaleThreshold) {
1432       errs() << "BOLT-ERROR: stale functions exceed specified threshold of "
1433              << opts::StaleThreshold << "%. Exiting.\n";
1434       exit(1);
1435     }
1436   }
1437 
1438   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1439     outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1440            << " objects was ignored\n";
1441   }
1442 
1443   if (ProfiledFunctions.size() > 10) {
1444     if (opts::Verbosity >= 1) {
1445       outs() << "BOLT-INFO: top called functions are:\n";
1446       std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(),
1447                 [](const BinaryFunction *A, const BinaryFunction *B) {
1448                   return B->getExecutionCount() < A->getExecutionCount();
1449                 });
1450       auto SFI = ProfiledFunctions.begin();
1451       auto SFIend = ProfiledFunctions.end();
1452       for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1453            ++SFI, ++I)
1454         outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n';
1455     }
1456   }
1457 
1458   if (!opts::PrintSortedBy.empty() &&
1459       std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(),
1460                 DynoStats::FIRST_DYNO_STAT) == opts::PrintSortedBy.end()) {
1461 
1462     std::vector<const BinaryFunction *> Functions;
1463     std::map<const BinaryFunction *, DynoStats> Stats;
1464 
1465     for (const auto &BFI : BC.getBinaryFunctions()) {
1466       const BinaryFunction &BF = BFI.second;
1467       if (shouldOptimize(BF) && BF.hasValidProfile()) {
1468         Functions.push_back(&BF);
1469         Stats.emplace(&BF, getDynoStats(BF));
1470       }
1471     }
1472 
1473     const bool SortAll =
1474         std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(),
1475                   DynoStats::LAST_DYNO_STAT) != opts::PrintSortedBy.end();
1476 
1477     const bool Ascending =
1478         opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1479 
1480     if (SortAll) {
1481       std::stable_sort(Functions.begin(), Functions.end(),
1482                        [Ascending, &Stats](const BinaryFunction *A,
1483                                            const BinaryFunction *B) {
1484                          return Ascending ? Stats.at(A) < Stats.at(B)
1485                                           : Stats.at(B) < Stats.at(A);
1486                        });
1487     } else {
1488       std::stable_sort(
1489           Functions.begin(), Functions.end(),
1490           [Ascending, &Stats](const BinaryFunction *A,
1491                               const BinaryFunction *B) {
1492             const DynoStats &StatsA = Stats.at(A);
1493             const DynoStats &StatsB = Stats.at(B);
1494             return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy)
1495                              : StatsB.lessThan(StatsA, opts::PrintSortedBy);
1496           });
1497     }
1498 
1499     outs() << "BOLT-INFO: top functions sorted by ";
1500     if (SortAll) {
1501       outs() << "dyno stats";
1502     } else {
1503       outs() << "(";
1504       bool PrintComma = false;
1505       for (const DynoStats::Category Category : opts::PrintSortedBy) {
1506         if (PrintComma)
1507           outs() << ", ";
1508         outs() << DynoStats::Description(Category);
1509         PrintComma = true;
1510       }
1511       outs() << ")";
1512     }
1513 
1514     outs() << " are:\n";
1515     auto SFI = Functions.begin();
1516     for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1517       const DynoStats Stats = getDynoStats(**SFI);
1518       outs() << "  " << **SFI;
1519       if (!SortAll) {
1520         outs() << " (";
1521         bool PrintComma = false;
1522         for (const DynoStats::Category Category : opts::PrintSortedBy) {
1523           if (PrintComma)
1524             outs() << ", ";
1525           outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1526           PrintComma = true;
1527         }
1528         outs() << ")";
1529       }
1530       outs() << "\n";
1531     }
1532   }
1533 
1534   if (!BC.TrappedFunctions.empty()) {
1535     errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1536            << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1537            << " will trap on entry. Use -trap-avx512=0 to disable"
1538               " traps.";
1539     if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1540       errs() << '\n';
1541       for (const BinaryFunction *Function : BC.TrappedFunctions)
1542         errs() << "  " << *Function << '\n';
1543     } else {
1544       errs() << " Use -v=1 to see the list.\n";
1545     }
1546   }
1547 
1548   // Print information on missed macro-fusion opportunities seen on input.
1549   if (BC.MissedMacroFusionPairs) {
1550     outs() << "BOLT-INFO: the input contains " << BC.MissedMacroFusionPairs
1551            << " (dynamic count : " << BC.MissedMacroFusionExecCount
1552            << ") opportunities for macro-fusion optimization";
1553     switch (opts::AlignMacroOpFusion) {
1554     case MFT_NONE:
1555       outs() << ". Use -align-macro-fusion to fix.\n";
1556       break;
1557     case MFT_HOT:
1558       outs() << ". Will fix instances on a hot path.\n";
1559       break;
1560     case MFT_ALL:
1561       outs() << " that are going to be fixed\n";
1562       break;
1563     }
1564   }
1565 
1566   // Collect and print information about suboptimal code layout on input.
1567   if (opts::ReportBadLayout) {
1568     std::vector<const BinaryFunction *> SuboptimalFuncs;
1569     for (auto &BFI : BC.getBinaryFunctions()) {
1570       const BinaryFunction &BF = BFI.second;
1571       if (!BF.hasValidProfile())
1572         continue;
1573 
1574       const uint64_t HotThreshold =
1575           std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1576       bool HotSeen = false;
1577       for (const BinaryBasicBlock *BB : BF.rlayout()) {
1578         if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1579           HotSeen = true;
1580           continue;
1581         }
1582         if (HotSeen && BB->getKnownExecutionCount() == 0) {
1583           SuboptimalFuncs.push_back(&BF);
1584           break;
1585         }
1586       }
1587     }
1588 
1589     if (!SuboptimalFuncs.empty()) {
1590       std::sort(SuboptimalFuncs.begin(), SuboptimalFuncs.end(),
1591                 [](const BinaryFunction *A, const BinaryFunction *B) {
1592                   return A->getKnownExecutionCount() / A->getSize() >
1593                          B->getKnownExecutionCount() / B->getSize();
1594                 });
1595 
1596       outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1597              << " functions have "
1598                 "cold code in the middle of hot code. Top functions are:\n";
1599       for (unsigned I = 0;
1600            I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1601                         SuboptimalFuncs.size());
1602            ++I)
1603         SuboptimalFuncs[I]->print(outs());
1604     }
1605   }
1606 
1607   if (NumUnknownControlFlowFunctions) {
1608     outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1609            << " functions have instructions with unknown control flow";
1610     if (!opts::PrintUnknown)
1611       outs() << ". Use -print-unknown to see the list.";
1612     outs() << '\n';
1613   }
1614 }
1615 
1616 void InstructionLowering::runOnFunctions(BinaryContext &BC) {
1617   for (auto &BFI : BC.getBinaryFunctions())
1618     for (BinaryBasicBlock &BB : BFI.second)
1619       for (MCInst &Instruction : BB)
1620         BC.MIB->lowerTailCall(Instruction);
1621 }
1622 
1623 void StripRepRet::runOnFunctions(BinaryContext &BC) {
1624   uint64_t NumPrefixesRemoved = 0;
1625   uint64_t NumBytesSaved = 0;
1626   for (auto &BFI : BC.getBinaryFunctions()) {
1627     for (BinaryBasicBlock &BB : BFI.second) {
1628       auto LastInstRIter = BB.getLastNonPseudo();
1629       if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1630           !BC.MIB->deleteREPPrefix(*LastInstRIter))
1631         continue;
1632 
1633       NumPrefixesRemoved += BB.getKnownExecutionCount();
1634       ++NumBytesSaved;
1635     }
1636   }
1637 
1638   if (NumBytesSaved)
1639     outs() << "BOLT-INFO: removed " << NumBytesSaved
1640            << " 'repz' prefixes"
1641               " with estimated execution count of "
1642            << NumPrefixesRemoved << " times.\n";
1643 }
1644 
1645 void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1646   if (!BC.isX86())
1647     return;
1648 
1649   uint64_t NumInlined = 0;
1650   uint64_t NumInlinedDyno = 0;
1651   for (auto &BFI : BC.getBinaryFunctions()) {
1652     for (BinaryBasicBlock &BB : BFI.second) {
1653       for (auto II = BB.begin(); II != BB.end(); ++II) {
1654         MCInst &Inst = *II;
1655 
1656         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1657             !Inst.getOperand(0).isExpr())
1658           continue;
1659 
1660         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1661         if (CalleeSymbol->getName() != "memcpy" &&
1662             CalleeSymbol->getName() != "memcpy@PLT" &&
1663             CalleeSymbol->getName() != "_memcpy8")
1664           continue;
1665 
1666         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1667         const bool IsTailCall = BC.MIB->isTailCall(Inst);
1668 
1669         const InstructionListType NewCode =
1670             BC.MIB->createInlineMemcpy(IsMemcpy8);
1671         II = BB.replaceInstruction(II, NewCode);
1672         std::advance(II, NewCode.size() - 1);
1673         if (IsTailCall) {
1674           MCInst Return;
1675           BC.MIB->createReturn(Return);
1676           II = BB.insertInstruction(std::next(II), std::move(Return));
1677         }
1678 
1679         ++NumInlined;
1680         NumInlinedDyno += BB.getKnownExecutionCount();
1681       }
1682     }
1683   }
1684 
1685   if (NumInlined) {
1686     outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1687     if (NumInlinedDyno)
1688       outs() << ". The calls were executed " << NumInlinedDyno
1689              << " times based on profile.";
1690     outs() << '\n';
1691   }
1692 }
1693 
1694 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1695   if (!BinaryFunctionPass::shouldOptimize(Function))
1696     return false;
1697 
1698   for (const std::string &FunctionSpec : Spec) {
1699     StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1700     if (Function.hasNameRegex(FunctionName))
1701       return true;
1702   }
1703 
1704   return false;
1705 }
1706 
1707 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1708     const BinaryFunction &Function) const {
1709   StringRef SitesString;
1710   for (const std::string &FunctionSpec : Spec) {
1711     StringRef FunctionName;
1712     std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1713     if (Function.hasNameRegex(FunctionName))
1714       break;
1715     SitesString = "";
1716   }
1717 
1718   std::set<size_t> Sites;
1719   SmallVector<StringRef, 4> SitesVec;
1720   SitesString.split(SitesVec, ':');
1721   for (StringRef SiteString : SitesVec) {
1722     if (SiteString.empty())
1723       continue;
1724     size_t Result;
1725     if (!SiteString.getAsInteger(10, Result))
1726       Sites.emplace(Result);
1727   }
1728 
1729   return Sites;
1730 }
1731 
1732 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1733   if (!BC.isX86())
1734     return;
1735 
1736   uint64_t NumSpecialized = 0;
1737   uint64_t NumSpecializedDyno = 0;
1738   for (auto &BFI : BC.getBinaryFunctions()) {
1739     BinaryFunction &Function = BFI.second;
1740     if (!shouldOptimize(Function))
1741       continue;
1742 
1743     std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1744     auto shouldOptimize = [&](size_t N) {
1745       return CallsToOptimize.empty() || CallsToOptimize.count(N);
1746     };
1747 
1748     std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1749     size_t CallSiteID = 0;
1750     for (BinaryBasicBlock *CurBB : Blocks) {
1751       for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1752         MCInst &Inst = *II;
1753 
1754         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1755             !Inst.getOperand(0).isExpr())
1756           continue;
1757 
1758         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1759         if (CalleeSymbol->getName() != "memcpy" &&
1760             CalleeSymbol->getName() != "memcpy@PLT")
1761           continue;
1762 
1763         if (BC.MIB->isTailCall(Inst))
1764           continue;
1765 
1766         ++CallSiteID;
1767 
1768         if (!shouldOptimize(CallSiteID))
1769           continue;
1770 
1771         // Create a copy of a call to memcpy(dest, src, size).
1772         MCInst MemcpyInstr = Inst;
1773 
1774         BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1775 
1776         BinaryBasicBlock *NextBB = nullptr;
1777         if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1778           NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1779           NextBB->eraseInstruction(NextBB->begin());
1780         } else {
1781           NextBB = OneByteMemcpyBB->getSuccessor();
1782           OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1783           assert(NextBB && "unexpected call to memcpy() with no return");
1784         }
1785 
1786         BinaryBasicBlock *MemcpyBB =
1787             Function.addBasicBlock(CurBB->getInputOffset());
1788         InstructionListType CmpJCC =
1789             BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1790                                 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1791         CurBB->addInstructions(CmpJCC);
1792         CurBB->addSuccessor(MemcpyBB);
1793 
1794         MemcpyBB->addInstruction(std::move(MemcpyInstr));
1795         MemcpyBB->addSuccessor(NextBB);
1796         MemcpyBB->setCFIState(NextBB->getCFIState());
1797         MemcpyBB->setExecutionCount(0);
1798 
1799         // To prevent the actual call from being moved to cold, we set its
1800         // execution count to 1.
1801         if (CurBB->getKnownExecutionCount() > 0)
1802           MemcpyBB->setExecutionCount(1);
1803 
1804         InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1805         OneByteMemcpyBB->addInstructions(OneByteMemcpy);
1806 
1807         ++NumSpecialized;
1808         NumSpecializedDyno += CurBB->getKnownExecutionCount();
1809 
1810         CurBB = NextBB;
1811 
1812         // Note: we don't expect the next instruction to be a call to memcpy.
1813         II = CurBB->begin();
1814       }
1815     }
1816   }
1817 
1818   if (NumSpecialized) {
1819     outs() << "BOLT-INFO: specialized " << NumSpecialized
1820            << " memcpy() call sites for size 1";
1821     if (NumSpecializedDyno)
1822       outs() << ". The calls were executed " << NumSpecializedDyno
1823              << " times based on profile.";
1824     outs() << '\n';
1825   }
1826 }
1827 
1828 void RemoveNops::runOnFunction(BinaryFunction &BF) {
1829   const BinaryContext &BC = BF.getBinaryContext();
1830   for (BinaryBasicBlock &BB : BF) {
1831     for (int64_t I = BB.size() - 1; I >= 0; --I) {
1832       MCInst &Inst = BB.getInstructionAtIndex(I);
1833       if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
1834         BB.eraseInstructionAtIndex(I);
1835     }
1836   }
1837 }
1838 
1839 void RemoveNops::runOnFunctions(BinaryContext &BC) {
1840   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1841     runOnFunction(BF);
1842   };
1843 
1844   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1845     return BF.shouldPreserveNops();
1846   };
1847 
1848   ParallelUtilities::runOnEachFunction(
1849       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
1850       SkipFunc, "RemoveNops");
1851 }
1852 
1853 } // namespace bolt
1854 } // namespace llvm
1855