1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/ParallelUtilities.h"
15 #include "bolt/Passes/ReorderAlgorithm.h"
16 #include "bolt/Passes/ReorderFunctions.h"
17 #include "llvm/Support/CommandLine.h"
18 
19 #include <numeric>
20 #include <vector>
21 
22 #define DEBUG_TYPE "bolt-opts"
23 
24 using namespace llvm;
25 using namespace bolt;
26 
27 namespace {
28 
29 const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30   if (C == bolt::DynoStats::FIRST_DYNO_STAT)
31     return "none";
32   else if (C == bolt::DynoStats::LAST_DYNO_STAT)
33     return "all";
34 
35   static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
36 
37   OptNames[C] = bolt::DynoStats::Description(C);
38 
39   std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
40 
41   return OptNames[C].c_str();
42 }
43 
44 const char *dynoStatsOptDesc(const bolt::DynoStats::Category C) {
45   if (C == bolt::DynoStats::FIRST_DYNO_STAT)
46     return "unsorted";
47   else if (C == bolt::DynoStats::LAST_DYNO_STAT)
48     return "sorted by all stats";
49 
50   return bolt::DynoStats::Description(C);
51 }
52 
53 }
54 
55 namespace opts {
56 
57 extern cl::OptionCategory BoltCategory;
58 extern cl::OptionCategory BoltOptCategory;
59 
60 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
61 extern cl::opt<unsigned> Verbosity;
62 extern cl::opt<bool> EnableBAT;
63 extern cl::opt<unsigned> ExecutionCountThreshold;
64 extern cl::opt<bool> UpdateDebugSections;
65 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
66 
67 enum DynoStatsSortOrder : char {
68   Ascending,
69   Descending
70 };
71 
72 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
73     "print-sorted-by-order",
74     cl::desc("use ascending or descending order when printing functions "
75              "ordered by dyno stats"),
76     cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
77 
78 cl::list<std::string>
79 HotTextMoveSections("hot-text-move-sections",
80   cl::desc("list of sections containing functions used for hugifying hot text. "
81            "BOLT makes sure these functions are not placed on the same page as "
82            "the hot text. (default=\'.stub,.mover\')."),
83   cl::value_desc("sec1,sec2,sec3,..."),
84   cl::CommaSeparated,
85   cl::ZeroOrMore,
86   cl::cat(BoltCategory));
87 
88 bool isHotTextMover(const BinaryFunction &Function) {
89   for (std::string &SectionName : opts::HotTextMoveSections) {
90     if (Function.getOriginSectionName() &&
91         *Function.getOriginSectionName() == SectionName)
92       return true;
93   }
94 
95   return false;
96 }
97 
98 static cl::opt<bool> MinBranchClusters(
99     "min-branch-clusters",
100     cl::desc("use a modified clustering algorithm geared towards minimizing "
101              "branches"),
102     cl::Hidden, cl::cat(BoltOptCategory));
103 
104 static cl::list<Peepholes::PeepholeOpts> Peepholes(
105     "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
106     cl::value_desc("opt1,opt2,opt3,..."),
107     cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
108                clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
109                           "remove double jumps when able"),
110                clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
111                           "insert tail call traps"),
112                clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
113                           "remove useless conditional branches"),
114                clEnumValN(Peepholes::PEEP_ALL, "all",
115                           "enable all peephole optimizations")),
116     cl::ZeroOrMore, cl::cat(BoltOptCategory));
117 
118 static cl::opt<unsigned>
119     PrintFuncStat("print-function-statistics",
120                   cl::desc("print statistics about basic block ordering"),
121                   cl::init(0), cl::cat(BoltOptCategory));
122 
123 static cl::list<bolt::DynoStats::Category>
124 PrintSortedBy("print-sorted-by",
125   cl::CommaSeparated,
126   cl::desc("print functions sorted by order of dyno stats"),
127   cl::value_desc("key1,key2,key3,..."),
128   cl::values(
129 #define D(name, ...)                                        \
130     clEnumValN(bolt::DynoStats::name,                     \
131                dynoStatsOptName(bolt::DynoStats::name),   \
132                dynoStatsOptDesc(bolt::DynoStats::name)),
133     DYNO_STATS
134 #undef D
135     clEnumValN(0xffff, ".", ".")
136     ),
137   cl::ZeroOrMore,
138   cl::cat(BoltOptCategory));
139 
140 static cl::opt<bool>
141     PrintUnknown("print-unknown",
142                  cl::desc("print names of functions with unknown control flow"),
143                  cl::cat(BoltCategory), cl::Hidden);
144 
145 static cl::opt<bool>
146     PrintUnknownCFG("print-unknown-cfg",
147                     cl::desc("dump CFG of functions with unknown control flow"),
148                     cl::cat(BoltCategory), cl::ReallyHidden);
149 
150 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
151     "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
152     cl::init(bolt::ReorderBasicBlocks::LT_NONE),
153     cl::values(
154         clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
155                    "do not reorder basic blocks"),
156         clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
157                    "layout blocks in reverse order"),
158         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
159                    "perform optimal layout based on profile"),
160         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
161                    "branch-predictor",
162                    "perform optimal layout prioritizing branch "
163                    "predictions"),
164         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
165                    "perform optimal layout prioritizing I-cache "
166                    "behavior"),
167         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
168                    "perform layout optimizing I-cache behavior"),
169         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
170                    "perform layout optimizing I-cache behavior"),
171         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
172                    "cluster-shuffle", "perform random layout of clusters")),
173     cl::ZeroOrMore, cl::cat(BoltOptCategory),
174     cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) {
175       if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
176         WithColor::warning()
177             << "'-reorder-blocks=cache+' is deprecated, "
178             << "please use '-reorder-blocks=ext-tsp' instead\n";
179         ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
180       }
181     }));
182 
183 static cl::opt<unsigned> ReportBadLayout(
184     "report-bad-layout",
185     cl::desc("print top <uint> functions with suboptimal code layout on input"),
186     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
187 
188 static cl::opt<bool>
189     ReportStaleFuncs("report-stale",
190                      cl::desc("print the list of functions with stale profile"),
191                      cl::Hidden, cl::cat(BoltOptCategory));
192 
193 enum SctcModes : char {
194   SctcAlways,
195   SctcPreserveDirection,
196   SctcHeuristic
197 };
198 
199 static cl::opt<SctcModes>
200 SctcMode("sctc-mode",
201   cl::desc("mode for simplify conditional tail calls"),
202   cl::init(SctcAlways),
203   cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
204     clEnumValN(SctcPreserveDirection,
205       "preserve",
206       "only perform sctc when branch direction is "
207       "preserved"),
208     clEnumValN(SctcHeuristic,
209       "heuristic",
210       "use branch prediction data to control sctc")),
211   cl::ZeroOrMore,
212   cl::cat(BoltOptCategory));
213 
214 static cl::opt<unsigned>
215 StaleThreshold("stale-threshold",
216     cl::desc(
217       "maximum percentage of stale functions to tolerate (default: 100)"),
218     cl::init(100),
219     cl::Hidden,
220     cl::cat(BoltOptCategory));
221 
222 static cl::opt<unsigned> TSPThreshold(
223     "tsp-threshold",
224     cl::desc(
225         "maximum number of hot basic blocks in a function for which to use "
226         "a precise TSP solution while re-ordering basic blocks"),
227     cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
228 
229 static cl::opt<unsigned> TopCalledLimit(
230     "top-called-limit",
231     cl::desc("maximum number of functions to print in top called "
232              "functions section"),
233     cl::init(100), cl::Hidden, cl::cat(BoltCategory));
234 
235 } // namespace opts
236 
237 namespace llvm {
238 namespace bolt {
239 
240 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
241   return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
242          !BF.isIgnored();
243 }
244 
245 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
246   return BF.isSimple() && !BF.isIgnored();
247 }
248 
249 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
250   uint64_t NumRemoved = 0;
251   uint64_t NumDuplicateEdges = 0;
252   uint64_t NeedsFixBranches = 0;
253   for (BinaryBasicBlock &BB : BF) {
254     if (!BB.empty())
255       continue;
256 
257     if (BB.isEntryPoint() || BB.isLandingPad())
258       continue;
259 
260     // Handle a dangling empty block.
261     if (BB.succ_size() == 0) {
262       // If an empty dangling basic block has a predecessor, it could be a
263       // result of codegen for __builtin_unreachable. In such case, do not
264       // remove the block.
265       if (BB.pred_size() == 0) {
266         BB.markValid(false);
267         ++NumRemoved;
268       }
269       continue;
270     }
271 
272     // The block should have just one successor.
273     BinaryBasicBlock *Successor = BB.getSuccessor();
274     assert(Successor && "invalid CFG encountered");
275 
276     // Redirect all predecessors to the successor block.
277     while (!BB.pred_empty()) {
278       BinaryBasicBlock *Predecessor = *BB.pred_begin();
279       if (Predecessor->hasJumpTable())
280         break;
281 
282       if (Predecessor == Successor)
283         break;
284 
285       BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
286       Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
287                                     BI.MispredictedCount);
288       // We need to fix branches even if we failed to replace all successors
289       // and remove the block.
290       NeedsFixBranches = true;
291     }
292 
293     if (BB.pred_empty()) {
294       BB.removeAllSuccessors();
295       BB.markValid(false);
296       ++NumRemoved;
297     }
298   }
299 
300   if (NumRemoved)
301     BF.eraseInvalidBBs();
302 
303   // Check for duplicate successors. Do it after the empty block elimination as
304   // we can get more duplicate successors.
305   for (BinaryBasicBlock &BB : BF)
306     if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
307         BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
308       ++NumDuplicateEdges;
309 
310   // fixBranches() will get rid of duplicate edges and update jump instructions.
311   if (NumDuplicateEdges || NeedsFixBranches)
312     BF.fixBranches();
313 
314   NumDuplicateEdgesMerged += NumDuplicateEdges;
315   NumBlocksRemoved += NumRemoved;
316 }
317 
318 void NormalizeCFG::runOnFunctions(BinaryContext &BC) {
319   ParallelUtilities::runOnEachFunction(
320       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
321       [&](BinaryFunction &BF) { runOnFunction(BF); },
322       [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
323       "NormalizeCFG");
324   if (NumBlocksRemoved)
325     outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
326            << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
327   if (NumDuplicateEdgesMerged)
328     outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
329            << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s")
330            << '\n';
331 }
332 
333 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
334   if (Function.layout_size() > 0) {
335     unsigned Count;
336     uint64_t Bytes;
337     Function.markUnreachableBlocks();
338     LLVM_DEBUG({
339       for (BinaryBasicBlock *BB : Function.layout()) {
340         if (!BB->isValid()) {
341           dbgs() << "BOLT-INFO: UCE found unreachable block " << BB->getName()
342                  << " in function " << Function << "\n";
343           Function.dump();
344         }
345       }
346     });
347     std::tie(Count, Bytes) = Function.eraseInvalidBBs();
348     DeletedBlocks += Count;
349     DeletedBytes += Bytes;
350     if (Count) {
351       Modified.insert(&Function);
352       if (opts::Verbosity > 0)
353         outs() << "BOLT-INFO: Removed " << Count
354                << " dead basic block(s) accounting for " << Bytes
355                << " bytes in function " << Function << '\n';
356     }
357   }
358 }
359 
360 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
361   for (auto &It : BC.getBinaryFunctions()) {
362     BinaryFunction &Function = It.second;
363     if (shouldOptimize(Function))
364       runOnFunction(Function);
365   }
366 
367   outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
368          << DeletedBytes << " bytes of code.\n";
369 }
370 
371 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
372   return (BinaryFunctionPass::shouldPrint(BF) &&
373           opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
374 }
375 
376 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
377   // Apply execution count threshold
378   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
379     return false;
380 
381   return BinaryFunctionPass::shouldOptimize(BF);
382 }
383 
384 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
385   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
386     return;
387 
388   std::atomic<uint64_t> ModifiedFuncCount{0};
389 
390   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
391     modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
392     if (BF.hasLayoutChanged())
393       ++ModifiedFuncCount;
394   };
395 
396   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
397     return !shouldOptimize(BF);
398   };
399 
400   ParallelUtilities::runOnEachFunction(
401       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
402       "ReorderBasicBlocks");
403 
404   outs() << "BOLT-INFO: basic block reordering modified layout of "
405          << format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(),
406                    100.0 * ModifiedFuncCount.load() /
407                        BC.getBinaryFunctions().size());
408 
409   if (opts::PrintFuncStat > 0) {
410     raw_ostream &OS = outs();
411     // Copy all the values into vector in order to sort them
412     std::map<uint64_t, BinaryFunction &> ScoreMap;
413     auto &BFs = BC.getBinaryFunctions();
414     for (auto It = BFs.begin(); It != BFs.end(); ++It)
415       ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
416           It->second.getFunctionScore(), It->second));
417 
418     OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
419     OS << "           There are " << BFs.size() << " functions in total. \n";
420     OS << "           Number of functions being modified: "
421        << ModifiedFuncCount.load() << "\n";
422     OS << "           User asks for detailed information on top "
423        << opts::PrintFuncStat << " functions. (Ranked by function score)"
424        << "\n\n";
425     uint64_t I = 0;
426     for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
427              ScoreMap.rbegin();
428          Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
429       BinaryFunction &Function = Rit->second;
430 
431       OS << "           Information for function of top: " << (I + 1) << ": \n";
432       OS << "             Function Score is: " << Function.getFunctionScore()
433          << "\n";
434       OS << "             There are " << Function.size()
435          << " number of blocks in this function.\n";
436       OS << "             There are " << Function.getInstructionCount()
437          << " number of instructions in this function.\n";
438       OS << "             The edit distance for this function is: "
439          << Function.getEditDistance() << "\n\n";
440     }
441   }
442 }
443 
444 void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
445                                               LayoutType Type,
446                                               bool MinBranchClusters) const {
447   if (BF.size() == 0 || Type == LT_NONE)
448     return;
449 
450   BinaryFunction::BasicBlockOrderType NewLayout;
451   std::unique_ptr<ReorderAlgorithm> Algo;
452 
453   // Cannot do optimal layout without profile.
454   if (Type != LT_REVERSE && !BF.hasValidProfile())
455     return;
456 
457   if (Type == LT_REVERSE) {
458     Algo.reset(new ReverseReorderAlgorithm());
459   } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
460     // Work on optimal solution if problem is small enough
461     LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
462     Algo.reset(new TSPReorderAlgorithm());
463   } else {
464     LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
465 
466     std::unique_ptr<ClusterAlgorithm> CAlgo;
467     if (MinBranchClusters)
468       CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
469     else
470       CAlgo.reset(new PHGreedyClusterAlgorithm());
471 
472     switch (Type) {
473     case LT_OPTIMIZE:
474       Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
475       break;
476 
477     case LT_OPTIMIZE_BRANCH:
478       Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
479       break;
480 
481     case LT_OPTIMIZE_CACHE:
482       Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
483       break;
484 
485     case LT_OPTIMIZE_EXT_TSP:
486       Algo.reset(new ExtTSPReorderAlgorithm());
487       break;
488 
489     case LT_OPTIMIZE_SHUFFLE:
490       Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
491       break;
492 
493     default:
494       llvm_unreachable("unexpected layout type");
495     }
496   }
497 
498   Algo->reorderBasicBlocks(BF, NewLayout);
499 
500   BF.updateBasicBlockLayout(NewLayout);
501 }
502 
503 void FixupBranches::runOnFunctions(BinaryContext &BC) {
504   for (auto &It : BC.getBinaryFunctions()) {
505     BinaryFunction &Function = It.second;
506     if (!BC.shouldEmit(Function) || !Function.isSimple())
507       continue;
508 
509     Function.fixBranches();
510   }
511 }
512 
513 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
514   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
515     if (!BF.finalizeCFIState()) {
516       if (BC.HasRelocations) {
517         errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
518                << ". Exiting.\n";
519         exit(1);
520       }
521       BF.setSimple(false);
522       return;
523     }
524 
525     BF.setFinalized();
526 
527     // Update exception handling information.
528     BF.updateEHRanges();
529   };
530 
531   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
532     return !BC.shouldEmit(BF);
533   };
534 
535   ParallelUtilities::runOnEachFunction(
536       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
537       SkipPredicate, "FinalizeFunctions");
538 }
539 
540 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
541   if (BC.HasRelocations)
542     return;
543 
544   if (!opts::UpdateDebugSections)
545     return;
546 
547   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
548   // incorrect debug info.
549   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
550     uint64_t HotSize, ColdSize;
551     std::tie(HotSize, ColdSize) =
552         BC.calculateEmittedSize(BF, /*FixBranches=*/false);
553     if (HotSize > BF.getMaxSize())
554       BF.setSimple(false);
555   };
556 
557   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
558     return !shouldOptimize(BF);
559   };
560 
561   ParallelUtilities::runOnEachFunction(
562       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
563       SkipFunc, "CheckLargeFunctions");
564 }
565 
566 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
567   // Unlike other passes, allow functions in non-CFG state.
568   return BF.isSimple() && !BF.isIgnored();
569 }
570 
571 void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
572   std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations;
573 
574   for (auto &It : BC.getBinaryFunctions()) {
575     BinaryFunction &BF = It.second;
576     int64_t CurrentGnuArgsSize = 0;
577 
578     // Have we crossed hot/cold border for split functions?
579     bool SeenCold = false;
580 
581     for (BinaryBasicBlock *BB : BF.layout()) {
582       if (BB->isCold() && !SeenCold) {
583         SeenCold = true;
584         CurrentGnuArgsSize = 0;
585       }
586 
587       // First convert GnuArgsSize annotations into CFIs. This may change instr
588       // pointers, so do it before recording ptrs for preserved annotations
589       if (BF.usesGnuArgsSize()) {
590         for (auto II = BB->begin(); II != BB->end(); ++II) {
591           if (!BC.MIB->isInvoke(*II))
592             continue;
593           const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
594           assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size");
595           if (NewGnuArgsSize != CurrentGnuArgsSize) {
596             auto InsertII = BF.addCFIInstruction(
597                 BB, II,
598                 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
599             CurrentGnuArgsSize = NewGnuArgsSize;
600             II = std::next(InsertII);
601           }
602         }
603       }
604 
605       // Now record preserved annotations separately and then strip annotations.
606       for (auto II = BB->begin(); II != BB->end(); ++II) {
607         if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II))
608           PreservedOffsetAnnotations.emplace_back(&(*II),
609                                                   *BC.MIB->getOffset(*II));
610         BC.MIB->stripAnnotations(*II);
611       }
612     }
613   }
614   for (BinaryFunction *BF : BC.getInjectedBinaryFunctions())
615     for (BinaryBasicBlock &BB : *BF)
616       for (MCInst &Instruction : BB)
617         BC.MIB->stripAnnotations(Instruction);
618 
619   // Release all memory taken by annotations
620   BC.MIB->freeAnnotations();
621 
622   // Reinsert preserved annotations we need during code emission.
623   for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations)
624     BC.MIB->setOffset(*Item.first, Item.second);
625 }
626 
627 namespace {
628 
629 // This peephole fixes jump instructions that jump to another basic
630 // block with a single jump instruction, e.g.
631 //
632 // B0: ...
633 //     jmp  B1   (or jcc B1)
634 //
635 // B1: jmp  B2
636 //
637 // ->
638 //
639 // B0: ...
640 //     jmp  B2   (or jcc B2)
641 //
642 uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
643   uint64_t NumDoubleJumps = 0;
644 
645   MCContext *Ctx = Function.getBinaryContext().Ctx.get();
646   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
647   for (BinaryBasicBlock &BB : Function) {
648     auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
649                              const MCSymbol *SuccSym) {
650       // Ignore infinite loop jumps or fallthrough tail jumps.
651       if (Pred == Succ || Succ == &BB)
652         return false;
653 
654       if (Succ) {
655         const MCSymbol *TBB = nullptr;
656         const MCSymbol *FBB = nullptr;
657         MCInst *CondBranch = nullptr;
658         MCInst *UncondBranch = nullptr;
659         bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
660         if (!Res) {
661           LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
662                      Pred->dump());
663           return false;
664         }
665         Pred->replaceSuccessor(&BB, Succ);
666 
667         // We must patch up any existing branch instructions to match up
668         // with the new successor.
669         assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
670                "Predecessor block has inconsistent number of successors");
671         if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
672           MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
673         } else if (UncondBranch &&
674                    MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
675           MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
676         } else if (!UncondBranch) {
677           assert(Function.getBasicBlockAfter(Pred, false) != Succ &&
678                  "Don't add an explicit jump to a fallthrough block.");
679           Pred->addBranchInstruction(Succ);
680         }
681       } else {
682         // Succ will be null in the tail call case.  In this case we
683         // need to explicitly add a tail call instruction.
684         MCInst *Branch = Pred->getLastNonPseudoInstr();
685         if (Branch && MIB->isUnconditionalBranch(*Branch)) {
686           assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
687           Pred->removeSuccessor(&BB);
688           Pred->eraseInstruction(Pred->findInstruction(Branch));
689           Pred->addTailCallInstruction(SuccSym);
690         } else {
691           return false;
692         }
693       }
694 
695       ++NumDoubleJumps;
696       LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
697                         << Pred->getName() << " -> " << BB.getName() << " to "
698                         << Pred->getName() << " -> " << SuccSym->getName()
699                         << (!Succ ? " (tail)\n" : "\n"));
700 
701       return true;
702     };
703 
704     if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
705       continue;
706 
707     MCInst *Inst = BB.getFirstNonPseudoInstr();
708     const bool IsTailCall = MIB->isTailCall(*Inst);
709 
710     if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
711       continue;
712 
713     // If we operate after SCTC make sure it's not a conditional tail call.
714     if (IsTailCall && MIB->isConditionalBranch(*Inst))
715       continue;
716 
717     const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
718     BinaryBasicBlock *Succ = BB.getSuccessor();
719 
720     if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
721       continue;
722 
723     std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
724 
725     for (BinaryBasicBlock *Pred : Preds) {
726       if (Pred->isLandingPad())
727         continue;
728 
729       if (Pred->getSuccessor() == &BB ||
730           (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
731           Pred->getConditionalSuccessor(false) == &BB)
732         if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid)
733           BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
734                        BB.isEntryPoint());
735     }
736   }
737 
738   return NumDoubleJumps;
739 }
740 } // namespace
741 
742 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
743     const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
744     const BinaryBasicBlock *BB, const bool DirectionFlag) {
745   if (BeenOptimized.count(PredBB))
746     return false;
747 
748   const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
749 
750   if (IsForward)
751     ++NumOrigForwardBranches;
752   else
753     ++NumOrigBackwardBranches;
754 
755   if (opts::SctcMode == opts::SctcAlways)
756     return true;
757 
758   if (opts::SctcMode == opts::SctcPreserveDirection)
759     return IsForward == DirectionFlag;
760 
761   const ErrorOr<std::pair<double, double>> Frequency =
762       PredBB->getBranchStats(BB);
763 
764   // It's ok to rewrite the conditional branch if the new target will be
765   // a backward branch.
766 
767   // If no data available for these branches, then it should be ok to
768   // do the optimization since it will reduce code size.
769   if (Frequency.getError())
770     return true;
771 
772   // TODO: should this use misprediction frequency instead?
773   const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
774                       (!IsForward && Frequency.get().first <= 0.5);
775 
776   return Result == DirectionFlag;
777 }
778 
779 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
780   // Need updated indices to correctly detect branch' direction.
781   BF.updateLayoutIndices();
782   BF.markUnreachableBlocks();
783 
784   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
785   MCContext *Ctx = BF.getBinaryContext().Ctx.get();
786   uint64_t NumLocalCTCCandidates = 0;
787   uint64_t NumLocalCTCs = 0;
788   uint64_t LocalCTCTakenCount = 0;
789   uint64_t LocalCTCExecCount = 0;
790   std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
791       NeedsUncondBranch;
792 
793   // Will block be deleted by UCE?
794   auto isValid = [](const BinaryBasicBlock *BB) {
795     return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
796   };
797 
798   for (BinaryBasicBlock *BB : BF.layout()) {
799     // Locate BB with a single direct tail-call instruction.
800     if (BB->getNumNonPseudos() != 1)
801       continue;
802 
803     MCInst *Instr = BB->getFirstNonPseudoInstr();
804     if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
805       continue;
806 
807     const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
808     if (!CalleeSymbol)
809       continue;
810 
811     // Detect direction of the possible conditional tail call.
812     const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
813 
814     // Iterate through all predecessors.
815     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
816       BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
817       if (!CondSucc)
818         continue;
819 
820       ++NumLocalCTCCandidates;
821 
822       const MCSymbol *TBB = nullptr;
823       const MCSymbol *FBB = nullptr;
824       MCInst *CondBranch = nullptr;
825       MCInst *UncondBranch = nullptr;
826       bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
827 
828       // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
829       if (!Result) {
830         LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
831                    PredBB->dump());
832         continue;
833       }
834 
835       assert(Result && "internal error analyzing conditional branch");
836       assert(CondBranch && "conditional branch expected");
837 
838       // It's possible that PredBB is also a successor to BB that may have
839       // been processed by a previous iteration of the SCTC loop, in which
840       // case it may have been marked invalid.  We should skip rewriting in
841       // this case.
842       if (!PredBB->isValid()) {
843         assert(PredBB->isSuccessor(BB) &&
844                "PredBB should be valid if it is not a successor to BB");
845         continue;
846       }
847 
848       // We don't want to reverse direction of the branch in new order
849       // without further profile analysis.
850       const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
851       if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
852         continue;
853 
854       // Record this block so that we don't try to optimize it twice.
855       BeenOptimized.insert(PredBB);
856 
857       uint64_t Count = 0;
858       if (CondSucc != BB) {
859         // Patch the new target address into the conditional branch.
860         MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
861         // Since we reversed the condition on the branch we need to change
862         // the target for the unconditional branch or add a unconditional
863         // branch to the old target.  This has to be done manually since
864         // fixupBranches is not called after SCTC.
865         NeedsUncondBranch.emplace_back(PredBB, CondSucc);
866         Count = PredBB->getFallthroughBranchInfo().Count;
867       } else {
868         // Change destination of the conditional branch.
869         MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
870         Count = PredBB->getTakenBranchInfo().Count;
871       }
872       const uint64_t CTCTakenFreq =
873           Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
874 
875       // Annotate it, so "isCall" returns true for this jcc
876       MIB->setConditionalTailCall(*CondBranch);
877       // Add info abount the conditional tail call frequency, otherwise this
878       // info will be lost when we delete the associated BranchInfo entry
879       auto &CTCAnnotation =
880           MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
881       CTCAnnotation = CTCTakenFreq;
882 
883       // Remove the unused successor which may be eliminated later
884       // if there are no other users.
885       PredBB->removeSuccessor(BB);
886       // Update BB execution count
887       if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
888         BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
889       else if (CTCTakenFreq > BB->getKnownExecutionCount())
890         BB->setExecutionCount(0);
891 
892       ++NumLocalCTCs;
893       LocalCTCTakenCount += CTCTakenFreq;
894       LocalCTCExecCount += PredBB->getKnownExecutionCount();
895     }
896 
897     // Remove the block from CFG if all predecessors were removed.
898     BB->markValid(isValid(BB));
899   }
900 
901   // Add unconditional branches at the end of BBs to new successors
902   // as long as the successor is not a fallthrough.
903   for (auto &Entry : NeedsUncondBranch) {
904     BinaryBasicBlock *PredBB = Entry.first;
905     const BinaryBasicBlock *CondSucc = Entry.second;
906 
907     const MCSymbol *TBB = nullptr;
908     const MCSymbol *FBB = nullptr;
909     MCInst *CondBranch = nullptr;
910     MCInst *UncondBranch = nullptr;
911     PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
912 
913     // Find the next valid block.  Invalid blocks will be deleted
914     // so they shouldn't be considered fallthrough targets.
915     const BinaryBasicBlock *NextBlock = BF.getBasicBlockAfter(PredBB, false);
916     while (NextBlock && !isValid(NextBlock))
917       NextBlock = BF.getBasicBlockAfter(NextBlock, false);
918 
919     // Get the unconditional successor to this block.
920     const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
921     assert(PredSucc && "The other branch should be a tail call");
922 
923     const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
924 
925     if (UncondBranch) {
926       if (HasFallthrough)
927         PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
928       else
929         MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
930     } else if (!HasFallthrough) {
931       MCInst Branch;
932       MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
933       PredBB->addInstruction(Branch);
934     }
935   }
936 
937   if (NumLocalCTCs > 0) {
938     NumDoubleJumps += fixDoubleJumps(BF, true);
939     // Clean-up unreachable tail-call blocks.
940     const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
941     DeletedBlocks += Stats.first;
942     DeletedBytes += Stats.second;
943 
944     assert(BF.validateCFG());
945   }
946 
947   LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
948                     << " conditional tail calls from a total of "
949                     << NumLocalCTCCandidates << " candidates in function " << BF
950                     << ". CTCs execution count for this function is "
951                     << LocalCTCExecCount << " and CTC taken count is "
952                     << LocalCTCTakenCount << "\n";);
953 
954   NumTailCallsPatched += NumLocalCTCs;
955   NumCandidateTailCalls += NumLocalCTCCandidates;
956   CTCExecCount += LocalCTCExecCount;
957   CTCTakenCount += LocalCTCTakenCount;
958 
959   return NumLocalCTCs > 0;
960 }
961 
962 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
963   if (!BC.isX86())
964     return;
965 
966   for (auto &It : BC.getBinaryFunctions()) {
967     BinaryFunction &Function = It.second;
968 
969     if (!shouldOptimize(Function))
970       continue;
971 
972     if (fixTailCalls(Function)) {
973       Modified.insert(&Function);
974       Function.setHasCanonicalCFG(false);
975     }
976   }
977 
978   outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
979          << " tail calls (" << NumOrigForwardBranches << " forward)"
980          << " tail calls (" << NumOrigBackwardBranches << " backward)"
981          << " from a total of " << NumCandidateTailCalls << " while removing "
982          << NumDoubleJumps << " double jumps"
983          << " and removing " << DeletedBlocks << " basic blocks"
984          << " totalling " << DeletedBytes
985          << " bytes of code. CTCs total execution count is " << CTCExecCount
986          << " and the number of times CTCs are taken is " << CTCTakenCount
987          << ".\n";
988 }
989 
990 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
991   uint64_t Count = 0;
992   const BinaryContext &BC = Function.getBinaryContext();
993   for (BinaryBasicBlock &BB : Function) {
994     for (MCInst &Inst : BB) {
995       MCInst OriginalInst;
996       if (opts::Verbosity > 2)
997         OriginalInst = Inst;
998 
999       if (!BC.MIB->shortenInstruction(Inst, *BC.STI))
1000         continue;
1001 
1002       if (opts::Verbosity > 2) {
1003         BC.scopeLock();
1004         outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
1005         BC.printInstruction(outs(), OriginalInst, 0, &Function);
1006         outs() << "BOLT-INFO: to:";
1007         BC.printInstruction(outs(), Inst, 0, &Function);
1008       }
1009 
1010       ++Count;
1011     }
1012   }
1013 
1014   return Count;
1015 }
1016 
1017 void ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1018   std::atomic<uint64_t> NumShortened{0};
1019   if (!BC.isX86())
1020     return;
1021 
1022   ParallelUtilities::runOnEachFunction(
1023       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1024       [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1025       nullptr, "ShortenInstructions");
1026 
1027   outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n";
1028 }
1029 
1030 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1031   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1032   for (BinaryBasicBlock &BB : Function) {
1033     MCInst *Inst = BB.getLastNonPseudoInstr();
1034     if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1035       MCInst Trap;
1036       if (MIB->createTrap(Trap)) {
1037         BB.addInstruction(Trap);
1038         ++TailCallTraps;
1039       }
1040     }
1041   }
1042 }
1043 
1044 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1045   for (BinaryBasicBlock &BB : Function) {
1046     if (BB.succ_size() != 2)
1047       continue;
1048 
1049     BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1050     BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1051     if (CondBB != UncondBB)
1052       continue;
1053 
1054     const MCSymbol *TBB = nullptr;
1055     const MCSymbol *FBB = nullptr;
1056     MCInst *CondBranch = nullptr;
1057     MCInst *UncondBranch = nullptr;
1058     bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1059 
1060     // analyzeBranch() can fail due to unusual branch instructions,
1061     // e.g. jrcxz, or jump tables (indirect jump).
1062     if (!Result || !CondBranch)
1063       continue;
1064 
1065     BB.removeDuplicateConditionalSuccessor(CondBranch);
1066     ++NumUselessCondBranches;
1067   }
1068 }
1069 
1070 void Peepholes::runOnFunctions(BinaryContext &BC) {
1071   const char Opts =
1072       std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1073                       [](const char A, const PeepholeOpts B) { return A | B; });
1074   if (Opts == PEEP_NONE)
1075     return;
1076 
1077   for (auto &It : BC.getBinaryFunctions()) {
1078     BinaryFunction &Function = It.second;
1079     if (shouldOptimize(Function)) {
1080       if (Opts & PEEP_DOUBLE_JUMPS)
1081         NumDoubleJumps += fixDoubleJumps(Function, false);
1082       if (Opts & PEEP_TAILCALL_TRAPS)
1083         addTailcallTraps(Function);
1084       if (Opts & PEEP_USELESS_BRANCHES)
1085         removeUselessCondBranches(Function);
1086       assert(Function.validateCFG());
1087     }
1088   }
1089   outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1090          << " double jumps patched.\n"
1091          << "BOLT-INFO: Peephole: " << TailCallTraps
1092          << " tail call traps inserted.\n"
1093          << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1094          << " useless conditional branches removed.\n";
1095 }
1096 
1097 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1098   BinaryContext &BC = BF.getBinaryContext();
1099   MCPlusBuilder *MIB = BC.MIB.get();
1100 
1101   uint64_t NumLocalLoadsSimplified = 0;
1102   uint64_t NumDynamicLocalLoadsSimplified = 0;
1103   uint64_t NumLocalLoadsFound = 0;
1104   uint64_t NumDynamicLocalLoadsFound = 0;
1105 
1106   for (BinaryBasicBlock *BB : BF.layout()) {
1107     for (MCInst &Inst : *BB) {
1108       unsigned Opcode = Inst.getOpcode();
1109       const MCInstrDesc &Desc = BC.MII->get(Opcode);
1110 
1111       // Skip instructions that do not load from memory.
1112       if (!Desc.mayLoad())
1113         continue;
1114 
1115       // Try to statically evaluate the target memory address;
1116       uint64_t TargetAddress;
1117 
1118       if (MIB->hasPCRelOperand(Inst)) {
1119         // Try to find the symbol that corresponds to the PC-relative operand.
1120         MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1121         assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1122         assert(DispOpI->isExpr() &&
1123                "found PC-relative with non-symbolic displacement");
1124 
1125         // Get displacement symbol.
1126         const MCSymbol *DisplSymbol;
1127         uint64_t DisplOffset;
1128 
1129         std::tie(DisplSymbol, DisplOffset) =
1130             MIB->getTargetSymbolInfo(DispOpI->getExpr());
1131 
1132         if (!DisplSymbol)
1133           continue;
1134 
1135         // Look up the symbol address in the global symbols map of the binary
1136         // context object.
1137         BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1138         if (!BD)
1139           continue;
1140         TargetAddress = BD->getAddress() + DisplOffset;
1141       } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1142         continue;
1143       }
1144 
1145       // Get the contents of the section containing the target address of the
1146       // memory operand. We are only interested in read-only sections.
1147       ErrorOr<BinarySection &> DataSection =
1148           BC.getSectionForAddress(TargetAddress);
1149       if (!DataSection || !DataSection->isReadOnly())
1150         continue;
1151 
1152       if (BC.getRelocationAt(TargetAddress) ||
1153           BC.getDynamicRelocationAt(TargetAddress))
1154         continue;
1155 
1156       uint32_t Offset = TargetAddress - DataSection->getAddress();
1157       StringRef ConstantData = DataSection->getContents();
1158 
1159       ++NumLocalLoadsFound;
1160       if (BB->hasProfile())
1161         NumDynamicLocalLoadsFound += BB->getExecutionCount();
1162 
1163       if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1164         ++NumLocalLoadsSimplified;
1165         if (BB->hasProfile())
1166           NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1167       }
1168     }
1169   }
1170 
1171   NumLoadsFound += NumLocalLoadsFound;
1172   NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1173   NumLoadsSimplified += NumLocalLoadsSimplified;
1174   NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1175 
1176   return NumLocalLoadsSimplified > 0;
1177 }
1178 
1179 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1180   for (auto &It : BC.getBinaryFunctions()) {
1181     BinaryFunction &Function = It.second;
1182     if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1183       Modified.insert(&Function);
1184   }
1185 
1186   outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1187          << NumLoadsFound << " loads from a statically computed address.\n"
1188          << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
1189          << "\n"
1190          << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
1191 }
1192 
1193 void AssignSections::runOnFunctions(BinaryContext &BC) {
1194   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1195     Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1196     Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1197   }
1198 
1199   // In non-relocation mode functions have pre-assigned section names.
1200   if (!BC.HasRelocations)
1201     return;
1202 
1203   const bool UseColdSection =
1204       BC.NumProfiledFuncs > 0 ||
1205       opts::ReorderFunctions == ReorderFunctions::RT_USER;
1206   for (auto &BFI : BC.getBinaryFunctions()) {
1207     BinaryFunction &Function = BFI.second;
1208     if (opts::isHotTextMover(Function)) {
1209       Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1210       Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1211       continue;
1212     }
1213 
1214     if (!UseColdSection || Function.hasValidIndex())
1215       Function.setCodeSectionName(BC.getMainCodeSectionName());
1216     else
1217       Function.setCodeSectionName(BC.getColdCodeSectionName());
1218 
1219     if (Function.isSplit())
1220       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1221   }
1222 }
1223 
1224 void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1225   double FlowImbalanceMean = 0.0;
1226   size_t NumBlocksConsidered = 0;
1227   double WorstBias = 0.0;
1228   const BinaryFunction *WorstBiasFunc = nullptr;
1229 
1230   // For each function CFG, we fill an IncomingMap with the sum of the frequency
1231   // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1232   // of the frequency of outgoing edges.
1233   using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1234   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1235   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1236 
1237   // Compute mean
1238   for (const auto &BFI : BC.getBinaryFunctions()) {
1239     const BinaryFunction &Function = BFI.second;
1240     if (Function.empty() || !Function.isSimple())
1241       continue;
1242     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1243     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1244     for (const BinaryBasicBlock &BB : Function) {
1245       uint64_t TotalOutgoing = 0ULL;
1246       auto SuccBIIter = BB.branch_info_begin();
1247       for (BinaryBasicBlock *Succ : BB.successors()) {
1248         uint64_t Count = SuccBIIter->Count;
1249         if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1250           ++SuccBIIter;
1251           continue;
1252         }
1253         TotalOutgoing += Count;
1254         IncomingMap[Succ] += Count;
1255         ++SuccBIIter;
1256       }
1257       OutgoingMap[&BB] = TotalOutgoing;
1258     }
1259 
1260     size_t NumBlocks = 0;
1261     double Mean = 0.0;
1262     for (const BinaryBasicBlock &BB : Function) {
1263       // Do not compute score for low frequency blocks, entry or exit blocks
1264       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1265         continue;
1266       ++NumBlocks;
1267       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1268       Mean += fabs(Difference / IncomingMap[&BB]);
1269     }
1270 
1271     FlowImbalanceMean += Mean;
1272     NumBlocksConsidered += NumBlocks;
1273     if (!NumBlocks)
1274       continue;
1275     double FuncMean = Mean / NumBlocks;
1276     if (FuncMean > WorstBias) {
1277       WorstBias = FuncMean;
1278       WorstBiasFunc = &Function;
1279     }
1280   }
1281   if (NumBlocksConsidered > 0)
1282     FlowImbalanceMean /= NumBlocksConsidered;
1283 
1284   // Compute standard deviation
1285   NumBlocksConsidered = 0;
1286   double FlowImbalanceVar = 0.0;
1287   for (const auto &BFI : BC.getBinaryFunctions()) {
1288     const BinaryFunction &Function = BFI.second;
1289     if (Function.empty() || !Function.isSimple())
1290       continue;
1291     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1292     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1293     for (const BinaryBasicBlock &BB : Function) {
1294       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1295         continue;
1296       ++NumBlocksConsidered;
1297       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1298       FlowImbalanceVar +=
1299           pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1300     }
1301   }
1302   if (NumBlocksConsidered) {
1303     FlowImbalanceVar /= NumBlocksConsidered;
1304     FlowImbalanceVar = sqrt(FlowImbalanceVar);
1305   }
1306 
1307   // Report to user
1308   outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1309                    (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1310   if (WorstBiasFunc && opts::Verbosity >= 1) {
1311     outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
1312            << "\n";
1313     LLVM_DEBUG(WorstBiasFunc->dump());
1314   }
1315 }
1316 
1317 void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1318   uint64_t NumRegularFunctions = 0;
1319   uint64_t NumStaleProfileFunctions = 0;
1320   uint64_t NumNonSimpleProfiledFunctions = 0;
1321   uint64_t NumUnknownControlFlowFunctions = 0;
1322   uint64_t TotalSampleCount = 0;
1323   uint64_t StaleSampleCount = 0;
1324   std::vector<const BinaryFunction *> ProfiledFunctions;
1325   const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1326   for (auto &BFI : BC.getBinaryFunctions()) {
1327     const BinaryFunction &Function = BFI.second;
1328 
1329     // Ignore PLT functions for stats.
1330     if (Function.isPLTFunction())
1331       continue;
1332 
1333     ++NumRegularFunctions;
1334 
1335     if (!Function.isSimple()) {
1336       if (Function.hasProfile())
1337         ++NumNonSimpleProfiledFunctions;
1338       continue;
1339     }
1340 
1341     if (Function.hasUnknownControlFlow()) {
1342       if (opts::PrintUnknownCFG)
1343         Function.dump();
1344       else if (opts::PrintUnknown)
1345         errs() << "function with unknown control flow: " << Function << '\n';
1346 
1347       ++NumUnknownControlFlowFunctions;
1348     }
1349 
1350     if (!Function.hasProfile())
1351       continue;
1352 
1353     uint64_t SampleCount = Function.getRawBranchCount();
1354     TotalSampleCount += SampleCount;
1355 
1356     if (Function.hasValidProfile()) {
1357       ProfiledFunctions.push_back(&Function);
1358     } else {
1359       if (opts::ReportStaleFuncs) {
1360         outs() << StaleFuncsHeader;
1361         StaleFuncsHeader = "";
1362         outs() << "  " << Function << '\n';
1363       }
1364       ++NumStaleProfileFunctions;
1365       StaleSampleCount += SampleCount;
1366     }
1367   }
1368   BC.NumProfiledFuncs = ProfiledFunctions.size();
1369 
1370   const size_t NumAllProfiledFunctions =
1371       ProfiledFunctions.size() + NumStaleProfileFunctions;
1372   outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1373          << NumRegularFunctions << " functions in the binary ("
1374          << format("%.1f", NumAllProfiledFunctions /
1375                                (float)NumRegularFunctions * 100.0f)
1376          << "%) have non-empty execution profile\n";
1377   if (NumNonSimpleProfiledFunctions) {
1378     outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1379            << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1380            << " with profile could not be optimized\n";
1381   }
1382   if (NumStaleProfileFunctions) {
1383     const float PctStale =
1384         NumStaleProfileFunctions / (float)NumAllProfiledFunctions * 100.0f;
1385     auto printErrorOrWarning = [&]() {
1386       if (PctStale > opts::StaleThreshold)
1387         errs() << "BOLT-ERROR: ";
1388       else
1389         errs() << "BOLT-WARNING: ";
1390     };
1391     printErrorOrWarning();
1392     errs() << NumStaleProfileFunctions
1393            << format(" (%.1f%% of all profiled)", PctStale) << " function"
1394            << (NumStaleProfileFunctions == 1 ? "" : "s")
1395            << " have invalid (possibly stale) profile."
1396               " Use -report-stale to see the list.\n";
1397     if (TotalSampleCount > 0) {
1398       printErrorOrWarning();
1399       errs() << StaleSampleCount << " out of " << TotalSampleCount
1400              << " samples in the binary ("
1401              << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount))
1402              << "%) belong to functions with invalid"
1403                 " (possibly stale) profile.\n";
1404     }
1405     if (PctStale > opts::StaleThreshold) {
1406       errs() << "BOLT-ERROR: stale functions exceed specified threshold of "
1407              << opts::StaleThreshold << "%. Exiting.\n";
1408       exit(1);
1409     }
1410   }
1411 
1412   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1413     outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1414            << " objects was ignored\n";
1415   }
1416 
1417   if (ProfiledFunctions.size() > 10) {
1418     if (opts::Verbosity >= 1) {
1419       outs() << "BOLT-INFO: top called functions are:\n";
1420       std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(),
1421                 [](const BinaryFunction *A, const BinaryFunction *B) {
1422                   return B->getExecutionCount() < A->getExecutionCount();
1423                 });
1424       auto SFI = ProfiledFunctions.begin();
1425       auto SFIend = ProfiledFunctions.end();
1426       for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1427            ++SFI, ++I)
1428         outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n';
1429     }
1430   }
1431 
1432   if (!opts::PrintSortedBy.empty() &&
1433       std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(),
1434                 DynoStats::FIRST_DYNO_STAT) == opts::PrintSortedBy.end()) {
1435 
1436     std::vector<const BinaryFunction *> Functions;
1437     std::map<const BinaryFunction *, DynoStats> Stats;
1438 
1439     for (const auto &BFI : BC.getBinaryFunctions()) {
1440       const BinaryFunction &BF = BFI.second;
1441       if (shouldOptimize(BF) && BF.hasValidProfile()) {
1442         Functions.push_back(&BF);
1443         Stats.emplace(&BF, getDynoStats(BF));
1444       }
1445     }
1446 
1447     const bool SortAll =
1448         std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(),
1449                   DynoStats::LAST_DYNO_STAT) != opts::PrintSortedBy.end();
1450 
1451     const bool Ascending =
1452         opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1453 
1454     if (SortAll) {
1455       std::stable_sort(Functions.begin(), Functions.end(),
1456                        [Ascending, &Stats](const BinaryFunction *A,
1457                                            const BinaryFunction *B) {
1458                          return Ascending ? Stats.at(A) < Stats.at(B)
1459                                           : Stats.at(B) < Stats.at(A);
1460                        });
1461     } else {
1462       std::stable_sort(
1463           Functions.begin(), Functions.end(),
1464           [Ascending, &Stats](const BinaryFunction *A,
1465                               const BinaryFunction *B) {
1466             const DynoStats &StatsA = Stats.at(A);
1467             const DynoStats &StatsB = Stats.at(B);
1468             return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy)
1469                              : StatsB.lessThan(StatsA, opts::PrintSortedBy);
1470           });
1471     }
1472 
1473     outs() << "BOLT-INFO: top functions sorted by ";
1474     if (SortAll) {
1475       outs() << "dyno stats";
1476     } else {
1477       outs() << "(";
1478       bool PrintComma = false;
1479       for (const DynoStats::Category Category : opts::PrintSortedBy) {
1480         if (PrintComma)
1481           outs() << ", ";
1482         outs() << DynoStats::Description(Category);
1483         PrintComma = true;
1484       }
1485       outs() << ")";
1486     }
1487 
1488     outs() << " are:\n";
1489     auto SFI = Functions.begin();
1490     for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1491       const DynoStats Stats = getDynoStats(**SFI);
1492       outs() << "  " << **SFI;
1493       if (!SortAll) {
1494         outs() << " (";
1495         bool PrintComma = false;
1496         for (const DynoStats::Category Category : opts::PrintSortedBy) {
1497           if (PrintComma)
1498             outs() << ", ";
1499           outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1500           PrintComma = true;
1501         }
1502         outs() << ")";
1503       }
1504       outs() << "\n";
1505     }
1506   }
1507 
1508   if (!BC.TrappedFunctions.empty()) {
1509     errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1510            << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1511            << " will trap on entry. Use -trap-avx512=0 to disable"
1512               " traps.";
1513     if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1514       errs() << '\n';
1515       for (const BinaryFunction *Function : BC.TrappedFunctions)
1516         errs() << "  " << *Function << '\n';
1517     } else {
1518       errs() << " Use -v=1 to see the list.\n";
1519     }
1520   }
1521 
1522   // Print information on missed macro-fusion opportunities seen on input.
1523   if (BC.MissedMacroFusionPairs) {
1524     outs() << "BOLT-INFO: the input contains " << BC.MissedMacroFusionPairs
1525            << " (dynamic count : " << BC.MissedMacroFusionExecCount
1526            << ") opportunities for macro-fusion optimization";
1527     switch (opts::AlignMacroOpFusion) {
1528     case MFT_NONE:
1529       outs() << ". Use -align-macro-fusion to fix.\n";
1530       break;
1531     case MFT_HOT:
1532       outs() << ". Will fix instances on a hot path.\n";
1533       break;
1534     case MFT_ALL:
1535       outs() << " that are going to be fixed\n";
1536       break;
1537     }
1538   }
1539 
1540   // Collect and print information about suboptimal code layout on input.
1541   if (opts::ReportBadLayout) {
1542     std::vector<const BinaryFunction *> SuboptimalFuncs;
1543     for (auto &BFI : BC.getBinaryFunctions()) {
1544       const BinaryFunction &BF = BFI.second;
1545       if (!BF.hasValidProfile())
1546         continue;
1547 
1548       const uint64_t HotThreshold =
1549           std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1550       bool HotSeen = false;
1551       for (const BinaryBasicBlock *BB : BF.rlayout()) {
1552         if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1553           HotSeen = true;
1554           continue;
1555         }
1556         if (HotSeen && BB->getKnownExecutionCount() == 0) {
1557           SuboptimalFuncs.push_back(&BF);
1558           break;
1559         }
1560       }
1561     }
1562 
1563     if (!SuboptimalFuncs.empty()) {
1564       std::sort(SuboptimalFuncs.begin(), SuboptimalFuncs.end(),
1565                 [](const BinaryFunction *A, const BinaryFunction *B) {
1566                   return A->getKnownExecutionCount() / A->getSize() >
1567                          B->getKnownExecutionCount() / B->getSize();
1568                 });
1569 
1570       outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1571              << " functions have "
1572                 "cold code in the middle of hot code. Top functions are:\n";
1573       for (unsigned I = 0;
1574            I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1575                         SuboptimalFuncs.size());
1576            ++I)
1577         SuboptimalFuncs[I]->print(outs());
1578     }
1579   }
1580 
1581   if (NumUnknownControlFlowFunctions) {
1582     outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1583            << " functions have instructions with unknown control flow";
1584     if (!opts::PrintUnknown)
1585       outs() << ". Use -print-unknown to see the list.";
1586     outs() << '\n';
1587   }
1588 }
1589 
1590 void InstructionLowering::runOnFunctions(BinaryContext &BC) {
1591   for (auto &BFI : BC.getBinaryFunctions())
1592     for (BinaryBasicBlock &BB : BFI.second)
1593       for (MCInst &Instruction : BB)
1594         BC.MIB->lowerTailCall(Instruction);
1595 }
1596 
1597 void StripRepRet::runOnFunctions(BinaryContext &BC) {
1598   uint64_t NumPrefixesRemoved = 0;
1599   uint64_t NumBytesSaved = 0;
1600   for (auto &BFI : BC.getBinaryFunctions()) {
1601     for (BinaryBasicBlock &BB : BFI.second) {
1602       auto LastInstRIter = BB.getLastNonPseudo();
1603       if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1604           !BC.MIB->deleteREPPrefix(*LastInstRIter))
1605         continue;
1606 
1607       NumPrefixesRemoved += BB.getKnownExecutionCount();
1608       ++NumBytesSaved;
1609     }
1610   }
1611 
1612   if (NumBytesSaved)
1613     outs() << "BOLT-INFO: removed " << NumBytesSaved
1614            << " 'repz' prefixes"
1615               " with estimated execution count of "
1616            << NumPrefixesRemoved << " times.\n";
1617 }
1618 
1619 void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1620   if (!BC.isX86())
1621     return;
1622 
1623   uint64_t NumInlined = 0;
1624   uint64_t NumInlinedDyno = 0;
1625   for (auto &BFI : BC.getBinaryFunctions()) {
1626     for (BinaryBasicBlock &BB : BFI.second) {
1627       for (auto II = BB.begin(); II != BB.end(); ++II) {
1628         MCInst &Inst = *II;
1629 
1630         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1631             !Inst.getOperand(0).isExpr())
1632           continue;
1633 
1634         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1635         if (CalleeSymbol->getName() != "memcpy" &&
1636             CalleeSymbol->getName() != "memcpy@PLT" &&
1637             CalleeSymbol->getName() != "_memcpy8")
1638           continue;
1639 
1640         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1641         const bool IsTailCall = BC.MIB->isTailCall(Inst);
1642 
1643         const InstructionListType NewCode =
1644             BC.MIB->createInlineMemcpy(IsMemcpy8);
1645         II = BB.replaceInstruction(II, NewCode);
1646         std::advance(II, NewCode.size() - 1);
1647         if (IsTailCall) {
1648           MCInst Return;
1649           BC.MIB->createReturn(Return);
1650           II = BB.insertInstruction(std::next(II), std::move(Return));
1651         }
1652 
1653         ++NumInlined;
1654         NumInlinedDyno += BB.getKnownExecutionCount();
1655       }
1656     }
1657   }
1658 
1659   if (NumInlined) {
1660     outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1661     if (NumInlinedDyno)
1662       outs() << ". The calls were executed " << NumInlinedDyno
1663              << " times based on profile.";
1664     outs() << '\n';
1665   }
1666 }
1667 
1668 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1669   if (!BinaryFunctionPass::shouldOptimize(Function))
1670     return false;
1671 
1672   for (const std::string &FunctionSpec : Spec) {
1673     StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1674     if (Function.hasNameRegex(FunctionName))
1675       return true;
1676   }
1677 
1678   return false;
1679 }
1680 
1681 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1682     const BinaryFunction &Function) const {
1683   StringRef SitesString;
1684   for (const std::string &FunctionSpec : Spec) {
1685     StringRef FunctionName;
1686     std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1687     if (Function.hasNameRegex(FunctionName))
1688       break;
1689     SitesString = "";
1690   }
1691 
1692   std::set<size_t> Sites;
1693   SmallVector<StringRef, 4> SitesVec;
1694   SitesString.split(SitesVec, ':');
1695   for (StringRef SiteString : SitesVec) {
1696     if (SiteString.empty())
1697       continue;
1698     size_t Result;
1699     if (!SiteString.getAsInteger(10, Result))
1700       Sites.emplace(Result);
1701   }
1702 
1703   return Sites;
1704 }
1705 
1706 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1707   if (!BC.isX86())
1708     return;
1709 
1710   uint64_t NumSpecialized = 0;
1711   uint64_t NumSpecializedDyno = 0;
1712   for (auto &BFI : BC.getBinaryFunctions()) {
1713     BinaryFunction &Function = BFI.second;
1714     if (!shouldOptimize(Function))
1715       continue;
1716 
1717     std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1718     auto shouldOptimize = [&](size_t N) {
1719       return CallsToOptimize.empty() || CallsToOptimize.count(N);
1720     };
1721 
1722     std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1723     size_t CallSiteID = 0;
1724     for (BinaryBasicBlock *CurBB : Blocks) {
1725       for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1726         MCInst &Inst = *II;
1727 
1728         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1729             !Inst.getOperand(0).isExpr())
1730           continue;
1731 
1732         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1733         if (CalleeSymbol->getName() != "memcpy" &&
1734             CalleeSymbol->getName() != "memcpy@PLT")
1735           continue;
1736 
1737         if (BC.MIB->isTailCall(Inst))
1738           continue;
1739 
1740         ++CallSiteID;
1741 
1742         if (!shouldOptimize(CallSiteID))
1743           continue;
1744 
1745         // Create a copy of a call to memcpy(dest, src, size).
1746         MCInst MemcpyInstr = Inst;
1747 
1748         BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1749 
1750         BinaryBasicBlock *NextBB = nullptr;
1751         if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1752           NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1753           NextBB->eraseInstruction(NextBB->begin());
1754         } else {
1755           NextBB = OneByteMemcpyBB->getSuccessor();
1756           OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1757           assert(NextBB && "unexpected call to memcpy() with no return");
1758         }
1759 
1760         BinaryBasicBlock *MemcpyBB =
1761             Function.addBasicBlock(CurBB->getInputOffset());
1762         InstructionListType CmpJCC =
1763             BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1764                                 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1765         CurBB->addInstructions(CmpJCC);
1766         CurBB->addSuccessor(MemcpyBB);
1767 
1768         MemcpyBB->addInstruction(std::move(MemcpyInstr));
1769         MemcpyBB->addSuccessor(NextBB);
1770         MemcpyBB->setCFIState(NextBB->getCFIState());
1771         MemcpyBB->setExecutionCount(0);
1772 
1773         // To prevent the actual call from being moved to cold, we set its
1774         // execution count to 1.
1775         if (CurBB->getKnownExecutionCount() > 0)
1776           MemcpyBB->setExecutionCount(1);
1777 
1778         InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1779         OneByteMemcpyBB->addInstructions(OneByteMemcpy);
1780 
1781         ++NumSpecialized;
1782         NumSpecializedDyno += CurBB->getKnownExecutionCount();
1783 
1784         CurBB = NextBB;
1785 
1786         // Note: we don't expect the next instruction to be a call to memcpy.
1787         II = CurBB->begin();
1788       }
1789     }
1790   }
1791 
1792   if (NumSpecialized) {
1793     outs() << "BOLT-INFO: specialized " << NumSpecialized
1794            << " memcpy() call sites for size 1";
1795     if (NumSpecializedDyno)
1796       outs() << ". The calls were executed " << NumSpecializedDyno
1797              << " times based on profile.";
1798     outs() << '\n';
1799   }
1800 }
1801 
1802 void RemoveNops::runOnFunction(BinaryFunction &BF) {
1803   const BinaryContext &BC = BF.getBinaryContext();
1804   for (BinaryBasicBlock &BB : BF) {
1805     for (int64_t I = BB.size() - 1; I >= 0; --I) {
1806       MCInst &Inst = BB.getInstructionAtIndex(I);
1807       if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
1808         BB.eraseInstructionAtIndex(I);
1809     }
1810   }
1811 }
1812 
1813 void RemoveNops::runOnFunctions(BinaryContext &BC) {
1814   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1815     runOnFunction(BF);
1816   };
1817 
1818   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1819     return BF.shouldPreserveNops();
1820   };
1821 
1822   ParallelUtilities::runOnEachFunction(
1823       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
1824       SkipFunc, "RemoveNops");
1825 }
1826 
1827 } // namespace bolt
1828 } // namespace llvm
1829