1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/ParallelUtilities.h"
15 #include "bolt/Passes/ReorderAlgorithm.h"
16 #include "bolt/Passes/ReorderFunctions.h"
17 #include "llvm/Support/CommandLine.h"
18 
19 #include <numeric>
20 #include <vector>
21 
22 #define DEBUG_TYPE "bolt-opts"
23 
24 using namespace llvm;
25 using namespace bolt;
26 
27 namespace {
28 
29 const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30   if (C == bolt::DynoStats::FIRST_DYNO_STAT)
31     return "none";
32   else if (C == bolt::DynoStats::LAST_DYNO_STAT)
33     return "all";
34 
35   static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
36 
37   OptNames[C] = bolt::DynoStats::Description(C);
38 
39   std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
40 
41   return OptNames[C].c_str();
42 }
43 
44 const char *dynoStatsOptDesc(const bolt::DynoStats::Category C) {
45   if (C == bolt::DynoStats::FIRST_DYNO_STAT)
46     return "unsorted";
47   else if (C == bolt::DynoStats::LAST_DYNO_STAT)
48     return "sorted by all stats";
49 
50   return bolt::DynoStats::Description(C);
51 }
52 
53 }
54 
55 namespace opts {
56 
57 extern cl::OptionCategory BoltCategory;
58 extern cl::OptionCategory BoltOptCategory;
59 
60 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
61 extern cl::opt<unsigned> Verbosity;
62 extern cl::opt<bool> EnableBAT;
63 extern cl::opt<unsigned> ExecutionCountThreshold;
64 extern cl::opt<bool> UpdateDebugSections;
65 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
66 
67 enum DynoStatsSortOrder : char {
68   Ascending,
69   Descending
70 };
71 
72 static cl::opt<DynoStatsSortOrder>
73 DynoStatsSortOrderOpt("print-sorted-by-order",
74   cl::desc("use ascending or descending order when printing functions "
75            "ordered by dyno stats"),
76   cl::ZeroOrMore,
77   cl::init(DynoStatsSortOrder::Descending),
78   cl::cat(BoltOptCategory));
79 
80 cl::list<std::string>
81 HotTextMoveSections("hot-text-move-sections",
82   cl::desc("list of sections containing functions used for hugifying hot text. "
83            "BOLT makes sure these functions are not placed on the same page as "
84            "the hot text. (default=\'.stub,.mover\')."),
85   cl::value_desc("sec1,sec2,sec3,..."),
86   cl::CommaSeparated,
87   cl::ZeroOrMore,
88   cl::cat(BoltCategory));
89 
90 bool isHotTextMover(const BinaryFunction &Function) {
91   for (std::string &SectionName : opts::HotTextMoveSections) {
92     if (Function.getOriginSectionName() &&
93         *Function.getOriginSectionName() == SectionName)
94       return true;
95   }
96 
97   return false;
98 }
99 
100 static cl::opt<bool>
101 MinBranchClusters("min-branch-clusters",
102   cl::desc("use a modified clustering algorithm geared towards minimizing "
103            "branches"),
104   cl::ZeroOrMore,
105   cl::Hidden,
106   cl::cat(BoltOptCategory));
107 
108 static cl::list<Peepholes::PeepholeOpts> Peepholes(
109     "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
110     cl::value_desc("opt1,opt2,opt3,..."),
111     cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
112                clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
113                           "remove double jumps when able"),
114                clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
115                           "insert tail call traps"),
116                clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
117                           "remove useless conditional branches"),
118                clEnumValN(Peepholes::PEEP_ALL, "all",
119                           "enable all peephole optimizations")),
120     cl::ZeroOrMore, cl::cat(BoltOptCategory));
121 
122 static cl::opt<unsigned>
123 PrintFuncStat("print-function-statistics",
124   cl::desc("print statistics about basic block ordering"),
125   cl::init(0),
126   cl::ZeroOrMore,
127   cl::cat(BoltOptCategory));
128 
129 static cl::list<bolt::DynoStats::Category>
130 PrintSortedBy("print-sorted-by",
131   cl::CommaSeparated,
132   cl::desc("print functions sorted by order of dyno stats"),
133   cl::value_desc("key1,key2,key3,..."),
134   cl::values(
135 #define D(name, ...)                                        \
136     clEnumValN(bolt::DynoStats::name,                     \
137                dynoStatsOptName(bolt::DynoStats::name),   \
138                dynoStatsOptDesc(bolt::DynoStats::name)),
139     DYNO_STATS
140 #undef D
141     clEnumValN(0xffff, ".", ".")
142     ),
143   cl::ZeroOrMore,
144   cl::cat(BoltOptCategory));
145 
146 static cl::opt<bool>
147 PrintUnknown("print-unknown",
148   cl::desc("print names of functions with unknown control flow"),
149   cl::init(false),
150   cl::ZeroOrMore,
151   cl::cat(BoltCategory),
152   cl::Hidden);
153 
154 static cl::opt<bool>
155 PrintUnknownCFG("print-unknown-cfg",
156   cl::desc("dump CFG of functions with unknown control flow"),
157   cl::init(false),
158   cl::ZeroOrMore,
159   cl::cat(BoltCategory),
160   cl::ReallyHidden);
161 
162 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
163     "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
164     cl::init(bolt::ReorderBasicBlocks::LT_NONE),
165     cl::values(
166         clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
167                    "do not reorder basic blocks"),
168         clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
169                    "layout blocks in reverse order"),
170         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
171                    "perform optimal layout based on profile"),
172         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
173                    "branch-predictor",
174                    "perform optimal layout prioritizing branch "
175                    "predictions"),
176         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
177                    "perform optimal layout prioritizing I-cache "
178                    "behavior"),
179         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "cache+",
180                    "perform layout optimizing I-cache behavior"),
181         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
182                    "perform layout optimizing I-cache behavior"),
183         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
184                    "cluster-shuffle", "perform random layout of clusters")),
185     cl::ZeroOrMore, cl::cat(BoltOptCategory));
186 
187 static cl::opt<unsigned>
188 ReportBadLayout("report-bad-layout",
189   cl::desc("print top <uint> functions with suboptimal code layout on input"),
190   cl::init(0),
191   cl::ZeroOrMore,
192   cl::Hidden,
193   cl::cat(BoltOptCategory));
194 
195 static cl::opt<bool>
196 ReportStaleFuncs("report-stale",
197   cl::desc("print the list of functions with stale profile"),
198   cl::init(false),
199   cl::ZeroOrMore,
200   cl::Hidden,
201   cl::cat(BoltOptCategory));
202 
203 enum SctcModes : char {
204   SctcAlways,
205   SctcPreserveDirection,
206   SctcHeuristic
207 };
208 
209 static cl::opt<SctcModes>
210 SctcMode("sctc-mode",
211   cl::desc("mode for simplify conditional tail calls"),
212   cl::init(SctcAlways),
213   cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
214     clEnumValN(SctcPreserveDirection,
215       "preserve",
216       "only perform sctc when branch direction is "
217       "preserved"),
218     clEnumValN(SctcHeuristic,
219       "heuristic",
220       "use branch prediction data to control sctc")),
221   cl::ZeroOrMore,
222   cl::cat(BoltOptCategory));
223 
224 static cl::opt<unsigned>
225 StaleThreshold("stale-threshold",
226     cl::desc(
227       "maximum percentage of stale functions to tolerate (default: 100)"),
228     cl::init(100),
229     cl::Hidden,
230     cl::cat(BoltOptCategory));
231 
232 static cl::opt<unsigned>
233 TSPThreshold("tsp-threshold",
234   cl::desc("maximum number of hot basic blocks in a function for which to use "
235            "a precise TSP solution while re-ordering basic blocks"),
236   cl::init(10),
237   cl::ZeroOrMore,
238   cl::Hidden,
239   cl::cat(BoltOptCategory));
240 
241 static cl::opt<unsigned>
242 TopCalledLimit("top-called-limit",
243   cl::desc("maximum number of functions to print in top called "
244            "functions section"),
245   cl::init(100),
246   cl::ZeroOrMore,
247   cl::Hidden,
248   cl::cat(BoltCategory));
249 
250 } // namespace opts
251 
252 namespace llvm {
253 namespace bolt {
254 
255 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
256   return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
257          !BF.isIgnored();
258 }
259 
260 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
261   return BF.isSimple() && !BF.isIgnored();
262 }
263 
264 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
265   uint64_t NumRemoved = 0;
266   uint64_t NumDuplicateEdges = 0;
267   uint64_t NeedsFixBranches = 0;
268   for (BinaryBasicBlock &BB : BF) {
269     if (!BB.empty())
270       continue;
271 
272     if (BB.isEntryPoint() || BB.isLandingPad())
273       continue;
274 
275     // Handle a dangling empty block.
276     if (BB.succ_size() == 0) {
277       // If an empty dangling basic block has a predecessor, it could be a
278       // result of codegen for __builtin_unreachable. In such case, do not
279       // remove the block.
280       if (BB.pred_size() == 0) {
281         BB.markValid(false);
282         ++NumRemoved;
283       }
284       continue;
285     }
286 
287     // The block should have just one successor.
288     BinaryBasicBlock *Successor = BB.getSuccessor();
289     assert(Successor && "invalid CFG encountered");
290 
291     // Redirect all predecessors to the successor block.
292     while (!BB.pred_empty()) {
293       BinaryBasicBlock *Predecessor = *BB.pred_begin();
294       if (Predecessor->hasJumpTable())
295         break;
296 
297       if (Predecessor == Successor)
298         break;
299 
300       BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
301       Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
302                                     BI.MispredictedCount);
303       // We need to fix branches even if we failed to replace all successors
304       // and remove the block.
305       NeedsFixBranches = true;
306     }
307 
308     if (BB.pred_empty()) {
309       BB.removeAllSuccessors();
310       BB.markValid(false);
311       ++NumRemoved;
312     }
313   }
314 
315   if (NumRemoved)
316     BF.eraseInvalidBBs();
317 
318   // Check for duplicate successors. Do it after the empty block elimination as
319   // we can get more duplicate successors.
320   for (BinaryBasicBlock &BB : BF)
321     if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
322         BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
323       ++NumDuplicateEdges;
324 
325   // fixBranches() will get rid of duplicate edges and update jump instructions.
326   if (NumDuplicateEdges || NeedsFixBranches)
327     BF.fixBranches();
328 
329   NumDuplicateEdgesMerged += NumDuplicateEdges;
330   NumBlocksRemoved += NumRemoved;
331 }
332 
333 void NormalizeCFG::runOnFunctions(BinaryContext &BC) {
334   ParallelUtilities::runOnEachFunction(
335       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
336       [&](BinaryFunction &BF) { runOnFunction(BF); },
337       [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
338       "NormalizeCFG");
339   if (NumBlocksRemoved)
340     outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
341            << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
342   if (NumDuplicateEdgesMerged)
343     outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
344            << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s")
345            << '\n';
346 }
347 
348 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
349   if (Function.layout_size() > 0) {
350     unsigned Count;
351     uint64_t Bytes;
352     Function.markUnreachableBlocks();
353     LLVM_DEBUG({
354       for (BinaryBasicBlock *BB : Function.layout()) {
355         if (!BB->isValid()) {
356           dbgs() << "BOLT-INFO: UCE found unreachable block " << BB->getName()
357                  << " in function " << Function << "\n";
358           Function.dump();
359         }
360       }
361     });
362     std::tie(Count, Bytes) = Function.eraseInvalidBBs();
363     DeletedBlocks += Count;
364     DeletedBytes += Bytes;
365     if (Count) {
366       Modified.insert(&Function);
367       if (opts::Verbosity > 0)
368         outs() << "BOLT-INFO: Removed " << Count
369                << " dead basic block(s) accounting for " << Bytes
370                << " bytes in function " << Function << '\n';
371     }
372   }
373 }
374 
375 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
376   for (auto &It : BC.getBinaryFunctions()) {
377     BinaryFunction &Function = It.second;
378     if (shouldOptimize(Function))
379       runOnFunction(Function);
380   }
381 
382   outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
383          << DeletedBytes << " bytes of code.\n";
384 }
385 
386 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
387   return (BinaryFunctionPass::shouldPrint(BF) &&
388           opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
389 }
390 
391 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
392   // Apply execution count threshold
393   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
394     return false;
395 
396   return BinaryFunctionPass::shouldOptimize(BF);
397 }
398 
399 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
400   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
401     return;
402 
403   std::atomic<uint64_t> ModifiedFuncCount{0};
404 
405   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
406     modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
407     if (BF.hasLayoutChanged())
408       ++ModifiedFuncCount;
409   };
410 
411   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
412     return !shouldOptimize(BF);
413   };
414 
415   ParallelUtilities::runOnEachFunction(
416       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
417       "ReorderBasicBlocks");
418 
419   outs() << "BOLT-INFO: basic block reordering modified layout of "
420          << format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(),
421                    100.0 * ModifiedFuncCount.load() /
422                        BC.getBinaryFunctions().size());
423 
424   if (opts::PrintFuncStat > 0) {
425     raw_ostream &OS = outs();
426     // Copy all the values into vector in order to sort them
427     std::map<uint64_t, BinaryFunction &> ScoreMap;
428     auto &BFs = BC.getBinaryFunctions();
429     for (auto It = BFs.begin(); It != BFs.end(); ++It)
430       ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
431           It->second.getFunctionScore(), It->second));
432 
433     OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
434     OS << "           There are " << BFs.size() << " functions in total. \n";
435     OS << "           Number of functions being modified: "
436        << ModifiedFuncCount.load() << "\n";
437     OS << "           User asks for detailed information on top "
438        << opts::PrintFuncStat << " functions. (Ranked by function score)"
439        << "\n\n";
440     uint64_t I = 0;
441     for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
442              ScoreMap.rbegin();
443          Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
444       BinaryFunction &Function = Rit->second;
445 
446       OS << "           Information for function of top: " << (I + 1) << ": \n";
447       OS << "             Function Score is: " << Function.getFunctionScore()
448          << "\n";
449       OS << "             There are " << Function.size()
450          << " number of blocks in this function.\n";
451       OS << "             There are " << Function.getInstructionCount()
452          << " number of instructions in this function.\n";
453       OS << "             The edit distance for this function is: "
454          << Function.getEditDistance() << "\n\n";
455     }
456   }
457 }
458 
459 void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
460                                               LayoutType Type,
461                                               bool MinBranchClusters) const {
462   if (BF.size() == 0 || Type == LT_NONE)
463     return;
464 
465   BinaryFunction::BasicBlockOrderType NewLayout;
466   std::unique_ptr<ReorderAlgorithm> Algo;
467 
468   // Cannot do optimal layout without profile.
469   if (Type != LT_REVERSE && !BF.hasValidProfile())
470     return;
471 
472   if (Type == LT_REVERSE) {
473     Algo.reset(new ReverseReorderAlgorithm());
474   } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
475     // Work on optimal solution if problem is small enough
476     LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
477     Algo.reset(new TSPReorderAlgorithm());
478   } else {
479     LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
480 
481     std::unique_ptr<ClusterAlgorithm> CAlgo;
482     if (MinBranchClusters)
483       CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
484     else
485       CAlgo.reset(new PHGreedyClusterAlgorithm());
486 
487     switch (Type) {
488     case LT_OPTIMIZE:
489       Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
490       break;
491 
492     case LT_OPTIMIZE_BRANCH:
493       Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
494       break;
495 
496     case LT_OPTIMIZE_CACHE:
497       Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
498       break;
499 
500     case LT_OPTIMIZE_EXT_TSP:
501       Algo.reset(new ExtTSPReorderAlgorithm());
502       break;
503 
504     case LT_OPTIMIZE_SHUFFLE:
505       Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
506       break;
507 
508     default:
509       llvm_unreachable("unexpected layout type");
510     }
511   }
512 
513   Algo->reorderBasicBlocks(BF, NewLayout);
514 
515   BF.updateBasicBlockLayout(NewLayout);
516 }
517 
518 void FixupBranches::runOnFunctions(BinaryContext &BC) {
519   for (auto &It : BC.getBinaryFunctions()) {
520     BinaryFunction &Function = It.second;
521     if (!BC.shouldEmit(Function) || !Function.isSimple())
522       continue;
523 
524     Function.fixBranches();
525   }
526 }
527 
528 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
529   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
530     if (!BF.finalizeCFIState()) {
531       if (BC.HasRelocations) {
532         errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
533                << ". Exiting.\n";
534         exit(1);
535       }
536       BF.setSimple(false);
537       return;
538     }
539 
540     BF.setFinalized();
541 
542     // Update exception handling information.
543     BF.updateEHRanges();
544   };
545 
546   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
547     return !BC.shouldEmit(BF);
548   };
549 
550   ParallelUtilities::runOnEachFunction(
551       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
552       SkipPredicate, "FinalizeFunctions");
553 }
554 
555 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
556   if (BC.HasRelocations)
557     return;
558 
559   if (!opts::UpdateDebugSections)
560     return;
561 
562   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
563   // incorrect debug info.
564   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
565     uint64_t HotSize, ColdSize;
566     std::tie(HotSize, ColdSize) =
567         BC.calculateEmittedSize(BF, /*FixBranches=*/false);
568     if (HotSize > BF.getMaxSize())
569       BF.setSimple(false);
570   };
571 
572   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
573     return !shouldOptimize(BF);
574   };
575 
576   ParallelUtilities::runOnEachFunction(
577       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
578       SkipFunc, "CheckLargeFunctions");
579 }
580 
581 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
582   // Unlike other passes, allow functions in non-CFG state.
583   return BF.isSimple() && !BF.isIgnored();
584 }
585 
586 void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
587   std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations;
588 
589   for (auto &It : BC.getBinaryFunctions()) {
590     BinaryFunction &BF = It.second;
591     int64_t CurrentGnuArgsSize = 0;
592 
593     // Have we crossed hot/cold border for split functions?
594     bool SeenCold = false;
595 
596     for (BinaryBasicBlock *BB : BF.layout()) {
597       if (BB->isCold() && !SeenCold) {
598         SeenCold = true;
599         CurrentGnuArgsSize = 0;
600       }
601 
602       // First convert GnuArgsSize annotations into CFIs. This may change instr
603       // pointers, so do it before recording ptrs for preserved annotations
604       if (BF.usesGnuArgsSize()) {
605         for (auto II = BB->begin(); II != BB->end(); ++II) {
606           if (!BC.MIB->isInvoke(*II))
607             continue;
608           const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
609           assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size");
610           if (NewGnuArgsSize != CurrentGnuArgsSize) {
611             auto InsertII = BF.addCFIInstruction(
612                 BB, II,
613                 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
614             CurrentGnuArgsSize = NewGnuArgsSize;
615             II = std::next(InsertII);
616           }
617         }
618       }
619 
620       // Now record preserved annotations separately and then strip annotations.
621       for (auto II = BB->begin(); II != BB->end(); ++II) {
622         if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II))
623           PreservedOffsetAnnotations.emplace_back(&(*II),
624                                                   *BC.MIB->getOffset(*II));
625         BC.MIB->stripAnnotations(*II);
626       }
627     }
628   }
629   for (BinaryFunction *BF : BC.getInjectedBinaryFunctions())
630     for (BinaryBasicBlock &BB : *BF)
631       for (MCInst &Instruction : BB)
632         BC.MIB->stripAnnotations(Instruction);
633 
634   // Release all memory taken by annotations
635   BC.MIB->freeAnnotations();
636 
637   // Reinsert preserved annotations we need during code emission.
638   for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations)
639     BC.MIB->setOffset(*Item.first, Item.second);
640 }
641 
642 namespace {
643 
644 // This peephole fixes jump instructions that jump to another basic
645 // block with a single jump instruction, e.g.
646 //
647 // B0: ...
648 //     jmp  B1   (or jcc B1)
649 //
650 // B1: jmp  B2
651 //
652 // ->
653 //
654 // B0: ...
655 //     jmp  B2   (or jcc B2)
656 //
657 uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
658   uint64_t NumDoubleJumps = 0;
659 
660   MCContext *Ctx = Function.getBinaryContext().Ctx.get();
661   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
662   for (BinaryBasicBlock &BB : Function) {
663     auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
664                              const MCSymbol *SuccSym) {
665       // Ignore infinite loop jumps or fallthrough tail jumps.
666       if (Pred == Succ || Succ == &BB)
667         return false;
668 
669       if (Succ) {
670         const MCSymbol *TBB = nullptr;
671         const MCSymbol *FBB = nullptr;
672         MCInst *CondBranch = nullptr;
673         MCInst *UncondBranch = nullptr;
674         bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
675         if (!Res) {
676           LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
677                      Pred->dump());
678           return false;
679         }
680         Pred->replaceSuccessor(&BB, Succ);
681 
682         // We must patch up any existing branch instructions to match up
683         // with the new successor.
684         assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
685                "Predecessor block has inconsistent number of successors");
686         if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
687           MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
688         } else if (UncondBranch &&
689                    MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
690           MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
691         } else if (!UncondBranch) {
692           assert(Function.getBasicBlockAfter(Pred, false) != Succ &&
693                  "Don't add an explicit jump to a fallthrough block.");
694           Pred->addBranchInstruction(Succ);
695         }
696       } else {
697         // Succ will be null in the tail call case.  In this case we
698         // need to explicitly add a tail call instruction.
699         MCInst *Branch = Pred->getLastNonPseudoInstr();
700         if (Branch && MIB->isUnconditionalBranch(*Branch)) {
701           assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
702           Pred->removeSuccessor(&BB);
703           Pred->eraseInstruction(Pred->findInstruction(Branch));
704           Pred->addTailCallInstruction(SuccSym);
705         } else {
706           return false;
707         }
708       }
709 
710       ++NumDoubleJumps;
711       LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
712                         << Pred->getName() << " -> " << BB.getName() << " to "
713                         << Pred->getName() << " -> " << SuccSym->getName()
714                         << (!Succ ? " (tail)\n" : "\n"));
715 
716       return true;
717     };
718 
719     if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
720       continue;
721 
722     MCInst *Inst = BB.getFirstNonPseudoInstr();
723     const bool IsTailCall = MIB->isTailCall(*Inst);
724 
725     if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
726       continue;
727 
728     // If we operate after SCTC make sure it's not a conditional tail call.
729     if (IsTailCall && MIB->isConditionalBranch(*Inst))
730       continue;
731 
732     const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
733     BinaryBasicBlock *Succ = BB.getSuccessor();
734 
735     if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
736       continue;
737 
738     std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
739 
740     for (BinaryBasicBlock *Pred : Preds) {
741       if (Pred->isLandingPad())
742         continue;
743 
744       if (Pred->getSuccessor() == &BB ||
745           (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
746           Pred->getConditionalSuccessor(false) == &BB)
747         if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid)
748           BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
749                        BB.isEntryPoint());
750     }
751   }
752 
753   return NumDoubleJumps;
754 }
755 } // namespace
756 
757 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
758     const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
759     const BinaryBasicBlock *BB, const bool DirectionFlag) {
760   if (BeenOptimized.count(PredBB))
761     return false;
762 
763   const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
764 
765   if (IsForward)
766     ++NumOrigForwardBranches;
767   else
768     ++NumOrigBackwardBranches;
769 
770   if (opts::SctcMode == opts::SctcAlways)
771     return true;
772 
773   if (opts::SctcMode == opts::SctcPreserveDirection)
774     return IsForward == DirectionFlag;
775 
776   const ErrorOr<std::pair<double, double>> Frequency =
777       PredBB->getBranchStats(BB);
778 
779   // It's ok to rewrite the conditional branch if the new target will be
780   // a backward branch.
781 
782   // If no data available for these branches, then it should be ok to
783   // do the optimization since it will reduce code size.
784   if (Frequency.getError())
785     return true;
786 
787   // TODO: should this use misprediction frequency instead?
788   const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
789                       (!IsForward && Frequency.get().first <= 0.5);
790 
791   return Result == DirectionFlag;
792 }
793 
794 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
795   // Need updated indices to correctly detect branch' direction.
796   BF.updateLayoutIndices();
797   BF.markUnreachableBlocks();
798 
799   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
800   MCContext *Ctx = BF.getBinaryContext().Ctx.get();
801   uint64_t NumLocalCTCCandidates = 0;
802   uint64_t NumLocalCTCs = 0;
803   uint64_t LocalCTCTakenCount = 0;
804   uint64_t LocalCTCExecCount = 0;
805   std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
806       NeedsUncondBranch;
807 
808   // Will block be deleted by UCE?
809   auto isValid = [](const BinaryBasicBlock *BB) {
810     return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
811   };
812 
813   for (BinaryBasicBlock *BB : BF.layout()) {
814     // Locate BB with a single direct tail-call instruction.
815     if (BB->getNumNonPseudos() != 1)
816       continue;
817 
818     MCInst *Instr = BB->getFirstNonPseudoInstr();
819     if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
820       continue;
821 
822     const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
823     if (!CalleeSymbol)
824       continue;
825 
826     // Detect direction of the possible conditional tail call.
827     const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
828 
829     // Iterate through all predecessors.
830     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
831       BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
832       if (!CondSucc)
833         continue;
834 
835       ++NumLocalCTCCandidates;
836 
837       const MCSymbol *TBB = nullptr;
838       const MCSymbol *FBB = nullptr;
839       MCInst *CondBranch = nullptr;
840       MCInst *UncondBranch = nullptr;
841       bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
842 
843       // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
844       if (!Result) {
845         LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
846                    PredBB->dump());
847         continue;
848       }
849 
850       assert(Result && "internal error analyzing conditional branch");
851       assert(CondBranch && "conditional branch expected");
852 
853       // It's possible that PredBB is also a successor to BB that may have
854       // been processed by a previous iteration of the SCTC loop, in which
855       // case it may have been marked invalid.  We should skip rewriting in
856       // this case.
857       if (!PredBB->isValid()) {
858         assert(PredBB->isSuccessor(BB) &&
859                "PredBB should be valid if it is not a successor to BB");
860         continue;
861       }
862 
863       // We don't want to reverse direction of the branch in new order
864       // without further profile analysis.
865       const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
866       if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
867         continue;
868 
869       // Record this block so that we don't try to optimize it twice.
870       BeenOptimized.insert(PredBB);
871 
872       uint64_t Count = 0;
873       if (CondSucc != BB) {
874         // Patch the new target address into the conditional branch.
875         MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
876         // Since we reversed the condition on the branch we need to change
877         // the target for the unconditional branch or add a unconditional
878         // branch to the old target.  This has to be done manually since
879         // fixupBranches is not called after SCTC.
880         NeedsUncondBranch.emplace_back(PredBB, CondSucc);
881         Count = PredBB->getFallthroughBranchInfo().Count;
882       } else {
883         // Change destination of the conditional branch.
884         MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
885         Count = PredBB->getTakenBranchInfo().Count;
886       }
887       const uint64_t CTCTakenFreq =
888           Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
889 
890       // Annotate it, so "isCall" returns true for this jcc
891       MIB->setConditionalTailCall(*CondBranch);
892       // Add info abount the conditional tail call frequency, otherwise this
893       // info will be lost when we delete the associated BranchInfo entry
894       auto &CTCAnnotation =
895           MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
896       CTCAnnotation = CTCTakenFreq;
897 
898       // Remove the unused successor which may be eliminated later
899       // if there are no other users.
900       PredBB->removeSuccessor(BB);
901       // Update BB execution count
902       if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
903         BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
904       else if (CTCTakenFreq > BB->getKnownExecutionCount())
905         BB->setExecutionCount(0);
906 
907       ++NumLocalCTCs;
908       LocalCTCTakenCount += CTCTakenFreq;
909       LocalCTCExecCount += PredBB->getKnownExecutionCount();
910     }
911 
912     // Remove the block from CFG if all predecessors were removed.
913     BB->markValid(isValid(BB));
914   }
915 
916   // Add unconditional branches at the end of BBs to new successors
917   // as long as the successor is not a fallthrough.
918   for (auto &Entry : NeedsUncondBranch) {
919     BinaryBasicBlock *PredBB = Entry.first;
920     const BinaryBasicBlock *CondSucc = Entry.second;
921 
922     const MCSymbol *TBB = nullptr;
923     const MCSymbol *FBB = nullptr;
924     MCInst *CondBranch = nullptr;
925     MCInst *UncondBranch = nullptr;
926     PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
927 
928     // Find the next valid block.  Invalid blocks will be deleted
929     // so they shouldn't be considered fallthrough targets.
930     const BinaryBasicBlock *NextBlock = BF.getBasicBlockAfter(PredBB, false);
931     while (NextBlock && !isValid(NextBlock))
932       NextBlock = BF.getBasicBlockAfter(NextBlock, false);
933 
934     // Get the unconditional successor to this block.
935     const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
936     assert(PredSucc && "The other branch should be a tail call");
937 
938     const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
939 
940     if (UncondBranch) {
941       if (HasFallthrough)
942         PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
943       else
944         MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
945     } else if (!HasFallthrough) {
946       MCInst Branch;
947       MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
948       PredBB->addInstruction(Branch);
949     }
950   }
951 
952   if (NumLocalCTCs > 0) {
953     NumDoubleJumps += fixDoubleJumps(BF, true);
954     // Clean-up unreachable tail-call blocks.
955     const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
956     DeletedBlocks += Stats.first;
957     DeletedBytes += Stats.second;
958 
959     assert(BF.validateCFG());
960   }
961 
962   LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
963                     << " conditional tail calls from a total of "
964                     << NumLocalCTCCandidates << " candidates in function " << BF
965                     << ". CTCs execution count for this function is "
966                     << LocalCTCExecCount << " and CTC taken count is "
967                     << LocalCTCTakenCount << "\n";);
968 
969   NumTailCallsPatched += NumLocalCTCs;
970   NumCandidateTailCalls += NumLocalCTCCandidates;
971   CTCExecCount += LocalCTCExecCount;
972   CTCTakenCount += LocalCTCTakenCount;
973 
974   return NumLocalCTCs > 0;
975 }
976 
977 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
978   if (!BC.isX86())
979     return;
980 
981   for (auto &It : BC.getBinaryFunctions()) {
982     BinaryFunction &Function = It.second;
983 
984     if (!shouldOptimize(Function))
985       continue;
986 
987     if (fixTailCalls(Function)) {
988       Modified.insert(&Function);
989       Function.setHasCanonicalCFG(false);
990     }
991   }
992 
993   outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
994          << " tail calls (" << NumOrigForwardBranches << " forward)"
995          << " tail calls (" << NumOrigBackwardBranches << " backward)"
996          << " from a total of " << NumCandidateTailCalls << " while removing "
997          << NumDoubleJumps << " double jumps"
998          << " and removing " << DeletedBlocks << " basic blocks"
999          << " totalling " << DeletedBytes
1000          << " bytes of code. CTCs total execution count is " << CTCExecCount
1001          << " and the number of times CTCs are taken is " << CTCTakenCount
1002          << ".\n";
1003 }
1004 
1005 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
1006   uint64_t Count = 0;
1007   const BinaryContext &BC = Function.getBinaryContext();
1008   for (BinaryBasicBlock &BB : Function) {
1009     for (MCInst &Inst : BB) {
1010       MCInst OriginalInst;
1011       if (opts::Verbosity > 2)
1012         OriginalInst = Inst;
1013 
1014       if (!BC.MIB->shortenInstruction(Inst))
1015         continue;
1016 
1017       if (opts::Verbosity > 2) {
1018         outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
1019         BC.printInstruction(outs(), OriginalInst, 0, &Function);
1020         outs() << "BOLT-INFO: to:";
1021         BC.printInstruction(outs(), Inst, 0, &Function);
1022       }
1023 
1024       ++Count;
1025     }
1026   }
1027 
1028   return Count;
1029 }
1030 
1031 void ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1032   std::atomic<uint64_t> NumShortened{0};
1033   if (!BC.isX86())
1034     return;
1035 
1036   ParallelUtilities::runOnEachFunction(
1037       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1038       [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1039       nullptr, "ShortenInstructions");
1040 
1041   outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n";
1042 }
1043 
1044 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1045   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1046   for (BinaryBasicBlock &BB : Function) {
1047     MCInst *Inst = BB.getLastNonPseudoInstr();
1048     if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1049       MCInst Trap;
1050       if (MIB->createTrap(Trap)) {
1051         BB.addInstruction(Trap);
1052         ++TailCallTraps;
1053       }
1054     }
1055   }
1056 }
1057 
1058 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1059   for (BinaryBasicBlock &BB : Function) {
1060     if (BB.succ_size() != 2)
1061       continue;
1062 
1063     BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1064     BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1065     if (CondBB != UncondBB)
1066       continue;
1067 
1068     const MCSymbol *TBB = nullptr;
1069     const MCSymbol *FBB = nullptr;
1070     MCInst *CondBranch = nullptr;
1071     MCInst *UncondBranch = nullptr;
1072     bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1073 
1074     // analyzeBranch() can fail due to unusual branch instructions,
1075     // e.g. jrcxz, or jump tables (indirect jump).
1076     if (!Result || !CondBranch)
1077       continue;
1078 
1079     BB.removeDuplicateConditionalSuccessor(CondBranch);
1080     ++NumUselessCondBranches;
1081   }
1082 }
1083 
1084 void Peepholes::runOnFunctions(BinaryContext &BC) {
1085   const char Opts =
1086       std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1087                       [](const char A, const PeepholeOpts B) { return A | B; });
1088   if (Opts == PEEP_NONE)
1089     return;
1090 
1091   for (auto &It : BC.getBinaryFunctions()) {
1092     BinaryFunction &Function = It.second;
1093     if (shouldOptimize(Function)) {
1094       if (Opts & PEEP_DOUBLE_JUMPS)
1095         NumDoubleJumps += fixDoubleJumps(Function, false);
1096       if (Opts & PEEP_TAILCALL_TRAPS)
1097         addTailcallTraps(Function);
1098       if (Opts & PEEP_USELESS_BRANCHES)
1099         removeUselessCondBranches(Function);
1100       assert(Function.validateCFG());
1101     }
1102   }
1103   outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1104          << " double jumps patched.\n"
1105          << "BOLT-INFO: Peephole: " << TailCallTraps
1106          << " tail call traps inserted.\n"
1107          << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1108          << " useless conditional branches removed.\n";
1109 }
1110 
1111 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1112   BinaryContext &BC = BF.getBinaryContext();
1113   MCPlusBuilder *MIB = BC.MIB.get();
1114 
1115   uint64_t NumLocalLoadsSimplified = 0;
1116   uint64_t NumDynamicLocalLoadsSimplified = 0;
1117   uint64_t NumLocalLoadsFound = 0;
1118   uint64_t NumDynamicLocalLoadsFound = 0;
1119 
1120   for (BinaryBasicBlock *BB : BF.layout()) {
1121     for (MCInst &Inst : *BB) {
1122       unsigned Opcode = Inst.getOpcode();
1123       const MCInstrDesc &Desc = BC.MII->get(Opcode);
1124 
1125       // Skip instructions that do not load from memory.
1126       if (!Desc.mayLoad())
1127         continue;
1128 
1129       // Try to statically evaluate the target memory address;
1130       uint64_t TargetAddress;
1131 
1132       if (MIB->hasPCRelOperand(Inst)) {
1133         // Try to find the symbol that corresponds to the PC-relative operand.
1134         MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1135         assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1136         assert(DispOpI->isExpr() &&
1137                "found PC-relative with non-symbolic displacement");
1138 
1139         // Get displacement symbol.
1140         const MCSymbol *DisplSymbol;
1141         uint64_t DisplOffset;
1142 
1143         std::tie(DisplSymbol, DisplOffset) =
1144             MIB->getTargetSymbolInfo(DispOpI->getExpr());
1145 
1146         if (!DisplSymbol)
1147           continue;
1148 
1149         // Look up the symbol address in the global symbols map of the binary
1150         // context object.
1151         BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1152         if (!BD)
1153           continue;
1154         TargetAddress = BD->getAddress() + DisplOffset;
1155       } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1156         continue;
1157       }
1158 
1159       // Get the contents of the section containing the target address of the
1160       // memory operand. We are only interested in read-only sections.
1161       ErrorOr<BinarySection &> DataSection =
1162           BC.getSectionForAddress(TargetAddress);
1163       if (!DataSection || !DataSection->isReadOnly())
1164         continue;
1165 
1166       if (BC.getRelocationAt(TargetAddress) ||
1167           BC.getDynamicRelocationAt(TargetAddress))
1168         continue;
1169 
1170       uint32_t Offset = TargetAddress - DataSection->getAddress();
1171       StringRef ConstantData = DataSection->getContents();
1172 
1173       ++NumLocalLoadsFound;
1174       if (BB->hasProfile())
1175         NumDynamicLocalLoadsFound += BB->getExecutionCount();
1176 
1177       if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1178         ++NumLocalLoadsSimplified;
1179         if (BB->hasProfile())
1180           NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1181       }
1182     }
1183   }
1184 
1185   NumLoadsFound += NumLocalLoadsFound;
1186   NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1187   NumLoadsSimplified += NumLocalLoadsSimplified;
1188   NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1189 
1190   return NumLocalLoadsSimplified > 0;
1191 }
1192 
1193 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1194   for (auto &It : BC.getBinaryFunctions()) {
1195     BinaryFunction &Function = It.second;
1196     if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1197       Modified.insert(&Function);
1198   }
1199 
1200   outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1201          << NumLoadsFound << " loads from a statically computed address.\n"
1202          << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
1203          << "\n"
1204          << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
1205 }
1206 
1207 void AssignSections::runOnFunctions(BinaryContext &BC) {
1208   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1209     Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1210     Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1211   }
1212 
1213   // In non-relocation mode functions have pre-assigned section names.
1214   if (!BC.HasRelocations)
1215     return;
1216 
1217   const bool UseColdSection =
1218       BC.NumProfiledFuncs > 0 ||
1219       opts::ReorderFunctions == ReorderFunctions::RT_USER;
1220   for (auto &BFI : BC.getBinaryFunctions()) {
1221     BinaryFunction &Function = BFI.second;
1222     if (opts::isHotTextMover(Function)) {
1223       Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1224       Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1225       continue;
1226     }
1227 
1228     if (!UseColdSection || Function.hasValidIndex() ||
1229         Function.hasValidProfile())
1230       Function.setCodeSectionName(BC.getMainCodeSectionName());
1231     else
1232       Function.setCodeSectionName(BC.getColdCodeSectionName());
1233 
1234     if (Function.isSplit())
1235       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1236   }
1237 }
1238 
1239 void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1240   double FlowImbalanceMean = 0.0;
1241   size_t NumBlocksConsidered = 0;
1242   double WorstBias = 0.0;
1243   const BinaryFunction *WorstBiasFunc = nullptr;
1244 
1245   // For each function CFG, we fill an IncomingMap with the sum of the frequency
1246   // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1247   // of the frequency of outgoing edges.
1248   using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1249   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1250   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1251 
1252   // Compute mean
1253   for (const auto &BFI : BC.getBinaryFunctions()) {
1254     const BinaryFunction &Function = BFI.second;
1255     if (Function.empty() || !Function.isSimple())
1256       continue;
1257     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1258     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1259     for (const BinaryBasicBlock &BB : Function) {
1260       uint64_t TotalOutgoing = 0ULL;
1261       auto SuccBIIter = BB.branch_info_begin();
1262       for (BinaryBasicBlock *Succ : BB.successors()) {
1263         uint64_t Count = SuccBIIter->Count;
1264         if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1265           ++SuccBIIter;
1266           continue;
1267         }
1268         TotalOutgoing += Count;
1269         IncomingMap[Succ] += Count;
1270         ++SuccBIIter;
1271       }
1272       OutgoingMap[&BB] = TotalOutgoing;
1273     }
1274 
1275     size_t NumBlocks = 0;
1276     double Mean = 0.0;
1277     for (const BinaryBasicBlock &BB : Function) {
1278       // Do not compute score for low frequency blocks, entry or exit blocks
1279       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1280         continue;
1281       ++NumBlocks;
1282       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1283       Mean += fabs(Difference / IncomingMap[&BB]);
1284     }
1285 
1286     FlowImbalanceMean += Mean;
1287     NumBlocksConsidered += NumBlocks;
1288     if (!NumBlocks)
1289       continue;
1290     double FuncMean = Mean / NumBlocks;
1291     if (FuncMean > WorstBias) {
1292       WorstBias = FuncMean;
1293       WorstBiasFunc = &Function;
1294     }
1295   }
1296   if (NumBlocksConsidered > 0)
1297     FlowImbalanceMean /= NumBlocksConsidered;
1298 
1299   // Compute standard deviation
1300   NumBlocksConsidered = 0;
1301   double FlowImbalanceVar = 0.0;
1302   for (const auto &BFI : BC.getBinaryFunctions()) {
1303     const BinaryFunction &Function = BFI.second;
1304     if (Function.empty() || !Function.isSimple())
1305       continue;
1306     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1307     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1308     for (const BinaryBasicBlock &BB : Function) {
1309       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1310         continue;
1311       ++NumBlocksConsidered;
1312       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1313       FlowImbalanceVar +=
1314           pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1315     }
1316   }
1317   if (NumBlocksConsidered) {
1318     FlowImbalanceVar /= NumBlocksConsidered;
1319     FlowImbalanceVar = sqrt(FlowImbalanceVar);
1320   }
1321 
1322   // Report to user
1323   outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1324                    (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1325   if (WorstBiasFunc && opts::Verbosity >= 1) {
1326     outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
1327            << "\n";
1328     LLVM_DEBUG(WorstBiasFunc->dump());
1329   }
1330 }
1331 
1332 void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1333   uint64_t NumRegularFunctions = 0;
1334   uint64_t NumStaleProfileFunctions = 0;
1335   uint64_t NumNonSimpleProfiledFunctions = 0;
1336   uint64_t NumUnknownControlFlowFunctions = 0;
1337   uint64_t TotalSampleCount = 0;
1338   uint64_t StaleSampleCount = 0;
1339   std::vector<const BinaryFunction *> ProfiledFunctions;
1340   const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1341   for (auto &BFI : BC.getBinaryFunctions()) {
1342     const BinaryFunction &Function = BFI.second;
1343 
1344     // Ignore PLT functions for stats.
1345     if (Function.isPLTFunction())
1346       continue;
1347 
1348     ++NumRegularFunctions;
1349 
1350     if (!Function.isSimple()) {
1351       if (Function.hasProfile())
1352         ++NumNonSimpleProfiledFunctions;
1353       continue;
1354     }
1355 
1356     if (Function.hasUnknownControlFlow()) {
1357       if (opts::PrintUnknownCFG)
1358         Function.dump();
1359       else if (opts::PrintUnknown)
1360         errs() << "function with unknown control flow: " << Function << '\n';
1361 
1362       ++NumUnknownControlFlowFunctions;
1363     }
1364 
1365     if (!Function.hasProfile())
1366       continue;
1367 
1368     uint64_t SampleCount = Function.getRawBranchCount();
1369     TotalSampleCount += SampleCount;
1370 
1371     if (Function.hasValidProfile()) {
1372       ProfiledFunctions.push_back(&Function);
1373     } else {
1374       if (opts::ReportStaleFuncs) {
1375         outs() << StaleFuncsHeader;
1376         StaleFuncsHeader = "";
1377         outs() << "  " << Function << '\n';
1378       }
1379       ++NumStaleProfileFunctions;
1380       StaleSampleCount += SampleCount;
1381     }
1382   }
1383   BC.NumProfiledFuncs = ProfiledFunctions.size();
1384 
1385   const size_t NumAllProfiledFunctions =
1386       ProfiledFunctions.size() + NumStaleProfileFunctions;
1387   outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1388          << NumRegularFunctions << " functions in the binary ("
1389          << format("%.1f", NumAllProfiledFunctions /
1390                                (float)NumRegularFunctions * 100.0f)
1391          << "%) have non-empty execution profile\n";
1392   if (NumNonSimpleProfiledFunctions) {
1393     outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1394            << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1395            << " with profile could not be optimized\n";
1396   }
1397   if (NumStaleProfileFunctions) {
1398     const float PctStale =
1399         NumStaleProfileFunctions / (float)NumAllProfiledFunctions * 100.0f;
1400     auto printErrorOrWarning = [&]() {
1401       if (PctStale > opts::StaleThreshold)
1402         errs() << "BOLT-ERROR: ";
1403       else
1404         errs() << "BOLT-WARNING: ";
1405     };
1406     printErrorOrWarning();
1407     errs() << NumStaleProfileFunctions
1408            << format(" (%.1f%% of all profiled)", PctStale) << " function"
1409            << (NumStaleProfileFunctions == 1 ? "" : "s")
1410            << " have invalid (possibly stale) profile."
1411               " Use -report-stale to see the list.\n";
1412     if (TotalSampleCount > 0) {
1413       printErrorOrWarning();
1414       errs() << StaleSampleCount << " out of " << TotalSampleCount
1415              << " samples in the binary ("
1416              << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount))
1417              << "%) belong to functions with invalid"
1418                 " (possibly stale) profile.\n";
1419     }
1420     if (PctStale > opts::StaleThreshold) {
1421       errs() << "BOLT-ERROR: stale functions exceed specified threshold of "
1422              << opts::StaleThreshold << "%. Exiting.\n";
1423       exit(1);
1424     }
1425   }
1426 
1427   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1428     outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1429            << " objects was ignored\n";
1430   }
1431 
1432   if (ProfiledFunctions.size() > 10) {
1433     if (opts::Verbosity >= 1) {
1434       outs() << "BOLT-INFO: top called functions are:\n";
1435       std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(),
1436                 [](const BinaryFunction *A, const BinaryFunction *B) {
1437                   return B->getExecutionCount() < A->getExecutionCount();
1438                 });
1439       auto SFI = ProfiledFunctions.begin();
1440       auto SFIend = ProfiledFunctions.end();
1441       for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1442            ++SFI, ++I)
1443         outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n';
1444     }
1445   }
1446 
1447   if (!opts::PrintSortedBy.empty() &&
1448       std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(),
1449                 DynoStats::FIRST_DYNO_STAT) == opts::PrintSortedBy.end()) {
1450 
1451     std::vector<const BinaryFunction *> Functions;
1452     std::map<const BinaryFunction *, DynoStats> Stats;
1453 
1454     for (const auto &BFI : BC.getBinaryFunctions()) {
1455       const BinaryFunction &BF = BFI.second;
1456       if (shouldOptimize(BF) && BF.hasValidProfile()) {
1457         Functions.push_back(&BF);
1458         Stats.emplace(&BF, getDynoStats(BF));
1459       }
1460     }
1461 
1462     const bool SortAll =
1463         std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(),
1464                   DynoStats::LAST_DYNO_STAT) != opts::PrintSortedBy.end();
1465 
1466     const bool Ascending =
1467         opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1468 
1469     if (SortAll) {
1470       std::stable_sort(Functions.begin(), Functions.end(),
1471                        [Ascending, &Stats](const BinaryFunction *A,
1472                                            const BinaryFunction *B) {
1473                          return Ascending ? Stats.at(A) < Stats.at(B)
1474                                           : Stats.at(B) < Stats.at(A);
1475                        });
1476     } else {
1477       std::stable_sort(
1478           Functions.begin(), Functions.end(),
1479           [Ascending, &Stats](const BinaryFunction *A,
1480                               const BinaryFunction *B) {
1481             const DynoStats &StatsA = Stats.at(A);
1482             const DynoStats &StatsB = Stats.at(B);
1483             return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy)
1484                              : StatsB.lessThan(StatsA, opts::PrintSortedBy);
1485           });
1486     }
1487 
1488     outs() << "BOLT-INFO: top functions sorted by ";
1489     if (SortAll) {
1490       outs() << "dyno stats";
1491     } else {
1492       outs() << "(";
1493       bool PrintComma = false;
1494       for (const DynoStats::Category Category : opts::PrintSortedBy) {
1495         if (PrintComma)
1496           outs() << ", ";
1497         outs() << DynoStats::Description(Category);
1498         PrintComma = true;
1499       }
1500       outs() << ")";
1501     }
1502 
1503     outs() << " are:\n";
1504     auto SFI = Functions.begin();
1505     for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1506       const DynoStats Stats = getDynoStats(**SFI);
1507       outs() << "  " << **SFI;
1508       if (!SortAll) {
1509         outs() << " (";
1510         bool PrintComma = false;
1511         for (const DynoStats::Category Category : opts::PrintSortedBy) {
1512           if (PrintComma)
1513             outs() << ", ";
1514           outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1515           PrintComma = true;
1516         }
1517         outs() << ")";
1518       }
1519       outs() << "\n";
1520     }
1521   }
1522 
1523   if (!BC.TrappedFunctions.empty()) {
1524     errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1525            << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1526            << " will trap on entry. Use -trap-avx512=0 to disable"
1527               " traps.";
1528     if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1529       errs() << '\n';
1530       for (const BinaryFunction *Function : BC.TrappedFunctions)
1531         errs() << "  " << *Function << '\n';
1532     } else {
1533       errs() << " Use -v=1 to see the list.\n";
1534     }
1535   }
1536 
1537   // Print information on missed macro-fusion opportunities seen on input.
1538   if (BC.MissedMacroFusionPairs) {
1539     outs() << "BOLT-INFO: the input contains " << BC.MissedMacroFusionPairs
1540            << " (dynamic count : " << BC.MissedMacroFusionExecCount
1541            << ") opportunities for macro-fusion optimization";
1542     switch (opts::AlignMacroOpFusion) {
1543     case MFT_NONE:
1544       outs() << ". Use -align-macro-fusion to fix.\n";
1545       break;
1546     case MFT_HOT:
1547       outs() << ". Will fix instances on a hot path.\n";
1548       break;
1549     case MFT_ALL:
1550       outs() << " that are going to be fixed\n";
1551       break;
1552     }
1553   }
1554 
1555   // Collect and print information about suboptimal code layout on input.
1556   if (opts::ReportBadLayout) {
1557     std::vector<const BinaryFunction *> SuboptimalFuncs;
1558     for (auto &BFI : BC.getBinaryFunctions()) {
1559       const BinaryFunction &BF = BFI.second;
1560       if (!BF.hasValidProfile())
1561         continue;
1562 
1563       const uint64_t HotThreshold =
1564           std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1565       bool HotSeen = false;
1566       for (const BinaryBasicBlock *BB : BF.rlayout()) {
1567         if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1568           HotSeen = true;
1569           continue;
1570         }
1571         if (HotSeen && BB->getKnownExecutionCount() == 0) {
1572           SuboptimalFuncs.push_back(&BF);
1573           break;
1574         }
1575       }
1576     }
1577 
1578     if (!SuboptimalFuncs.empty()) {
1579       std::sort(SuboptimalFuncs.begin(), SuboptimalFuncs.end(),
1580                 [](const BinaryFunction *A, const BinaryFunction *B) {
1581                   return A->getKnownExecutionCount() / A->getSize() >
1582                          B->getKnownExecutionCount() / B->getSize();
1583                 });
1584 
1585       outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1586              << " functions have "
1587                 "cold code in the middle of hot code. Top functions are:\n";
1588       for (unsigned I = 0;
1589            I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1590                         SuboptimalFuncs.size());
1591            ++I)
1592         SuboptimalFuncs[I]->print(outs());
1593     }
1594   }
1595 
1596   if (NumUnknownControlFlowFunctions) {
1597     outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1598            << " functions have instructions with unknown control flow";
1599     if (!opts::PrintUnknown)
1600       outs() << ". Use -print-unknown to see the list.";
1601     outs() << '\n';
1602   }
1603 }
1604 
1605 void InstructionLowering::runOnFunctions(BinaryContext &BC) {
1606   for (auto &BFI : BC.getBinaryFunctions())
1607     for (BinaryBasicBlock &BB : BFI.second)
1608       for (MCInst &Instruction : BB)
1609         BC.MIB->lowerTailCall(Instruction);
1610 }
1611 
1612 void StripRepRet::runOnFunctions(BinaryContext &BC) {
1613   uint64_t NumPrefixesRemoved = 0;
1614   uint64_t NumBytesSaved = 0;
1615   for (auto &BFI : BC.getBinaryFunctions()) {
1616     for (BinaryBasicBlock &BB : BFI.second) {
1617       auto LastInstRIter = BB.getLastNonPseudo();
1618       if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1619           !BC.MIB->deleteREPPrefix(*LastInstRIter))
1620         continue;
1621 
1622       NumPrefixesRemoved += BB.getKnownExecutionCount();
1623       ++NumBytesSaved;
1624     }
1625   }
1626 
1627   if (NumBytesSaved)
1628     outs() << "BOLT-INFO: removed " << NumBytesSaved
1629            << " 'repz' prefixes"
1630               " with estimated execution count of "
1631            << NumPrefixesRemoved << " times.\n";
1632 }
1633 
1634 void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1635   if (!BC.isX86())
1636     return;
1637 
1638   uint64_t NumInlined = 0;
1639   uint64_t NumInlinedDyno = 0;
1640   for (auto &BFI : BC.getBinaryFunctions()) {
1641     for (BinaryBasicBlock &BB : BFI.second) {
1642       for (auto II = BB.begin(); II != BB.end(); ++II) {
1643         MCInst &Inst = *II;
1644 
1645         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1646             !Inst.getOperand(0).isExpr())
1647           continue;
1648 
1649         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1650         if (CalleeSymbol->getName() != "memcpy" &&
1651             CalleeSymbol->getName() != "memcpy@PLT" &&
1652             CalleeSymbol->getName() != "_memcpy8")
1653           continue;
1654 
1655         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1656         const bool IsTailCall = BC.MIB->isTailCall(Inst);
1657 
1658         const InstructionListType NewCode =
1659             BC.MIB->createInlineMemcpy(IsMemcpy8);
1660         II = BB.replaceInstruction(II, NewCode);
1661         std::advance(II, NewCode.size() - 1);
1662         if (IsTailCall) {
1663           MCInst Return;
1664           BC.MIB->createReturn(Return);
1665           II = BB.insertInstruction(std::next(II), std::move(Return));
1666         }
1667 
1668         ++NumInlined;
1669         NumInlinedDyno += BB.getKnownExecutionCount();
1670       }
1671     }
1672   }
1673 
1674   if (NumInlined) {
1675     outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1676     if (NumInlinedDyno)
1677       outs() << ". The calls were executed " << NumInlinedDyno
1678              << " times based on profile.";
1679     outs() << '\n';
1680   }
1681 }
1682 
1683 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1684   if (!BinaryFunctionPass::shouldOptimize(Function))
1685     return false;
1686 
1687   for (const std::string &FunctionSpec : Spec) {
1688     StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1689     if (Function.hasNameRegex(FunctionName))
1690       return true;
1691   }
1692 
1693   return false;
1694 }
1695 
1696 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1697     const BinaryFunction &Function) const {
1698   StringRef SitesString;
1699   for (const std::string &FunctionSpec : Spec) {
1700     StringRef FunctionName;
1701     std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1702     if (Function.hasNameRegex(FunctionName))
1703       break;
1704     SitesString = "";
1705   }
1706 
1707   std::set<size_t> Sites;
1708   SmallVector<StringRef, 4> SitesVec;
1709   SitesString.split(SitesVec, ':');
1710   for (StringRef SiteString : SitesVec) {
1711     if (SiteString.empty())
1712       continue;
1713     size_t Result;
1714     if (!SiteString.getAsInteger(10, Result))
1715       Sites.emplace(Result);
1716   }
1717 
1718   return Sites;
1719 }
1720 
1721 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1722   if (!BC.isX86())
1723     return;
1724 
1725   uint64_t NumSpecialized = 0;
1726   uint64_t NumSpecializedDyno = 0;
1727   for (auto &BFI : BC.getBinaryFunctions()) {
1728     BinaryFunction &Function = BFI.second;
1729     if (!shouldOptimize(Function))
1730       continue;
1731 
1732     std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1733     auto shouldOptimize = [&](size_t N) {
1734       return CallsToOptimize.empty() || CallsToOptimize.count(N);
1735     };
1736 
1737     std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1738     size_t CallSiteID = 0;
1739     for (BinaryBasicBlock *CurBB : Blocks) {
1740       for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1741         MCInst &Inst = *II;
1742 
1743         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1744             !Inst.getOperand(0).isExpr())
1745           continue;
1746 
1747         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1748         if (CalleeSymbol->getName() != "memcpy" &&
1749             CalleeSymbol->getName() != "memcpy@PLT")
1750           continue;
1751 
1752         if (BC.MIB->isTailCall(Inst))
1753           continue;
1754 
1755         ++CallSiteID;
1756 
1757         if (!shouldOptimize(CallSiteID))
1758           continue;
1759 
1760         // Create a copy of a call to memcpy(dest, src, size).
1761         MCInst MemcpyInstr = Inst;
1762 
1763         BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1764 
1765         BinaryBasicBlock *NextBB = nullptr;
1766         if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1767           NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1768           NextBB->eraseInstruction(NextBB->begin());
1769         } else {
1770           NextBB = OneByteMemcpyBB->getSuccessor();
1771           OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1772           assert(NextBB && "unexpected call to memcpy() with no return");
1773         }
1774 
1775         BinaryBasicBlock *MemcpyBB =
1776             Function.addBasicBlock(CurBB->getInputOffset());
1777         InstructionListType CmpJCC =
1778             BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1779                                 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1780         CurBB->addInstructions(CmpJCC);
1781         CurBB->addSuccessor(MemcpyBB);
1782 
1783         MemcpyBB->addInstruction(std::move(MemcpyInstr));
1784         MemcpyBB->addSuccessor(NextBB);
1785         MemcpyBB->setCFIState(NextBB->getCFIState());
1786         MemcpyBB->setExecutionCount(0);
1787 
1788         // To prevent the actual call from being moved to cold, we set its
1789         // execution count to 1.
1790         if (CurBB->getKnownExecutionCount() > 0)
1791           MemcpyBB->setExecutionCount(1);
1792 
1793         InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1794         OneByteMemcpyBB->addInstructions(OneByteMemcpy);
1795 
1796         ++NumSpecialized;
1797         NumSpecializedDyno += CurBB->getKnownExecutionCount();
1798 
1799         CurBB = NextBB;
1800 
1801         // Note: we don't expect the next instruction to be a call to memcpy.
1802         II = CurBB->begin();
1803       }
1804     }
1805   }
1806 
1807   if (NumSpecialized) {
1808     outs() << "BOLT-INFO: specialized " << NumSpecialized
1809            << " memcpy() call sites for size 1";
1810     if (NumSpecializedDyno)
1811       outs() << ". The calls were executed " << NumSpecializedDyno
1812              << " times based on profile.";
1813     outs() << '\n';
1814   }
1815 }
1816 
1817 void RemoveNops::runOnFunction(BinaryFunction &BF) {
1818   const BinaryContext &BC = BF.getBinaryContext();
1819   for (BinaryBasicBlock &BB : BF) {
1820     for (int64_t I = BB.size() - 1; I >= 0; --I) {
1821       MCInst &Inst = BB.getInstructionAtIndex(I);
1822       if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
1823         BB.eraseInstructionAtIndex(I);
1824     }
1825   }
1826 }
1827 
1828 void RemoveNops::runOnFunctions(BinaryContext &BC) {
1829   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1830     runOnFunction(BF);
1831   };
1832 
1833   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1834     return BF.shouldPreserveNops();
1835   };
1836 
1837   ParallelUtilities::runOnEachFunction(
1838       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
1839       SkipFunc, "RemoveNops");
1840 }
1841 
1842 } // namespace bolt
1843 } // namespace llvm
1844