1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/ParallelUtilities.h"
15 #include "bolt/Passes/ReorderAlgorithm.h"
16 #include "bolt/Passes/ReorderFunctions.h"
17 #include "llvm/Support/CommandLine.h"
18 
19 #include <numeric>
20 #include <vector>
21 
22 #define DEBUG_TYPE "bolt-opts"
23 
24 using namespace llvm;
25 using namespace bolt;
26 
27 namespace {
28 
dynoStatsOptName(const bolt::DynoStats::Category C)29 const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30   if (C == bolt::DynoStats::FIRST_DYNO_STAT)
31     return "none";
32   else if (C == bolt::DynoStats::LAST_DYNO_STAT)
33     return "all";
34 
35   static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
36 
37   OptNames[C] = bolt::DynoStats::Description(C);
38 
39   std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
40 
41   return OptNames[C].c_str();
42 }
43 
dynoStatsOptDesc(const bolt::DynoStats::Category C)44 const char *dynoStatsOptDesc(const bolt::DynoStats::Category C) {
45   if (C == bolt::DynoStats::FIRST_DYNO_STAT)
46     return "unsorted";
47   else if (C == bolt::DynoStats::LAST_DYNO_STAT)
48     return "sorted by all stats";
49 
50   return bolt::DynoStats::Description(C);
51 }
52 
53 }
54 
55 namespace opts {
56 
57 extern cl::OptionCategory BoltCategory;
58 extern cl::OptionCategory BoltOptCategory;
59 
60 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
61 extern cl::opt<unsigned> Verbosity;
62 extern cl::opt<bool> EnableBAT;
63 extern cl::opt<unsigned> ExecutionCountThreshold;
64 extern cl::opt<bool> UpdateDebugSections;
65 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
66 
67 enum DynoStatsSortOrder : char {
68   Ascending,
69   Descending
70 };
71 
72 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
73     "print-sorted-by-order",
74     cl::desc("use ascending or descending order when printing functions "
75              "ordered by dyno stats"),
76     cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
77 
78 cl::list<std::string>
79 HotTextMoveSections("hot-text-move-sections",
80   cl::desc("list of sections containing functions used for hugifying hot text. "
81            "BOLT makes sure these functions are not placed on the same page as "
82            "the hot text. (default=\'.stub,.mover\')."),
83   cl::value_desc("sec1,sec2,sec3,..."),
84   cl::CommaSeparated,
85   cl::ZeroOrMore,
86   cl::cat(BoltCategory));
87 
isHotTextMover(const BinaryFunction & Function)88 bool isHotTextMover(const BinaryFunction &Function) {
89   for (std::string &SectionName : opts::HotTextMoveSections) {
90     if (Function.getOriginSectionName() &&
91         *Function.getOriginSectionName() == SectionName)
92       return true;
93   }
94 
95   return false;
96 }
97 
98 static cl::opt<bool> MinBranchClusters(
99     "min-branch-clusters",
100     cl::desc("use a modified clustering algorithm geared towards minimizing "
101              "branches"),
102     cl::Hidden, cl::cat(BoltOptCategory));
103 
104 static cl::list<Peepholes::PeepholeOpts> Peepholes(
105     "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
106     cl::value_desc("opt1,opt2,opt3,..."),
107     cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
108                clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
109                           "remove double jumps when able"),
110                clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
111                           "insert tail call traps"),
112                clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
113                           "remove useless conditional branches"),
114                clEnumValN(Peepholes::PEEP_ALL, "all",
115                           "enable all peephole optimizations")),
116     cl::ZeroOrMore, cl::cat(BoltOptCategory));
117 
118 static cl::opt<unsigned>
119     PrintFuncStat("print-function-statistics",
120                   cl::desc("print statistics about basic block ordering"),
121                   cl::init(0), cl::cat(BoltOptCategory));
122 
123 static cl::list<bolt::DynoStats::Category>
124 PrintSortedBy("print-sorted-by",
125   cl::CommaSeparated,
126   cl::desc("print functions sorted by order of dyno stats"),
127   cl::value_desc("key1,key2,key3,..."),
128   cl::values(
129 #define D(name, ...)                                        \
130     clEnumValN(bolt::DynoStats::name,                     \
131                dynoStatsOptName(bolt::DynoStats::name),   \
132                dynoStatsOptDesc(bolt::DynoStats::name)),
133     DYNO_STATS
134 #undef D
135     clEnumValN(0xffff, ".", ".")
136     ),
137   cl::ZeroOrMore,
138   cl::cat(BoltOptCategory));
139 
140 static cl::opt<bool>
141     PrintUnknown("print-unknown",
142                  cl::desc("print names of functions with unknown control flow"),
143                  cl::cat(BoltCategory), cl::Hidden);
144 
145 static cl::opt<bool>
146     PrintUnknownCFG("print-unknown-cfg",
147                     cl::desc("dump CFG of functions with unknown control flow"),
148                     cl::cat(BoltCategory), cl::ReallyHidden);
149 
150 // Please MSVC19 with a forward declaration: otherwise it reports an error about
151 // an undeclared variable inside a callback.
152 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
153 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
154     "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
155     cl::init(bolt::ReorderBasicBlocks::LT_NONE),
156     cl::values(
157         clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
158                    "do not reorder basic blocks"),
159         clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
160                    "layout blocks in reverse order"),
161         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
162                    "perform optimal layout based on profile"),
163         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
164                    "branch-predictor",
165                    "perform optimal layout prioritizing branch "
166                    "predictions"),
167         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
168                    "perform optimal layout prioritizing I-cache "
169                    "behavior"),
170         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
171                    "perform layout optimizing I-cache behavior"),
172         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
173                    "perform layout optimizing I-cache behavior"),
174         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
175                    "cluster-shuffle", "perform random layout of clusters")),
176     cl::ZeroOrMore, cl::cat(BoltOptCategory),
__anonbe3616080202(const bolt::ReorderBasicBlocks::LayoutType &option) 177     cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) {
178       if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
179         WithColor::warning()
180             << "'-reorder-blocks=cache+' is deprecated, "
181             << "please use '-reorder-blocks=ext-tsp' instead\n";
182         ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
183       }
184     }));
185 
186 static cl::opt<unsigned> ReportBadLayout(
187     "report-bad-layout",
188     cl::desc("print top <uint> functions with suboptimal code layout on input"),
189     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
190 
191 static cl::opt<bool>
192     ReportStaleFuncs("report-stale",
193                      cl::desc("print the list of functions with stale profile"),
194                      cl::Hidden, cl::cat(BoltOptCategory));
195 
196 enum SctcModes : char {
197   SctcAlways,
198   SctcPreserveDirection,
199   SctcHeuristic
200 };
201 
202 static cl::opt<SctcModes>
203 SctcMode("sctc-mode",
204   cl::desc("mode for simplify conditional tail calls"),
205   cl::init(SctcAlways),
206   cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
207     clEnumValN(SctcPreserveDirection,
208       "preserve",
209       "only perform sctc when branch direction is "
210       "preserved"),
211     clEnumValN(SctcHeuristic,
212       "heuristic",
213       "use branch prediction data to control sctc")),
214   cl::ZeroOrMore,
215   cl::cat(BoltOptCategory));
216 
217 static cl::opt<unsigned>
218 StaleThreshold("stale-threshold",
219     cl::desc(
220       "maximum percentage of stale functions to tolerate (default: 100)"),
221     cl::init(100),
222     cl::Hidden,
223     cl::cat(BoltOptCategory));
224 
225 static cl::opt<unsigned> TSPThreshold(
226     "tsp-threshold",
227     cl::desc(
228         "maximum number of hot basic blocks in a function for which to use "
229         "a precise TSP solution while re-ordering basic blocks"),
230     cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
231 
232 static cl::opt<unsigned> TopCalledLimit(
233     "top-called-limit",
234     cl::desc("maximum number of functions to print in top called "
235              "functions section"),
236     cl::init(100), cl::Hidden, cl::cat(BoltCategory));
237 
238 } // namespace opts
239 
240 namespace llvm {
241 namespace bolt {
242 
shouldOptimize(const BinaryFunction & BF) const243 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
244   return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
245          !BF.isIgnored();
246 }
247 
shouldPrint(const BinaryFunction & BF) const248 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
249   return BF.isSimple() && !BF.isIgnored();
250 }
251 
runOnFunction(BinaryFunction & BF)252 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
253   uint64_t NumRemoved = 0;
254   uint64_t NumDuplicateEdges = 0;
255   uint64_t NeedsFixBranches = 0;
256   for (BinaryBasicBlock &BB : BF) {
257     if (!BB.empty())
258       continue;
259 
260     if (BB.isEntryPoint() || BB.isLandingPad())
261       continue;
262 
263     // Handle a dangling empty block.
264     if (BB.succ_size() == 0) {
265       // If an empty dangling basic block has a predecessor, it could be a
266       // result of codegen for __builtin_unreachable. In such case, do not
267       // remove the block.
268       if (BB.pred_size() == 0) {
269         BB.markValid(false);
270         ++NumRemoved;
271       }
272       continue;
273     }
274 
275     // The block should have just one successor.
276     BinaryBasicBlock *Successor = BB.getSuccessor();
277     assert(Successor && "invalid CFG encountered");
278 
279     // Redirect all predecessors to the successor block.
280     while (!BB.pred_empty()) {
281       BinaryBasicBlock *Predecessor = *BB.pred_begin();
282       if (Predecessor->hasJumpTable())
283         break;
284 
285       if (Predecessor == Successor)
286         break;
287 
288       BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
289       Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
290                                     BI.MispredictedCount);
291       // We need to fix branches even if we failed to replace all successors
292       // and remove the block.
293       NeedsFixBranches = true;
294     }
295 
296     if (BB.pred_empty()) {
297       BB.removeAllSuccessors();
298       BB.markValid(false);
299       ++NumRemoved;
300     }
301   }
302 
303   if (NumRemoved)
304     BF.eraseInvalidBBs();
305 
306   // Check for duplicate successors. Do it after the empty block elimination as
307   // we can get more duplicate successors.
308   for (BinaryBasicBlock &BB : BF)
309     if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
310         BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
311       ++NumDuplicateEdges;
312 
313   // fixBranches() will get rid of duplicate edges and update jump instructions.
314   if (NumDuplicateEdges || NeedsFixBranches)
315     BF.fixBranches();
316 
317   NumDuplicateEdgesMerged += NumDuplicateEdges;
318   NumBlocksRemoved += NumRemoved;
319 }
320 
runOnFunctions(BinaryContext & BC)321 void NormalizeCFG::runOnFunctions(BinaryContext &BC) {
322   ParallelUtilities::runOnEachFunction(
323       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
324       [&](BinaryFunction &BF) { runOnFunction(BF); },
325       [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
326       "NormalizeCFG");
327   if (NumBlocksRemoved)
328     outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
329            << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
330   if (NumDuplicateEdgesMerged)
331     outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
332            << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s")
333            << '\n';
334 }
335 
runOnFunction(BinaryFunction & Function)336 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
337   if (!Function.getLayout().block_empty()) {
338     unsigned Count;
339     uint64_t Bytes;
340     Function.markUnreachableBlocks();
341     LLVM_DEBUG({
342       for (BinaryBasicBlock &BB : Function) {
343         if (!BB.isValid()) {
344           dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
345                  << " in function " << Function << "\n";
346           Function.dump();
347         }
348       }
349     });
350     std::tie(Count, Bytes) = Function.eraseInvalidBBs();
351     DeletedBlocks += Count;
352     DeletedBytes += Bytes;
353     if (Count) {
354       Modified.insert(&Function);
355       if (opts::Verbosity > 0)
356         outs() << "BOLT-INFO: Removed " << Count
357                << " dead basic block(s) accounting for " << Bytes
358                << " bytes in function " << Function << '\n';
359     }
360   }
361 }
362 
runOnFunctions(BinaryContext & BC)363 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
364   for (auto &It : BC.getBinaryFunctions()) {
365     BinaryFunction &Function = It.second;
366     if (shouldOptimize(Function))
367       runOnFunction(Function);
368   }
369 
370   outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
371          << DeletedBytes << " bytes of code.\n";
372 }
373 
shouldPrint(const BinaryFunction & BF) const374 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
375   return (BinaryFunctionPass::shouldPrint(BF) &&
376           opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
377 }
378 
shouldOptimize(const BinaryFunction & BF) const379 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
380   // Apply execution count threshold
381   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
382     return false;
383 
384   return BinaryFunctionPass::shouldOptimize(BF);
385 }
386 
runOnFunctions(BinaryContext & BC)387 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
388   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
389     return;
390 
391   std::atomic<uint64_t> ModifiedFuncCount{0};
392 
393   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
394     modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
395     if (BF.getLayout().hasLayoutChanged())
396       ++ModifiedFuncCount;
397   };
398 
399   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
400     return !shouldOptimize(BF);
401   };
402 
403   ParallelUtilities::runOnEachFunction(
404       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
405       "ReorderBasicBlocks");
406 
407   outs() << "BOLT-INFO: basic block reordering modified layout of "
408          << format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(),
409                    100.0 * ModifiedFuncCount.load() /
410                        BC.getBinaryFunctions().size());
411 
412   if (opts::PrintFuncStat > 0) {
413     raw_ostream &OS = outs();
414     // Copy all the values into vector in order to sort them
415     std::map<uint64_t, BinaryFunction &> ScoreMap;
416     auto &BFs = BC.getBinaryFunctions();
417     for (auto It = BFs.begin(); It != BFs.end(); ++It)
418       ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
419           It->second.getFunctionScore(), It->second));
420 
421     OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
422     OS << "           There are " << BFs.size() << " functions in total. \n";
423     OS << "           Number of functions being modified: "
424        << ModifiedFuncCount.load() << "\n";
425     OS << "           User asks for detailed information on top "
426        << opts::PrintFuncStat << " functions. (Ranked by function score)"
427        << "\n\n";
428     uint64_t I = 0;
429     for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
430              ScoreMap.rbegin();
431          Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
432       BinaryFunction &Function = Rit->second;
433 
434       OS << "           Information for function of top: " << (I + 1) << ": \n";
435       OS << "             Function Score is: " << Function.getFunctionScore()
436          << "\n";
437       OS << "             There are " << Function.size()
438          << " number of blocks in this function.\n";
439       OS << "             There are " << Function.getInstructionCount()
440          << " number of instructions in this function.\n";
441       OS << "             The edit distance for this function is: "
442          << Function.getLayout().getEditDistance() << "\n\n";
443     }
444   }
445 }
446 
modifyFunctionLayout(BinaryFunction & BF,LayoutType Type,bool MinBranchClusters) const447 void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
448                                               LayoutType Type,
449                                               bool MinBranchClusters) const {
450   if (BF.size() == 0 || Type == LT_NONE)
451     return;
452 
453   BinaryFunction::BasicBlockOrderType NewLayout;
454   std::unique_ptr<ReorderAlgorithm> Algo;
455 
456   // Cannot do optimal layout without profile.
457   if (Type != LT_REVERSE && !BF.hasValidProfile())
458     return;
459 
460   if (Type == LT_REVERSE) {
461     Algo.reset(new ReverseReorderAlgorithm());
462   } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
463     // Work on optimal solution if problem is small enough
464     LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
465     Algo.reset(new TSPReorderAlgorithm());
466   } else {
467     LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
468 
469     std::unique_ptr<ClusterAlgorithm> CAlgo;
470     if (MinBranchClusters)
471       CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
472     else
473       CAlgo.reset(new PHGreedyClusterAlgorithm());
474 
475     switch (Type) {
476     case LT_OPTIMIZE:
477       Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
478       break;
479 
480     case LT_OPTIMIZE_BRANCH:
481       Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
482       break;
483 
484     case LT_OPTIMIZE_CACHE:
485       Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
486       break;
487 
488     case LT_OPTIMIZE_EXT_TSP:
489       Algo.reset(new ExtTSPReorderAlgorithm());
490       break;
491 
492     case LT_OPTIMIZE_SHUFFLE:
493       Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
494       break;
495 
496     default:
497       llvm_unreachable("unexpected layout type");
498     }
499   }
500 
501   Algo->reorderBasicBlocks(BF, NewLayout);
502 
503   BF.getLayout().update(NewLayout);
504 }
505 
runOnFunctions(BinaryContext & BC)506 void FixupBranches::runOnFunctions(BinaryContext &BC) {
507   for (auto &It : BC.getBinaryFunctions()) {
508     BinaryFunction &Function = It.second;
509     if (!BC.shouldEmit(Function) || !Function.isSimple())
510       continue;
511 
512     Function.fixBranches();
513   }
514 }
515 
runOnFunctions(BinaryContext & BC)516 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
517   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
518     if (!BF.finalizeCFIState()) {
519       if (BC.HasRelocations) {
520         errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
521                << ". Exiting.\n";
522         exit(1);
523       }
524       BF.setSimple(false);
525       return;
526     }
527 
528     BF.setFinalized();
529 
530     // Update exception handling information.
531     BF.updateEHRanges();
532   };
533 
534   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
535     return !BC.shouldEmit(BF);
536   };
537 
538   ParallelUtilities::runOnEachFunction(
539       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
540       SkipPredicate, "FinalizeFunctions");
541 }
542 
runOnFunctions(BinaryContext & BC)543 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
544   if (BC.HasRelocations)
545     return;
546 
547   if (!opts::UpdateDebugSections)
548     return;
549 
550   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
551   // incorrect debug info.
552   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
553     uint64_t HotSize, ColdSize;
554     std::tie(HotSize, ColdSize) =
555         BC.calculateEmittedSize(BF, /*FixBranches=*/false);
556     if (HotSize > BF.getMaxSize())
557       BF.setSimple(false);
558   };
559 
560   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
561     return !shouldOptimize(BF);
562   };
563 
564   ParallelUtilities::runOnEachFunction(
565       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
566       SkipFunc, "CheckLargeFunctions");
567 }
568 
shouldOptimize(const BinaryFunction & BF) const569 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
570   // Unlike other passes, allow functions in non-CFG state.
571   return BF.isSimple() && !BF.isIgnored();
572 }
573 
runOnFunctions(BinaryContext & BC)574 void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
575   std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations;
576 
577   for (auto &It : BC.getBinaryFunctions()) {
578     BinaryFunction &BF = It.second;
579     int64_t CurrentGnuArgsSize = 0;
580 
581     // Have we crossed hot/cold border for split functions?
582     bool SeenCold = false;
583 
584     for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
585       if (BB->isCold() && !SeenCold) {
586         SeenCold = true;
587         CurrentGnuArgsSize = 0;
588       }
589 
590       // First convert GnuArgsSize annotations into CFIs. This may change instr
591       // pointers, so do it before recording ptrs for preserved annotations
592       if (BF.usesGnuArgsSize()) {
593         for (auto II = BB->begin(); II != BB->end(); ++II) {
594           if (!BC.MIB->isInvoke(*II))
595             continue;
596           const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
597           assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size");
598           if (NewGnuArgsSize != CurrentGnuArgsSize) {
599             auto InsertII = BF.addCFIInstruction(
600                 BB, II,
601                 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
602             CurrentGnuArgsSize = NewGnuArgsSize;
603             II = std::next(InsertII);
604           }
605         }
606       }
607 
608       // Now record preserved annotations separately and then strip annotations.
609       for (auto II = BB->begin(); II != BB->end(); ++II) {
610         if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II))
611           PreservedOffsetAnnotations.emplace_back(&(*II),
612                                                   *BC.MIB->getOffset(*II));
613         BC.MIB->stripAnnotations(*II);
614       }
615     }
616   }
617   for (BinaryFunction *BF : BC.getInjectedBinaryFunctions())
618     for (BinaryBasicBlock &BB : *BF)
619       for (MCInst &Instruction : BB)
620         BC.MIB->stripAnnotations(Instruction);
621 
622   // Release all memory taken by annotations
623   BC.MIB->freeAnnotations();
624 
625   // Reinsert preserved annotations we need during code emission.
626   for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations)
627     BC.MIB->setOffset(*Item.first, Item.second);
628 }
629 
630 namespace {
631 
632 // This peephole fixes jump instructions that jump to another basic
633 // block with a single jump instruction, e.g.
634 //
635 // B0: ...
636 //     jmp  B1   (or jcc B1)
637 //
638 // B1: jmp  B2
639 //
640 // ->
641 //
642 // B0: ...
643 //     jmp  B2   (or jcc B2)
644 //
fixDoubleJumps(BinaryFunction & Function,bool MarkInvalid)645 uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
646   uint64_t NumDoubleJumps = 0;
647 
648   MCContext *Ctx = Function.getBinaryContext().Ctx.get();
649   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
650   for (BinaryBasicBlock &BB : Function) {
651     auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
652                              const MCSymbol *SuccSym) {
653       // Ignore infinite loop jumps or fallthrough tail jumps.
654       if (Pred == Succ || Succ == &BB)
655         return false;
656 
657       if (Succ) {
658         const MCSymbol *TBB = nullptr;
659         const MCSymbol *FBB = nullptr;
660         MCInst *CondBranch = nullptr;
661         MCInst *UncondBranch = nullptr;
662         bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
663         if (!Res) {
664           LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
665                      Pred->dump());
666           return false;
667         }
668         Pred->replaceSuccessor(&BB, Succ);
669 
670         // We must patch up any existing branch instructions to match up
671         // with the new successor.
672         assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
673                "Predecessor block has inconsistent number of successors");
674         if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
675           MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
676         } else if (UncondBranch &&
677                    MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
678           MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
679         } else if (!UncondBranch) {
680           assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ &&
681                  "Don't add an explicit jump to a fallthrough block.");
682           Pred->addBranchInstruction(Succ);
683         }
684       } else {
685         // Succ will be null in the tail call case.  In this case we
686         // need to explicitly add a tail call instruction.
687         MCInst *Branch = Pred->getLastNonPseudoInstr();
688         if (Branch && MIB->isUnconditionalBranch(*Branch)) {
689           assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
690           Pred->removeSuccessor(&BB);
691           Pred->eraseInstruction(Pred->findInstruction(Branch));
692           Pred->addTailCallInstruction(SuccSym);
693         } else {
694           return false;
695         }
696       }
697 
698       ++NumDoubleJumps;
699       LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
700                         << Pred->getName() << " -> " << BB.getName() << " to "
701                         << Pred->getName() << " -> " << SuccSym->getName()
702                         << (!Succ ? " (tail)\n" : "\n"));
703 
704       return true;
705     };
706 
707     if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
708       continue;
709 
710     MCInst *Inst = BB.getFirstNonPseudoInstr();
711     const bool IsTailCall = MIB->isTailCall(*Inst);
712 
713     if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
714       continue;
715 
716     // If we operate after SCTC make sure it's not a conditional tail call.
717     if (IsTailCall && MIB->isConditionalBranch(*Inst))
718       continue;
719 
720     const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
721     BinaryBasicBlock *Succ = BB.getSuccessor();
722 
723     if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
724       continue;
725 
726     std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
727 
728     for (BinaryBasicBlock *Pred : Preds) {
729       if (Pred->isLandingPad())
730         continue;
731 
732       if (Pred->getSuccessor() == &BB ||
733           (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
734           Pred->getConditionalSuccessor(false) == &BB)
735         if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid)
736           BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
737                        BB.isEntryPoint());
738     }
739   }
740 
741   return NumDoubleJumps;
742 }
743 } // namespace
744 
shouldRewriteBranch(const BinaryBasicBlock * PredBB,const MCInst & CondBranch,const BinaryBasicBlock * BB,const bool DirectionFlag)745 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
746     const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
747     const BinaryBasicBlock *BB, const bool DirectionFlag) {
748   if (BeenOptimized.count(PredBB))
749     return false;
750 
751   const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
752 
753   if (IsForward)
754     ++NumOrigForwardBranches;
755   else
756     ++NumOrigBackwardBranches;
757 
758   if (opts::SctcMode == opts::SctcAlways)
759     return true;
760 
761   if (opts::SctcMode == opts::SctcPreserveDirection)
762     return IsForward == DirectionFlag;
763 
764   const ErrorOr<std::pair<double, double>> Frequency =
765       PredBB->getBranchStats(BB);
766 
767   // It's ok to rewrite the conditional branch if the new target will be
768   // a backward branch.
769 
770   // If no data available for these branches, then it should be ok to
771   // do the optimization since it will reduce code size.
772   if (Frequency.getError())
773     return true;
774 
775   // TODO: should this use misprediction frequency instead?
776   const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
777                       (!IsForward && Frequency.get().first <= 0.5);
778 
779   return Result == DirectionFlag;
780 }
781 
fixTailCalls(BinaryFunction & BF)782 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
783   // Need updated indices to correctly detect branch' direction.
784   BF.getLayout().updateLayoutIndices();
785   BF.markUnreachableBlocks();
786 
787   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
788   MCContext *Ctx = BF.getBinaryContext().Ctx.get();
789   uint64_t NumLocalCTCCandidates = 0;
790   uint64_t NumLocalCTCs = 0;
791   uint64_t LocalCTCTakenCount = 0;
792   uint64_t LocalCTCExecCount = 0;
793   std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
794       NeedsUncondBranch;
795 
796   // Will block be deleted by UCE?
797   auto isValid = [](const BinaryBasicBlock *BB) {
798     return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
799   };
800 
801   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
802     // Locate BB with a single direct tail-call instruction.
803     if (BB->getNumNonPseudos() != 1)
804       continue;
805 
806     MCInst *Instr = BB->getFirstNonPseudoInstr();
807     if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
808       continue;
809 
810     const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
811     if (!CalleeSymbol)
812       continue;
813 
814     // Detect direction of the possible conditional tail call.
815     const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
816 
817     // Iterate through all predecessors.
818     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
819       BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
820       if (!CondSucc)
821         continue;
822 
823       ++NumLocalCTCCandidates;
824 
825       const MCSymbol *TBB = nullptr;
826       const MCSymbol *FBB = nullptr;
827       MCInst *CondBranch = nullptr;
828       MCInst *UncondBranch = nullptr;
829       bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
830 
831       // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
832       if (!Result) {
833         LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
834                    PredBB->dump());
835         continue;
836       }
837 
838       assert(Result && "internal error analyzing conditional branch");
839       assert(CondBranch && "conditional branch expected");
840 
841       // It's possible that PredBB is also a successor to BB that may have
842       // been processed by a previous iteration of the SCTC loop, in which
843       // case it may have been marked invalid.  We should skip rewriting in
844       // this case.
845       if (!PredBB->isValid()) {
846         assert(PredBB->isSuccessor(BB) &&
847                "PredBB should be valid if it is not a successor to BB");
848         continue;
849       }
850 
851       // We don't want to reverse direction of the branch in new order
852       // without further profile analysis.
853       const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
854       if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
855         continue;
856 
857       // Record this block so that we don't try to optimize it twice.
858       BeenOptimized.insert(PredBB);
859 
860       uint64_t Count = 0;
861       if (CondSucc != BB) {
862         // Patch the new target address into the conditional branch.
863         MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
864         // Since we reversed the condition on the branch we need to change
865         // the target for the unconditional branch or add a unconditional
866         // branch to the old target.  This has to be done manually since
867         // fixupBranches is not called after SCTC.
868         NeedsUncondBranch.emplace_back(PredBB, CondSucc);
869         Count = PredBB->getFallthroughBranchInfo().Count;
870       } else {
871         // Change destination of the conditional branch.
872         MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
873         Count = PredBB->getTakenBranchInfo().Count;
874       }
875       const uint64_t CTCTakenFreq =
876           Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
877 
878       // Annotate it, so "isCall" returns true for this jcc
879       MIB->setConditionalTailCall(*CondBranch);
880       // Add info abount the conditional tail call frequency, otherwise this
881       // info will be lost when we delete the associated BranchInfo entry
882       auto &CTCAnnotation =
883           MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
884       CTCAnnotation = CTCTakenFreq;
885 
886       // Remove the unused successor which may be eliminated later
887       // if there are no other users.
888       PredBB->removeSuccessor(BB);
889       // Update BB execution count
890       if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
891         BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
892       else if (CTCTakenFreq > BB->getKnownExecutionCount())
893         BB->setExecutionCount(0);
894 
895       ++NumLocalCTCs;
896       LocalCTCTakenCount += CTCTakenFreq;
897       LocalCTCExecCount += PredBB->getKnownExecutionCount();
898     }
899 
900     // Remove the block from CFG if all predecessors were removed.
901     BB->markValid(isValid(BB));
902   }
903 
904   // Add unconditional branches at the end of BBs to new successors
905   // as long as the successor is not a fallthrough.
906   for (auto &Entry : NeedsUncondBranch) {
907     BinaryBasicBlock *PredBB = Entry.first;
908     const BinaryBasicBlock *CondSucc = Entry.second;
909 
910     const MCSymbol *TBB = nullptr;
911     const MCSymbol *FBB = nullptr;
912     MCInst *CondBranch = nullptr;
913     MCInst *UncondBranch = nullptr;
914     PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
915 
916     // Find the next valid block.  Invalid blocks will be deleted
917     // so they shouldn't be considered fallthrough targets.
918     const BinaryBasicBlock *NextBlock =
919         BF.getLayout().getBasicBlockAfter(PredBB, false);
920     while (NextBlock && !isValid(NextBlock))
921       NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false);
922 
923     // Get the unconditional successor to this block.
924     const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
925     assert(PredSucc && "The other branch should be a tail call");
926 
927     const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
928 
929     if (UncondBranch) {
930       if (HasFallthrough)
931         PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
932       else
933         MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
934     } else if (!HasFallthrough) {
935       MCInst Branch;
936       MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
937       PredBB->addInstruction(Branch);
938     }
939   }
940 
941   if (NumLocalCTCs > 0) {
942     NumDoubleJumps += fixDoubleJumps(BF, true);
943     // Clean-up unreachable tail-call blocks.
944     const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
945     DeletedBlocks += Stats.first;
946     DeletedBytes += Stats.second;
947 
948     assert(BF.validateCFG());
949   }
950 
951   LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
952                     << " conditional tail calls from a total of "
953                     << NumLocalCTCCandidates << " candidates in function " << BF
954                     << ". CTCs execution count for this function is "
955                     << LocalCTCExecCount << " and CTC taken count is "
956                     << LocalCTCTakenCount << "\n";);
957 
958   NumTailCallsPatched += NumLocalCTCs;
959   NumCandidateTailCalls += NumLocalCTCCandidates;
960   CTCExecCount += LocalCTCExecCount;
961   CTCTakenCount += LocalCTCTakenCount;
962 
963   return NumLocalCTCs > 0;
964 }
965 
runOnFunctions(BinaryContext & BC)966 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
967   if (!BC.isX86())
968     return;
969 
970   for (auto &It : BC.getBinaryFunctions()) {
971     BinaryFunction &Function = It.second;
972 
973     if (!shouldOptimize(Function))
974       continue;
975 
976     if (fixTailCalls(Function)) {
977       Modified.insert(&Function);
978       Function.setHasCanonicalCFG(false);
979     }
980   }
981 
982   outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
983          << " tail calls (" << NumOrigForwardBranches << " forward)"
984          << " tail calls (" << NumOrigBackwardBranches << " backward)"
985          << " from a total of " << NumCandidateTailCalls << " while removing "
986          << NumDoubleJumps << " double jumps"
987          << " and removing " << DeletedBlocks << " basic blocks"
988          << " totalling " << DeletedBytes
989          << " bytes of code. CTCs total execution count is " << CTCExecCount
990          << " and the number of times CTCs are taken is " << CTCTakenCount
991          << ".\n";
992 }
993 
shortenInstructions(BinaryFunction & Function)994 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
995   uint64_t Count = 0;
996   const BinaryContext &BC = Function.getBinaryContext();
997   for (BinaryBasicBlock &BB : Function) {
998     for (MCInst &Inst : BB) {
999       MCInst OriginalInst;
1000       if (opts::Verbosity > 2)
1001         OriginalInst = Inst;
1002 
1003       if (!BC.MIB->shortenInstruction(Inst, *BC.STI))
1004         continue;
1005 
1006       if (opts::Verbosity > 2) {
1007         BC.scopeLock();
1008         outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
1009         BC.printInstruction(outs(), OriginalInst, 0, &Function);
1010         outs() << "BOLT-INFO: to:";
1011         BC.printInstruction(outs(), Inst, 0, &Function);
1012       }
1013 
1014       ++Count;
1015     }
1016   }
1017 
1018   return Count;
1019 }
1020 
runOnFunctions(BinaryContext & BC)1021 void ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1022   std::atomic<uint64_t> NumShortened{0};
1023   if (!BC.isX86())
1024     return;
1025 
1026   ParallelUtilities::runOnEachFunction(
1027       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1028       [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1029       nullptr, "ShortenInstructions");
1030 
1031   outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n";
1032 }
1033 
addTailcallTraps(BinaryFunction & Function)1034 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1035   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1036   for (BinaryBasicBlock &BB : Function) {
1037     MCInst *Inst = BB.getLastNonPseudoInstr();
1038     if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1039       MCInst Trap;
1040       if (MIB->createTrap(Trap)) {
1041         BB.addInstruction(Trap);
1042         ++TailCallTraps;
1043       }
1044     }
1045   }
1046 }
1047 
removeUselessCondBranches(BinaryFunction & Function)1048 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1049   for (BinaryBasicBlock &BB : Function) {
1050     if (BB.succ_size() != 2)
1051       continue;
1052 
1053     BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1054     BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1055     if (CondBB != UncondBB)
1056       continue;
1057 
1058     const MCSymbol *TBB = nullptr;
1059     const MCSymbol *FBB = nullptr;
1060     MCInst *CondBranch = nullptr;
1061     MCInst *UncondBranch = nullptr;
1062     bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1063 
1064     // analyzeBranch() can fail due to unusual branch instructions,
1065     // e.g. jrcxz, or jump tables (indirect jump).
1066     if (!Result || !CondBranch)
1067       continue;
1068 
1069     BB.removeDuplicateConditionalSuccessor(CondBranch);
1070     ++NumUselessCondBranches;
1071   }
1072 }
1073 
runOnFunctions(BinaryContext & BC)1074 void Peepholes::runOnFunctions(BinaryContext &BC) {
1075   const char Opts =
1076       std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1077                       [](const char A, const PeepholeOpts B) { return A | B; });
1078   if (Opts == PEEP_NONE)
1079     return;
1080 
1081   for (auto &It : BC.getBinaryFunctions()) {
1082     BinaryFunction &Function = It.second;
1083     if (shouldOptimize(Function)) {
1084       if (Opts & PEEP_DOUBLE_JUMPS)
1085         NumDoubleJumps += fixDoubleJumps(Function, false);
1086       if (Opts & PEEP_TAILCALL_TRAPS)
1087         addTailcallTraps(Function);
1088       if (Opts & PEEP_USELESS_BRANCHES)
1089         removeUselessCondBranches(Function);
1090       assert(Function.validateCFG());
1091     }
1092   }
1093   outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1094          << " double jumps patched.\n"
1095          << "BOLT-INFO: Peephole: " << TailCallTraps
1096          << " tail call traps inserted.\n"
1097          << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1098          << " useless conditional branches removed.\n";
1099 }
1100 
simplifyRODataLoads(BinaryFunction & BF)1101 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1102   BinaryContext &BC = BF.getBinaryContext();
1103   MCPlusBuilder *MIB = BC.MIB.get();
1104 
1105   uint64_t NumLocalLoadsSimplified = 0;
1106   uint64_t NumDynamicLocalLoadsSimplified = 0;
1107   uint64_t NumLocalLoadsFound = 0;
1108   uint64_t NumDynamicLocalLoadsFound = 0;
1109 
1110   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
1111     for (MCInst &Inst : *BB) {
1112       unsigned Opcode = Inst.getOpcode();
1113       const MCInstrDesc &Desc = BC.MII->get(Opcode);
1114 
1115       // Skip instructions that do not load from memory.
1116       if (!Desc.mayLoad())
1117         continue;
1118 
1119       // Try to statically evaluate the target memory address;
1120       uint64_t TargetAddress;
1121 
1122       if (MIB->hasPCRelOperand(Inst)) {
1123         // Try to find the symbol that corresponds to the PC-relative operand.
1124         MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1125         assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1126         assert(DispOpI->isExpr() &&
1127                "found PC-relative with non-symbolic displacement");
1128 
1129         // Get displacement symbol.
1130         const MCSymbol *DisplSymbol;
1131         uint64_t DisplOffset;
1132 
1133         std::tie(DisplSymbol, DisplOffset) =
1134             MIB->getTargetSymbolInfo(DispOpI->getExpr());
1135 
1136         if (!DisplSymbol)
1137           continue;
1138 
1139         // Look up the symbol address in the global symbols map of the binary
1140         // context object.
1141         BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1142         if (!BD)
1143           continue;
1144         TargetAddress = BD->getAddress() + DisplOffset;
1145       } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1146         continue;
1147       }
1148 
1149       // Get the contents of the section containing the target address of the
1150       // memory operand. We are only interested in read-only sections.
1151       ErrorOr<BinarySection &> DataSection =
1152           BC.getSectionForAddress(TargetAddress);
1153       if (!DataSection || !DataSection->isReadOnly())
1154         continue;
1155 
1156       if (BC.getRelocationAt(TargetAddress) ||
1157           BC.getDynamicRelocationAt(TargetAddress))
1158         continue;
1159 
1160       uint32_t Offset = TargetAddress - DataSection->getAddress();
1161       StringRef ConstantData = DataSection->getContents();
1162 
1163       ++NumLocalLoadsFound;
1164       if (BB->hasProfile())
1165         NumDynamicLocalLoadsFound += BB->getExecutionCount();
1166 
1167       if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1168         ++NumLocalLoadsSimplified;
1169         if (BB->hasProfile())
1170           NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1171       }
1172     }
1173   }
1174 
1175   NumLoadsFound += NumLocalLoadsFound;
1176   NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1177   NumLoadsSimplified += NumLocalLoadsSimplified;
1178   NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1179 
1180   return NumLocalLoadsSimplified > 0;
1181 }
1182 
runOnFunctions(BinaryContext & BC)1183 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1184   for (auto &It : BC.getBinaryFunctions()) {
1185     BinaryFunction &Function = It.second;
1186     if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1187       Modified.insert(&Function);
1188   }
1189 
1190   outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1191          << NumLoadsFound << " loads from a statically computed address.\n"
1192          << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
1193          << "\n"
1194          << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
1195 }
1196 
runOnFunctions(BinaryContext & BC)1197 void AssignSections::runOnFunctions(BinaryContext &BC) {
1198   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1199     Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1200     Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1201   }
1202 
1203   // In non-relocation mode functions have pre-assigned section names.
1204   if (!BC.HasRelocations)
1205     return;
1206 
1207   const bool UseColdSection =
1208       BC.NumProfiledFuncs > 0 ||
1209       opts::ReorderFunctions == ReorderFunctions::RT_USER;
1210   for (auto &BFI : BC.getBinaryFunctions()) {
1211     BinaryFunction &Function = BFI.second;
1212     if (opts::isHotTextMover(Function)) {
1213       Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1214       Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1215       continue;
1216     }
1217 
1218     if (!UseColdSection || Function.hasValidIndex())
1219       Function.setCodeSectionName(BC.getMainCodeSectionName());
1220     else
1221       Function.setCodeSectionName(BC.getColdCodeSectionName());
1222 
1223     if (Function.isSplit())
1224       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1225   }
1226 }
1227 
runOnFunctions(BinaryContext & BC)1228 void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1229   double FlowImbalanceMean = 0.0;
1230   size_t NumBlocksConsidered = 0;
1231   double WorstBias = 0.0;
1232   const BinaryFunction *WorstBiasFunc = nullptr;
1233 
1234   // For each function CFG, we fill an IncomingMap with the sum of the frequency
1235   // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1236   // of the frequency of outgoing edges.
1237   using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1238   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1239   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1240 
1241   // Compute mean
1242   for (const auto &BFI : BC.getBinaryFunctions()) {
1243     const BinaryFunction &Function = BFI.second;
1244     if (Function.empty() || !Function.isSimple())
1245       continue;
1246     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1247     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1248     for (const BinaryBasicBlock &BB : Function) {
1249       uint64_t TotalOutgoing = 0ULL;
1250       auto SuccBIIter = BB.branch_info_begin();
1251       for (BinaryBasicBlock *Succ : BB.successors()) {
1252         uint64_t Count = SuccBIIter->Count;
1253         if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1254           ++SuccBIIter;
1255           continue;
1256         }
1257         TotalOutgoing += Count;
1258         IncomingMap[Succ] += Count;
1259         ++SuccBIIter;
1260       }
1261       OutgoingMap[&BB] = TotalOutgoing;
1262     }
1263 
1264     size_t NumBlocks = 0;
1265     double Mean = 0.0;
1266     for (const BinaryBasicBlock &BB : Function) {
1267       // Do not compute score for low frequency blocks, entry or exit blocks
1268       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1269         continue;
1270       ++NumBlocks;
1271       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1272       Mean += fabs(Difference / IncomingMap[&BB]);
1273     }
1274 
1275     FlowImbalanceMean += Mean;
1276     NumBlocksConsidered += NumBlocks;
1277     if (!NumBlocks)
1278       continue;
1279     double FuncMean = Mean / NumBlocks;
1280     if (FuncMean > WorstBias) {
1281       WorstBias = FuncMean;
1282       WorstBiasFunc = &Function;
1283     }
1284   }
1285   if (NumBlocksConsidered > 0)
1286     FlowImbalanceMean /= NumBlocksConsidered;
1287 
1288   // Compute standard deviation
1289   NumBlocksConsidered = 0;
1290   double FlowImbalanceVar = 0.0;
1291   for (const auto &BFI : BC.getBinaryFunctions()) {
1292     const BinaryFunction &Function = BFI.second;
1293     if (Function.empty() || !Function.isSimple())
1294       continue;
1295     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1296     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1297     for (const BinaryBasicBlock &BB : Function) {
1298       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1299         continue;
1300       ++NumBlocksConsidered;
1301       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1302       FlowImbalanceVar +=
1303           pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1304     }
1305   }
1306   if (NumBlocksConsidered) {
1307     FlowImbalanceVar /= NumBlocksConsidered;
1308     FlowImbalanceVar = sqrt(FlowImbalanceVar);
1309   }
1310 
1311   // Report to user
1312   outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1313                    (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1314   if (WorstBiasFunc && opts::Verbosity >= 1) {
1315     outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
1316            << "\n";
1317     LLVM_DEBUG(WorstBiasFunc->dump());
1318   }
1319 }
1320 
runOnFunctions(BinaryContext & BC)1321 void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1322   uint64_t NumRegularFunctions = 0;
1323   uint64_t NumStaleProfileFunctions = 0;
1324   uint64_t NumNonSimpleProfiledFunctions = 0;
1325   uint64_t NumUnknownControlFlowFunctions = 0;
1326   uint64_t TotalSampleCount = 0;
1327   uint64_t StaleSampleCount = 0;
1328   std::vector<const BinaryFunction *> ProfiledFunctions;
1329   const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1330   for (auto &BFI : BC.getBinaryFunctions()) {
1331     const BinaryFunction &Function = BFI.second;
1332 
1333     // Ignore PLT functions for stats.
1334     if (Function.isPLTFunction())
1335       continue;
1336 
1337     ++NumRegularFunctions;
1338 
1339     if (!Function.isSimple()) {
1340       if (Function.hasProfile())
1341         ++NumNonSimpleProfiledFunctions;
1342       continue;
1343     }
1344 
1345     if (Function.hasUnknownControlFlow()) {
1346       if (opts::PrintUnknownCFG)
1347         Function.dump();
1348       else if (opts::PrintUnknown)
1349         errs() << "function with unknown control flow: " << Function << '\n';
1350 
1351       ++NumUnknownControlFlowFunctions;
1352     }
1353 
1354     if (!Function.hasProfile())
1355       continue;
1356 
1357     uint64_t SampleCount = Function.getRawBranchCount();
1358     TotalSampleCount += SampleCount;
1359 
1360     if (Function.hasValidProfile()) {
1361       ProfiledFunctions.push_back(&Function);
1362     } else {
1363       if (opts::ReportStaleFuncs) {
1364         outs() << StaleFuncsHeader;
1365         StaleFuncsHeader = "";
1366         outs() << "  " << Function << '\n';
1367       }
1368       ++NumStaleProfileFunctions;
1369       StaleSampleCount += SampleCount;
1370     }
1371   }
1372   BC.NumProfiledFuncs = ProfiledFunctions.size();
1373 
1374   const size_t NumAllProfiledFunctions =
1375       ProfiledFunctions.size() + NumStaleProfileFunctions;
1376   outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1377          << NumRegularFunctions << " functions in the binary ("
1378          << format("%.1f", NumAllProfiledFunctions /
1379                                (float)NumRegularFunctions * 100.0f)
1380          << "%) have non-empty execution profile\n";
1381   if (NumNonSimpleProfiledFunctions) {
1382     outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1383            << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1384            << " with profile could not be optimized\n";
1385   }
1386   if (NumStaleProfileFunctions) {
1387     const float PctStale =
1388         NumStaleProfileFunctions / (float)NumAllProfiledFunctions * 100.0f;
1389     auto printErrorOrWarning = [&]() {
1390       if (PctStale > opts::StaleThreshold)
1391         errs() << "BOLT-ERROR: ";
1392       else
1393         errs() << "BOLT-WARNING: ";
1394     };
1395     printErrorOrWarning();
1396     errs() << NumStaleProfileFunctions
1397            << format(" (%.1f%% of all profiled)", PctStale) << " function"
1398            << (NumStaleProfileFunctions == 1 ? "" : "s")
1399            << " have invalid (possibly stale) profile."
1400               " Use -report-stale to see the list.\n";
1401     if (TotalSampleCount > 0) {
1402       printErrorOrWarning();
1403       errs() << StaleSampleCount << " out of " << TotalSampleCount
1404              << " samples in the binary ("
1405              << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount))
1406              << "%) belong to functions with invalid"
1407                 " (possibly stale) profile.\n";
1408     }
1409     if (PctStale > opts::StaleThreshold) {
1410       errs() << "BOLT-ERROR: stale functions exceed specified threshold of "
1411              << opts::StaleThreshold << "%. Exiting.\n";
1412       exit(1);
1413     }
1414   }
1415 
1416   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1417     outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1418            << " objects was ignored\n";
1419   }
1420 
1421   if (ProfiledFunctions.size() > 10) {
1422     if (opts::Verbosity >= 1) {
1423       outs() << "BOLT-INFO: top called functions are:\n";
1424       llvm::sort(ProfiledFunctions,
1425                  [](const BinaryFunction *A, const BinaryFunction *B) {
1426                    return B->getExecutionCount() < A->getExecutionCount();
1427                  });
1428       auto SFI = ProfiledFunctions.begin();
1429       auto SFIend = ProfiledFunctions.end();
1430       for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1431            ++SFI, ++I)
1432         outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n';
1433     }
1434   }
1435 
1436   if (!opts::PrintSortedBy.empty() &&
1437       !llvm::is_contained(opts::PrintSortedBy, DynoStats::FIRST_DYNO_STAT)) {
1438 
1439     std::vector<const BinaryFunction *> Functions;
1440     std::map<const BinaryFunction *, DynoStats> Stats;
1441 
1442     for (const auto &BFI : BC.getBinaryFunctions()) {
1443       const BinaryFunction &BF = BFI.second;
1444       if (shouldOptimize(BF) && BF.hasValidProfile()) {
1445         Functions.push_back(&BF);
1446         Stats.emplace(&BF, getDynoStats(BF));
1447       }
1448     }
1449 
1450     const bool SortAll =
1451         llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT);
1452 
1453     const bool Ascending =
1454         opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1455 
1456     if (SortAll) {
1457       llvm::stable_sort(Functions,
1458                         [Ascending, &Stats](const BinaryFunction *A,
1459                                             const BinaryFunction *B) {
1460                           return Ascending ? Stats.at(A) < Stats.at(B)
1461                                            : Stats.at(B) < Stats.at(A);
1462                         });
1463     } else {
1464       llvm::stable_sort(
1465           Functions, [Ascending, &Stats](const BinaryFunction *A,
1466                                          const BinaryFunction *B) {
1467             const DynoStats &StatsA = Stats.at(A);
1468             const DynoStats &StatsB = Stats.at(B);
1469             return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy)
1470                              : StatsB.lessThan(StatsA, opts::PrintSortedBy);
1471           });
1472     }
1473 
1474     outs() << "BOLT-INFO: top functions sorted by ";
1475     if (SortAll) {
1476       outs() << "dyno stats";
1477     } else {
1478       outs() << "(";
1479       bool PrintComma = false;
1480       for (const DynoStats::Category Category : opts::PrintSortedBy) {
1481         if (PrintComma)
1482           outs() << ", ";
1483         outs() << DynoStats::Description(Category);
1484         PrintComma = true;
1485       }
1486       outs() << ")";
1487     }
1488 
1489     outs() << " are:\n";
1490     auto SFI = Functions.begin();
1491     for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1492       const DynoStats Stats = getDynoStats(**SFI);
1493       outs() << "  " << **SFI;
1494       if (!SortAll) {
1495         outs() << " (";
1496         bool PrintComma = false;
1497         for (const DynoStats::Category Category : opts::PrintSortedBy) {
1498           if (PrintComma)
1499             outs() << ", ";
1500           outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1501           PrintComma = true;
1502         }
1503         outs() << ")";
1504       }
1505       outs() << "\n";
1506     }
1507   }
1508 
1509   if (!BC.TrappedFunctions.empty()) {
1510     errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1511            << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1512            << " will trap on entry. Use -trap-avx512=0 to disable"
1513               " traps.";
1514     if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1515       errs() << '\n';
1516       for (const BinaryFunction *Function : BC.TrappedFunctions)
1517         errs() << "  " << *Function << '\n';
1518     } else {
1519       errs() << " Use -v=1 to see the list.\n";
1520     }
1521   }
1522 
1523   // Print information on missed macro-fusion opportunities seen on input.
1524   if (BC.MissedMacroFusionPairs) {
1525     outs() << "BOLT-INFO: the input contains " << BC.MissedMacroFusionPairs
1526            << " (dynamic count : " << BC.MissedMacroFusionExecCount
1527            << ") opportunities for macro-fusion optimization";
1528     switch (opts::AlignMacroOpFusion) {
1529     case MFT_NONE:
1530       outs() << ". Use -align-macro-fusion to fix.\n";
1531       break;
1532     case MFT_HOT:
1533       outs() << ". Will fix instances on a hot path.\n";
1534       break;
1535     case MFT_ALL:
1536       outs() << " that are going to be fixed\n";
1537       break;
1538     }
1539   }
1540 
1541   // Collect and print information about suboptimal code layout on input.
1542   if (opts::ReportBadLayout) {
1543     std::vector<const BinaryFunction *> SuboptimalFuncs;
1544     for (auto &BFI : BC.getBinaryFunctions()) {
1545       const BinaryFunction &BF = BFI.second;
1546       if (!BF.hasValidProfile())
1547         continue;
1548 
1549       const uint64_t HotThreshold =
1550           std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1551       bool HotSeen = false;
1552       for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) {
1553         if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1554           HotSeen = true;
1555           continue;
1556         }
1557         if (HotSeen && BB->getKnownExecutionCount() == 0) {
1558           SuboptimalFuncs.push_back(&BF);
1559           break;
1560         }
1561       }
1562     }
1563 
1564     if (!SuboptimalFuncs.empty()) {
1565       llvm::sort(SuboptimalFuncs,
1566                  [](const BinaryFunction *A, const BinaryFunction *B) {
1567                    return A->getKnownExecutionCount() / A->getSize() >
1568                           B->getKnownExecutionCount() / B->getSize();
1569                  });
1570 
1571       outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1572              << " functions have "
1573                 "cold code in the middle of hot code. Top functions are:\n";
1574       for (unsigned I = 0;
1575            I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1576                         SuboptimalFuncs.size());
1577            ++I)
1578         SuboptimalFuncs[I]->print(outs());
1579     }
1580   }
1581 
1582   if (NumUnknownControlFlowFunctions) {
1583     outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1584            << " functions have instructions with unknown control flow";
1585     if (!opts::PrintUnknown)
1586       outs() << ". Use -print-unknown to see the list.";
1587     outs() << '\n';
1588   }
1589 }
1590 
runOnFunctions(BinaryContext & BC)1591 void InstructionLowering::runOnFunctions(BinaryContext &BC) {
1592   for (auto &BFI : BC.getBinaryFunctions())
1593     for (BinaryBasicBlock &BB : BFI.second)
1594       for (MCInst &Instruction : BB)
1595         BC.MIB->lowerTailCall(Instruction);
1596 }
1597 
runOnFunctions(BinaryContext & BC)1598 void StripRepRet::runOnFunctions(BinaryContext &BC) {
1599   if (!BC.isX86())
1600     return;
1601 
1602   uint64_t NumPrefixesRemoved = 0;
1603   uint64_t NumBytesSaved = 0;
1604   for (auto &BFI : BC.getBinaryFunctions()) {
1605     for (BinaryBasicBlock &BB : BFI.second) {
1606       auto LastInstRIter = BB.getLastNonPseudo();
1607       if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1608           !BC.MIB->deleteREPPrefix(*LastInstRIter))
1609         continue;
1610 
1611       NumPrefixesRemoved += BB.getKnownExecutionCount();
1612       ++NumBytesSaved;
1613     }
1614   }
1615 
1616   if (NumBytesSaved)
1617     outs() << "BOLT-INFO: removed " << NumBytesSaved
1618            << " 'repz' prefixes"
1619               " with estimated execution count of "
1620            << NumPrefixesRemoved << " times.\n";
1621 }
1622 
runOnFunctions(BinaryContext & BC)1623 void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1624   if (!BC.isX86())
1625     return;
1626 
1627   uint64_t NumInlined = 0;
1628   uint64_t NumInlinedDyno = 0;
1629   for (auto &BFI : BC.getBinaryFunctions()) {
1630     for (BinaryBasicBlock &BB : BFI.second) {
1631       for (auto II = BB.begin(); II != BB.end(); ++II) {
1632         MCInst &Inst = *II;
1633 
1634         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1635             !Inst.getOperand(0).isExpr())
1636           continue;
1637 
1638         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1639         if (CalleeSymbol->getName() != "memcpy" &&
1640             CalleeSymbol->getName() != "memcpy@PLT" &&
1641             CalleeSymbol->getName() != "_memcpy8")
1642           continue;
1643 
1644         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1645         const bool IsTailCall = BC.MIB->isTailCall(Inst);
1646 
1647         const InstructionListType NewCode =
1648             BC.MIB->createInlineMemcpy(IsMemcpy8);
1649         II = BB.replaceInstruction(II, NewCode);
1650         std::advance(II, NewCode.size() - 1);
1651         if (IsTailCall) {
1652           MCInst Return;
1653           BC.MIB->createReturn(Return);
1654           II = BB.insertInstruction(std::next(II), std::move(Return));
1655         }
1656 
1657         ++NumInlined;
1658         NumInlinedDyno += BB.getKnownExecutionCount();
1659       }
1660     }
1661   }
1662 
1663   if (NumInlined) {
1664     outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1665     if (NumInlinedDyno)
1666       outs() << ". The calls were executed " << NumInlinedDyno
1667              << " times based on profile.";
1668     outs() << '\n';
1669   }
1670 }
1671 
shouldOptimize(const BinaryFunction & Function) const1672 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1673   if (!BinaryFunctionPass::shouldOptimize(Function))
1674     return false;
1675 
1676   for (const std::string &FunctionSpec : Spec) {
1677     StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1678     if (Function.hasNameRegex(FunctionName))
1679       return true;
1680   }
1681 
1682   return false;
1683 }
1684 
getCallSitesToOptimize(const BinaryFunction & Function) const1685 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1686     const BinaryFunction &Function) const {
1687   StringRef SitesString;
1688   for (const std::string &FunctionSpec : Spec) {
1689     StringRef FunctionName;
1690     std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1691     if (Function.hasNameRegex(FunctionName))
1692       break;
1693     SitesString = "";
1694   }
1695 
1696   std::set<size_t> Sites;
1697   SmallVector<StringRef, 4> SitesVec;
1698   SitesString.split(SitesVec, ':');
1699   for (StringRef SiteString : SitesVec) {
1700     if (SiteString.empty())
1701       continue;
1702     size_t Result;
1703     if (!SiteString.getAsInteger(10, Result))
1704       Sites.emplace(Result);
1705   }
1706 
1707   return Sites;
1708 }
1709 
runOnFunctions(BinaryContext & BC)1710 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1711   if (!BC.isX86())
1712     return;
1713 
1714   uint64_t NumSpecialized = 0;
1715   uint64_t NumSpecializedDyno = 0;
1716   for (auto &BFI : BC.getBinaryFunctions()) {
1717     BinaryFunction &Function = BFI.second;
1718     if (!shouldOptimize(Function))
1719       continue;
1720 
1721     std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1722     auto shouldOptimize = [&](size_t N) {
1723       return CallsToOptimize.empty() || CallsToOptimize.count(N);
1724     };
1725 
1726     std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1727     size_t CallSiteID = 0;
1728     for (BinaryBasicBlock *CurBB : Blocks) {
1729       for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1730         MCInst &Inst = *II;
1731 
1732         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1733             !Inst.getOperand(0).isExpr())
1734           continue;
1735 
1736         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1737         if (CalleeSymbol->getName() != "memcpy" &&
1738             CalleeSymbol->getName() != "memcpy@PLT")
1739           continue;
1740 
1741         if (BC.MIB->isTailCall(Inst))
1742           continue;
1743 
1744         ++CallSiteID;
1745 
1746         if (!shouldOptimize(CallSiteID))
1747           continue;
1748 
1749         // Create a copy of a call to memcpy(dest, src, size).
1750         MCInst MemcpyInstr = Inst;
1751 
1752         BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1753 
1754         BinaryBasicBlock *NextBB = nullptr;
1755         if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1756           NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1757           NextBB->eraseInstruction(NextBB->begin());
1758         } else {
1759           NextBB = OneByteMemcpyBB->getSuccessor();
1760           OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1761           assert(NextBB && "unexpected call to memcpy() with no return");
1762         }
1763 
1764         BinaryBasicBlock *MemcpyBB = Function.addBasicBlock();
1765         MemcpyBB->setOffset(CurBB->getInputOffset());
1766         InstructionListType CmpJCC =
1767             BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1768                                 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1769         CurBB->addInstructions(CmpJCC);
1770         CurBB->addSuccessor(MemcpyBB);
1771 
1772         MemcpyBB->addInstruction(std::move(MemcpyInstr));
1773         MemcpyBB->addSuccessor(NextBB);
1774         MemcpyBB->setCFIState(NextBB->getCFIState());
1775         MemcpyBB->setExecutionCount(0);
1776 
1777         // To prevent the actual call from being moved to cold, we set its
1778         // execution count to 1.
1779         if (CurBB->getKnownExecutionCount() > 0)
1780           MemcpyBB->setExecutionCount(1);
1781 
1782         InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1783         OneByteMemcpyBB->addInstructions(OneByteMemcpy);
1784 
1785         ++NumSpecialized;
1786         NumSpecializedDyno += CurBB->getKnownExecutionCount();
1787 
1788         CurBB = NextBB;
1789 
1790         // Note: we don't expect the next instruction to be a call to memcpy.
1791         II = CurBB->begin();
1792       }
1793     }
1794   }
1795 
1796   if (NumSpecialized) {
1797     outs() << "BOLT-INFO: specialized " << NumSpecialized
1798            << " memcpy() call sites for size 1";
1799     if (NumSpecializedDyno)
1800       outs() << ". The calls were executed " << NumSpecializedDyno
1801              << " times based on profile.";
1802     outs() << '\n';
1803   }
1804 }
1805 
runOnFunction(BinaryFunction & BF)1806 void RemoveNops::runOnFunction(BinaryFunction &BF) {
1807   const BinaryContext &BC = BF.getBinaryContext();
1808   for (BinaryBasicBlock &BB : BF) {
1809     for (int64_t I = BB.size() - 1; I >= 0; --I) {
1810       MCInst &Inst = BB.getInstructionAtIndex(I);
1811       if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
1812         BB.eraseInstructionAtIndex(I);
1813     }
1814   }
1815 }
1816 
runOnFunctions(BinaryContext & BC)1817 void RemoveNops::runOnFunctions(BinaryContext &BC) {
1818   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1819     runOnFunction(BF);
1820   };
1821 
1822   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1823     return BF.shouldPreserveNops();
1824   };
1825 
1826   ParallelUtilities::runOnEachFunction(
1827       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
1828       SkipFunc, "RemoveNops");
1829 }
1830 
1831 } // namespace bolt
1832 } // namespace llvm
1833