1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/ParallelUtilities.h"
15 #include "bolt/Passes/ReorderAlgorithm.h"
16 #include "bolt/Passes/ReorderFunctions.h"
17 #include "llvm/Support/CommandLine.h"
18
19 #include <numeric>
20 #include <vector>
21
22 #define DEBUG_TYPE "bolt-opts"
23
24 using namespace llvm;
25 using namespace bolt;
26
27 namespace {
28
dynoStatsOptName(const bolt::DynoStats::Category C)29 const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30 if (C == bolt::DynoStats::FIRST_DYNO_STAT)
31 return "none";
32 else if (C == bolt::DynoStats::LAST_DYNO_STAT)
33 return "all";
34
35 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
36
37 OptNames[C] = bolt::DynoStats::Description(C);
38
39 std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
40
41 return OptNames[C].c_str();
42 }
43
dynoStatsOptDesc(const bolt::DynoStats::Category C)44 const char *dynoStatsOptDesc(const bolt::DynoStats::Category C) {
45 if (C == bolt::DynoStats::FIRST_DYNO_STAT)
46 return "unsorted";
47 else if (C == bolt::DynoStats::LAST_DYNO_STAT)
48 return "sorted by all stats";
49
50 return bolt::DynoStats::Description(C);
51 }
52
53 }
54
55 namespace opts {
56
57 extern cl::OptionCategory BoltCategory;
58 extern cl::OptionCategory BoltOptCategory;
59
60 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
61 extern cl::opt<unsigned> Verbosity;
62 extern cl::opt<bool> EnableBAT;
63 extern cl::opt<unsigned> ExecutionCountThreshold;
64 extern cl::opt<bool> UpdateDebugSections;
65 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
66
67 enum DynoStatsSortOrder : char {
68 Ascending,
69 Descending
70 };
71
72 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
73 "print-sorted-by-order",
74 cl::desc("use ascending or descending order when printing functions "
75 "ordered by dyno stats"),
76 cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
77
78 cl::list<std::string>
79 HotTextMoveSections("hot-text-move-sections",
80 cl::desc("list of sections containing functions used for hugifying hot text. "
81 "BOLT makes sure these functions are not placed on the same page as "
82 "the hot text. (default=\'.stub,.mover\')."),
83 cl::value_desc("sec1,sec2,sec3,..."),
84 cl::CommaSeparated,
85 cl::ZeroOrMore,
86 cl::cat(BoltCategory));
87
isHotTextMover(const BinaryFunction & Function)88 bool isHotTextMover(const BinaryFunction &Function) {
89 for (std::string &SectionName : opts::HotTextMoveSections) {
90 if (Function.getOriginSectionName() &&
91 *Function.getOriginSectionName() == SectionName)
92 return true;
93 }
94
95 return false;
96 }
97
98 static cl::opt<bool> MinBranchClusters(
99 "min-branch-clusters",
100 cl::desc("use a modified clustering algorithm geared towards minimizing "
101 "branches"),
102 cl::Hidden, cl::cat(BoltOptCategory));
103
104 static cl::list<Peepholes::PeepholeOpts> Peepholes(
105 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
106 cl::value_desc("opt1,opt2,opt3,..."),
107 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
108 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
109 "remove double jumps when able"),
110 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
111 "insert tail call traps"),
112 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
113 "remove useless conditional branches"),
114 clEnumValN(Peepholes::PEEP_ALL, "all",
115 "enable all peephole optimizations")),
116 cl::ZeroOrMore, cl::cat(BoltOptCategory));
117
118 static cl::opt<unsigned>
119 PrintFuncStat("print-function-statistics",
120 cl::desc("print statistics about basic block ordering"),
121 cl::init(0), cl::cat(BoltOptCategory));
122
123 static cl::list<bolt::DynoStats::Category>
124 PrintSortedBy("print-sorted-by",
125 cl::CommaSeparated,
126 cl::desc("print functions sorted by order of dyno stats"),
127 cl::value_desc("key1,key2,key3,..."),
128 cl::values(
129 #define D(name, ...) \
130 clEnumValN(bolt::DynoStats::name, \
131 dynoStatsOptName(bolt::DynoStats::name), \
132 dynoStatsOptDesc(bolt::DynoStats::name)),
133 DYNO_STATS
134 #undef D
135 clEnumValN(0xffff, ".", ".")
136 ),
137 cl::ZeroOrMore,
138 cl::cat(BoltOptCategory));
139
140 static cl::opt<bool>
141 PrintUnknown("print-unknown",
142 cl::desc("print names of functions with unknown control flow"),
143 cl::cat(BoltCategory), cl::Hidden);
144
145 static cl::opt<bool>
146 PrintUnknownCFG("print-unknown-cfg",
147 cl::desc("dump CFG of functions with unknown control flow"),
148 cl::cat(BoltCategory), cl::ReallyHidden);
149
150 // Please MSVC19 with a forward declaration: otherwise it reports an error about
151 // an undeclared variable inside a callback.
152 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
153 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
154 "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
155 cl::init(bolt::ReorderBasicBlocks::LT_NONE),
156 cl::values(
157 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
158 "do not reorder basic blocks"),
159 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
160 "layout blocks in reverse order"),
161 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
162 "perform optimal layout based on profile"),
163 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
164 "branch-predictor",
165 "perform optimal layout prioritizing branch "
166 "predictions"),
167 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
168 "perform optimal layout prioritizing I-cache "
169 "behavior"),
170 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
171 "perform layout optimizing I-cache behavior"),
172 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
173 "perform layout optimizing I-cache behavior"),
174 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
175 "cluster-shuffle", "perform random layout of clusters")),
176 cl::ZeroOrMore, cl::cat(BoltOptCategory),
__anonbe3616080202(const bolt::ReorderBasicBlocks::LayoutType &option) 177 cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) {
178 if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
179 WithColor::warning()
180 << "'-reorder-blocks=cache+' is deprecated, "
181 << "please use '-reorder-blocks=ext-tsp' instead\n";
182 ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
183 }
184 }));
185
186 static cl::opt<unsigned> ReportBadLayout(
187 "report-bad-layout",
188 cl::desc("print top <uint> functions with suboptimal code layout on input"),
189 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
190
191 static cl::opt<bool>
192 ReportStaleFuncs("report-stale",
193 cl::desc("print the list of functions with stale profile"),
194 cl::Hidden, cl::cat(BoltOptCategory));
195
196 enum SctcModes : char {
197 SctcAlways,
198 SctcPreserveDirection,
199 SctcHeuristic
200 };
201
202 static cl::opt<SctcModes>
203 SctcMode("sctc-mode",
204 cl::desc("mode for simplify conditional tail calls"),
205 cl::init(SctcAlways),
206 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
207 clEnumValN(SctcPreserveDirection,
208 "preserve",
209 "only perform sctc when branch direction is "
210 "preserved"),
211 clEnumValN(SctcHeuristic,
212 "heuristic",
213 "use branch prediction data to control sctc")),
214 cl::ZeroOrMore,
215 cl::cat(BoltOptCategory));
216
217 static cl::opt<unsigned>
218 StaleThreshold("stale-threshold",
219 cl::desc(
220 "maximum percentage of stale functions to tolerate (default: 100)"),
221 cl::init(100),
222 cl::Hidden,
223 cl::cat(BoltOptCategory));
224
225 static cl::opt<unsigned> TSPThreshold(
226 "tsp-threshold",
227 cl::desc(
228 "maximum number of hot basic blocks in a function for which to use "
229 "a precise TSP solution while re-ordering basic blocks"),
230 cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
231
232 static cl::opt<unsigned> TopCalledLimit(
233 "top-called-limit",
234 cl::desc("maximum number of functions to print in top called "
235 "functions section"),
236 cl::init(100), cl::Hidden, cl::cat(BoltCategory));
237
238 } // namespace opts
239
240 namespace llvm {
241 namespace bolt {
242
shouldOptimize(const BinaryFunction & BF) const243 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
244 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
245 !BF.isIgnored();
246 }
247
shouldPrint(const BinaryFunction & BF) const248 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
249 return BF.isSimple() && !BF.isIgnored();
250 }
251
runOnFunction(BinaryFunction & BF)252 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
253 uint64_t NumRemoved = 0;
254 uint64_t NumDuplicateEdges = 0;
255 uint64_t NeedsFixBranches = 0;
256 for (BinaryBasicBlock &BB : BF) {
257 if (!BB.empty())
258 continue;
259
260 if (BB.isEntryPoint() || BB.isLandingPad())
261 continue;
262
263 // Handle a dangling empty block.
264 if (BB.succ_size() == 0) {
265 // If an empty dangling basic block has a predecessor, it could be a
266 // result of codegen for __builtin_unreachable. In such case, do not
267 // remove the block.
268 if (BB.pred_size() == 0) {
269 BB.markValid(false);
270 ++NumRemoved;
271 }
272 continue;
273 }
274
275 // The block should have just one successor.
276 BinaryBasicBlock *Successor = BB.getSuccessor();
277 assert(Successor && "invalid CFG encountered");
278
279 // Redirect all predecessors to the successor block.
280 while (!BB.pred_empty()) {
281 BinaryBasicBlock *Predecessor = *BB.pred_begin();
282 if (Predecessor->hasJumpTable())
283 break;
284
285 if (Predecessor == Successor)
286 break;
287
288 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
289 Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
290 BI.MispredictedCount);
291 // We need to fix branches even if we failed to replace all successors
292 // and remove the block.
293 NeedsFixBranches = true;
294 }
295
296 if (BB.pred_empty()) {
297 BB.removeAllSuccessors();
298 BB.markValid(false);
299 ++NumRemoved;
300 }
301 }
302
303 if (NumRemoved)
304 BF.eraseInvalidBBs();
305
306 // Check for duplicate successors. Do it after the empty block elimination as
307 // we can get more duplicate successors.
308 for (BinaryBasicBlock &BB : BF)
309 if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
310 BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
311 ++NumDuplicateEdges;
312
313 // fixBranches() will get rid of duplicate edges and update jump instructions.
314 if (NumDuplicateEdges || NeedsFixBranches)
315 BF.fixBranches();
316
317 NumDuplicateEdgesMerged += NumDuplicateEdges;
318 NumBlocksRemoved += NumRemoved;
319 }
320
runOnFunctions(BinaryContext & BC)321 void NormalizeCFG::runOnFunctions(BinaryContext &BC) {
322 ParallelUtilities::runOnEachFunction(
323 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
324 [&](BinaryFunction &BF) { runOnFunction(BF); },
325 [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
326 "NormalizeCFG");
327 if (NumBlocksRemoved)
328 outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
329 << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
330 if (NumDuplicateEdgesMerged)
331 outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
332 << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s")
333 << '\n';
334 }
335
runOnFunction(BinaryFunction & Function)336 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
337 if (!Function.getLayout().block_empty()) {
338 unsigned Count;
339 uint64_t Bytes;
340 Function.markUnreachableBlocks();
341 LLVM_DEBUG({
342 for (BinaryBasicBlock &BB : Function) {
343 if (!BB.isValid()) {
344 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
345 << " in function " << Function << "\n";
346 Function.dump();
347 }
348 }
349 });
350 std::tie(Count, Bytes) = Function.eraseInvalidBBs();
351 DeletedBlocks += Count;
352 DeletedBytes += Bytes;
353 if (Count) {
354 Modified.insert(&Function);
355 if (opts::Verbosity > 0)
356 outs() << "BOLT-INFO: Removed " << Count
357 << " dead basic block(s) accounting for " << Bytes
358 << " bytes in function " << Function << '\n';
359 }
360 }
361 }
362
runOnFunctions(BinaryContext & BC)363 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
364 for (auto &It : BC.getBinaryFunctions()) {
365 BinaryFunction &Function = It.second;
366 if (shouldOptimize(Function))
367 runOnFunction(Function);
368 }
369
370 outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
371 << DeletedBytes << " bytes of code.\n";
372 }
373
shouldPrint(const BinaryFunction & BF) const374 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
375 return (BinaryFunctionPass::shouldPrint(BF) &&
376 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
377 }
378
shouldOptimize(const BinaryFunction & BF) const379 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
380 // Apply execution count threshold
381 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
382 return false;
383
384 return BinaryFunctionPass::shouldOptimize(BF);
385 }
386
runOnFunctions(BinaryContext & BC)387 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
388 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
389 return;
390
391 std::atomic<uint64_t> ModifiedFuncCount{0};
392
393 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
394 modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
395 if (BF.getLayout().hasLayoutChanged())
396 ++ModifiedFuncCount;
397 };
398
399 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
400 return !shouldOptimize(BF);
401 };
402
403 ParallelUtilities::runOnEachFunction(
404 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
405 "ReorderBasicBlocks");
406
407 outs() << "BOLT-INFO: basic block reordering modified layout of "
408 << format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(),
409 100.0 * ModifiedFuncCount.load() /
410 BC.getBinaryFunctions().size());
411
412 if (opts::PrintFuncStat > 0) {
413 raw_ostream &OS = outs();
414 // Copy all the values into vector in order to sort them
415 std::map<uint64_t, BinaryFunction &> ScoreMap;
416 auto &BFs = BC.getBinaryFunctions();
417 for (auto It = BFs.begin(); It != BFs.end(); ++It)
418 ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
419 It->second.getFunctionScore(), It->second));
420
421 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
422 OS << " There are " << BFs.size() << " functions in total. \n";
423 OS << " Number of functions being modified: "
424 << ModifiedFuncCount.load() << "\n";
425 OS << " User asks for detailed information on top "
426 << opts::PrintFuncStat << " functions. (Ranked by function score)"
427 << "\n\n";
428 uint64_t I = 0;
429 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
430 ScoreMap.rbegin();
431 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
432 BinaryFunction &Function = Rit->second;
433
434 OS << " Information for function of top: " << (I + 1) << ": \n";
435 OS << " Function Score is: " << Function.getFunctionScore()
436 << "\n";
437 OS << " There are " << Function.size()
438 << " number of blocks in this function.\n";
439 OS << " There are " << Function.getInstructionCount()
440 << " number of instructions in this function.\n";
441 OS << " The edit distance for this function is: "
442 << Function.getLayout().getEditDistance() << "\n\n";
443 }
444 }
445 }
446
modifyFunctionLayout(BinaryFunction & BF,LayoutType Type,bool MinBranchClusters) const447 void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
448 LayoutType Type,
449 bool MinBranchClusters) const {
450 if (BF.size() == 0 || Type == LT_NONE)
451 return;
452
453 BinaryFunction::BasicBlockOrderType NewLayout;
454 std::unique_ptr<ReorderAlgorithm> Algo;
455
456 // Cannot do optimal layout without profile.
457 if (Type != LT_REVERSE && !BF.hasValidProfile())
458 return;
459
460 if (Type == LT_REVERSE) {
461 Algo.reset(new ReverseReorderAlgorithm());
462 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
463 // Work on optimal solution if problem is small enough
464 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
465 Algo.reset(new TSPReorderAlgorithm());
466 } else {
467 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
468
469 std::unique_ptr<ClusterAlgorithm> CAlgo;
470 if (MinBranchClusters)
471 CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
472 else
473 CAlgo.reset(new PHGreedyClusterAlgorithm());
474
475 switch (Type) {
476 case LT_OPTIMIZE:
477 Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
478 break;
479
480 case LT_OPTIMIZE_BRANCH:
481 Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
482 break;
483
484 case LT_OPTIMIZE_CACHE:
485 Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
486 break;
487
488 case LT_OPTIMIZE_EXT_TSP:
489 Algo.reset(new ExtTSPReorderAlgorithm());
490 break;
491
492 case LT_OPTIMIZE_SHUFFLE:
493 Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
494 break;
495
496 default:
497 llvm_unreachable("unexpected layout type");
498 }
499 }
500
501 Algo->reorderBasicBlocks(BF, NewLayout);
502
503 BF.getLayout().update(NewLayout);
504 }
505
runOnFunctions(BinaryContext & BC)506 void FixupBranches::runOnFunctions(BinaryContext &BC) {
507 for (auto &It : BC.getBinaryFunctions()) {
508 BinaryFunction &Function = It.second;
509 if (!BC.shouldEmit(Function) || !Function.isSimple())
510 continue;
511
512 Function.fixBranches();
513 }
514 }
515
runOnFunctions(BinaryContext & BC)516 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
517 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
518 if (!BF.finalizeCFIState()) {
519 if (BC.HasRelocations) {
520 errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
521 << ". Exiting.\n";
522 exit(1);
523 }
524 BF.setSimple(false);
525 return;
526 }
527
528 BF.setFinalized();
529
530 // Update exception handling information.
531 BF.updateEHRanges();
532 };
533
534 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
535 return !BC.shouldEmit(BF);
536 };
537
538 ParallelUtilities::runOnEachFunction(
539 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
540 SkipPredicate, "FinalizeFunctions");
541 }
542
runOnFunctions(BinaryContext & BC)543 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
544 if (BC.HasRelocations)
545 return;
546
547 if (!opts::UpdateDebugSections)
548 return;
549
550 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
551 // incorrect debug info.
552 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
553 uint64_t HotSize, ColdSize;
554 std::tie(HotSize, ColdSize) =
555 BC.calculateEmittedSize(BF, /*FixBranches=*/false);
556 if (HotSize > BF.getMaxSize())
557 BF.setSimple(false);
558 };
559
560 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
561 return !shouldOptimize(BF);
562 };
563
564 ParallelUtilities::runOnEachFunction(
565 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
566 SkipFunc, "CheckLargeFunctions");
567 }
568
shouldOptimize(const BinaryFunction & BF) const569 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
570 // Unlike other passes, allow functions in non-CFG state.
571 return BF.isSimple() && !BF.isIgnored();
572 }
573
runOnFunctions(BinaryContext & BC)574 void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
575 std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations;
576
577 for (auto &It : BC.getBinaryFunctions()) {
578 BinaryFunction &BF = It.second;
579 int64_t CurrentGnuArgsSize = 0;
580
581 // Have we crossed hot/cold border for split functions?
582 bool SeenCold = false;
583
584 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
585 if (BB->isCold() && !SeenCold) {
586 SeenCold = true;
587 CurrentGnuArgsSize = 0;
588 }
589
590 // First convert GnuArgsSize annotations into CFIs. This may change instr
591 // pointers, so do it before recording ptrs for preserved annotations
592 if (BF.usesGnuArgsSize()) {
593 for (auto II = BB->begin(); II != BB->end(); ++II) {
594 if (!BC.MIB->isInvoke(*II))
595 continue;
596 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
597 assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size");
598 if (NewGnuArgsSize != CurrentGnuArgsSize) {
599 auto InsertII = BF.addCFIInstruction(
600 BB, II,
601 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
602 CurrentGnuArgsSize = NewGnuArgsSize;
603 II = std::next(InsertII);
604 }
605 }
606 }
607
608 // Now record preserved annotations separately and then strip annotations.
609 for (auto II = BB->begin(); II != BB->end(); ++II) {
610 if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II))
611 PreservedOffsetAnnotations.emplace_back(&(*II),
612 *BC.MIB->getOffset(*II));
613 BC.MIB->stripAnnotations(*II);
614 }
615 }
616 }
617 for (BinaryFunction *BF : BC.getInjectedBinaryFunctions())
618 for (BinaryBasicBlock &BB : *BF)
619 for (MCInst &Instruction : BB)
620 BC.MIB->stripAnnotations(Instruction);
621
622 // Release all memory taken by annotations
623 BC.MIB->freeAnnotations();
624
625 // Reinsert preserved annotations we need during code emission.
626 for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations)
627 BC.MIB->setOffset(*Item.first, Item.second);
628 }
629
630 namespace {
631
632 // This peephole fixes jump instructions that jump to another basic
633 // block with a single jump instruction, e.g.
634 //
635 // B0: ...
636 // jmp B1 (or jcc B1)
637 //
638 // B1: jmp B2
639 //
640 // ->
641 //
642 // B0: ...
643 // jmp B2 (or jcc B2)
644 //
fixDoubleJumps(BinaryFunction & Function,bool MarkInvalid)645 uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
646 uint64_t NumDoubleJumps = 0;
647
648 MCContext *Ctx = Function.getBinaryContext().Ctx.get();
649 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
650 for (BinaryBasicBlock &BB : Function) {
651 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
652 const MCSymbol *SuccSym) {
653 // Ignore infinite loop jumps or fallthrough tail jumps.
654 if (Pred == Succ || Succ == &BB)
655 return false;
656
657 if (Succ) {
658 const MCSymbol *TBB = nullptr;
659 const MCSymbol *FBB = nullptr;
660 MCInst *CondBranch = nullptr;
661 MCInst *UncondBranch = nullptr;
662 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
663 if (!Res) {
664 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
665 Pred->dump());
666 return false;
667 }
668 Pred->replaceSuccessor(&BB, Succ);
669
670 // We must patch up any existing branch instructions to match up
671 // with the new successor.
672 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
673 "Predecessor block has inconsistent number of successors");
674 if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
675 MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
676 } else if (UncondBranch &&
677 MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
678 MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
679 } else if (!UncondBranch) {
680 assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ &&
681 "Don't add an explicit jump to a fallthrough block.");
682 Pred->addBranchInstruction(Succ);
683 }
684 } else {
685 // Succ will be null in the tail call case. In this case we
686 // need to explicitly add a tail call instruction.
687 MCInst *Branch = Pred->getLastNonPseudoInstr();
688 if (Branch && MIB->isUnconditionalBranch(*Branch)) {
689 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
690 Pred->removeSuccessor(&BB);
691 Pred->eraseInstruction(Pred->findInstruction(Branch));
692 Pred->addTailCallInstruction(SuccSym);
693 } else {
694 return false;
695 }
696 }
697
698 ++NumDoubleJumps;
699 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
700 << Pred->getName() << " -> " << BB.getName() << " to "
701 << Pred->getName() << " -> " << SuccSym->getName()
702 << (!Succ ? " (tail)\n" : "\n"));
703
704 return true;
705 };
706
707 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
708 continue;
709
710 MCInst *Inst = BB.getFirstNonPseudoInstr();
711 const bool IsTailCall = MIB->isTailCall(*Inst);
712
713 if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
714 continue;
715
716 // If we operate after SCTC make sure it's not a conditional tail call.
717 if (IsTailCall && MIB->isConditionalBranch(*Inst))
718 continue;
719
720 const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
721 BinaryBasicBlock *Succ = BB.getSuccessor();
722
723 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
724 continue;
725
726 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
727
728 for (BinaryBasicBlock *Pred : Preds) {
729 if (Pred->isLandingPad())
730 continue;
731
732 if (Pred->getSuccessor() == &BB ||
733 (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
734 Pred->getConditionalSuccessor(false) == &BB)
735 if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid)
736 BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
737 BB.isEntryPoint());
738 }
739 }
740
741 return NumDoubleJumps;
742 }
743 } // namespace
744
shouldRewriteBranch(const BinaryBasicBlock * PredBB,const MCInst & CondBranch,const BinaryBasicBlock * BB,const bool DirectionFlag)745 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
746 const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
747 const BinaryBasicBlock *BB, const bool DirectionFlag) {
748 if (BeenOptimized.count(PredBB))
749 return false;
750
751 const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
752
753 if (IsForward)
754 ++NumOrigForwardBranches;
755 else
756 ++NumOrigBackwardBranches;
757
758 if (opts::SctcMode == opts::SctcAlways)
759 return true;
760
761 if (opts::SctcMode == opts::SctcPreserveDirection)
762 return IsForward == DirectionFlag;
763
764 const ErrorOr<std::pair<double, double>> Frequency =
765 PredBB->getBranchStats(BB);
766
767 // It's ok to rewrite the conditional branch if the new target will be
768 // a backward branch.
769
770 // If no data available for these branches, then it should be ok to
771 // do the optimization since it will reduce code size.
772 if (Frequency.getError())
773 return true;
774
775 // TODO: should this use misprediction frequency instead?
776 const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
777 (!IsForward && Frequency.get().first <= 0.5);
778
779 return Result == DirectionFlag;
780 }
781
fixTailCalls(BinaryFunction & BF)782 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
783 // Need updated indices to correctly detect branch' direction.
784 BF.getLayout().updateLayoutIndices();
785 BF.markUnreachableBlocks();
786
787 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
788 MCContext *Ctx = BF.getBinaryContext().Ctx.get();
789 uint64_t NumLocalCTCCandidates = 0;
790 uint64_t NumLocalCTCs = 0;
791 uint64_t LocalCTCTakenCount = 0;
792 uint64_t LocalCTCExecCount = 0;
793 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
794 NeedsUncondBranch;
795
796 // Will block be deleted by UCE?
797 auto isValid = [](const BinaryBasicBlock *BB) {
798 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
799 };
800
801 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
802 // Locate BB with a single direct tail-call instruction.
803 if (BB->getNumNonPseudos() != 1)
804 continue;
805
806 MCInst *Instr = BB->getFirstNonPseudoInstr();
807 if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
808 continue;
809
810 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
811 if (!CalleeSymbol)
812 continue;
813
814 // Detect direction of the possible conditional tail call.
815 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
816
817 // Iterate through all predecessors.
818 for (BinaryBasicBlock *PredBB : BB->predecessors()) {
819 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
820 if (!CondSucc)
821 continue;
822
823 ++NumLocalCTCCandidates;
824
825 const MCSymbol *TBB = nullptr;
826 const MCSymbol *FBB = nullptr;
827 MCInst *CondBranch = nullptr;
828 MCInst *UncondBranch = nullptr;
829 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
830
831 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
832 if (!Result) {
833 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
834 PredBB->dump());
835 continue;
836 }
837
838 assert(Result && "internal error analyzing conditional branch");
839 assert(CondBranch && "conditional branch expected");
840
841 // It's possible that PredBB is also a successor to BB that may have
842 // been processed by a previous iteration of the SCTC loop, in which
843 // case it may have been marked invalid. We should skip rewriting in
844 // this case.
845 if (!PredBB->isValid()) {
846 assert(PredBB->isSuccessor(BB) &&
847 "PredBB should be valid if it is not a successor to BB");
848 continue;
849 }
850
851 // We don't want to reverse direction of the branch in new order
852 // without further profile analysis.
853 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
854 if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
855 continue;
856
857 // Record this block so that we don't try to optimize it twice.
858 BeenOptimized.insert(PredBB);
859
860 uint64_t Count = 0;
861 if (CondSucc != BB) {
862 // Patch the new target address into the conditional branch.
863 MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
864 // Since we reversed the condition on the branch we need to change
865 // the target for the unconditional branch or add a unconditional
866 // branch to the old target. This has to be done manually since
867 // fixupBranches is not called after SCTC.
868 NeedsUncondBranch.emplace_back(PredBB, CondSucc);
869 Count = PredBB->getFallthroughBranchInfo().Count;
870 } else {
871 // Change destination of the conditional branch.
872 MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
873 Count = PredBB->getTakenBranchInfo().Count;
874 }
875 const uint64_t CTCTakenFreq =
876 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
877
878 // Annotate it, so "isCall" returns true for this jcc
879 MIB->setConditionalTailCall(*CondBranch);
880 // Add info abount the conditional tail call frequency, otherwise this
881 // info will be lost when we delete the associated BranchInfo entry
882 auto &CTCAnnotation =
883 MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
884 CTCAnnotation = CTCTakenFreq;
885
886 // Remove the unused successor which may be eliminated later
887 // if there are no other users.
888 PredBB->removeSuccessor(BB);
889 // Update BB execution count
890 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
891 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
892 else if (CTCTakenFreq > BB->getKnownExecutionCount())
893 BB->setExecutionCount(0);
894
895 ++NumLocalCTCs;
896 LocalCTCTakenCount += CTCTakenFreq;
897 LocalCTCExecCount += PredBB->getKnownExecutionCount();
898 }
899
900 // Remove the block from CFG if all predecessors were removed.
901 BB->markValid(isValid(BB));
902 }
903
904 // Add unconditional branches at the end of BBs to new successors
905 // as long as the successor is not a fallthrough.
906 for (auto &Entry : NeedsUncondBranch) {
907 BinaryBasicBlock *PredBB = Entry.first;
908 const BinaryBasicBlock *CondSucc = Entry.second;
909
910 const MCSymbol *TBB = nullptr;
911 const MCSymbol *FBB = nullptr;
912 MCInst *CondBranch = nullptr;
913 MCInst *UncondBranch = nullptr;
914 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
915
916 // Find the next valid block. Invalid blocks will be deleted
917 // so they shouldn't be considered fallthrough targets.
918 const BinaryBasicBlock *NextBlock =
919 BF.getLayout().getBasicBlockAfter(PredBB, false);
920 while (NextBlock && !isValid(NextBlock))
921 NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false);
922
923 // Get the unconditional successor to this block.
924 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
925 assert(PredSucc && "The other branch should be a tail call");
926
927 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
928
929 if (UncondBranch) {
930 if (HasFallthrough)
931 PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
932 else
933 MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
934 } else if (!HasFallthrough) {
935 MCInst Branch;
936 MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
937 PredBB->addInstruction(Branch);
938 }
939 }
940
941 if (NumLocalCTCs > 0) {
942 NumDoubleJumps += fixDoubleJumps(BF, true);
943 // Clean-up unreachable tail-call blocks.
944 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
945 DeletedBlocks += Stats.first;
946 DeletedBytes += Stats.second;
947
948 assert(BF.validateCFG());
949 }
950
951 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
952 << " conditional tail calls from a total of "
953 << NumLocalCTCCandidates << " candidates in function " << BF
954 << ". CTCs execution count for this function is "
955 << LocalCTCExecCount << " and CTC taken count is "
956 << LocalCTCTakenCount << "\n";);
957
958 NumTailCallsPatched += NumLocalCTCs;
959 NumCandidateTailCalls += NumLocalCTCCandidates;
960 CTCExecCount += LocalCTCExecCount;
961 CTCTakenCount += LocalCTCTakenCount;
962
963 return NumLocalCTCs > 0;
964 }
965
runOnFunctions(BinaryContext & BC)966 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
967 if (!BC.isX86())
968 return;
969
970 for (auto &It : BC.getBinaryFunctions()) {
971 BinaryFunction &Function = It.second;
972
973 if (!shouldOptimize(Function))
974 continue;
975
976 if (fixTailCalls(Function)) {
977 Modified.insert(&Function);
978 Function.setHasCanonicalCFG(false);
979 }
980 }
981
982 outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
983 << " tail calls (" << NumOrigForwardBranches << " forward)"
984 << " tail calls (" << NumOrigBackwardBranches << " backward)"
985 << " from a total of " << NumCandidateTailCalls << " while removing "
986 << NumDoubleJumps << " double jumps"
987 << " and removing " << DeletedBlocks << " basic blocks"
988 << " totalling " << DeletedBytes
989 << " bytes of code. CTCs total execution count is " << CTCExecCount
990 << " and the number of times CTCs are taken is " << CTCTakenCount
991 << ".\n";
992 }
993
shortenInstructions(BinaryFunction & Function)994 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
995 uint64_t Count = 0;
996 const BinaryContext &BC = Function.getBinaryContext();
997 for (BinaryBasicBlock &BB : Function) {
998 for (MCInst &Inst : BB) {
999 MCInst OriginalInst;
1000 if (opts::Verbosity > 2)
1001 OriginalInst = Inst;
1002
1003 if (!BC.MIB->shortenInstruction(Inst, *BC.STI))
1004 continue;
1005
1006 if (opts::Verbosity > 2) {
1007 BC.scopeLock();
1008 outs() << "BOLT-INFO: shortening:\nBOLT-INFO: ";
1009 BC.printInstruction(outs(), OriginalInst, 0, &Function);
1010 outs() << "BOLT-INFO: to:";
1011 BC.printInstruction(outs(), Inst, 0, &Function);
1012 }
1013
1014 ++Count;
1015 }
1016 }
1017
1018 return Count;
1019 }
1020
runOnFunctions(BinaryContext & BC)1021 void ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1022 std::atomic<uint64_t> NumShortened{0};
1023 if (!BC.isX86())
1024 return;
1025
1026 ParallelUtilities::runOnEachFunction(
1027 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1028 [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1029 nullptr, "ShortenInstructions");
1030
1031 outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n";
1032 }
1033
addTailcallTraps(BinaryFunction & Function)1034 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1035 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1036 for (BinaryBasicBlock &BB : Function) {
1037 MCInst *Inst = BB.getLastNonPseudoInstr();
1038 if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1039 MCInst Trap;
1040 if (MIB->createTrap(Trap)) {
1041 BB.addInstruction(Trap);
1042 ++TailCallTraps;
1043 }
1044 }
1045 }
1046 }
1047
removeUselessCondBranches(BinaryFunction & Function)1048 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1049 for (BinaryBasicBlock &BB : Function) {
1050 if (BB.succ_size() != 2)
1051 continue;
1052
1053 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1054 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1055 if (CondBB != UncondBB)
1056 continue;
1057
1058 const MCSymbol *TBB = nullptr;
1059 const MCSymbol *FBB = nullptr;
1060 MCInst *CondBranch = nullptr;
1061 MCInst *UncondBranch = nullptr;
1062 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1063
1064 // analyzeBranch() can fail due to unusual branch instructions,
1065 // e.g. jrcxz, or jump tables (indirect jump).
1066 if (!Result || !CondBranch)
1067 continue;
1068
1069 BB.removeDuplicateConditionalSuccessor(CondBranch);
1070 ++NumUselessCondBranches;
1071 }
1072 }
1073
runOnFunctions(BinaryContext & BC)1074 void Peepholes::runOnFunctions(BinaryContext &BC) {
1075 const char Opts =
1076 std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1077 [](const char A, const PeepholeOpts B) { return A | B; });
1078 if (Opts == PEEP_NONE)
1079 return;
1080
1081 for (auto &It : BC.getBinaryFunctions()) {
1082 BinaryFunction &Function = It.second;
1083 if (shouldOptimize(Function)) {
1084 if (Opts & PEEP_DOUBLE_JUMPS)
1085 NumDoubleJumps += fixDoubleJumps(Function, false);
1086 if (Opts & PEEP_TAILCALL_TRAPS)
1087 addTailcallTraps(Function);
1088 if (Opts & PEEP_USELESS_BRANCHES)
1089 removeUselessCondBranches(Function);
1090 assert(Function.validateCFG());
1091 }
1092 }
1093 outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1094 << " double jumps patched.\n"
1095 << "BOLT-INFO: Peephole: " << TailCallTraps
1096 << " tail call traps inserted.\n"
1097 << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1098 << " useless conditional branches removed.\n";
1099 }
1100
simplifyRODataLoads(BinaryFunction & BF)1101 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1102 BinaryContext &BC = BF.getBinaryContext();
1103 MCPlusBuilder *MIB = BC.MIB.get();
1104
1105 uint64_t NumLocalLoadsSimplified = 0;
1106 uint64_t NumDynamicLocalLoadsSimplified = 0;
1107 uint64_t NumLocalLoadsFound = 0;
1108 uint64_t NumDynamicLocalLoadsFound = 0;
1109
1110 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
1111 for (MCInst &Inst : *BB) {
1112 unsigned Opcode = Inst.getOpcode();
1113 const MCInstrDesc &Desc = BC.MII->get(Opcode);
1114
1115 // Skip instructions that do not load from memory.
1116 if (!Desc.mayLoad())
1117 continue;
1118
1119 // Try to statically evaluate the target memory address;
1120 uint64_t TargetAddress;
1121
1122 if (MIB->hasPCRelOperand(Inst)) {
1123 // Try to find the symbol that corresponds to the PC-relative operand.
1124 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1125 assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1126 assert(DispOpI->isExpr() &&
1127 "found PC-relative with non-symbolic displacement");
1128
1129 // Get displacement symbol.
1130 const MCSymbol *DisplSymbol;
1131 uint64_t DisplOffset;
1132
1133 std::tie(DisplSymbol, DisplOffset) =
1134 MIB->getTargetSymbolInfo(DispOpI->getExpr());
1135
1136 if (!DisplSymbol)
1137 continue;
1138
1139 // Look up the symbol address in the global symbols map of the binary
1140 // context object.
1141 BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1142 if (!BD)
1143 continue;
1144 TargetAddress = BD->getAddress() + DisplOffset;
1145 } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1146 continue;
1147 }
1148
1149 // Get the contents of the section containing the target address of the
1150 // memory operand. We are only interested in read-only sections.
1151 ErrorOr<BinarySection &> DataSection =
1152 BC.getSectionForAddress(TargetAddress);
1153 if (!DataSection || !DataSection->isReadOnly())
1154 continue;
1155
1156 if (BC.getRelocationAt(TargetAddress) ||
1157 BC.getDynamicRelocationAt(TargetAddress))
1158 continue;
1159
1160 uint32_t Offset = TargetAddress - DataSection->getAddress();
1161 StringRef ConstantData = DataSection->getContents();
1162
1163 ++NumLocalLoadsFound;
1164 if (BB->hasProfile())
1165 NumDynamicLocalLoadsFound += BB->getExecutionCount();
1166
1167 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1168 ++NumLocalLoadsSimplified;
1169 if (BB->hasProfile())
1170 NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1171 }
1172 }
1173 }
1174
1175 NumLoadsFound += NumLocalLoadsFound;
1176 NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1177 NumLoadsSimplified += NumLocalLoadsSimplified;
1178 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1179
1180 return NumLocalLoadsSimplified > 0;
1181 }
1182
runOnFunctions(BinaryContext & BC)1183 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1184 for (auto &It : BC.getBinaryFunctions()) {
1185 BinaryFunction &Function = It.second;
1186 if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1187 Modified.insert(&Function);
1188 }
1189
1190 outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1191 << NumLoadsFound << " loads from a statically computed address.\n"
1192 << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
1193 << "\n"
1194 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
1195 }
1196
runOnFunctions(BinaryContext & BC)1197 void AssignSections::runOnFunctions(BinaryContext &BC) {
1198 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1199 Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1200 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1201 }
1202
1203 // In non-relocation mode functions have pre-assigned section names.
1204 if (!BC.HasRelocations)
1205 return;
1206
1207 const bool UseColdSection =
1208 BC.NumProfiledFuncs > 0 ||
1209 opts::ReorderFunctions == ReorderFunctions::RT_USER;
1210 for (auto &BFI : BC.getBinaryFunctions()) {
1211 BinaryFunction &Function = BFI.second;
1212 if (opts::isHotTextMover(Function)) {
1213 Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1214 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1215 continue;
1216 }
1217
1218 if (!UseColdSection || Function.hasValidIndex())
1219 Function.setCodeSectionName(BC.getMainCodeSectionName());
1220 else
1221 Function.setCodeSectionName(BC.getColdCodeSectionName());
1222
1223 if (Function.isSplit())
1224 Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1225 }
1226 }
1227
runOnFunctions(BinaryContext & BC)1228 void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1229 double FlowImbalanceMean = 0.0;
1230 size_t NumBlocksConsidered = 0;
1231 double WorstBias = 0.0;
1232 const BinaryFunction *WorstBiasFunc = nullptr;
1233
1234 // For each function CFG, we fill an IncomingMap with the sum of the frequency
1235 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1236 // of the frequency of outgoing edges.
1237 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1238 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1239 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1240
1241 // Compute mean
1242 for (const auto &BFI : BC.getBinaryFunctions()) {
1243 const BinaryFunction &Function = BFI.second;
1244 if (Function.empty() || !Function.isSimple())
1245 continue;
1246 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1247 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1248 for (const BinaryBasicBlock &BB : Function) {
1249 uint64_t TotalOutgoing = 0ULL;
1250 auto SuccBIIter = BB.branch_info_begin();
1251 for (BinaryBasicBlock *Succ : BB.successors()) {
1252 uint64_t Count = SuccBIIter->Count;
1253 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1254 ++SuccBIIter;
1255 continue;
1256 }
1257 TotalOutgoing += Count;
1258 IncomingMap[Succ] += Count;
1259 ++SuccBIIter;
1260 }
1261 OutgoingMap[&BB] = TotalOutgoing;
1262 }
1263
1264 size_t NumBlocks = 0;
1265 double Mean = 0.0;
1266 for (const BinaryBasicBlock &BB : Function) {
1267 // Do not compute score for low frequency blocks, entry or exit blocks
1268 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1269 continue;
1270 ++NumBlocks;
1271 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1272 Mean += fabs(Difference / IncomingMap[&BB]);
1273 }
1274
1275 FlowImbalanceMean += Mean;
1276 NumBlocksConsidered += NumBlocks;
1277 if (!NumBlocks)
1278 continue;
1279 double FuncMean = Mean / NumBlocks;
1280 if (FuncMean > WorstBias) {
1281 WorstBias = FuncMean;
1282 WorstBiasFunc = &Function;
1283 }
1284 }
1285 if (NumBlocksConsidered > 0)
1286 FlowImbalanceMean /= NumBlocksConsidered;
1287
1288 // Compute standard deviation
1289 NumBlocksConsidered = 0;
1290 double FlowImbalanceVar = 0.0;
1291 for (const auto &BFI : BC.getBinaryFunctions()) {
1292 const BinaryFunction &Function = BFI.second;
1293 if (Function.empty() || !Function.isSimple())
1294 continue;
1295 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1296 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1297 for (const BinaryBasicBlock &BB : Function) {
1298 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1299 continue;
1300 ++NumBlocksConsidered;
1301 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1302 FlowImbalanceVar +=
1303 pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1304 }
1305 }
1306 if (NumBlocksConsidered) {
1307 FlowImbalanceVar /= NumBlocksConsidered;
1308 FlowImbalanceVar = sqrt(FlowImbalanceVar);
1309 }
1310
1311 // Report to user
1312 outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1313 (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1314 if (WorstBiasFunc && opts::Verbosity >= 1) {
1315 outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
1316 << "\n";
1317 LLVM_DEBUG(WorstBiasFunc->dump());
1318 }
1319 }
1320
runOnFunctions(BinaryContext & BC)1321 void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1322 uint64_t NumRegularFunctions = 0;
1323 uint64_t NumStaleProfileFunctions = 0;
1324 uint64_t NumNonSimpleProfiledFunctions = 0;
1325 uint64_t NumUnknownControlFlowFunctions = 0;
1326 uint64_t TotalSampleCount = 0;
1327 uint64_t StaleSampleCount = 0;
1328 std::vector<const BinaryFunction *> ProfiledFunctions;
1329 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1330 for (auto &BFI : BC.getBinaryFunctions()) {
1331 const BinaryFunction &Function = BFI.second;
1332
1333 // Ignore PLT functions for stats.
1334 if (Function.isPLTFunction())
1335 continue;
1336
1337 ++NumRegularFunctions;
1338
1339 if (!Function.isSimple()) {
1340 if (Function.hasProfile())
1341 ++NumNonSimpleProfiledFunctions;
1342 continue;
1343 }
1344
1345 if (Function.hasUnknownControlFlow()) {
1346 if (opts::PrintUnknownCFG)
1347 Function.dump();
1348 else if (opts::PrintUnknown)
1349 errs() << "function with unknown control flow: " << Function << '\n';
1350
1351 ++NumUnknownControlFlowFunctions;
1352 }
1353
1354 if (!Function.hasProfile())
1355 continue;
1356
1357 uint64_t SampleCount = Function.getRawBranchCount();
1358 TotalSampleCount += SampleCount;
1359
1360 if (Function.hasValidProfile()) {
1361 ProfiledFunctions.push_back(&Function);
1362 } else {
1363 if (opts::ReportStaleFuncs) {
1364 outs() << StaleFuncsHeader;
1365 StaleFuncsHeader = "";
1366 outs() << " " << Function << '\n';
1367 }
1368 ++NumStaleProfileFunctions;
1369 StaleSampleCount += SampleCount;
1370 }
1371 }
1372 BC.NumProfiledFuncs = ProfiledFunctions.size();
1373
1374 const size_t NumAllProfiledFunctions =
1375 ProfiledFunctions.size() + NumStaleProfileFunctions;
1376 outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1377 << NumRegularFunctions << " functions in the binary ("
1378 << format("%.1f", NumAllProfiledFunctions /
1379 (float)NumRegularFunctions * 100.0f)
1380 << "%) have non-empty execution profile\n";
1381 if (NumNonSimpleProfiledFunctions) {
1382 outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1383 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1384 << " with profile could not be optimized\n";
1385 }
1386 if (NumStaleProfileFunctions) {
1387 const float PctStale =
1388 NumStaleProfileFunctions / (float)NumAllProfiledFunctions * 100.0f;
1389 auto printErrorOrWarning = [&]() {
1390 if (PctStale > opts::StaleThreshold)
1391 errs() << "BOLT-ERROR: ";
1392 else
1393 errs() << "BOLT-WARNING: ";
1394 };
1395 printErrorOrWarning();
1396 errs() << NumStaleProfileFunctions
1397 << format(" (%.1f%% of all profiled)", PctStale) << " function"
1398 << (NumStaleProfileFunctions == 1 ? "" : "s")
1399 << " have invalid (possibly stale) profile."
1400 " Use -report-stale to see the list.\n";
1401 if (TotalSampleCount > 0) {
1402 printErrorOrWarning();
1403 errs() << StaleSampleCount << " out of " << TotalSampleCount
1404 << " samples in the binary ("
1405 << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount))
1406 << "%) belong to functions with invalid"
1407 " (possibly stale) profile.\n";
1408 }
1409 if (PctStale > opts::StaleThreshold) {
1410 errs() << "BOLT-ERROR: stale functions exceed specified threshold of "
1411 << opts::StaleThreshold << "%. Exiting.\n";
1412 exit(1);
1413 }
1414 }
1415
1416 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1417 outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1418 << " objects was ignored\n";
1419 }
1420
1421 if (ProfiledFunctions.size() > 10) {
1422 if (opts::Verbosity >= 1) {
1423 outs() << "BOLT-INFO: top called functions are:\n";
1424 llvm::sort(ProfiledFunctions,
1425 [](const BinaryFunction *A, const BinaryFunction *B) {
1426 return B->getExecutionCount() < A->getExecutionCount();
1427 });
1428 auto SFI = ProfiledFunctions.begin();
1429 auto SFIend = ProfiledFunctions.end();
1430 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1431 ++SFI, ++I)
1432 outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n';
1433 }
1434 }
1435
1436 if (!opts::PrintSortedBy.empty() &&
1437 !llvm::is_contained(opts::PrintSortedBy, DynoStats::FIRST_DYNO_STAT)) {
1438
1439 std::vector<const BinaryFunction *> Functions;
1440 std::map<const BinaryFunction *, DynoStats> Stats;
1441
1442 for (const auto &BFI : BC.getBinaryFunctions()) {
1443 const BinaryFunction &BF = BFI.second;
1444 if (shouldOptimize(BF) && BF.hasValidProfile()) {
1445 Functions.push_back(&BF);
1446 Stats.emplace(&BF, getDynoStats(BF));
1447 }
1448 }
1449
1450 const bool SortAll =
1451 llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT);
1452
1453 const bool Ascending =
1454 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1455
1456 if (SortAll) {
1457 llvm::stable_sort(Functions,
1458 [Ascending, &Stats](const BinaryFunction *A,
1459 const BinaryFunction *B) {
1460 return Ascending ? Stats.at(A) < Stats.at(B)
1461 : Stats.at(B) < Stats.at(A);
1462 });
1463 } else {
1464 llvm::stable_sort(
1465 Functions, [Ascending, &Stats](const BinaryFunction *A,
1466 const BinaryFunction *B) {
1467 const DynoStats &StatsA = Stats.at(A);
1468 const DynoStats &StatsB = Stats.at(B);
1469 return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy)
1470 : StatsB.lessThan(StatsA, opts::PrintSortedBy);
1471 });
1472 }
1473
1474 outs() << "BOLT-INFO: top functions sorted by ";
1475 if (SortAll) {
1476 outs() << "dyno stats";
1477 } else {
1478 outs() << "(";
1479 bool PrintComma = false;
1480 for (const DynoStats::Category Category : opts::PrintSortedBy) {
1481 if (PrintComma)
1482 outs() << ", ";
1483 outs() << DynoStats::Description(Category);
1484 PrintComma = true;
1485 }
1486 outs() << ")";
1487 }
1488
1489 outs() << " are:\n";
1490 auto SFI = Functions.begin();
1491 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1492 const DynoStats Stats = getDynoStats(**SFI);
1493 outs() << " " << **SFI;
1494 if (!SortAll) {
1495 outs() << " (";
1496 bool PrintComma = false;
1497 for (const DynoStats::Category Category : opts::PrintSortedBy) {
1498 if (PrintComma)
1499 outs() << ", ";
1500 outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1501 PrintComma = true;
1502 }
1503 outs() << ")";
1504 }
1505 outs() << "\n";
1506 }
1507 }
1508
1509 if (!BC.TrappedFunctions.empty()) {
1510 errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1511 << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1512 << " will trap on entry. Use -trap-avx512=0 to disable"
1513 " traps.";
1514 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1515 errs() << '\n';
1516 for (const BinaryFunction *Function : BC.TrappedFunctions)
1517 errs() << " " << *Function << '\n';
1518 } else {
1519 errs() << " Use -v=1 to see the list.\n";
1520 }
1521 }
1522
1523 // Print information on missed macro-fusion opportunities seen on input.
1524 if (BC.MissedMacroFusionPairs) {
1525 outs() << "BOLT-INFO: the input contains " << BC.MissedMacroFusionPairs
1526 << " (dynamic count : " << BC.MissedMacroFusionExecCount
1527 << ") opportunities for macro-fusion optimization";
1528 switch (opts::AlignMacroOpFusion) {
1529 case MFT_NONE:
1530 outs() << ". Use -align-macro-fusion to fix.\n";
1531 break;
1532 case MFT_HOT:
1533 outs() << ". Will fix instances on a hot path.\n";
1534 break;
1535 case MFT_ALL:
1536 outs() << " that are going to be fixed\n";
1537 break;
1538 }
1539 }
1540
1541 // Collect and print information about suboptimal code layout on input.
1542 if (opts::ReportBadLayout) {
1543 std::vector<const BinaryFunction *> SuboptimalFuncs;
1544 for (auto &BFI : BC.getBinaryFunctions()) {
1545 const BinaryFunction &BF = BFI.second;
1546 if (!BF.hasValidProfile())
1547 continue;
1548
1549 const uint64_t HotThreshold =
1550 std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1551 bool HotSeen = false;
1552 for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) {
1553 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1554 HotSeen = true;
1555 continue;
1556 }
1557 if (HotSeen && BB->getKnownExecutionCount() == 0) {
1558 SuboptimalFuncs.push_back(&BF);
1559 break;
1560 }
1561 }
1562 }
1563
1564 if (!SuboptimalFuncs.empty()) {
1565 llvm::sort(SuboptimalFuncs,
1566 [](const BinaryFunction *A, const BinaryFunction *B) {
1567 return A->getKnownExecutionCount() / A->getSize() >
1568 B->getKnownExecutionCount() / B->getSize();
1569 });
1570
1571 outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1572 << " functions have "
1573 "cold code in the middle of hot code. Top functions are:\n";
1574 for (unsigned I = 0;
1575 I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1576 SuboptimalFuncs.size());
1577 ++I)
1578 SuboptimalFuncs[I]->print(outs());
1579 }
1580 }
1581
1582 if (NumUnknownControlFlowFunctions) {
1583 outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1584 << " functions have instructions with unknown control flow";
1585 if (!opts::PrintUnknown)
1586 outs() << ". Use -print-unknown to see the list.";
1587 outs() << '\n';
1588 }
1589 }
1590
runOnFunctions(BinaryContext & BC)1591 void InstructionLowering::runOnFunctions(BinaryContext &BC) {
1592 for (auto &BFI : BC.getBinaryFunctions())
1593 for (BinaryBasicBlock &BB : BFI.second)
1594 for (MCInst &Instruction : BB)
1595 BC.MIB->lowerTailCall(Instruction);
1596 }
1597
runOnFunctions(BinaryContext & BC)1598 void StripRepRet::runOnFunctions(BinaryContext &BC) {
1599 if (!BC.isX86())
1600 return;
1601
1602 uint64_t NumPrefixesRemoved = 0;
1603 uint64_t NumBytesSaved = 0;
1604 for (auto &BFI : BC.getBinaryFunctions()) {
1605 for (BinaryBasicBlock &BB : BFI.second) {
1606 auto LastInstRIter = BB.getLastNonPseudo();
1607 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1608 !BC.MIB->deleteREPPrefix(*LastInstRIter))
1609 continue;
1610
1611 NumPrefixesRemoved += BB.getKnownExecutionCount();
1612 ++NumBytesSaved;
1613 }
1614 }
1615
1616 if (NumBytesSaved)
1617 outs() << "BOLT-INFO: removed " << NumBytesSaved
1618 << " 'repz' prefixes"
1619 " with estimated execution count of "
1620 << NumPrefixesRemoved << " times.\n";
1621 }
1622
runOnFunctions(BinaryContext & BC)1623 void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1624 if (!BC.isX86())
1625 return;
1626
1627 uint64_t NumInlined = 0;
1628 uint64_t NumInlinedDyno = 0;
1629 for (auto &BFI : BC.getBinaryFunctions()) {
1630 for (BinaryBasicBlock &BB : BFI.second) {
1631 for (auto II = BB.begin(); II != BB.end(); ++II) {
1632 MCInst &Inst = *II;
1633
1634 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1635 !Inst.getOperand(0).isExpr())
1636 continue;
1637
1638 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1639 if (CalleeSymbol->getName() != "memcpy" &&
1640 CalleeSymbol->getName() != "memcpy@PLT" &&
1641 CalleeSymbol->getName() != "_memcpy8")
1642 continue;
1643
1644 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1645 const bool IsTailCall = BC.MIB->isTailCall(Inst);
1646
1647 const InstructionListType NewCode =
1648 BC.MIB->createInlineMemcpy(IsMemcpy8);
1649 II = BB.replaceInstruction(II, NewCode);
1650 std::advance(II, NewCode.size() - 1);
1651 if (IsTailCall) {
1652 MCInst Return;
1653 BC.MIB->createReturn(Return);
1654 II = BB.insertInstruction(std::next(II), std::move(Return));
1655 }
1656
1657 ++NumInlined;
1658 NumInlinedDyno += BB.getKnownExecutionCount();
1659 }
1660 }
1661 }
1662
1663 if (NumInlined) {
1664 outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1665 if (NumInlinedDyno)
1666 outs() << ". The calls were executed " << NumInlinedDyno
1667 << " times based on profile.";
1668 outs() << '\n';
1669 }
1670 }
1671
shouldOptimize(const BinaryFunction & Function) const1672 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1673 if (!BinaryFunctionPass::shouldOptimize(Function))
1674 return false;
1675
1676 for (const std::string &FunctionSpec : Spec) {
1677 StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1678 if (Function.hasNameRegex(FunctionName))
1679 return true;
1680 }
1681
1682 return false;
1683 }
1684
getCallSitesToOptimize(const BinaryFunction & Function) const1685 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1686 const BinaryFunction &Function) const {
1687 StringRef SitesString;
1688 for (const std::string &FunctionSpec : Spec) {
1689 StringRef FunctionName;
1690 std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1691 if (Function.hasNameRegex(FunctionName))
1692 break;
1693 SitesString = "";
1694 }
1695
1696 std::set<size_t> Sites;
1697 SmallVector<StringRef, 4> SitesVec;
1698 SitesString.split(SitesVec, ':');
1699 for (StringRef SiteString : SitesVec) {
1700 if (SiteString.empty())
1701 continue;
1702 size_t Result;
1703 if (!SiteString.getAsInteger(10, Result))
1704 Sites.emplace(Result);
1705 }
1706
1707 return Sites;
1708 }
1709
runOnFunctions(BinaryContext & BC)1710 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1711 if (!BC.isX86())
1712 return;
1713
1714 uint64_t NumSpecialized = 0;
1715 uint64_t NumSpecializedDyno = 0;
1716 for (auto &BFI : BC.getBinaryFunctions()) {
1717 BinaryFunction &Function = BFI.second;
1718 if (!shouldOptimize(Function))
1719 continue;
1720
1721 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1722 auto shouldOptimize = [&](size_t N) {
1723 return CallsToOptimize.empty() || CallsToOptimize.count(N);
1724 };
1725
1726 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1727 size_t CallSiteID = 0;
1728 for (BinaryBasicBlock *CurBB : Blocks) {
1729 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1730 MCInst &Inst = *II;
1731
1732 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1733 !Inst.getOperand(0).isExpr())
1734 continue;
1735
1736 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1737 if (CalleeSymbol->getName() != "memcpy" &&
1738 CalleeSymbol->getName() != "memcpy@PLT")
1739 continue;
1740
1741 if (BC.MIB->isTailCall(Inst))
1742 continue;
1743
1744 ++CallSiteID;
1745
1746 if (!shouldOptimize(CallSiteID))
1747 continue;
1748
1749 // Create a copy of a call to memcpy(dest, src, size).
1750 MCInst MemcpyInstr = Inst;
1751
1752 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1753
1754 BinaryBasicBlock *NextBB = nullptr;
1755 if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1756 NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1757 NextBB->eraseInstruction(NextBB->begin());
1758 } else {
1759 NextBB = OneByteMemcpyBB->getSuccessor();
1760 OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1761 assert(NextBB && "unexpected call to memcpy() with no return");
1762 }
1763
1764 BinaryBasicBlock *MemcpyBB = Function.addBasicBlock();
1765 MemcpyBB->setOffset(CurBB->getInputOffset());
1766 InstructionListType CmpJCC =
1767 BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1768 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1769 CurBB->addInstructions(CmpJCC);
1770 CurBB->addSuccessor(MemcpyBB);
1771
1772 MemcpyBB->addInstruction(std::move(MemcpyInstr));
1773 MemcpyBB->addSuccessor(NextBB);
1774 MemcpyBB->setCFIState(NextBB->getCFIState());
1775 MemcpyBB->setExecutionCount(0);
1776
1777 // To prevent the actual call from being moved to cold, we set its
1778 // execution count to 1.
1779 if (CurBB->getKnownExecutionCount() > 0)
1780 MemcpyBB->setExecutionCount(1);
1781
1782 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1783 OneByteMemcpyBB->addInstructions(OneByteMemcpy);
1784
1785 ++NumSpecialized;
1786 NumSpecializedDyno += CurBB->getKnownExecutionCount();
1787
1788 CurBB = NextBB;
1789
1790 // Note: we don't expect the next instruction to be a call to memcpy.
1791 II = CurBB->begin();
1792 }
1793 }
1794 }
1795
1796 if (NumSpecialized) {
1797 outs() << "BOLT-INFO: specialized " << NumSpecialized
1798 << " memcpy() call sites for size 1";
1799 if (NumSpecializedDyno)
1800 outs() << ". The calls were executed " << NumSpecializedDyno
1801 << " times based on profile.";
1802 outs() << '\n';
1803 }
1804 }
1805
runOnFunction(BinaryFunction & BF)1806 void RemoveNops::runOnFunction(BinaryFunction &BF) {
1807 const BinaryContext &BC = BF.getBinaryContext();
1808 for (BinaryBasicBlock &BB : BF) {
1809 for (int64_t I = BB.size() - 1; I >= 0; --I) {
1810 MCInst &Inst = BB.getInstructionAtIndex(I);
1811 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
1812 BB.eraseInstructionAtIndex(I);
1813 }
1814 }
1815 }
1816
runOnFunctions(BinaryContext & BC)1817 void RemoveNops::runOnFunctions(BinaryContext &BC) {
1818 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1819 runOnFunction(BF);
1820 };
1821
1822 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1823 return BF.shouldPreserveNops();
1824 };
1825
1826 ParallelUtilities::runOnEachFunction(
1827 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
1828 SkipFunc, "RemoveNops");
1829 }
1830
1831 } // namespace bolt
1832 } // namespace llvm
1833