1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the SplitFunctions pass.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/SplitFunctions.h"
14 #include "bolt/Core/BinaryFunction.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "llvm/Support/CommandLine.h"
17 #include "llvm/Support/FormatVariadic.h"
18 
19 #include <vector>
20 
21 #define DEBUG_TYPE "bolt-opts"
22 
23 using namespace llvm;
24 using namespace bolt;
25 
26 namespace {
27 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> {
28 public:
29   explicit DeprecatedSplitFunctionOptionParser(cl::Option &O)
30       : cl::parser<bool>(O) {}
31 
32   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) {
33     if (Arg == "2" || Arg == "3") {
34       Value = true;
35       errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" "
36                         "for option -{1} is deprecated\n",
37                         Arg, ArgName);
38       return false;
39     }
40     return cl::parser<bool>::parse(O, ArgName, Arg, Value);
41   }
42 };
43 } // namespace
44 
45 namespace opts {
46 
47 extern cl::OptionCategory BoltOptCategory;
48 
49 extern cl::opt<bool> SplitEH;
50 extern cl::opt<unsigned> ExecutionCountThreshold;
51 
52 static cl::opt<bool> AggressiveSplitting(
53     "split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
54     cl::cat(BoltOptCategory));
55 
56 static cl::opt<unsigned> SplitAlignThreshold(
57     "split-align-threshold",
58     cl::desc("when deciding to split a function, apply this alignment "
59              "while doing the size comparison (see -split-threshold). "
60              "Default value: 2."),
61     cl::init(2),
62 
63     cl::Hidden, cl::cat(BoltOptCategory));
64 
65 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser>
66     SplitFunctions("split-functions",
67                    cl::desc("split functions into hot and cold regions"),
68                    cl::cat(BoltOptCategory));
69 
70 static cl::opt<unsigned> SplitThreshold(
71     "split-threshold",
72     cl::desc("split function only if its main size is reduced by more than "
73              "given amount of bytes. Default value: 0, i.e. split iff the "
74              "size is reduced. Note that on some architectures the size can "
75              "increase after splitting."),
76     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
77 
78 } // namespace opts
79 
80 namespace llvm {
81 namespace bolt {
82 
83 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
84   // Apply execution count threshold
85   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
86     return false;
87 
88   return BinaryFunctionPass::shouldOptimize(BF);
89 }
90 
91 void SplitFunctions::runOnFunctions(BinaryContext &BC) {
92   if (!opts::SplitFunctions)
93     return;
94 
95   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
96     splitFunction(BF);
97   };
98 
99   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
100     return !shouldOptimize(BF);
101   };
102 
103   ParallelUtilities::runOnEachFunction(
104       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
105       "SplitFunctions");
106 
107   if (SplitBytesHot + SplitBytesCold > 0)
108     outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
109            << " hot bytes from " << SplitBytesCold << " cold bytes "
110            << format("(%.2lf%% of split functions is hot).\n",
111                      100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));
112 }
113 
114 void SplitFunctions::splitFunction(BinaryFunction &BF) {
115   if (!BF.size())
116     return;
117 
118   if (!BF.hasValidProfile())
119     return;
120 
121   bool AllCold = true;
122   for (BinaryBasicBlock *BB : BF.layout()) {
123     const uint64_t ExecCount = BB->getExecutionCount();
124     if (ExecCount == BinaryBasicBlock::COUNT_NO_PROFILE)
125       return;
126     if (ExecCount != 0)
127       AllCold = false;
128   }
129 
130   if (AllCold)
131     return;
132 
133   BinaryFunction::BasicBlockOrderType PreSplitLayout = BF.getLayout();
134 
135   BinaryContext &BC = BF.getBinaryContext();
136   size_t OriginalHotSize;
137   size_t HotSize;
138   size_t ColdSize;
139   if (BC.isX86()) {
140     std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF);
141     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
142                       << " pre-split is <0x"
143                       << Twine::utohexstr(OriginalHotSize) << ", 0x"
144                       << Twine::utohexstr(ColdSize) << ">\n");
145   }
146 
147   // Never outline the first basic block.
148   BF.layout_front()->setCanOutline(false);
149   for (BinaryBasicBlock *BB : BF.layout()) {
150     if (!BB->canOutline())
151       continue;
152     if (BB->getExecutionCount() != 0) {
153       BB->setCanOutline(false);
154       continue;
155     }
156     // Do not split extra entry points in aarch64. They can be referred by
157     // using ADRs and when this happens, these blocks cannot be placed far
158     // away due to the limited range in ADR instruction.
159     if (BC.isAArch64() && BB->isEntryPoint()) {
160       BB->setCanOutline(false);
161       continue;
162     }
163 
164     if (BF.hasEHRanges() && !opts::SplitEH) {
165       // We cannot move landing pads (or rather entry points for landing pads).
166       if (BB->isLandingPad()) {
167         BB->setCanOutline(false);
168         continue;
169       }
170       // We cannot move a block that can throw since exception-handling
171       // runtime cannot deal with split functions. However, if we can guarantee
172       // that the block never throws, it is safe to move the block to
173       // decrease the size of the function.
174       for (MCInst &Instr : *BB) {
175         if (BC.MIB->isInvoke(Instr)) {
176           BB->setCanOutline(false);
177           break;
178         }
179       }
180     }
181   }
182 
183   if (opts::AggressiveSplitting) {
184     // All blocks with 0 count that we can move go to the end of the function.
185     // Even if they were natural to cluster formation and were seen in-between
186     // hot basic blocks.
187     llvm::stable_sort(BF.layout(),
188                       [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
189                         return A->canOutline() < B->canOutline();
190                       });
191   } else if (BF.hasEHRanges() && !opts::SplitEH) {
192     // Typically functions with exception handling have landing pads at the end.
193     // We cannot move beginning of landing pads, but we can move 0-count blocks
194     // comprising landing pads to the end and thus facilitate splitting.
195     auto FirstLP = BF.layout_begin();
196     while ((*FirstLP)->isLandingPad())
197       ++FirstLP;
198 
199     std::stable_sort(FirstLP, BF.layout_end(),
200                      [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
201                        return A->canOutline() < B->canOutline();
202                      });
203   }
204 
205   // Separate hot from cold starting from the bottom.
206   for (auto I = BF.layout_rbegin(), E = BF.layout_rend(); I != E; ++I) {
207     BinaryBasicBlock *BB = *I;
208     if (!BB->canOutline())
209       break;
210     BB->setIsCold(true);
211   }
212 
213   // For shared objects, place invoke instructions and corresponding landing
214   // pads in the same fragment. To reduce hot code size, create trampoline
215   // landing pads that will redirect the execution to the real LP.
216   if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit())
217     createEHTrampolines(BF);
218 
219   // Check the new size to see if it's worth splitting the function.
220   if (BC.isX86() && BF.isSplit()) {
221     std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
222     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
223                       << " post-split is <0x" << Twine::utohexstr(HotSize)
224                       << ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
225     if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
226         alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
227       LLVM_DEBUG(dbgs() << "Reversing splitting of function " << BF << ":\n  0x"
228                         << Twine::utohexstr(HotSize) << ", 0x"
229                         << Twine::utohexstr(ColdSize) << " -> 0x"
230                         << Twine::utohexstr(OriginalHotSize) << '\n');
231 
232       BF.updateBasicBlockLayout(PreSplitLayout);
233       for (BinaryBasicBlock &BB : BF)
234         BB.setIsCold(false);
235     } else {
236       SplitBytesHot += HotSize;
237       SplitBytesCold += ColdSize;
238     }
239   }
240 }
241 
242 void SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
243   const auto &MIB = BF.getBinaryContext().MIB;
244 
245   // Map real landing pads to the corresponding trampolines.
246   std::unordered_map<const MCSymbol *, const MCSymbol *> LPTrampolines;
247 
248   // Iterate over the copy of basic blocks since we are adding new blocks to the
249   // function which will invalidate its iterators.
250   std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend());
251   for (BinaryBasicBlock *BB : Blocks) {
252     for (MCInst &Instr : *BB) {
253       const Optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr);
254       if (!EHInfo || !EHInfo->first)
255         continue;
256 
257       const MCSymbol *LPLabel = EHInfo->first;
258       BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel);
259       if (BB->isCold() == LPBlock->isCold())
260         continue;
261 
262       const MCSymbol *TrampolineLabel = nullptr;
263       auto Iter = LPTrampolines.find(LPLabel);
264       if (Iter != LPTrampolines.end()) {
265         TrampolineLabel = Iter->second;
266       } else {
267         // Create a trampoline basic block in the same fragment as the thrower.
268         // Note: there's no need to insert the jump instruction, it will be
269         // added by fixBranches().
270         BinaryBasicBlock *TrampolineBB = BF.addBasicBlock();
271         TrampolineBB->setIsCold(BB->isCold());
272         TrampolineBB->setExecutionCount(LPBlock->getExecutionCount());
273         TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount());
274         TrampolineBB->setCFIState(LPBlock->getCFIState());
275         TrampolineLabel = TrampolineBB->getLabel();
276         LPTrampolines.emplace(std::make_pair(LPLabel, TrampolineLabel));
277       }
278 
279       // Substitute the landing pad with the trampoline.
280       MIB->updateEHInfo(Instr,
281                         MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second));
282     }
283   }
284 
285   if (LPTrampolines.empty())
286     return;
287 
288   // All trampoline blocks were added to the end of the function. Place them at
289   // the end of corresponding fragments.
290   std::stable_sort(BF.layout_begin(), BF.layout_end(),
291                    [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
292                      return A->isCold() < B->isCold();
293                    });
294 
295   // Conservatively introduce branch instructions.
296   BF.fixBranches();
297 
298   // Update exception-handling CFG for the function.
299   BF.recomputeLandingPads();
300 }
301 
302 } // namespace bolt
303 } // namespace llvm
304