1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the SplitFunctions pass.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/SplitFunctions.h"
14 #include "bolt/Core/BinaryFunction.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "llvm/Support/CommandLine.h"
17 
18 #include <vector>
19 
20 #define DEBUG_TYPE "bolt-opts"
21 
22 using namespace llvm;
23 using namespace bolt;
24 
25 namespace opts {
26 
27 extern cl::OptionCategory BoltOptCategory;
28 
29 extern cl::opt<bool> SplitEH;
30 extern cl::opt<unsigned> ExecutionCountThreshold;
31 
32 static cl::opt<bool> AggressiveSplitting(
33     "split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
34     cl::cat(BoltOptCategory));
35 
36 static cl::opt<unsigned> SplitAlignThreshold(
37     "split-align-threshold",
38     cl::desc("when deciding to split a function, apply this alignment "
39              "while doing the size comparison (see -split-threshold). "
40              "Default value: 2."),
41     cl::init(2),
42 
43     cl::Hidden, cl::cat(BoltOptCategory));
44 
45 static cl::opt<SplitFunctions::SplittingType>
46 SplitFunctions("split-functions",
47   cl::desc("split functions into hot and cold regions"),
48   cl::init(SplitFunctions::ST_NONE),
49   cl::values(clEnumValN(SplitFunctions::ST_NONE, "0",
50                         "do not split any function"),
51              clEnumValN(SplitFunctions::ST_LARGE, "1",
52                         "in non-relocation mode only split functions too large "
53                         "to fit into original code space"),
54              clEnumValN(SplitFunctions::ST_LARGE, "2",
55                         "same as 1 (backwards compatibility)"),
56              clEnumValN(SplitFunctions::ST_ALL, "3",
57                         "split all functions")),
58   cl::ZeroOrMore,
59   cl::cat(BoltOptCategory));
60 
61 static cl::opt<unsigned> SplitThreshold(
62     "split-threshold",
63     cl::desc("split function only if its main size is reduced by more than "
64              "given amount of bytes. Default value: 0, i.e. split iff the "
65              "size is reduced. Note that on some architectures the size can "
66              "increase after splitting."),
67     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
68 
69 void syncOptions(BinaryContext &BC) {
70   if (!BC.HasRelocations && opts::SplitFunctions == SplitFunctions::ST_LARGE)
71     opts::SplitFunctions = SplitFunctions::ST_ALL;
72 }
73 
74 } // namespace opts
75 
76 namespace llvm {
77 namespace bolt {
78 
79 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
80   // Apply execution count threshold
81   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
82     return false;
83 
84   return BinaryFunctionPass::shouldOptimize(BF);
85 }
86 
87 void SplitFunctions::runOnFunctions(BinaryContext &BC) {
88   opts::syncOptions(BC);
89 
90   if (opts::SplitFunctions == SplitFunctions::ST_NONE)
91     return;
92 
93   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
94     splitFunction(BF);
95   };
96 
97   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
98     return !shouldOptimize(BF);
99   };
100 
101   ParallelUtilities::runOnEachFunction(
102       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
103       "SplitFunctions");
104 
105   if (SplitBytesHot + SplitBytesCold > 0)
106     outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
107            << " hot bytes from " << SplitBytesCold << " cold bytes "
108            << format("(%.2lf%% of split functions is hot).\n",
109                      100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));
110 }
111 
112 void SplitFunctions::splitFunction(BinaryFunction &BF) {
113   if (!BF.size())
114     return;
115 
116   if (!BF.hasValidProfile())
117     return;
118 
119   bool AllCold = true;
120   for (BinaryBasicBlock *BB : BF.layout()) {
121     const uint64_t ExecCount = BB->getExecutionCount();
122     if (ExecCount == BinaryBasicBlock::COUNT_NO_PROFILE)
123       return;
124     if (ExecCount != 0)
125       AllCold = false;
126   }
127 
128   if (AllCold)
129     return;
130 
131   BinaryFunction::BasicBlockOrderType PreSplitLayout = BF.getLayout();
132 
133   BinaryContext &BC = BF.getBinaryContext();
134   size_t OriginalHotSize;
135   size_t HotSize;
136   size_t ColdSize;
137   if (BC.isX86()) {
138     std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF);
139     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
140                       << " pre-split is <0x"
141                       << Twine::utohexstr(OriginalHotSize) << ", 0x"
142                       << Twine::utohexstr(ColdSize) << ">\n");
143     if (opts::SplitFunctions == SplitFunctions::ST_LARGE &&
144         !BC.HasRelocations) {
145       // Split only if the function wouldn't fit.
146       if (OriginalHotSize <= BF.getMaxSize())
147         return;
148     }
149   }
150 
151   // Never outline the first basic block.
152   BF.layout_front()->setCanOutline(false);
153   for (BinaryBasicBlock *BB : BF.layout()) {
154     if (!BB->canOutline())
155       continue;
156     if (BB->getExecutionCount() != 0) {
157       BB->setCanOutline(false);
158       continue;
159     }
160     // Do not split extra entry points in aarch64. They can be referred by
161     // using ADRs and when this happens, these blocks cannot be placed far
162     // away due to the limited range in ADR instruction.
163     if (BC.isAArch64() && BB->isEntryPoint()) {
164       BB->setCanOutline(false);
165       continue;
166     }
167 
168     if (BF.hasEHRanges() && !opts::SplitEH) {
169       // We cannot move landing pads (or rather entry points for landing pads).
170       if (BB->isLandingPad()) {
171         BB->setCanOutline(false);
172         continue;
173       }
174       // We cannot move a block that can throw since exception-handling
175       // runtime cannot deal with split functions. However, if we can guarantee
176       // that the block never throws, it is safe to move the block to
177       // decrease the size of the function.
178       for (MCInst &Instr : *BB) {
179         if (BC.MIB->isInvoke(Instr)) {
180           BB->setCanOutline(false);
181           break;
182         }
183       }
184     }
185   }
186 
187   if (opts::AggressiveSplitting) {
188     // All blocks with 0 count that we can move go to the end of the function.
189     // Even if they were natural to cluster formation and were seen in-between
190     // hot basic blocks.
191     std::stable_sort(BF.layout_begin(), BF.layout_end(),
192                      [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
193                        return A->canOutline() < B->canOutline();
194                      });
195   } else if (BF.hasEHRanges() && !opts::SplitEH) {
196     // Typically functions with exception handling have landing pads at the end.
197     // We cannot move beginning of landing pads, but we can move 0-count blocks
198     // comprising landing pads to the end and thus facilitate splitting.
199     auto FirstLP = BF.layout_begin();
200     while ((*FirstLP)->isLandingPad())
201       ++FirstLP;
202 
203     std::stable_sort(FirstLP, BF.layout_end(),
204                      [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
205                        return A->canOutline() < B->canOutline();
206                      });
207   }
208 
209   // Separate hot from cold starting from the bottom.
210   for (auto I = BF.layout_rbegin(), E = BF.layout_rend(); I != E; ++I) {
211     BinaryBasicBlock *BB = *I;
212     if (!BB->canOutline())
213       break;
214     BB->setIsCold(true);
215   }
216 
217   // For shared objects, place invoke instructions and corresponding landing
218   // pads in the same fragment. To reduce hot code size, create trampoline
219   // landing pads that will redirect the execution to the real LP.
220   if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit())
221     createEHTrampolines(BF);
222 
223   // Check the new size to see if it's worth splitting the function.
224   if (BC.isX86() && BF.isSplit()) {
225     std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
226     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
227                       << " post-split is <0x" << Twine::utohexstr(HotSize)
228                       << ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
229     if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
230         alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
231       LLVM_DEBUG(dbgs() << "Reversing splitting of function " << BF << ":\n  0x"
232                         << Twine::utohexstr(HotSize) << ", 0x"
233                         << Twine::utohexstr(ColdSize) << " -> 0x"
234                         << Twine::utohexstr(OriginalHotSize) << '\n');
235 
236       BF.updateBasicBlockLayout(PreSplitLayout);
237       for (BinaryBasicBlock &BB : BF)
238         BB.setIsCold(false);
239     } else {
240       SplitBytesHot += HotSize;
241       SplitBytesCold += ColdSize;
242     }
243   }
244 }
245 
246 void SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
247   const auto &MIB = BF.getBinaryContext().MIB;
248 
249   // Map real landing pads to the corresponding trampolines.
250   std::unordered_map<const MCSymbol *, const MCSymbol *> LPTrampolines;
251 
252   // Iterate over the copy of basic blocks since we are adding new blocks to the
253   // function which will invalidate its iterators.
254   std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend());
255   for (BinaryBasicBlock *BB : Blocks) {
256     for (MCInst &Instr : *BB) {
257       const Optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr);
258       if (!EHInfo || !EHInfo->first)
259         continue;
260 
261       const MCSymbol *LPLabel = EHInfo->first;
262       BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel);
263       if (BB->isCold() == LPBlock->isCold())
264         continue;
265 
266       const MCSymbol *TrampolineLabel = nullptr;
267       auto Iter = LPTrampolines.find(LPLabel);
268       if (Iter != LPTrampolines.end()) {
269         TrampolineLabel = Iter->second;
270       } else {
271         // Create a trampoline basic block in the same fragment as the thrower.
272         // Note: there's no need to insert the jump instruction, it will be
273         // added by fixBranches().
274         BinaryBasicBlock *TrampolineBB = BF.addBasicBlock();
275         TrampolineBB->setIsCold(BB->isCold());
276         TrampolineBB->setExecutionCount(LPBlock->getExecutionCount());
277         TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount());
278         TrampolineBB->setCFIState(LPBlock->getCFIState());
279         TrampolineLabel = TrampolineBB->getLabel();
280         LPTrampolines.emplace(std::make_pair(LPLabel, TrampolineLabel));
281       }
282 
283       // Substitute the landing pad with the trampoline.
284       MIB->updateEHInfo(Instr,
285                         MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second));
286     }
287   }
288 
289   if (LPTrampolines.empty())
290     return;
291 
292   // All trampoline blocks were added to the end of the function. Place them at
293   // the end of corresponding fragments.
294   std::stable_sort(BF.layout_begin(), BF.layout_end(),
295                    [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
296                      return A->isCold() < B->isCold();
297                    });
298 
299   // Conservatively introduce branch instructions.
300   BF.fixBranches();
301 
302   // Update exception-handling CFG for the function.
303   BF.recomputeLandingPads();
304 }
305 
306 } // namespace bolt
307 } // namespace llvm
308