1 //===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the FrameOptimizerPass class.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "bolt/Passes/FrameOptimizer.h"
14 #include "bolt/Core/ParallelUtilities.h"
15 #include "bolt/Passes/BinaryFunctionCallGraph.h"
16 #include "bolt/Passes/DataflowInfoManager.h"
17 #include "bolt/Passes/ShrinkWrapping.h"
18 #include "bolt/Passes/StackAvailableExpressions.h"
19 #include "bolt/Passes/StackReachingUses.h"
20 #include "bolt/Utils/CommandLineOpts.h"
21 #include "llvm/Support/Timer.h"
22 #include <deque>
23 #include <unordered_map>
24
25 #define DEBUG_TYPE "fop"
26
27 using namespace llvm;
28
29 namespace opts {
30 extern cl::opt<unsigned> Verbosity;
31 extern cl::opt<bool> TimeOpts;
32 extern cl::OptionCategory BoltOptCategory;
33
34 using namespace bolt;
35
36 cl::opt<FrameOptimizationType>
37 FrameOptimization("frame-opt",
38 cl::init(FOP_NONE),
39 cl::desc("optimize stack frame accesses"),
40 cl::values(
41 clEnumValN(FOP_NONE, "none", "do not perform frame optimization"),
42 clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"),
43 clEnumValN(FOP_ALL, "all", "perform FOP on all functions")),
44 cl::ZeroOrMore,
45 cl::cat(BoltOptCategory));
46
47 cl::opt<bool> RemoveStores(
48 "frame-opt-rm-stores", cl::init(FOP_NONE),
49 cl::desc("apply additional analysis to remove stores (experimental)"),
50 cl::cat(BoltOptCategory));
51
52 } // namespace opts
53
54 namespace llvm {
55 namespace bolt {
56
removeUnnecessaryLoads(const RegAnalysis & RA,const FrameAnalysis & FA,BinaryFunction & BF)57 void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
58 const FrameAnalysis &FA,
59 BinaryFunction &BF) {
60 StackAvailableExpressions SAE(RA, FA, BF);
61 SAE.run();
62
63 LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n");
64 std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
65 bool Changed = false;
66 const auto ExprEnd = SAE.expr_end();
67 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
68 for (BinaryBasicBlock &BB : BF) {
69 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
70 const MCInst *Prev = nullptr;
71 for (MCInst &Inst : BB) {
72 LLVM_DEBUG({
73 dbgs() << "\t\tNow at ";
74 Inst.dump();
75 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
76 I != ExprEnd; ++I) {
77 dbgs() << "\t\t\tReached by: ";
78 (*I)->dump();
79 }
80 });
81 // if Inst is a load from stack and the current available expressions show
82 // this value is available in a register or immediate, replace this load
83 // with move from register or from immediate.
84 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
85 if (!FIEX) {
86 Prev = &Inst;
87 continue;
88 }
89 // FIXME: Change to remove IsSimple == 0. We're being conservative here,
90 // but once replaceMemOperandWithReg is ready, we should feed it with all
91 // sorts of complex instructions.
92 if (FIEX->IsLoad == false || FIEX->IsSimple == false ||
93 FIEX->StackOffset >= 0) {
94 Prev = &Inst;
95 continue;
96 }
97
98 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
99 I != ExprEnd; ++I) {
100 const MCInst *AvailableInst = *I;
101 ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*AvailableInst);
102 if (!FIEY)
103 continue;
104 assert(FIEY->IsStore && FIEY->IsSimple);
105 if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size)
106 continue;
107 // TODO: Change push/pops to stack adjustment instruction
108 if (MIB->isPop(Inst))
109 continue;
110
111 ++NumRedundantLoads;
112 FreqRedundantLoads += BB.getKnownExecutionCount();
113 Changed = true;
114 LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
115 LLVM_DEBUG(Inst.dump());
116 LLVM_DEBUG(dbgs() << "Related store instruction: ");
117 LLVM_DEBUG(AvailableInst->dump());
118 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
119 // Replace load
120 if (FIEY->IsStoreFromReg) {
121 if (!MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) {
122 LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
123 break;
124 }
125 FreqLoadsChangedToReg += BB.getKnownExecutionCount();
126 MIB->removeAnnotation(Inst, "FrameAccessEntry");
127 LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
128 if (MIB->isRedundantMove(Inst)) {
129 ++NumLoadsDeleted;
130 FreqLoadsDeleted += BB.getKnownExecutionCount();
131 LLVM_DEBUG(dbgs() << "Created a redundant move\n");
132 // Delete it!
133 ToErase.push_front(std::make_pair(&BB, &Inst));
134 }
135 } else {
136 char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
137 support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm;
138 LLVM_DEBUG(dbgs() << "Changing operand to an imm... ");
139 if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
140 LLVM_DEBUG(dbgs() << "FAILED\n");
141 } else {
142 FreqLoadsChangedToImm += BB.getKnownExecutionCount();
143 MIB->removeAnnotation(Inst, "FrameAccessEntry");
144 LLVM_DEBUG(dbgs() << "Ok\n");
145 }
146 }
147 LLVM_DEBUG(dbgs() << "Changed to: ");
148 LLVM_DEBUG(Inst.dump());
149 break;
150 }
151 Prev = &Inst;
152 }
153 }
154 if (Changed)
155 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
156
157 // TODO: Implement an interface of eraseInstruction that works out the
158 // complete list of elements to remove.
159 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
160 I.first->eraseInstruction(I.first->findInstruction(I.second));
161 }
162
removeUnusedStores(const FrameAnalysis & FA,BinaryFunction & BF)163 void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
164 BinaryFunction &BF) {
165 StackReachingUses SRU(FA, BF);
166 SRU.run();
167
168 LLVM_DEBUG(dbgs() << "Performing unused stores removal\n");
169 std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
170 bool Changed = false;
171 for (BinaryBasicBlock &BB : BF) {
172 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
173 const MCInst *Prev = nullptr;
174 for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
175 MCInst &Inst = *I;
176 LLVM_DEBUG({
177 dbgs() << "\t\tNow at ";
178 Inst.dump();
179 for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB);
180 I != SRU.expr_end(); ++I) {
181 dbgs() << "\t\t\tReached by: ";
182 (*I)->dump();
183 }
184 });
185 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
186 if (!FIEX) {
187 Prev = &Inst;
188 continue;
189 }
190 if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) {
191 Prev = &Inst;
192 continue;
193 }
194
195 if (SRU.isStoreUsed(*FIEX,
196 Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) {
197 Prev = &Inst;
198 continue;
199 }
200 // TODO: Change push/pops to stack adjustment instruction
201 if (BF.getBinaryContext().MIB->isPush(Inst))
202 continue;
203
204 ++NumRedundantStores;
205 FreqRedundantStores += BB.getKnownExecutionCount();
206 Changed = true;
207 LLVM_DEBUG(dbgs() << "Unused store instruction: ");
208 LLVM_DEBUG(Inst.dump());
209 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
210 LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset
211 << " size = " << (int)FIEX->Size << "\n");
212 // Delete it!
213 ToErase.emplace_back(&BB, &Inst);
214 Prev = &Inst;
215 }
216 }
217
218 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
219 I.first->eraseInstruction(I.first->findInstruction(I.second));
220
221 if (Changed)
222 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
223 }
224
runOnFunctions(BinaryContext & BC)225 void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
226 if (opts::FrameOptimization == FOP_NONE)
227 return;
228
229 std::unique_ptr<BinaryFunctionCallGraph> CG;
230 std::unique_ptr<FrameAnalysis> FA;
231 std::unique_ptr<RegAnalysis> RA;
232
233 {
234 NamedRegionTimer T1("callgraph", "create call graph", "FOP",
235 "FOP breakdown", opts::TimeOpts);
236 CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC));
237 }
238
239 {
240 NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP",
241 "FOP breakdown", opts::TimeOpts);
242 FA = std::make_unique<FrameAnalysis>(BC, *CG);
243 }
244
245 {
246 NamedRegionTimer T1("reganalysis", "reg analysis", "FOP", "FOP breakdown",
247 opts::TimeOpts);
248 RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get());
249 }
250
251 // Perform caller-saved register optimizations, then callee-saved register
252 // optimizations (shrink wrapping)
253 for (auto &I : BC.getBinaryFunctions()) {
254 if (!FA->hasFrameInfo(I.second))
255 continue;
256 // Restrict pass execution if user asked to only run on hot functions
257 if (opts::FrameOptimization == FOP_HOT) {
258 if (I.second.getKnownExecutionCount() < BC.getHotThreshold())
259 continue;
260 LLVM_DEBUG(
261 dbgs() << "Considering " << I.second.getPrintName()
262 << " for frame optimizations because its execution count ( "
263 << I.second.getKnownExecutionCount()
264 << " ) exceeds our hotness threshold ( "
265 << BC.getHotThreshold() << " )\n");
266 }
267
268 {
269 NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
270 opts::TimeOpts);
271 if (!FA->hasStackArithmetic(I.second))
272 removeUnnecessaryLoads(*RA, *FA, I.second);
273 }
274
275 if (opts::RemoveStores) {
276 NamedRegionTimer T1("removestores", "remove stores", "FOP",
277 "FOP breakdown", opts::TimeOpts);
278 if (!FA->hasStackArithmetic(I.second))
279 removeUnusedStores(*FA, I.second);
280 }
281 // Don't even start shrink wrapping if no profiling info is available
282 if (I.second.getKnownExecutionCount() == 0)
283 continue;
284 }
285
286 {
287 NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
288 "FOP breakdown", opts::TimeOpts);
289 performShrinkWrapping(*RA, *FA, BC);
290 }
291
292 outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
293 << " redundant load(s) and " << NumRedundantStores
294 << " unused store(s)\n";
295 outs() << "BOLT-INFO: Frequency of redundant loads is " << FreqRedundantLoads
296 << " and frequency of unused stores is " << FreqRedundantStores
297 << "\n";
298 outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
299 << FreqLoadsChangedToReg
300 << " and frequency of loads changed to use an immediate is "
301 << FreqLoadsChangedToImm << "\n";
302 outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
303 << " load(s) (dyn count: " << FreqLoadsDeleted << ") and "
304 << NumRedundantStores << " store(s)\n";
305 FA->printStats();
306 ShrinkWrapping::printStats();
307 }
308
performShrinkWrapping(const RegAnalysis & RA,const FrameAnalysis & FA,BinaryContext & BC)309 void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
310 const FrameAnalysis &FA,
311 BinaryContext &BC) {
312 // Initialize necessary annotations to allow safe parallel accesses to
313 // annotation index in MIB
314 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
315 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName());
316 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName());
317 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName());
318 BC.MIB->getOrCreateAnnotationIndex(
319 StackLayoutModifier::getOffsetCFIRegTagName());
320 BC.MIB->getOrCreateAnnotationIndex("ReachingDefs");
321 BC.MIB->getOrCreateAnnotationIndex("ReachingUses");
322 BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis");
323 BC.MIB->getOrCreateAnnotationIndex("StackReachingUses");
324 BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis");
325 BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis");
326 BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
327 BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls");
328 BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions");
329 BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis");
330 BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo");
331 BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking");
332 BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward");
333 BC.MIB->getOrCreateAnnotationIndex("ReachingInsns");
334 BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
335 BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
336
337 std::vector<std::pair<uint64_t, const BinaryFunction *>> Top10Funcs;
338 auto LogFunc = [&](BinaryFunction &BF) {
339 auto Lower = std::lower_bound(
340 Top10Funcs.begin(), Top10Funcs.end(), BF.getKnownExecutionCount(),
341 [](const std::pair<uint64_t, const BinaryFunction *> &Elmt,
342 uint64_t Value) { return Elmt.first > Value; });
343 if (Lower == Top10Funcs.end() && Top10Funcs.size() >= 10)
344 return;
345 Top10Funcs.insert(Lower,
346 std::make_pair<>(BF.getKnownExecutionCount(), &BF));
347 if (Top10Funcs.size() > 10)
348 Top10Funcs.resize(10);
349 };
350 (void)LogFunc;
351
352 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
353 if (BF.getFunctionScore() == 0)
354 return true;
355
356 return false;
357 };
358
359 const bool HotOnly = opts::FrameOptimization == FOP_HOT;
360
361 ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
362 [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
363 DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
364 ShrinkWrapping SW(FA, BF, Info, AllocatorId);
365
366 if (SW.perform(HotOnly)) {
367 std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
368 FuncsChanged.insert(&BF);
369 LLVM_DEBUG(LogFunc(BF));
370 }
371 };
372
373 ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
374 BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
375 SkipPredicate, "shrink-wrapping");
376
377 if (!Top10Funcs.empty()) {
378 outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
379 for (const auto &Elmt : Top10Funcs)
380 outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
381 }
382 }
383
384 } // namespace bolt
385 } // namespace llvm
386