1 //===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the FrameOptimizerPass class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/FrameOptimizer.h"
14 #include "bolt/Core/ParallelUtilities.h"
15 #include "bolt/Passes/BinaryFunctionCallGraph.h"
16 #include "bolt/Passes/DataflowInfoManager.h"
17 #include "bolt/Passes/ShrinkWrapping.h"
18 #include "bolt/Passes/StackAvailableExpressions.h"
19 #include "bolt/Passes/StackReachingUses.h"
20 #include "llvm/Support/Timer.h"
21 #include <deque>
22 #include <unordered_map>
23 
24 #define DEBUG_TYPE "fop"
25 
26 using namespace llvm;
27 
28 namespace opts {
29 extern cl::opt<unsigned> Verbosity;
30 extern cl::opt<bool> TimeOpts;
31 extern cl::OptionCategory BoltOptCategory;
32 
33 using namespace bolt;
34 
35 cl::opt<FrameOptimizationType>
36 FrameOptimization("frame-opt",
37   cl::init(FOP_NONE),
38   cl::desc("optimize stack frame accesses"),
39   cl::values(
40     clEnumValN(FOP_NONE, "none", "do not perform frame optimization"),
41     clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"),
42     clEnumValN(FOP_ALL, "all", "perform FOP on all functions")),
43   cl::ZeroOrMore,
44   cl::cat(BoltOptCategory));
45 
46 cl::opt<bool>
47 RemoveStores("frame-opt-rm-stores",
48   cl::init(FOP_NONE),
49   cl::desc("apply additional analysis to remove stores (experimental)"),
50   cl::init(false),
51   cl::ZeroOrMore,
52   cl::cat(BoltOptCategory));
53 
54 } // namespace opts
55 
56 namespace llvm {
57 namespace bolt {
58 
59 void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
60                                                 const FrameAnalysis &FA,
61                                                 BinaryFunction &BF) {
62   StackAvailableExpressions SAE(RA, FA, BF);
63   SAE.run();
64 
65   LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n");
66   std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
67   bool Changed = false;
68   const auto ExprEnd = SAE.expr_end();
69   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
70   for (BinaryBasicBlock &BB : BF) {
71     LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
72     const MCInst *Prev = nullptr;
73     for (MCInst &Inst : BB) {
74       LLVM_DEBUG({
75         dbgs() << "\t\tNow at ";
76         Inst.dump();
77         for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
78              I != ExprEnd; ++I) {
79           dbgs() << "\t\t\tReached by: ";
80           (*I)->dump();
81         }
82       });
83       // if Inst is a load from stack and the current available expressions show
84       // this value is available in a register or immediate, replace this load
85       // with move from register or from immediate.
86       ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
87       if (!FIEX) {
88         Prev = &Inst;
89         continue;
90       }
91       // FIXME: Change to remove IsSimple == 0. We're being conservative here,
92       // but once replaceMemOperandWithReg is ready, we should feed it with all
93       // sorts of complex instructions.
94       if (FIEX->IsLoad == false || FIEX->IsSimple == false ||
95           FIEX->StackOffset >= 0) {
96         Prev = &Inst;
97         continue;
98       }
99 
100       for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
101            I != ExprEnd; ++I) {
102         const MCInst *AvailableInst = *I;
103         ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*AvailableInst);
104         if (!FIEY)
105           continue;
106         assert(FIEY->IsStore && FIEY->IsSimple);
107         if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size)
108           continue;
109         // TODO: Change push/pops to stack adjustment instruction
110         if (MIB->isPop(Inst))
111           continue;
112 
113         ++NumRedundantLoads;
114         Changed = true;
115         LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
116         LLVM_DEBUG(Inst.dump());
117         LLVM_DEBUG(dbgs() << "Related store instruction: ");
118         LLVM_DEBUG(AvailableInst->dump());
119         LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
120         // Replace load
121         if (FIEY->IsStoreFromReg) {
122           if (!MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) {
123             LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
124             break;
125           }
126           ++NumLoadsChangedToReg;
127           MIB->removeAnnotation(Inst, "FrameAccessEntry");
128           LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
129           if (MIB->isRedundantMove(Inst)) {
130             ++NumLoadsDeleted;
131             LLVM_DEBUG(dbgs() << "Created a redundant move\n");
132             // Delete it!
133             ToErase.push_front(std::make_pair(&BB, &Inst));
134           }
135         } else {
136           char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
137           support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm;
138           LLVM_DEBUG(dbgs() << "Changing operand to an imm... ");
139           if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
140             LLVM_DEBUG(dbgs() << "FAILED\n");
141           } else {
142             ++NumLoadsChangedToImm;
143             MIB->removeAnnotation(Inst, "FrameAccessEntry");
144             LLVM_DEBUG(dbgs() << "Ok\n");
145           }
146         }
147         LLVM_DEBUG(dbgs() << "Changed to: ");
148         LLVM_DEBUG(Inst.dump());
149         break;
150       }
151       Prev = &Inst;
152     }
153   }
154   if (Changed) {
155     LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
156   }
157   // TODO: Implement an interface of eraseInstruction that works out the
158   // complete list of elements to remove.
159   for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase) {
160     I.first->eraseInstruction(I.first->findInstruction(I.second));
161   }
162 }
163 
164 void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
165                                             BinaryFunction &BF) {
166   StackReachingUses SRU(FA, BF);
167   SRU.run();
168 
169   LLVM_DEBUG(dbgs() << "Performing unused stores removal\n");
170   std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
171   bool Changed = false;
172   for (BinaryBasicBlock &BB : BF) {
173     LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
174     const MCInst *Prev = nullptr;
175     for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
176       MCInst &Inst = *I;
177       LLVM_DEBUG({
178         dbgs() << "\t\tNow at ";
179         Inst.dump();
180         for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB);
181              I != SRU.expr_end(); ++I) {
182           dbgs() << "\t\t\tReached by: ";
183           (*I)->dump();
184         }
185       });
186       ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
187       if (!FIEX) {
188         Prev = &Inst;
189         continue;
190       }
191       if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) {
192         Prev = &Inst;
193         continue;
194       }
195 
196       if (SRU.isStoreUsed(*FIEX,
197                           Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) {
198         Prev = &Inst;
199         continue;
200       }
201       // TODO: Change push/pops to stack adjustment instruction
202       if (BF.getBinaryContext().MIB->isPush(Inst))
203         continue;
204 
205       ++NumRedundantStores;
206       Changed = true;
207       LLVM_DEBUG(dbgs() << "Unused store instruction: ");
208       LLVM_DEBUG(Inst.dump());
209       LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
210       LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset
211                         << " size = " << (int)FIEX->Size << "\n");
212       // Delete it!
213       ToErase.emplace_back(&BB, &Inst);
214       Prev = &Inst;
215     }
216   }
217 
218   for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase) {
219     I.first->eraseInstruction(I.first->findInstruction(I.second));
220   }
221   if (Changed) {
222     LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
223   }
224 }
225 
226 void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
227   if (opts::FrameOptimization == FOP_NONE)
228     return;
229 
230   std::unique_ptr<BinaryFunctionCallGraph> CG;
231   std::unique_ptr<FrameAnalysis> FA;
232   std::unique_ptr<RegAnalysis> RA;
233 
234   {
235     NamedRegionTimer T1("callgraph", "create call graph", "FOP",
236                         "FOP breakdown", opts::TimeOpts);
237     CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC));
238   }
239 
240   {
241     NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP",
242                         "FOP breakdown", opts::TimeOpts);
243     FA = std::make_unique<FrameAnalysis>(BC, *CG);
244   }
245 
246   {
247     NamedRegionTimer T1("reganalysis", "reg analysis", "FOP", "FOP breakdown",
248                         opts::TimeOpts);
249     RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get());
250   }
251 
252   // Perform caller-saved register optimizations, then callee-saved register
253   // optimizations (shrink wrapping)
254   for (auto &I : BC.getBinaryFunctions()) {
255     if (!FA->hasFrameInfo(I.second))
256       continue;
257     // Restrict pass execution if user asked to only run on hot functions
258     if (opts::FrameOptimization == FOP_HOT) {
259       if (I.second.getKnownExecutionCount() < BC.getHotThreshold())
260         continue;
261       LLVM_DEBUG(
262           dbgs() << "Considering " << I.second.getPrintName()
263                  << " for frame optimizations because its execution count ( "
264                  << I.second.getKnownExecutionCount()
265                  << " ) exceeds our hotness threshold ( "
266                  << BC.getHotThreshold() << " )\n");
267     }
268 
269     {
270       NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
271                           opts::TimeOpts);
272       removeUnnecessaryLoads(*RA, *FA, I.second);
273     }
274 
275     if (opts::RemoveStores) {
276       NamedRegionTimer T1("removestores", "remove stores", "FOP",
277                           "FOP breakdown", opts::TimeOpts);
278       removeUnusedStores(*FA, I.second);
279     }
280     // Don't even start shrink wrapping if no profiling info is available
281     if (I.second.getKnownExecutionCount() == 0)
282       continue;
283   }
284 
285   {
286     NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
287                         "FOP breakdown", opts::TimeOpts);
288     performShrinkWrapping(*RA, *FA, BC);
289   }
290 
291   outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
292          << " redundant load(s) and " << NumRedundantStores
293          << " unused store(s)\n";
294   outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg
295          << " load(s) to use a register instead of a stack access, and "
296          << NumLoadsChangedToImm << " to use an immediate.\n"
297          << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and "
298          << NumRedundantStores << " store(s).\n";
299   FA->printStats();
300   ShrinkWrapping::printStats();
301 }
302 
303 void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
304                                                const FrameAnalysis &FA,
305                                                BinaryContext &BC) {
306   // Initialize necessary annotations to allow safe parallel accesses to
307   // annotation index in MIB
308   BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
309   BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName());
310   BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName());
311   BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName());
312   BC.MIB->getOrCreateAnnotationIndex(
313       StackLayoutModifier::getOffsetCFIRegTagName());
314   BC.MIB->getOrCreateAnnotationIndex("ReachingDefs");
315   BC.MIB->getOrCreateAnnotationIndex("ReachingUses");
316   BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis");
317   BC.MIB->getOrCreateAnnotationIndex("StackReachingUses");
318   BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis");
319   BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis");
320   BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
321   BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls");
322   BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions");
323   BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis");
324   BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo");
325   BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking");
326   BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward");
327   BC.MIB->getOrCreateAnnotationIndex("ReachingInsns");
328   BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
329   BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
330 
331   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
332     if (!FA.hasFrameInfo(BF))
333       return true;
334 
335     if (opts::FrameOptimization == FOP_HOT &&
336         (BF.getKnownExecutionCount() < BC.getHotThreshold()))
337       return true;
338 
339     if (BF.getKnownExecutionCount() == 0)
340       return true;
341 
342     return false;
343   };
344 
345   ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
346       [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
347         DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
348         ShrinkWrapping SW(FA, BF, Info, AllocatorId);
349 
350         if (SW.perform()) {
351           std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
352           FuncsChanged.insert(&BF);
353         }
354       };
355 
356   ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
357       BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
358       SkipPredicate, "shrink-wrapping");
359 }
360 
361 } // namespace bolt
362 } // namespace llvm
363