1 //===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the FrameOptimizerPass class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/FrameOptimizer.h"
14 #include "bolt/Core/ParallelUtilities.h"
15 #include "bolt/Passes/BinaryFunctionCallGraph.h"
16 #include "bolt/Passes/DataflowInfoManager.h"
17 #include "bolt/Passes/ShrinkWrapping.h"
18 #include "bolt/Passes/StackAvailableExpressions.h"
19 #include "bolt/Passes/StackReachingUses.h"
20 #include "llvm/Support/Timer.h"
21 #include <deque>
22 #include <unordered_map>
23 
24 #define DEBUG_TYPE "fop"
25 
26 using namespace llvm;
27 
28 namespace opts {
29 extern cl::opt<unsigned> Verbosity;
30 extern cl::opt<bool> TimeOpts;
31 extern cl::OptionCategory BoltOptCategory;
32 
33 using namespace bolt;
34 
35 cl::opt<FrameOptimizationType>
36 FrameOptimization("frame-opt",
37   cl::init(FOP_NONE),
38   cl::desc("optimize stack frame accesses"),
39   cl::values(
40     clEnumValN(FOP_NONE, "none", "do not perform frame optimization"),
41     clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"),
42     clEnumValN(FOP_ALL, "all", "perform FOP on all functions")),
43   cl::ZeroOrMore,
44   cl::cat(BoltOptCategory));
45 
46 cl::opt<bool> RemoveStores(
47     "frame-opt-rm-stores", cl::init(FOP_NONE),
48     cl::desc("apply additional analysis to remove stores (experimental)"),
49     cl::cat(BoltOptCategory));
50 
51 } // namespace opts
52 
53 namespace llvm {
54 namespace bolt {
55 
56 void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
57                                                 const FrameAnalysis &FA,
58                                                 BinaryFunction &BF) {
59   StackAvailableExpressions SAE(RA, FA, BF);
60   SAE.run();
61 
62   LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n");
63   std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
64   bool Changed = false;
65   const auto ExprEnd = SAE.expr_end();
66   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
67   for (BinaryBasicBlock &BB : BF) {
68     LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
69     const MCInst *Prev = nullptr;
70     for (MCInst &Inst : BB) {
71       LLVM_DEBUG({
72         dbgs() << "\t\tNow at ";
73         Inst.dump();
74         for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
75              I != ExprEnd; ++I) {
76           dbgs() << "\t\t\tReached by: ";
77           (*I)->dump();
78         }
79       });
80       // if Inst is a load from stack and the current available expressions show
81       // this value is available in a register or immediate, replace this load
82       // with move from register or from immediate.
83       ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
84       if (!FIEX) {
85         Prev = &Inst;
86         continue;
87       }
88       // FIXME: Change to remove IsSimple == 0. We're being conservative here,
89       // but once replaceMemOperandWithReg is ready, we should feed it with all
90       // sorts of complex instructions.
91       if (FIEX->IsLoad == false || FIEX->IsSimple == false ||
92           FIEX->StackOffset >= 0) {
93         Prev = &Inst;
94         continue;
95       }
96 
97       for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
98            I != ExprEnd; ++I) {
99         const MCInst *AvailableInst = *I;
100         ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*AvailableInst);
101         if (!FIEY)
102           continue;
103         assert(FIEY->IsStore && FIEY->IsSimple);
104         if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size)
105           continue;
106         // TODO: Change push/pops to stack adjustment instruction
107         if (MIB->isPop(Inst))
108           continue;
109 
110         ++NumRedundantLoads;
111         Changed = true;
112         LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
113         LLVM_DEBUG(Inst.dump());
114         LLVM_DEBUG(dbgs() << "Related store instruction: ");
115         LLVM_DEBUG(AvailableInst->dump());
116         LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
117         // Replace load
118         if (FIEY->IsStoreFromReg) {
119           if (!MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) {
120             LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
121             break;
122           }
123           ++NumLoadsChangedToReg;
124           MIB->removeAnnotation(Inst, "FrameAccessEntry");
125           LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
126           if (MIB->isRedundantMove(Inst)) {
127             ++NumLoadsDeleted;
128             LLVM_DEBUG(dbgs() << "Created a redundant move\n");
129             // Delete it!
130             ToErase.push_front(std::make_pair(&BB, &Inst));
131           }
132         } else {
133           char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
134           support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm;
135           LLVM_DEBUG(dbgs() << "Changing operand to an imm... ");
136           if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
137             LLVM_DEBUG(dbgs() << "FAILED\n");
138           } else {
139             ++NumLoadsChangedToImm;
140             MIB->removeAnnotation(Inst, "FrameAccessEntry");
141             LLVM_DEBUG(dbgs() << "Ok\n");
142           }
143         }
144         LLVM_DEBUG(dbgs() << "Changed to: ");
145         LLVM_DEBUG(Inst.dump());
146         break;
147       }
148       Prev = &Inst;
149     }
150   }
151   if (Changed)
152     LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
153 
154   // TODO: Implement an interface of eraseInstruction that works out the
155   // complete list of elements to remove.
156   for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
157     I.first->eraseInstruction(I.first->findInstruction(I.second));
158 }
159 
160 void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
161                                             BinaryFunction &BF) {
162   StackReachingUses SRU(FA, BF);
163   SRU.run();
164 
165   LLVM_DEBUG(dbgs() << "Performing unused stores removal\n");
166   std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
167   bool Changed = false;
168   for (BinaryBasicBlock &BB : BF) {
169     LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
170     const MCInst *Prev = nullptr;
171     for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
172       MCInst &Inst = *I;
173       LLVM_DEBUG({
174         dbgs() << "\t\tNow at ";
175         Inst.dump();
176         for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB);
177              I != SRU.expr_end(); ++I) {
178           dbgs() << "\t\t\tReached by: ";
179           (*I)->dump();
180         }
181       });
182       ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
183       if (!FIEX) {
184         Prev = &Inst;
185         continue;
186       }
187       if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) {
188         Prev = &Inst;
189         continue;
190       }
191 
192       if (SRU.isStoreUsed(*FIEX,
193                           Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) {
194         Prev = &Inst;
195         continue;
196       }
197       // TODO: Change push/pops to stack adjustment instruction
198       if (BF.getBinaryContext().MIB->isPush(Inst))
199         continue;
200 
201       ++NumRedundantStores;
202       Changed = true;
203       LLVM_DEBUG(dbgs() << "Unused store instruction: ");
204       LLVM_DEBUG(Inst.dump());
205       LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
206       LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset
207                         << " size = " << (int)FIEX->Size << "\n");
208       // Delete it!
209       ToErase.emplace_back(&BB, &Inst);
210       Prev = &Inst;
211     }
212   }
213 
214   for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
215     I.first->eraseInstruction(I.first->findInstruction(I.second));
216 
217   if (Changed)
218     LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
219 }
220 
221 void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
222   if (opts::FrameOptimization == FOP_NONE)
223     return;
224 
225   std::unique_ptr<BinaryFunctionCallGraph> CG;
226   std::unique_ptr<FrameAnalysis> FA;
227   std::unique_ptr<RegAnalysis> RA;
228 
229   {
230     NamedRegionTimer T1("callgraph", "create call graph", "FOP",
231                         "FOP breakdown", opts::TimeOpts);
232     CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC));
233   }
234 
235   {
236     NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP",
237                         "FOP breakdown", opts::TimeOpts);
238     FA = std::make_unique<FrameAnalysis>(BC, *CG);
239   }
240 
241   {
242     NamedRegionTimer T1("reganalysis", "reg analysis", "FOP", "FOP breakdown",
243                         opts::TimeOpts);
244     RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get());
245   }
246 
247   // Perform caller-saved register optimizations, then callee-saved register
248   // optimizations (shrink wrapping)
249   for (auto &I : BC.getBinaryFunctions()) {
250     if (!FA->hasFrameInfo(I.second))
251       continue;
252     // Restrict pass execution if user asked to only run on hot functions
253     if (opts::FrameOptimization == FOP_HOT) {
254       if (I.second.getKnownExecutionCount() < BC.getHotThreshold())
255         continue;
256       LLVM_DEBUG(
257           dbgs() << "Considering " << I.second.getPrintName()
258                  << " for frame optimizations because its execution count ( "
259                  << I.second.getKnownExecutionCount()
260                  << " ) exceeds our hotness threshold ( "
261                  << BC.getHotThreshold() << " )\n");
262     }
263 
264     {
265       NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
266                           opts::TimeOpts);
267       removeUnnecessaryLoads(*RA, *FA, I.second);
268     }
269 
270     if (opts::RemoveStores) {
271       NamedRegionTimer T1("removestores", "remove stores", "FOP",
272                           "FOP breakdown", opts::TimeOpts);
273       removeUnusedStores(*FA, I.second);
274     }
275     // Don't even start shrink wrapping if no profiling info is available
276     if (I.second.getKnownExecutionCount() == 0)
277       continue;
278   }
279 
280   {
281     NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
282                         "FOP breakdown", opts::TimeOpts);
283     performShrinkWrapping(*RA, *FA, BC);
284   }
285 
286   outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
287          << " redundant load(s) and " << NumRedundantStores
288          << " unused store(s)\n";
289   outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg
290          << " load(s) to use a register instead of a stack access, and "
291          << NumLoadsChangedToImm << " to use an immediate.\n"
292          << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and "
293          << NumRedundantStores << " store(s).\n";
294   FA->printStats();
295   ShrinkWrapping::printStats();
296 }
297 
298 void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
299                                                const FrameAnalysis &FA,
300                                                BinaryContext &BC) {
301   // Initialize necessary annotations to allow safe parallel accesses to
302   // annotation index in MIB
303   BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
304   BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName());
305   BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName());
306   BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName());
307   BC.MIB->getOrCreateAnnotationIndex(
308       StackLayoutModifier::getOffsetCFIRegTagName());
309   BC.MIB->getOrCreateAnnotationIndex("ReachingDefs");
310   BC.MIB->getOrCreateAnnotationIndex("ReachingUses");
311   BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis");
312   BC.MIB->getOrCreateAnnotationIndex("StackReachingUses");
313   BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis");
314   BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis");
315   BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
316   BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls");
317   BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions");
318   BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis");
319   BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo");
320   BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking");
321   BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward");
322   BC.MIB->getOrCreateAnnotationIndex("ReachingInsns");
323   BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
324   BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
325 
326   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
327     if (!FA.hasFrameInfo(BF))
328       return true;
329 
330     if (opts::FrameOptimization == FOP_HOT &&
331         (BF.getKnownExecutionCount() < BC.getHotThreshold()))
332       return true;
333 
334     if (BF.getKnownExecutionCount() == 0)
335       return true;
336 
337     return false;
338   };
339 
340   ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
341       [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
342         DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
343         ShrinkWrapping SW(FA, BF, Info, AllocatorId);
344 
345         if (SW.perform()) {
346           std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
347           FuncsChanged.insert(&BF);
348         }
349       };
350 
351   ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
352       BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
353       SkipPredicate, "shrink-wrapping");
354 }
355 
356 } // namespace bolt
357 } // namespace llvm
358