1 //===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the FrameOptimizerPass class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/FrameOptimizer.h" 14 #include "bolt/Core/ParallelUtilities.h" 15 #include "bolt/Passes/BinaryFunctionCallGraph.h" 16 #include "bolt/Passes/DataflowInfoManager.h" 17 #include "bolt/Passes/ShrinkWrapping.h" 18 #include "bolt/Passes/StackAvailableExpressions.h" 19 #include "bolt/Passes/StackReachingUses.h" 20 #include "llvm/Support/Timer.h" 21 #include <deque> 22 #include <unordered_map> 23 24 #define DEBUG_TYPE "fop" 25 26 using namespace llvm; 27 28 namespace opts { 29 extern cl::opt<unsigned> Verbosity; 30 extern cl::opt<bool> TimeOpts; 31 extern cl::OptionCategory BoltOptCategory; 32 33 using namespace bolt; 34 35 cl::opt<FrameOptimizationType> 36 FrameOptimization("frame-opt", 37 cl::init(FOP_NONE), 38 cl::desc("optimize stack frame accesses"), 39 cl::values( 40 clEnumValN(FOP_NONE, "none", "do not perform frame optimization"), 41 clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"), 42 clEnumValN(FOP_ALL, "all", "perform FOP on all functions")), 43 cl::ZeroOrMore, 44 cl::cat(BoltOptCategory)); 45 46 cl::opt<bool> RemoveStores( 47 "frame-opt-rm-stores", cl::init(FOP_NONE), 48 cl::desc("apply additional analysis to remove stores (experimental)"), 49 cl::cat(BoltOptCategory)); 50 51 } // namespace opts 52 53 namespace llvm { 54 namespace bolt { 55 56 void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA, 57 const FrameAnalysis &FA, 58 BinaryFunction &BF) { 59 StackAvailableExpressions SAE(RA, FA, BF); 60 SAE.run(); 61 62 LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n"); 63 std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase; 64 bool Changed = false; 65 const auto ExprEnd = SAE.expr_end(); 66 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 67 for (BinaryBasicBlock &BB : BF) { 68 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); 69 const MCInst *Prev = nullptr; 70 for (MCInst &Inst : BB) { 71 LLVM_DEBUG({ 72 dbgs() << "\t\tNow at "; 73 Inst.dump(); 74 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB); 75 I != ExprEnd; ++I) { 76 dbgs() << "\t\t\tReached by: "; 77 (*I)->dump(); 78 } 79 }); 80 // if Inst is a load from stack and the current available expressions show 81 // this value is available in a register or immediate, replace this load 82 // with move from register or from immediate. 83 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst); 84 if (!FIEX) { 85 Prev = &Inst; 86 continue; 87 } 88 // FIXME: Change to remove IsSimple == 0. We're being conservative here, 89 // but once replaceMemOperandWithReg is ready, we should feed it with all 90 // sorts of complex instructions. 91 if (FIEX->IsLoad == false || FIEX->IsSimple == false || 92 FIEX->StackOffset >= 0) { 93 Prev = &Inst; 94 continue; 95 } 96 97 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB); 98 I != ExprEnd; ++I) { 99 const MCInst *AvailableInst = *I; 100 ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*AvailableInst); 101 if (!FIEY) 102 continue; 103 assert(FIEY->IsStore && FIEY->IsSimple); 104 if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size) 105 continue; 106 // TODO: Change push/pops to stack adjustment instruction 107 if (MIB->isPop(Inst)) 108 continue; 109 110 ++NumRedundantLoads; 111 Changed = true; 112 LLVM_DEBUG(dbgs() << "Redundant load instruction: "); 113 LLVM_DEBUG(Inst.dump()); 114 LLVM_DEBUG(dbgs() << "Related store instruction: "); 115 LLVM_DEBUG(AvailableInst->dump()); 116 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); 117 // Replace load 118 if (FIEY->IsStoreFromReg) { 119 if (!MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) { 120 LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n"); 121 break; 122 } 123 ++NumLoadsChangedToReg; 124 MIB->removeAnnotation(Inst, "FrameAccessEntry"); 125 LLVM_DEBUG(dbgs() << "Changed operand to a reg\n"); 126 if (MIB->isRedundantMove(Inst)) { 127 ++NumLoadsDeleted; 128 LLVM_DEBUG(dbgs() << "Created a redundant move\n"); 129 // Delete it! 130 ToErase.push_front(std::make_pair(&BB, &Inst)); 131 } 132 } else { 133 char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0}; 134 support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm; 135 LLVM_DEBUG(dbgs() << "Changing operand to an imm... "); 136 if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) { 137 LLVM_DEBUG(dbgs() << "FAILED\n"); 138 } else { 139 ++NumLoadsChangedToImm; 140 MIB->removeAnnotation(Inst, "FrameAccessEntry"); 141 LLVM_DEBUG(dbgs() << "Ok\n"); 142 } 143 } 144 LLVM_DEBUG(dbgs() << "Changed to: "); 145 LLVM_DEBUG(Inst.dump()); 146 break; 147 } 148 Prev = &Inst; 149 } 150 } 151 if (Changed) 152 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); 153 154 // TODO: Implement an interface of eraseInstruction that works out the 155 // complete list of elements to remove. 156 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase) 157 I.first->eraseInstruction(I.first->findInstruction(I.second)); 158 } 159 160 void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA, 161 BinaryFunction &BF) { 162 StackReachingUses SRU(FA, BF); 163 SRU.run(); 164 165 LLVM_DEBUG(dbgs() << "Performing unused stores removal\n"); 166 std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase; 167 bool Changed = false; 168 for (BinaryBasicBlock &BB : BF) { 169 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); 170 const MCInst *Prev = nullptr; 171 for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { 172 MCInst &Inst = *I; 173 LLVM_DEBUG({ 174 dbgs() << "\t\tNow at "; 175 Inst.dump(); 176 for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB); 177 I != SRU.expr_end(); ++I) { 178 dbgs() << "\t\t\tReached by: "; 179 (*I)->dump(); 180 } 181 }); 182 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst); 183 if (!FIEX) { 184 Prev = &Inst; 185 continue; 186 } 187 if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) { 188 Prev = &Inst; 189 continue; 190 } 191 192 if (SRU.isStoreUsed(*FIEX, 193 Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) { 194 Prev = &Inst; 195 continue; 196 } 197 // TODO: Change push/pops to stack adjustment instruction 198 if (BF.getBinaryContext().MIB->isPush(Inst)) 199 continue; 200 201 ++NumRedundantStores; 202 Changed = true; 203 LLVM_DEBUG(dbgs() << "Unused store instruction: "); 204 LLVM_DEBUG(Inst.dump()); 205 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); 206 LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset 207 << " size = " << (int)FIEX->Size << "\n"); 208 // Delete it! 209 ToErase.emplace_back(&BB, &Inst); 210 Prev = &Inst; 211 } 212 } 213 214 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase) 215 I.first->eraseInstruction(I.first->findInstruction(I.second)); 216 217 if (Changed) 218 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); 219 } 220 221 void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) { 222 if (opts::FrameOptimization == FOP_NONE) 223 return; 224 225 std::unique_ptr<BinaryFunctionCallGraph> CG; 226 std::unique_ptr<FrameAnalysis> FA; 227 std::unique_ptr<RegAnalysis> RA; 228 229 { 230 NamedRegionTimer T1("callgraph", "create call graph", "FOP", 231 "FOP breakdown", opts::TimeOpts); 232 CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC)); 233 } 234 235 { 236 NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP", 237 "FOP breakdown", opts::TimeOpts); 238 FA = std::make_unique<FrameAnalysis>(BC, *CG); 239 } 240 241 { 242 NamedRegionTimer T1("reganalysis", "reg analysis", "FOP", "FOP breakdown", 243 opts::TimeOpts); 244 RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get()); 245 } 246 247 // Perform caller-saved register optimizations, then callee-saved register 248 // optimizations (shrink wrapping) 249 for (auto &I : BC.getBinaryFunctions()) { 250 if (!FA->hasFrameInfo(I.second)) 251 continue; 252 // Restrict pass execution if user asked to only run on hot functions 253 if (opts::FrameOptimization == FOP_HOT) { 254 if (I.second.getKnownExecutionCount() < BC.getHotThreshold()) 255 continue; 256 LLVM_DEBUG( 257 dbgs() << "Considering " << I.second.getPrintName() 258 << " for frame optimizations because its execution count ( " 259 << I.second.getKnownExecutionCount() 260 << " ) exceeds our hotness threshold ( " 261 << BC.getHotThreshold() << " )\n"); 262 } 263 264 { 265 NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown", 266 opts::TimeOpts); 267 removeUnnecessaryLoads(*RA, *FA, I.second); 268 } 269 270 if (opts::RemoveStores) { 271 NamedRegionTimer T1("removestores", "remove stores", "FOP", 272 "FOP breakdown", opts::TimeOpts); 273 removeUnusedStores(*FA, I.second); 274 } 275 // Don't even start shrink wrapping if no profiling info is available 276 if (I.second.getKnownExecutionCount() == 0) 277 continue; 278 } 279 280 { 281 NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP", 282 "FOP breakdown", opts::TimeOpts); 283 performShrinkWrapping(*RA, *FA, BC); 284 } 285 286 outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads 287 << " redundant load(s) and " << NumRedundantStores 288 << " unused store(s)\n"; 289 outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg 290 << " load(s) to use a register instead of a stack access, and " 291 << NumLoadsChangedToImm << " to use an immediate.\n" 292 << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and " 293 << NumRedundantStores << " store(s).\n"; 294 FA->printStats(); 295 ShrinkWrapping::printStats(); 296 } 297 298 void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA, 299 const FrameAnalysis &FA, 300 BinaryContext &BC) { 301 // Initialize necessary annotations to allow safe parallel accesses to 302 // annotation index in MIB 303 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName()); 304 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName()); 305 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName()); 306 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName()); 307 BC.MIB->getOrCreateAnnotationIndex( 308 StackLayoutModifier::getOffsetCFIRegTagName()); 309 BC.MIB->getOrCreateAnnotationIndex("ReachingDefs"); 310 BC.MIB->getOrCreateAnnotationIndex("ReachingUses"); 311 BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis"); 312 BC.MIB->getOrCreateAnnotationIndex("StackReachingUses"); 313 BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis"); 314 BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis"); 315 BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking"); 316 BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls"); 317 BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions"); 318 BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis"); 319 BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo"); 320 BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking"); 321 BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward"); 322 BC.MIB->getOrCreateAnnotationIndex("ReachingInsns"); 323 BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos"); 324 BC.MIB->getOrCreateAnnotationIndex("DeleteMe"); 325 326 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 327 if (!FA.hasFrameInfo(BF)) 328 return true; 329 330 if (opts::FrameOptimization == FOP_HOT && 331 (BF.getKnownExecutionCount() < BC.getHotThreshold())) 332 return true; 333 334 if (BF.getKnownExecutionCount() == 0) 335 return true; 336 337 return false; 338 }; 339 340 ParallelUtilities::WorkFuncWithAllocTy WorkFunction = 341 [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) { 342 DataflowInfoManager Info(BF, &RA, &FA, AllocatorId); 343 ShrinkWrapping SW(FA, BF, Info, AllocatorId); 344 345 if (SW.perform()) { 346 std::lock_guard<std::mutex> Lock(FuncsChangedMutex); 347 FuncsChanged.insert(&BF); 348 } 349 }; 350 351 ParallelUtilities::runOnEachFunctionWithUniqueAllocId( 352 BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction, 353 SkipPredicate, "shrink-wrapping"); 354 } 355 356 } // namespace bolt 357 } // namespace llvm 358