1 //===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the FrameOptimizerPass class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/FrameOptimizer.h" 14 #include "bolt/Core/ParallelUtilities.h" 15 #include "bolt/Passes/BinaryFunctionCallGraph.h" 16 #include "bolt/Passes/DataflowInfoManager.h" 17 #include "bolt/Passes/ShrinkWrapping.h" 18 #include "bolt/Passes/StackAvailableExpressions.h" 19 #include "bolt/Passes/StackReachingUses.h" 20 #include "llvm/Support/Timer.h" 21 #include <deque> 22 #include <unordered_map> 23 24 #define DEBUG_TYPE "fop" 25 26 using namespace llvm; 27 28 namespace opts { 29 extern cl::opt<unsigned> Verbosity; 30 extern cl::opt<bool> TimeOpts; 31 extern cl::OptionCategory BoltOptCategory; 32 33 using namespace bolt; 34 35 cl::opt<FrameOptimizationType> 36 FrameOptimization("frame-opt", 37 cl::init(FOP_NONE), 38 cl::desc("optimize stack frame accesses"), 39 cl::values( 40 clEnumValN(FOP_NONE, "none", "do not perform frame optimization"), 41 clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"), 42 clEnumValN(FOP_ALL, "all", "perform FOP on all functions")), 43 cl::ZeroOrMore, 44 cl::cat(BoltOptCategory)); 45 46 cl::opt<bool> 47 RemoveStores("frame-opt-rm-stores", 48 cl::init(FOP_NONE), 49 cl::desc("apply additional analysis to remove stores (experimental)"), 50 cl::init(false), 51 cl::ZeroOrMore, 52 cl::cat(BoltOptCategory)); 53 54 } // namespace opts 55 56 namespace llvm { 57 namespace bolt { 58 59 void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA, 60 const FrameAnalysis &FA, 61 BinaryFunction &BF) { 62 StackAvailableExpressions SAE(RA, FA, BF); 63 SAE.run(); 64 65 LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n"); 66 std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase; 67 bool Changed = false; 68 const auto ExprEnd = SAE.expr_end(); 69 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 70 for (BinaryBasicBlock &BB : BF) { 71 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); 72 const MCInst *Prev = nullptr; 73 for (MCInst &Inst : BB) { 74 LLVM_DEBUG({ 75 dbgs() << "\t\tNow at "; 76 Inst.dump(); 77 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB); 78 I != ExprEnd; ++I) { 79 dbgs() << "\t\t\tReached by: "; 80 (*I)->dump(); 81 } 82 }); 83 // if Inst is a load from stack and the current available expressions show 84 // this value is available in a register or immediate, replace this load 85 // with move from register or from immediate. 86 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst); 87 if (!FIEX) { 88 Prev = &Inst; 89 continue; 90 } 91 // FIXME: Change to remove IsSimple == 0. We're being conservative here, 92 // but once replaceMemOperandWithReg is ready, we should feed it with all 93 // sorts of complex instructions. 94 if (FIEX->IsLoad == false || FIEX->IsSimple == false || 95 FIEX->StackOffset >= 0) { 96 Prev = &Inst; 97 continue; 98 } 99 100 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB); 101 I != ExprEnd; ++I) { 102 const MCInst *AvailableInst = *I; 103 ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*AvailableInst); 104 if (!FIEY) 105 continue; 106 assert(FIEY->IsStore && FIEY->IsSimple); 107 if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size) 108 continue; 109 // TODO: Change push/pops to stack adjustment instruction 110 if (MIB->isPop(Inst)) 111 continue; 112 113 ++NumRedundantLoads; 114 Changed = true; 115 LLVM_DEBUG(dbgs() << "Redundant load instruction: "); 116 LLVM_DEBUG(Inst.dump()); 117 LLVM_DEBUG(dbgs() << "Related store instruction: "); 118 LLVM_DEBUG(AvailableInst->dump()); 119 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); 120 // Replace load 121 if (FIEY->IsStoreFromReg) { 122 if (!MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) { 123 LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n"); 124 break; 125 } 126 ++NumLoadsChangedToReg; 127 MIB->removeAnnotation(Inst, "FrameAccessEntry"); 128 LLVM_DEBUG(dbgs() << "Changed operand to a reg\n"); 129 if (MIB->isRedundantMove(Inst)) { 130 ++NumLoadsDeleted; 131 LLVM_DEBUG(dbgs() << "Created a redundant move\n"); 132 // Delete it! 133 ToErase.push_front(std::make_pair(&BB, &Inst)); 134 } 135 } else { 136 char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0}; 137 support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm; 138 LLVM_DEBUG(dbgs() << "Changing operand to an imm... "); 139 if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) { 140 LLVM_DEBUG(dbgs() << "FAILED\n"); 141 } else { 142 ++NumLoadsChangedToImm; 143 MIB->removeAnnotation(Inst, "FrameAccessEntry"); 144 LLVM_DEBUG(dbgs() << "Ok\n"); 145 } 146 } 147 LLVM_DEBUG(dbgs() << "Changed to: "); 148 LLVM_DEBUG(Inst.dump()); 149 break; 150 } 151 Prev = &Inst; 152 } 153 } 154 if (Changed) 155 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); 156 157 // TODO: Implement an interface of eraseInstruction that works out the 158 // complete list of elements to remove. 159 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase) 160 I.first->eraseInstruction(I.first->findInstruction(I.second)); 161 } 162 163 void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA, 164 BinaryFunction &BF) { 165 StackReachingUses SRU(FA, BF); 166 SRU.run(); 167 168 LLVM_DEBUG(dbgs() << "Performing unused stores removal\n"); 169 std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase; 170 bool Changed = false; 171 for (BinaryBasicBlock &BB : BF) { 172 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); 173 const MCInst *Prev = nullptr; 174 for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { 175 MCInst &Inst = *I; 176 LLVM_DEBUG({ 177 dbgs() << "\t\tNow at "; 178 Inst.dump(); 179 for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB); 180 I != SRU.expr_end(); ++I) { 181 dbgs() << "\t\t\tReached by: "; 182 (*I)->dump(); 183 } 184 }); 185 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst); 186 if (!FIEX) { 187 Prev = &Inst; 188 continue; 189 } 190 if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) { 191 Prev = &Inst; 192 continue; 193 } 194 195 if (SRU.isStoreUsed(*FIEX, 196 Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) { 197 Prev = &Inst; 198 continue; 199 } 200 // TODO: Change push/pops to stack adjustment instruction 201 if (BF.getBinaryContext().MIB->isPush(Inst)) 202 continue; 203 204 ++NumRedundantStores; 205 Changed = true; 206 LLVM_DEBUG(dbgs() << "Unused store instruction: "); 207 LLVM_DEBUG(Inst.dump()); 208 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); 209 LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset 210 << " size = " << (int)FIEX->Size << "\n"); 211 // Delete it! 212 ToErase.emplace_back(&BB, &Inst); 213 Prev = &Inst; 214 } 215 } 216 217 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase) 218 I.first->eraseInstruction(I.first->findInstruction(I.second)); 219 220 if (Changed) 221 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); 222 } 223 224 void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) { 225 if (opts::FrameOptimization == FOP_NONE) 226 return; 227 228 std::unique_ptr<BinaryFunctionCallGraph> CG; 229 std::unique_ptr<FrameAnalysis> FA; 230 std::unique_ptr<RegAnalysis> RA; 231 232 { 233 NamedRegionTimer T1("callgraph", "create call graph", "FOP", 234 "FOP breakdown", opts::TimeOpts); 235 CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC)); 236 } 237 238 { 239 NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP", 240 "FOP breakdown", opts::TimeOpts); 241 FA = std::make_unique<FrameAnalysis>(BC, *CG); 242 } 243 244 { 245 NamedRegionTimer T1("reganalysis", "reg analysis", "FOP", "FOP breakdown", 246 opts::TimeOpts); 247 RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get()); 248 } 249 250 // Perform caller-saved register optimizations, then callee-saved register 251 // optimizations (shrink wrapping) 252 for (auto &I : BC.getBinaryFunctions()) { 253 if (!FA->hasFrameInfo(I.second)) 254 continue; 255 // Restrict pass execution if user asked to only run on hot functions 256 if (opts::FrameOptimization == FOP_HOT) { 257 if (I.second.getKnownExecutionCount() < BC.getHotThreshold()) 258 continue; 259 LLVM_DEBUG( 260 dbgs() << "Considering " << I.second.getPrintName() 261 << " for frame optimizations because its execution count ( " 262 << I.second.getKnownExecutionCount() 263 << " ) exceeds our hotness threshold ( " 264 << BC.getHotThreshold() << " )\n"); 265 } 266 267 { 268 NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown", 269 opts::TimeOpts); 270 removeUnnecessaryLoads(*RA, *FA, I.second); 271 } 272 273 if (opts::RemoveStores) { 274 NamedRegionTimer T1("removestores", "remove stores", "FOP", 275 "FOP breakdown", opts::TimeOpts); 276 removeUnusedStores(*FA, I.second); 277 } 278 // Don't even start shrink wrapping if no profiling info is available 279 if (I.second.getKnownExecutionCount() == 0) 280 continue; 281 } 282 283 { 284 NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP", 285 "FOP breakdown", opts::TimeOpts); 286 performShrinkWrapping(*RA, *FA, BC); 287 } 288 289 outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads 290 << " redundant load(s) and " << NumRedundantStores 291 << " unused store(s)\n"; 292 outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg 293 << " load(s) to use a register instead of a stack access, and " 294 << NumLoadsChangedToImm << " to use an immediate.\n" 295 << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and " 296 << NumRedundantStores << " store(s).\n"; 297 FA->printStats(); 298 ShrinkWrapping::printStats(); 299 } 300 301 void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA, 302 const FrameAnalysis &FA, 303 BinaryContext &BC) { 304 // Initialize necessary annotations to allow safe parallel accesses to 305 // annotation index in MIB 306 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName()); 307 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName()); 308 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName()); 309 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName()); 310 BC.MIB->getOrCreateAnnotationIndex( 311 StackLayoutModifier::getOffsetCFIRegTagName()); 312 BC.MIB->getOrCreateAnnotationIndex("ReachingDefs"); 313 BC.MIB->getOrCreateAnnotationIndex("ReachingUses"); 314 BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis"); 315 BC.MIB->getOrCreateAnnotationIndex("StackReachingUses"); 316 BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis"); 317 BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis"); 318 BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking"); 319 BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls"); 320 BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions"); 321 BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis"); 322 BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo"); 323 BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking"); 324 BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward"); 325 BC.MIB->getOrCreateAnnotationIndex("ReachingInsns"); 326 BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos"); 327 BC.MIB->getOrCreateAnnotationIndex("DeleteMe"); 328 329 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 330 if (!FA.hasFrameInfo(BF)) 331 return true; 332 333 if (opts::FrameOptimization == FOP_HOT && 334 (BF.getKnownExecutionCount() < BC.getHotThreshold())) 335 return true; 336 337 if (BF.getKnownExecutionCount() == 0) 338 return true; 339 340 return false; 341 }; 342 343 ParallelUtilities::WorkFuncWithAllocTy WorkFunction = 344 [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) { 345 DataflowInfoManager Info(BF, &RA, &FA, AllocatorId); 346 ShrinkWrapping SW(FA, BF, Info, AllocatorId); 347 348 if (SW.perform()) { 349 std::lock_guard<std::mutex> Lock(FuncsChangedMutex); 350 FuncsChanged.insert(&BF); 351 } 352 }; 353 354 ParallelUtilities::runOnEachFunctionWithUniqueAllocId( 355 BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction, 356 SkipPredicate, "shrink-wrapping"); 357 } 358 359 } // namespace bolt 360 } // namespace llvm 361