1 //===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the FrameOptimizerPass class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/FrameOptimizer.h" 14 #include "bolt/Core/ParallelUtilities.h" 15 #include "bolt/Passes/BinaryFunctionCallGraph.h" 16 #include "bolt/Passes/DataflowInfoManager.h" 17 #include "bolt/Passes/ShrinkWrapping.h" 18 #include "bolt/Passes/StackAvailableExpressions.h" 19 #include "bolt/Passes/StackReachingUses.h" 20 #include "llvm/Support/Timer.h" 21 #include <deque> 22 #include <unordered_map> 23 24 #define DEBUG_TYPE "fop" 25 26 using namespace llvm; 27 28 namespace opts { 29 extern cl::opt<unsigned> Verbosity; 30 extern cl::opt<bool> TimeOpts; 31 extern cl::OptionCategory BoltOptCategory; 32 33 using namespace bolt; 34 35 cl::opt<FrameOptimizationType> 36 FrameOptimization("frame-opt", 37 cl::init(FOP_NONE), 38 cl::desc("optimize stack frame accesses"), 39 cl::values( 40 clEnumValN(FOP_NONE, "none", "do not perform frame optimization"), 41 clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"), 42 clEnumValN(FOP_ALL, "all", "perform FOP on all functions")), 43 cl::ZeroOrMore, 44 cl::cat(BoltOptCategory)); 45 46 cl::opt<bool> 47 RemoveStores("frame-opt-rm-stores", 48 cl::init(FOP_NONE), 49 cl::desc("apply additional analysis to remove stores (experimental)"), 50 cl::init(false), 51 cl::ZeroOrMore, 52 cl::cat(BoltOptCategory)); 53 54 } // namespace opts 55 56 namespace llvm { 57 namespace bolt { 58 59 void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA, 60 const FrameAnalysis &FA, 61 BinaryFunction &BF) { 62 StackAvailableExpressions SAE(RA, FA, BF); 63 SAE.run(); 64 65 LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n"); 66 std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase; 67 bool Changed = false; 68 const auto ExprEnd = SAE.expr_end(); 69 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 70 for (BinaryBasicBlock &BB : BF) { 71 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); 72 const MCInst *Prev = nullptr; 73 for (MCInst &Inst : BB) { 74 LLVM_DEBUG({ 75 dbgs() << "\t\tNow at "; 76 Inst.dump(); 77 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB); 78 I != ExprEnd; ++I) { 79 dbgs() << "\t\t\tReached by: "; 80 (*I)->dump(); 81 } 82 }); 83 // if Inst is a load from stack and the current available expressions show 84 // this value is available in a register or immediate, replace this load 85 // with move from register or from immediate. 86 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst); 87 if (!FIEX) { 88 Prev = &Inst; 89 continue; 90 } 91 // FIXME: Change to remove IsSimple == 0. We're being conservative here, 92 // but once replaceMemOperandWithReg is ready, we should feed it with all 93 // sorts of complex instructions. 94 if (FIEX->IsLoad == false || FIEX->IsSimple == false || 95 FIEX->StackOffset >= 0) { 96 Prev = &Inst; 97 continue; 98 } 99 100 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB); 101 I != ExprEnd; ++I) { 102 const MCInst *AvailableInst = *I; 103 ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*AvailableInst); 104 if (!FIEY) 105 continue; 106 assert(FIEY->IsStore && FIEY->IsSimple); 107 if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size) 108 continue; 109 // TODO: Change push/pops to stack adjustment instruction 110 if (MIB->isPop(Inst)) 111 continue; 112 113 ++NumRedundantLoads; 114 Changed = true; 115 LLVM_DEBUG(dbgs() << "Redundant load instruction: "); 116 LLVM_DEBUG(Inst.dump()); 117 LLVM_DEBUG(dbgs() << "Related store instruction: "); 118 LLVM_DEBUG(AvailableInst->dump()); 119 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); 120 // Replace load 121 if (FIEY->IsStoreFromReg) { 122 if (!MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) { 123 LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n"); 124 break; 125 } 126 ++NumLoadsChangedToReg; 127 MIB->removeAnnotation(Inst, "FrameAccessEntry"); 128 LLVM_DEBUG(dbgs() << "Changed operand to a reg\n"); 129 if (MIB->isRedundantMove(Inst)) { 130 ++NumLoadsDeleted; 131 LLVM_DEBUG(dbgs() << "Created a redundant move\n"); 132 // Delete it! 133 ToErase.push_front(std::make_pair(&BB, &Inst)); 134 } 135 } else { 136 char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0}; 137 support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm; 138 LLVM_DEBUG(dbgs() << "Changing operand to an imm... "); 139 if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) { 140 LLVM_DEBUG(dbgs() << "FAILED\n"); 141 } else { 142 ++NumLoadsChangedToImm; 143 MIB->removeAnnotation(Inst, "FrameAccessEntry"); 144 LLVM_DEBUG(dbgs() << "Ok\n"); 145 } 146 } 147 LLVM_DEBUG(dbgs() << "Changed to: "); 148 LLVM_DEBUG(Inst.dump()); 149 break; 150 } 151 Prev = &Inst; 152 } 153 } 154 if (Changed) { 155 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); 156 } 157 // TODO: Implement an interface of eraseInstruction that works out the 158 // complete list of elements to remove. 159 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase) { 160 I.first->eraseInstruction(I.first->findInstruction(I.second)); 161 } 162 } 163 164 void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA, 165 BinaryFunction &BF) { 166 StackReachingUses SRU(FA, BF); 167 SRU.run(); 168 169 LLVM_DEBUG(dbgs() << "Performing unused stores removal\n"); 170 std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase; 171 bool Changed = false; 172 for (BinaryBasicBlock &BB : BF) { 173 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); 174 const MCInst *Prev = nullptr; 175 for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { 176 MCInst &Inst = *I; 177 LLVM_DEBUG({ 178 dbgs() << "\t\tNow at "; 179 Inst.dump(); 180 for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB); 181 I != SRU.expr_end(); ++I) { 182 dbgs() << "\t\t\tReached by: "; 183 (*I)->dump(); 184 } 185 }); 186 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst); 187 if (!FIEX) { 188 Prev = &Inst; 189 continue; 190 } 191 if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) { 192 Prev = &Inst; 193 continue; 194 } 195 196 if (SRU.isStoreUsed(*FIEX, 197 Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) { 198 Prev = &Inst; 199 continue; 200 } 201 // TODO: Change push/pops to stack adjustment instruction 202 if (BF.getBinaryContext().MIB->isPush(Inst)) 203 continue; 204 205 ++NumRedundantStores; 206 Changed = true; 207 LLVM_DEBUG(dbgs() << "Unused store instruction: "); 208 LLVM_DEBUG(Inst.dump()); 209 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); 210 LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset 211 << " size = " << (int)FIEX->Size << "\n"); 212 // Delete it! 213 ToErase.emplace_back(&BB, &Inst); 214 Prev = &Inst; 215 } 216 } 217 218 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase) { 219 I.first->eraseInstruction(I.first->findInstruction(I.second)); 220 } 221 if (Changed) { 222 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); 223 } 224 } 225 226 void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) { 227 if (opts::FrameOptimization == FOP_NONE) 228 return; 229 230 std::unique_ptr<BinaryFunctionCallGraph> CG; 231 std::unique_ptr<FrameAnalysis> FA; 232 std::unique_ptr<RegAnalysis> RA; 233 234 { 235 NamedRegionTimer T1("callgraph", "create call graph", "FOP", 236 "FOP breakdown", opts::TimeOpts); 237 CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC)); 238 } 239 240 { 241 NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP", 242 "FOP breakdown", opts::TimeOpts); 243 FA = std::make_unique<FrameAnalysis>(BC, *CG); 244 } 245 246 { 247 NamedRegionTimer T1("reganalysis", "reg analysis", "FOP", "FOP breakdown", 248 opts::TimeOpts); 249 RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get()); 250 } 251 252 // Perform caller-saved register optimizations, then callee-saved register 253 // optimizations (shrink wrapping) 254 for (auto &I : BC.getBinaryFunctions()) { 255 if (!FA->hasFrameInfo(I.second)) 256 continue; 257 // Restrict pass execution if user asked to only run on hot functions 258 if (opts::FrameOptimization == FOP_HOT) { 259 if (I.second.getKnownExecutionCount() < BC.getHotThreshold()) 260 continue; 261 LLVM_DEBUG( 262 dbgs() << "Considering " << I.second.getPrintName() 263 << " for frame optimizations because its execution count ( " 264 << I.second.getKnownExecutionCount() 265 << " ) exceeds our hotness threshold ( " 266 << BC.getHotThreshold() << " )\n"); 267 } 268 269 { 270 NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown", 271 opts::TimeOpts); 272 removeUnnecessaryLoads(*RA, *FA, I.second); 273 } 274 275 if (opts::RemoveStores) { 276 NamedRegionTimer T1("removestores", "remove stores", "FOP", 277 "FOP breakdown", opts::TimeOpts); 278 removeUnusedStores(*FA, I.second); 279 } 280 // Don't even start shrink wrapping if no profiling info is available 281 if (I.second.getKnownExecutionCount() == 0) 282 continue; 283 } 284 285 { 286 NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP", 287 "FOP breakdown", opts::TimeOpts); 288 performShrinkWrapping(*RA, *FA, BC); 289 } 290 291 outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads 292 << " redundant load(s) and " << NumRedundantStores 293 << " unused store(s)\n"; 294 outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg 295 << " load(s) to use a register instead of a stack access, and " 296 << NumLoadsChangedToImm << " to use an immediate.\n" 297 << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and " 298 << NumRedundantStores << " store(s).\n"; 299 FA->printStats(); 300 ShrinkWrapping::printStats(); 301 } 302 303 void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA, 304 const FrameAnalysis &FA, 305 BinaryContext &BC) { 306 // Initialize necessary annotations to allow safe parallel accesses to 307 // annotation index in MIB 308 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName()); 309 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName()); 310 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName()); 311 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName()); 312 BC.MIB->getOrCreateAnnotationIndex( 313 StackLayoutModifier::getOffsetCFIRegTagName()); 314 BC.MIB->getOrCreateAnnotationIndex("ReachingDefs"); 315 BC.MIB->getOrCreateAnnotationIndex("ReachingUses"); 316 BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis"); 317 BC.MIB->getOrCreateAnnotationIndex("StackReachingUses"); 318 BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis"); 319 BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis"); 320 BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking"); 321 BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls"); 322 BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions"); 323 BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis"); 324 BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo"); 325 BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking"); 326 BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward"); 327 BC.MIB->getOrCreateAnnotationIndex("ReachingInsns"); 328 BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos"); 329 BC.MIB->getOrCreateAnnotationIndex("DeleteMe"); 330 331 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 332 if (!FA.hasFrameInfo(BF)) 333 return true; 334 335 if (opts::FrameOptimization == FOP_HOT && 336 (BF.getKnownExecutionCount() < BC.getHotThreshold())) 337 return true; 338 339 if (BF.getKnownExecutionCount() == 0) 340 return true; 341 342 return false; 343 }; 344 345 ParallelUtilities::WorkFuncWithAllocTy WorkFunction = 346 [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) { 347 DataflowInfoManager Info(BF, &RA, &FA, AllocatorId); 348 ShrinkWrapping SW(FA, BF, Info, AllocatorId); 349 350 if (SW.perform()) { 351 std::lock_guard<std::mutex> Lock(FuncsChangedMutex); 352 FuncsChanged.insert(&BF); 353 } 354 }; 355 356 ParallelUtilities::runOnEachFunctionWithUniqueAllocId( 357 BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction, 358 SkipPredicate, "shrink-wrapping"); 359 } 360 361 } // namespace bolt 362 } // namespace llvm 363