1 //===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the FrameOptimizerPass class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/FrameOptimizer.h" 14 #include "bolt/Core/ParallelUtilities.h" 15 #include "bolt/Passes/BinaryFunctionCallGraph.h" 16 #include "bolt/Passes/DataflowInfoManager.h" 17 #include "bolt/Passes/ShrinkWrapping.h" 18 #include "bolt/Passes/StackAvailableExpressions.h" 19 #include "bolt/Passes/StackReachingUses.h" 20 #include "bolt/Utils/CommandLineOpts.h" 21 #include "llvm/Support/Timer.h" 22 #include <deque> 23 #include <unordered_map> 24 25 #define DEBUG_TYPE "fop" 26 27 using namespace llvm; 28 29 namespace opts { 30 extern cl::opt<unsigned> Verbosity; 31 extern cl::opt<bool> TimeOpts; 32 extern cl::OptionCategory BoltOptCategory; 33 34 using namespace bolt; 35 36 cl::opt<FrameOptimizationType> 37 FrameOptimization("frame-opt", 38 cl::init(FOP_NONE), 39 cl::desc("optimize stack frame accesses"), 40 cl::values( 41 clEnumValN(FOP_NONE, "none", "do not perform frame optimization"), 42 clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"), 43 clEnumValN(FOP_ALL, "all", "perform FOP on all functions")), 44 cl::ZeroOrMore, 45 cl::cat(BoltOptCategory)); 46 47 cl::opt<bool> RemoveStores( 48 "frame-opt-rm-stores", cl::init(FOP_NONE), 49 cl::desc("apply additional analysis to remove stores (experimental)"), 50 cl::cat(BoltOptCategory)); 51 52 } // namespace opts 53 54 namespace llvm { 55 namespace bolt { 56 57 void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA, 58 const FrameAnalysis &FA, 59 BinaryFunction &BF) { 60 StackAvailableExpressions SAE(RA, FA, BF); 61 SAE.run(); 62 63 LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n"); 64 std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase; 65 bool Changed = false; 66 const auto ExprEnd = SAE.expr_end(); 67 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 68 for (BinaryBasicBlock &BB : BF) { 69 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); 70 const MCInst *Prev = nullptr; 71 for (MCInst &Inst : BB) { 72 LLVM_DEBUG({ 73 dbgs() << "\t\tNow at "; 74 Inst.dump(); 75 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB); 76 I != ExprEnd; ++I) { 77 dbgs() << "\t\t\tReached by: "; 78 (*I)->dump(); 79 } 80 }); 81 // if Inst is a load from stack and the current available expressions show 82 // this value is available in a register or immediate, replace this load 83 // with move from register or from immediate. 84 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst); 85 if (!FIEX) { 86 Prev = &Inst; 87 continue; 88 } 89 // FIXME: Change to remove IsSimple == 0. We're being conservative here, 90 // but once replaceMemOperandWithReg is ready, we should feed it with all 91 // sorts of complex instructions. 92 if (FIEX->IsLoad == false || FIEX->IsSimple == false || 93 FIEX->StackOffset >= 0) { 94 Prev = &Inst; 95 continue; 96 } 97 98 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB); 99 I != ExprEnd; ++I) { 100 const MCInst *AvailableInst = *I; 101 ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*AvailableInst); 102 if (!FIEY) 103 continue; 104 assert(FIEY->IsStore && FIEY->IsSimple); 105 if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size) 106 continue; 107 // TODO: Change push/pops to stack adjustment instruction 108 if (MIB->isPop(Inst)) 109 continue; 110 111 ++NumRedundantLoads; 112 FreqRedundantLoads += BB.getKnownExecutionCount(); 113 Changed = true; 114 LLVM_DEBUG(dbgs() << "Redundant load instruction: "); 115 LLVM_DEBUG(Inst.dump()); 116 LLVM_DEBUG(dbgs() << "Related store instruction: "); 117 LLVM_DEBUG(AvailableInst->dump()); 118 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); 119 // Replace load 120 if (FIEY->IsStoreFromReg) { 121 if (!MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) { 122 LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n"); 123 break; 124 } 125 FreqLoadsChangedToReg += BB.getKnownExecutionCount(); 126 MIB->removeAnnotation(Inst, "FrameAccessEntry"); 127 LLVM_DEBUG(dbgs() << "Changed operand to a reg\n"); 128 if (MIB->isRedundantMove(Inst)) { 129 ++NumLoadsDeleted; 130 FreqLoadsDeleted += BB.getKnownExecutionCount(); 131 LLVM_DEBUG(dbgs() << "Created a redundant move\n"); 132 // Delete it! 133 ToErase.push_front(std::make_pair(&BB, &Inst)); 134 } 135 } else { 136 char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0}; 137 support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm; 138 LLVM_DEBUG(dbgs() << "Changing operand to an imm... "); 139 if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) { 140 LLVM_DEBUG(dbgs() << "FAILED\n"); 141 } else { 142 FreqLoadsChangedToImm += BB.getKnownExecutionCount(); 143 MIB->removeAnnotation(Inst, "FrameAccessEntry"); 144 LLVM_DEBUG(dbgs() << "Ok\n"); 145 } 146 } 147 LLVM_DEBUG(dbgs() << "Changed to: "); 148 LLVM_DEBUG(Inst.dump()); 149 break; 150 } 151 Prev = &Inst; 152 } 153 } 154 if (Changed) 155 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); 156 157 // TODO: Implement an interface of eraseInstruction that works out the 158 // complete list of elements to remove. 159 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase) 160 I.first->eraseInstruction(I.first->findInstruction(I.second)); 161 } 162 163 void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA, 164 BinaryFunction &BF) { 165 StackReachingUses SRU(FA, BF); 166 SRU.run(); 167 168 LLVM_DEBUG(dbgs() << "Performing unused stores removal\n"); 169 std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase; 170 bool Changed = false; 171 for (BinaryBasicBlock &BB : BF) { 172 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); 173 const MCInst *Prev = nullptr; 174 for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { 175 MCInst &Inst = *I; 176 LLVM_DEBUG({ 177 dbgs() << "\t\tNow at "; 178 Inst.dump(); 179 for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB); 180 I != SRU.expr_end(); ++I) { 181 dbgs() << "\t\t\tReached by: "; 182 (*I)->dump(); 183 } 184 }); 185 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst); 186 if (!FIEX) { 187 Prev = &Inst; 188 continue; 189 } 190 if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) { 191 Prev = &Inst; 192 continue; 193 } 194 195 if (SRU.isStoreUsed(*FIEX, 196 Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) { 197 Prev = &Inst; 198 continue; 199 } 200 // TODO: Change push/pops to stack adjustment instruction 201 if (BF.getBinaryContext().MIB->isPush(Inst)) 202 continue; 203 204 ++NumRedundantStores; 205 FreqRedundantStores += BB.getKnownExecutionCount(); 206 Changed = true; 207 LLVM_DEBUG(dbgs() << "Unused store instruction: "); 208 LLVM_DEBUG(Inst.dump()); 209 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); 210 LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset 211 << " size = " << (int)FIEX->Size << "\n"); 212 // Delete it! 213 ToErase.emplace_back(&BB, &Inst); 214 Prev = &Inst; 215 } 216 } 217 218 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase) 219 I.first->eraseInstruction(I.first->findInstruction(I.second)); 220 221 if (Changed) 222 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); 223 } 224 225 void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) { 226 if (opts::FrameOptimization == FOP_NONE) 227 return; 228 229 std::unique_ptr<BinaryFunctionCallGraph> CG; 230 std::unique_ptr<FrameAnalysis> FA; 231 std::unique_ptr<RegAnalysis> RA; 232 233 { 234 NamedRegionTimer T1("callgraph", "create call graph", "FOP", 235 "FOP breakdown", opts::TimeOpts); 236 CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC)); 237 } 238 239 { 240 NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP", 241 "FOP breakdown", opts::TimeOpts); 242 FA = std::make_unique<FrameAnalysis>(BC, *CG); 243 } 244 245 { 246 NamedRegionTimer T1("reganalysis", "reg analysis", "FOP", "FOP breakdown", 247 opts::TimeOpts); 248 RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get()); 249 } 250 251 // Perform caller-saved register optimizations, then callee-saved register 252 // optimizations (shrink wrapping) 253 for (auto &I : BC.getBinaryFunctions()) { 254 if (!FA->hasFrameInfo(I.second)) 255 continue; 256 // Restrict pass execution if user asked to only run on hot functions 257 if (opts::FrameOptimization == FOP_HOT) { 258 if (I.second.getKnownExecutionCount() < BC.getHotThreshold()) 259 continue; 260 LLVM_DEBUG( 261 dbgs() << "Considering " << I.second.getPrintName() 262 << " for frame optimizations because its execution count ( " 263 << I.second.getKnownExecutionCount() 264 << " ) exceeds our hotness threshold ( " 265 << BC.getHotThreshold() << " )\n"); 266 } 267 268 { 269 NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown", 270 opts::TimeOpts); 271 if (!FA->hasStackArithmetic(I.second)) 272 removeUnnecessaryLoads(*RA, *FA, I.second); 273 } 274 275 if (opts::RemoveStores) { 276 NamedRegionTimer T1("removestores", "remove stores", "FOP", 277 "FOP breakdown", opts::TimeOpts); 278 if (!FA->hasStackArithmetic(I.second)) 279 removeUnusedStores(*FA, I.second); 280 } 281 // Don't even start shrink wrapping if no profiling info is available 282 if (I.second.getKnownExecutionCount() == 0) 283 continue; 284 } 285 286 { 287 NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP", 288 "FOP breakdown", opts::TimeOpts); 289 performShrinkWrapping(*RA, *FA, BC); 290 } 291 292 outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads 293 << " redundant load(s) and " << NumRedundantStores 294 << " unused store(s)\n"; 295 outs() << "BOLT-INFO: Frequency of redundant loads is " << FreqRedundantLoads 296 << " and frequency of unused stores is " << FreqRedundantStores 297 << "\n"; 298 outs() << "BOLT-INFO: Frequency of loads changed to use a register is " 299 << FreqLoadsChangedToReg 300 << " and frequency of loads changed to use an immediate is " 301 << FreqLoadsChangedToImm << "\n"; 302 outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted 303 << " load(s) (dyn count: " << FreqLoadsDeleted << ") and " 304 << NumRedundantStores << " store(s)\n"; 305 FA->printStats(); 306 ShrinkWrapping::printStats(); 307 } 308 309 void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA, 310 const FrameAnalysis &FA, 311 BinaryContext &BC) { 312 // Initialize necessary annotations to allow safe parallel accesses to 313 // annotation index in MIB 314 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName()); 315 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName()); 316 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName()); 317 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName()); 318 BC.MIB->getOrCreateAnnotationIndex( 319 StackLayoutModifier::getOffsetCFIRegTagName()); 320 BC.MIB->getOrCreateAnnotationIndex("ReachingDefs"); 321 BC.MIB->getOrCreateAnnotationIndex("ReachingUses"); 322 BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis"); 323 BC.MIB->getOrCreateAnnotationIndex("StackReachingUses"); 324 BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis"); 325 BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis"); 326 BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking"); 327 BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls"); 328 BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions"); 329 BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis"); 330 BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo"); 331 BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking"); 332 BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward"); 333 BC.MIB->getOrCreateAnnotationIndex("ReachingInsns"); 334 BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos"); 335 BC.MIB->getOrCreateAnnotationIndex("DeleteMe"); 336 337 std::vector<std::pair<uint64_t, const BinaryFunction *>> Top10Funcs; 338 auto LogFunc = [&](BinaryFunction &BF) { 339 auto Lower = std::lower_bound( 340 Top10Funcs.begin(), Top10Funcs.end(), BF.getKnownExecutionCount(), 341 [](const std::pair<uint64_t, const BinaryFunction *> &Elmt, 342 uint64_t Value) { return Elmt.first > Value; }); 343 if (Lower == Top10Funcs.end() && Top10Funcs.size() >= 10) 344 return; 345 Top10Funcs.insert(Lower, 346 std::make_pair<>(BF.getKnownExecutionCount(), &BF)); 347 if (Top10Funcs.size() > 10) 348 Top10Funcs.resize(10); 349 }; 350 (void)LogFunc; 351 352 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 353 if (BF.getFunctionScore() == 0) 354 return true; 355 356 return false; 357 }; 358 359 const bool HotOnly = opts::FrameOptimization == FOP_HOT; 360 361 ParallelUtilities::WorkFuncWithAllocTy WorkFunction = 362 [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) { 363 DataflowInfoManager Info(BF, &RA, &FA, AllocatorId); 364 ShrinkWrapping SW(FA, BF, Info, AllocatorId); 365 366 if (SW.perform(HotOnly)) { 367 std::lock_guard<std::mutex> Lock(FuncsChangedMutex); 368 FuncsChanged.insert(&BF); 369 LLVM_DEBUG(LogFunc(BF)); 370 } 371 }; 372 373 ParallelUtilities::runOnEachFunctionWithUniqueAllocId( 374 BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction, 375 SkipPredicate, "shrink-wrapping"); 376 377 if (!Top10Funcs.empty()) { 378 outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n"; 379 for (const auto &Elmt : Top10Funcs) 380 outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n"; 381 } 382 } 383 384 } // namespace bolt 385 } // namespace llvm 386