1 //===- bolt/Passes/IndirectCallPromotion.cpp ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the IndirectCallPromotion class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/IndirectCallPromotion.h" 14 #include "bolt/Passes/BinaryFunctionCallGraph.h" 15 #include "bolt/Passes/DataflowInfoManager.h" 16 #include "bolt/Passes/Inliner.h" 17 #include "llvm/Support/CommandLine.h" 18 19 #define DEBUG_TYPE "ICP" 20 #define DEBUG_VERBOSE(Level, X) \ 21 if (opts::Verbosity >= (Level)) { \ 22 X; \ 23 } 24 25 using namespace llvm; 26 using namespace bolt; 27 28 namespace opts { 29 30 extern cl::OptionCategory BoltOptCategory; 31 32 extern cl::opt<IndirectCallPromotionType> ICP; 33 extern cl::opt<unsigned> Verbosity; 34 extern cl::opt<unsigned> ExecutionCountThreshold; 35 36 static cl::opt<unsigned> ICPJTRemainingPercentThreshold( 37 "icp-jt-remaining-percent-threshold", 38 cl::desc("The percentage threshold against remaining unpromoted indirect " 39 "call count for the promotion for jump tables"), 40 cl::init(30), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); 41 42 static cl::opt<unsigned> ICPJTTotalPercentThreshold( 43 "icp-jt-total-percent-threshold", 44 cl::desc( 45 "The percentage threshold against total count for the promotion for " 46 "jump tables"), 47 cl::init(5), cl::Hidden, cl::cat(BoltOptCategory)); 48 49 static cl::opt<unsigned> ICPCallsRemainingPercentThreshold( 50 "icp-calls-remaining-percent-threshold", 51 cl::desc("The percentage threshold against remaining unpromoted indirect " 52 "call count for the promotion for calls"), 53 cl::init(50), cl::Hidden, cl::cat(BoltOptCategory)); 54 55 static cl::opt<unsigned> ICPCallsTotalPercentThreshold( 56 "icp-calls-total-percent-threshold", 57 cl::desc( 58 "The percentage threshold against total count for the promotion for " 59 "calls"), 60 cl::init(30), cl::Hidden, cl::cat(BoltOptCategory)); 61 62 static cl::opt<unsigned> ICPMispredictThreshold( 63 "indirect-call-promotion-mispredict-threshold", 64 cl::desc("misprediction threshold for skipping ICP on an " 65 "indirect call"), 66 cl::init(0), cl::cat(BoltOptCategory)); 67 68 static cl::opt<bool> ICPUseMispredicts( 69 "indirect-call-promotion-use-mispredicts", 70 cl::desc("use misprediction frequency for determining whether or not ICP " 71 "should be applied at a callsite. The " 72 "-indirect-call-promotion-mispredict-threshold value will be used " 73 "by this heuristic"), 74 cl::cat(BoltOptCategory)); 75 76 static cl::opt<unsigned> 77 ICPTopN("indirect-call-promotion-topn", 78 cl::desc("limit number of targets to consider when doing indirect " 79 "call promotion. 0 = no limit"), 80 cl::init(3), cl::cat(BoltOptCategory)); 81 82 static cl::opt<unsigned> ICPCallsTopN( 83 "indirect-call-promotion-calls-topn", 84 cl::desc("limit number of targets to consider when doing indirect " 85 "call promotion on calls. 0 = no limit"), 86 cl::init(0), cl::cat(BoltOptCategory)); 87 88 static cl::opt<unsigned> ICPJumpTablesTopN( 89 "indirect-call-promotion-jump-tables-topn", 90 cl::desc("limit number of targets to consider when doing indirect " 91 "call promotion on jump tables. 0 = no limit"), 92 cl::init(0), cl::cat(BoltOptCategory)); 93 94 static cl::opt<bool> EliminateLoads( 95 "icp-eliminate-loads", 96 cl::desc("enable load elimination using memory profiling data when " 97 "performing ICP"), 98 cl::init(true), cl::cat(BoltOptCategory)); 99 100 static cl::opt<unsigned> ICPTopCallsites( 101 "icp-top-callsites", 102 cl::desc("optimize hottest calls until at least this percentage of all " 103 "indirect calls frequency is covered. 0 = all callsites"), 104 cl::init(99), cl::Hidden, cl::cat(BoltOptCategory)); 105 106 static cl::list<std::string> 107 ICPFuncsList("icp-funcs", cl::CommaSeparated, 108 cl::desc("list of functions to enable ICP for"), 109 cl::value_desc("func1,func2,func3,..."), cl::Hidden, 110 cl::cat(BoltOptCategory)); 111 112 static cl::opt<bool> 113 ICPOldCodeSequence("icp-old-code-sequence", 114 cl::desc("use old code sequence for promoted calls"), 115 cl::Hidden, cl::cat(BoltOptCategory)); 116 117 static cl::opt<bool> ICPJumpTablesByTarget( 118 "icp-jump-tables-targets", 119 cl::desc( 120 "for jump tables, optimize indirect jmp targets instead of indices"), 121 cl::Hidden, cl::cat(BoltOptCategory)); 122 123 static cl::opt<bool> ICPPeelForInline( 124 "icp-inline", cl::desc("only promote call targets eligible for inlining"), 125 cl::Hidden, cl::cat(BoltOptCategory)); 126 127 } // namespace opts 128 129 static bool verifyProfile(std::map<uint64_t, BinaryFunction> &BFs) { 130 bool IsValid = true; 131 for (auto &BFI : BFs) { 132 BinaryFunction &BF = BFI.second; 133 if (!BF.isSimple()) 134 continue; 135 for (BinaryBasicBlock *BB : BF.layout()) { 136 auto BI = BB->branch_info_begin(); 137 for (BinaryBasicBlock *SuccBB : BB->successors()) { 138 if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) { 139 if (BB->getKnownExecutionCount() == 0 || 140 SuccBB->getKnownExecutionCount() == 0) { 141 errs() << "BOLT-WARNING: profile verification failed after ICP for " 142 "function " 143 << BF << '\n'; 144 IsValid = false; 145 } 146 } 147 ++BI; 148 } 149 } 150 } 151 return IsValid; 152 } 153 154 namespace llvm { 155 namespace bolt { 156 157 IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF, 158 const IndirectCallProfile &ICP) 159 : From(BF.getSymbol()), To(ICP.Offset), Mispreds(ICP.Mispreds), 160 Branches(ICP.Count) { 161 if (ICP.Symbol) { 162 To.Sym = ICP.Symbol; 163 To.Addr = 0; 164 } 165 } 166 167 void IndirectCallPromotion::printDecision( 168 llvm::raw_ostream &OS, 169 std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const { 170 uint64_t TotalCount = 0; 171 uint64_t TotalMispreds = 0; 172 for (const Callsite &S : Targets) { 173 TotalCount += S.Branches; 174 TotalMispreds += S.Mispreds; 175 } 176 if (!TotalCount) 177 TotalCount = 1; 178 if (!TotalMispreds) 179 TotalMispreds = 1; 180 181 OS << "BOLT-INFO: ICP decision for call site with " << Targets.size() 182 << " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds 183 << "\n"; 184 185 size_t I = 0; 186 for (const Callsite &S : Targets) { 187 OS << "Count = " << S.Branches << ", " 188 << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", " 189 << "Mispreds = " << S.Mispreds << ", " 190 << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds); 191 if (I < N) 192 OS << " * to be optimized *"; 193 if (!S.JTIndices.empty()) { 194 OS << " Indices:"; 195 for (const uint64_t Idx : S.JTIndices) 196 OS << " " << Idx; 197 } 198 OS << "\n"; 199 I += S.JTIndices.empty() ? 1 : S.JTIndices.size(); 200 } 201 } 202 203 // Get list of targets for a given call sorted by most frequently 204 // called first. 205 std::vector<IndirectCallPromotion::Callsite> 206 IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB, 207 const MCInst &Inst) const { 208 BinaryFunction &BF = *BB.getFunction(); 209 const BinaryContext &BC = BF.getBinaryContext(); 210 std::vector<Callsite> Targets; 211 212 if (const JumpTable *JT = BF.getJumpTable(Inst)) { 213 // Don't support PIC jump tables for now 214 if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_PIC) 215 return Targets; 216 const Location From(BF.getSymbol()); 217 const std::pair<size_t, size_t> Range = 218 JT->getEntriesForAddress(BC.MIB->getJumpTable(Inst)); 219 assert(JT->Counts.empty() || JT->Counts.size() >= Range.second); 220 JumpTable::JumpInfo DefaultJI; 221 const JumpTable::JumpInfo *JI = 222 JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first]; 223 const size_t JIAdj = JT->Counts.empty() ? 0 : 1; 224 assert(JT->Type == JumpTable::JTT_PIC || 225 JT->EntrySize == BC.AsmInfo->getCodePointerSize()); 226 for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) { 227 MCSymbol *Entry = JT->Entries[I]; 228 assert(BF.getBasicBlockForLabel(Entry) || 229 Entry == BF.getFunctionEndLabel() || 230 Entry == BF.getFunctionColdEndLabel()); 231 if (Entry == BF.getFunctionEndLabel() || 232 Entry == BF.getFunctionColdEndLabel()) 233 continue; 234 const Location To(Entry); 235 const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(Entry); 236 Targets.emplace_back(From, To, BI.MispredictedCount, BI.Count, 237 I - Range.first); 238 } 239 240 // Sort by symbol then addr. 241 llvm::sort(Targets, [](const Callsite &A, const Callsite &B) { 242 if (A.To.Sym && B.To.Sym) 243 return A.To.Sym < B.To.Sym; 244 else if (A.To.Sym && !B.To.Sym) 245 return true; 246 else if (!A.To.Sym && B.To.Sym) 247 return false; 248 else 249 return A.To.Addr < B.To.Addr; 250 }); 251 252 // Targets may contain multiple entries to the same target, but using 253 // different indices. Their profile will report the same number of branches 254 // for different indices if the target is the same. That's because we don't 255 // profile the index value, but only the target via LBR. 256 auto First = Targets.begin(); 257 auto Last = Targets.end(); 258 auto Result = First; 259 while (++First != Last) { 260 Callsite &A = *Result; 261 const Callsite &B = *First; 262 if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym) 263 A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(), 264 B.JTIndices.end()); 265 else 266 *(++Result) = *First; 267 } 268 ++Result; 269 270 LLVM_DEBUG(if (Targets.end() - Result > 0) { 271 dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result) 272 << " duplicate targets removed\n"; 273 }); 274 275 Targets.erase(Result, Targets.end()); 276 } else { 277 // Don't try to optimize PC relative indirect calls. 278 if (Inst.getOperand(0).isReg() && 279 Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter()) 280 return Targets; 281 282 const auto ICSP = BC.MIB->tryGetAnnotationAs<IndirectCallSiteProfile>( 283 Inst, "CallProfile"); 284 if (ICSP) { 285 for (const IndirectCallProfile &CSP : ICSP.get()) { 286 Callsite Site(BF, CSP); 287 if (Site.isValid()) 288 Targets.emplace_back(std::move(Site)); 289 } 290 } 291 } 292 293 // Sort by target count, number of indices in case of jump table, and 294 // mispredicts. We prioritize targets with high count, small number of indices 295 // and high mispredicts. Break ties by selecting targets with lower addresses. 296 llvm::stable_sort(Targets, [](const Callsite &A, const Callsite &B) { 297 if (A.Branches != B.Branches) 298 return A.Branches > B.Branches; 299 if (A.JTIndices.size() != B.JTIndices.size()) 300 return A.JTIndices.size() < B.JTIndices.size(); 301 if (A.Mispreds != B.Mispreds) 302 return A.Mispreds > B.Mispreds; 303 return A.To.Addr < B.To.Addr; 304 }); 305 306 // Remove non-symbol targets 307 llvm::erase_if(Targets, [](const Callsite &CS) { return !CS.To.Sym; }); 308 309 LLVM_DEBUG(if (BF.getJumpTable(Inst)) { 310 uint64_t TotalCount = 0; 311 uint64_t TotalMispreds = 0; 312 for (const Callsite &S : Targets) { 313 TotalCount += S.Branches; 314 TotalMispreds += S.Mispreds; 315 } 316 if (!TotalCount) 317 TotalCount = 1; 318 if (!TotalMispreds) 319 TotalMispreds = 1; 320 321 dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets.size() 322 << ", Count = " << TotalCount << ", Mispreds = " << TotalMispreds 323 << "\n"; 324 325 size_t I = 0; 326 for (const Callsite &S : Targets) { 327 dbgs() << "Count[" << I << "] = " << S.Branches << ", " 328 << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", " 329 << "Mispreds[" << I << "] = " << S.Mispreds << ", " 330 << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds) << "\n"; 331 ++I; 332 } 333 }); 334 335 return Targets; 336 } 337 338 IndirectCallPromotion::JumpTableInfoType 339 IndirectCallPromotion::maybeGetHotJumpTableTargets(BinaryBasicBlock &BB, 340 MCInst &CallInst, 341 MCInst *&TargetFetchInst, 342 const JumpTable *JT) const { 343 assert(JT && "Can't get jump table addrs for non-jump tables."); 344 345 BinaryFunction &Function = *BB.getFunction(); 346 BinaryContext &BC = Function.getBinaryContext(); 347 348 if (!Function.hasMemoryProfile() || !opts::EliminateLoads) 349 return JumpTableInfoType(); 350 351 JumpTableInfoType HotTargets; 352 MCInst *MemLocInstr; 353 MCInst *PCRelBaseOut; 354 unsigned BaseReg, IndexReg; 355 int64_t DispValue; 356 const MCExpr *DispExpr; 357 MutableArrayRef<MCInst> Insts(&BB.front(), &CallInst); 358 const IndirectBranchType Type = BC.MIB->analyzeIndirectBranch( 359 CallInst, Insts.begin(), Insts.end(), BC.AsmInfo->getCodePointerSize(), 360 MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut); 361 362 assert(MemLocInstr && "There should always be a load for jump tables"); 363 if (!MemLocInstr) 364 return JumpTableInfoType(); 365 366 LLVM_DEBUG({ 367 dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for " 368 << "jump table in " << Function << " at @ " 369 << (&CallInst - &BB.front()) << "\n" 370 << "BOLT-INFO: ICP target fetch instructions:\n"; 371 BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function); 372 if (MemLocInstr != &CallInst) 373 BC.printInstruction(dbgs(), CallInst, 0, &Function); 374 }); 375 376 DEBUG_VERBOSE(1, { 377 dbgs() << "Jmp info: Type = " << (unsigned)Type << ", " 378 << "BaseReg = " << BC.MRI->getName(BaseReg) << ", " 379 << "IndexReg = " << BC.MRI->getName(IndexReg) << ", " 380 << "DispValue = " << Twine::utohexstr(DispValue) << ", " 381 << "DispExpr = " << DispExpr << ", " 382 << "MemLocInstr = "; 383 BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function); 384 dbgs() << "\n"; 385 }); 386 387 ++TotalIndexBasedCandidates; 388 389 auto ErrorOrMemAccesssProfile = 390 BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MemLocInstr, 391 "MemoryAccessProfile"); 392 if (!ErrorOrMemAccesssProfile) { 393 DEBUG_VERBOSE(1, dbgs() 394 << "BOLT-INFO: ICP no memory profiling data found\n"); 395 return JumpTableInfoType(); 396 } 397 MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccesssProfile.get(); 398 399 uint64_t ArrayStart; 400 if (DispExpr) { 401 ErrorOr<uint64_t> DispValueOrError = 402 BC.getSymbolValue(*BC.MIB->getTargetSymbol(DispExpr)); 403 assert(DispValueOrError && "global symbol needs a value"); 404 ArrayStart = *DispValueOrError; 405 } else { 406 ArrayStart = static_cast<uint64_t>(DispValue); 407 } 408 409 if (BaseReg == BC.MRI->getProgramCounter()) 410 ArrayStart += Function.getAddress() + MemAccessProfile.NextInstrOffset; 411 412 // This is a map of [symbol] -> [count, index] and is used to combine indices 413 // into the jump table since there may be multiple addresses that all have the 414 // same entry. 415 std::map<MCSymbol *, std::pair<uint64_t, uint64_t>> HotTargetMap; 416 const std::pair<size_t, size_t> Range = JT->getEntriesForAddress(ArrayStart); 417 418 for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) { 419 size_t Index; 420 // Mem data occasionally includes nullprs, ignore them. 421 if (!AccessInfo.MemoryObject && !AccessInfo.Offset) 422 continue; 423 424 if (AccessInfo.Offset % JT->EntrySize != 0) // ignore bogus data 425 return JumpTableInfoType(); 426 427 if (AccessInfo.MemoryObject) { 428 // Deal with bad/stale data 429 if (!AccessInfo.MemoryObject->getName().startswith( 430 "JUMP_TABLE/" + Function.getOneName().str())) 431 return JumpTableInfoType(); 432 Index = 433 (AccessInfo.Offset - (ArrayStart - JT->getAddress())) / JT->EntrySize; 434 } else { 435 Index = (AccessInfo.Offset - ArrayStart) / JT->EntrySize; 436 } 437 438 // If Index is out of range it probably means the memory profiling data is 439 // wrong for this instruction, bail out. 440 if (Index >= Range.second) { 441 LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first 442 << ", " << Range.second << "\n"); 443 return JumpTableInfoType(); 444 } 445 446 // Make sure the hot index points at a legal label corresponding to a BB, 447 // e.g. not the end of function (unreachable) label. 448 if (!Function.getBasicBlockForLabel(JT->Entries[Index + Range.first])) { 449 LLVM_DEBUG({ 450 dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus " 451 << "label " << JT->Entries[Index + Range.first]->getName() 452 << " in jump table:\n"; 453 JT->print(dbgs()); 454 dbgs() << "HotTargetMap:\n"; 455 for (std::pair<MCSymbol *const, std::pair<uint64_t, uint64_t>> &HT : 456 HotTargetMap) 457 dbgs() << "BOLT-INFO: " << HT.first->getName() 458 << " = (count=" << HT.second.first 459 << ", index=" << HT.second.second << ")\n"; 460 }); 461 return JumpTableInfoType(); 462 } 463 464 std::pair<uint64_t, uint64_t> &HotTarget = 465 HotTargetMap[JT->Entries[Index + Range.first]]; 466 HotTarget.first += AccessInfo.Count; 467 HotTarget.second = Index; 468 } 469 470 llvm::transform( 471 HotTargetMap, std::back_inserter(HotTargets), 472 [](const std::pair<MCSymbol *, std::pair<uint64_t, uint64_t>> &A) { 473 return A.second; 474 }); 475 476 // Sort with highest counts first. 477 llvm::sort(reverse(HotTargets)); 478 479 LLVM_DEBUG({ 480 dbgs() << "BOLT-INFO: ICP jump table hot targets:\n"; 481 for (const std::pair<uint64_t, uint64_t> &Target : HotTargets) 482 dbgs() << "BOLT-INFO: Idx = " << Target.second << ", " 483 << "Count = " << Target.first << "\n"; 484 }); 485 486 BC.MIB->getOrCreateAnnotationAs<uint16_t>(CallInst, "JTIndexReg") = IndexReg; 487 488 TargetFetchInst = MemLocInstr; 489 490 return HotTargets; 491 } 492 493 IndirectCallPromotion::SymTargetsType 494 IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets, 495 size_t &N, BinaryBasicBlock &BB, 496 MCInst &CallInst, 497 MCInst *&TargetFetchInst) const { 498 const JumpTable *JT = BB.getFunction()->getJumpTable(CallInst); 499 SymTargetsType SymTargets; 500 501 if (!JT) { 502 for (size_t I = 0; I < N; ++I) { 503 assert(Targets[I].To.Sym && "All ICP targets must be to known symbols"); 504 assert(Targets[I].JTIndices.empty() && 505 "Can't have jump table indices for non-jump tables"); 506 SymTargets.emplace_back(Targets[I].To.Sym, 0); 507 } 508 return SymTargets; 509 } 510 511 // Use memory profile to select hot targets. 512 JumpTableInfoType HotTargets = 513 maybeGetHotJumpTableTargets(BB, CallInst, TargetFetchInst, JT); 514 515 auto findTargetsIndex = [&](uint64_t JTIndex) { 516 for (size_t I = 0; I < Targets.size(); ++I) 517 if (llvm::is_contained(Targets[I].JTIndices, JTIndex)) 518 return I; 519 LLVM_DEBUG(dbgs() << "BOLT-ERROR: Unable to find target index for hot jump " 520 << " table entry in " << *BB.getFunction() << "\n"); 521 llvm_unreachable("Hot indices must be referred to by at least one " 522 "callsite"); 523 }; 524 525 if (!HotTargets.empty()) { 526 if (opts::Verbosity >= 1) 527 for (size_t I = 0; I < HotTargets.size(); ++I) 528 outs() << "BOLT-INFO: HotTarget[" << I << "] = (" << HotTargets[I].first 529 << ", " << HotTargets[I].second << ")\n"; 530 531 // Recompute hottest targets, now discriminating which index is hot 532 // NOTE: This is a tradeoff. On one hand, we get index information. On the 533 // other hand, info coming from the memory profile is much less accurate 534 // than LBRs. So we may actually end up working with more coarse 535 // profile granularity in exchange for information about indices. 536 std::vector<Callsite> NewTargets; 537 std::map<const MCSymbol *, uint32_t> IndicesPerTarget; 538 uint64_t TotalMemAccesses = 0; 539 for (size_t I = 0; I < HotTargets.size(); ++I) { 540 const uint64_t TargetIndex = findTargetsIndex(HotTargets[I].second); 541 ++IndicesPerTarget[Targets[TargetIndex].To.Sym]; 542 TotalMemAccesses += HotTargets[I].first; 543 } 544 uint64_t RemainingMemAccesses = TotalMemAccesses; 545 const size_t TopN = 546 opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : opts::ICPTopN; 547 size_t I = 0; 548 for (; I < HotTargets.size(); ++I) { 549 const uint64_t MemAccesses = HotTargets[I].first; 550 if (100 * MemAccesses < 551 TotalMemAccesses * opts::ICPJTTotalPercentThreshold) 552 break; 553 if (100 * MemAccesses < 554 RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold) 555 break; 556 if (TopN && I >= TopN) 557 break; 558 RemainingMemAccesses -= MemAccesses; 559 560 const uint64_t JTIndex = HotTargets[I].second; 561 Callsite &Target = Targets[findTargetsIndex(JTIndex)]; 562 563 NewTargets.push_back(Target); 564 std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndices); 565 llvm::erase_value(Target.JTIndices, JTIndex); 566 567 // Keep fixCFG counts sane if more indices use this same target later 568 assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map"); 569 NewTargets.back().Branches = 570 Target.Branches / IndicesPerTarget[Target.To.Sym]; 571 NewTargets.back().Mispreds = 572 Target.Mispreds / IndicesPerTarget[Target.To.Sym]; 573 assert(Target.Branches >= NewTargets.back().Branches); 574 assert(Target.Mispreds >= NewTargets.back().Mispreds); 575 Target.Branches -= NewTargets.back().Branches; 576 Target.Mispreds -= NewTargets.back().Mispreds; 577 } 578 llvm::copy(Targets, std::back_inserter(NewTargets)); 579 std::swap(NewTargets, Targets); 580 N = I; 581 582 if (N == 0 && opts::Verbosity >= 1) { 583 outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in " 584 << BB.getName() << ": failed to meet thresholds after memory " 585 << "profile data was loaded.\n"; 586 return SymTargets; 587 } 588 } 589 590 for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) { 591 Callsite &Target = Targets[TgtIdx]; 592 assert(Target.To.Sym && "All ICP targets must be to known symbols"); 593 assert(!Target.JTIndices.empty() && "Jump tables must have indices"); 594 for (uint64_t Idx : Target.JTIndices) { 595 SymTargets.emplace_back(Target.To.Sym, Idx); 596 ++I; 597 } 598 } 599 600 return SymTargets; 601 } 602 603 IndirectCallPromotion::MethodInfoType IndirectCallPromotion::maybeGetVtableSyms( 604 BinaryBasicBlock &BB, MCInst &Inst, 605 const SymTargetsType &SymTargets) const { 606 BinaryFunction &Function = *BB.getFunction(); 607 BinaryContext &BC = Function.getBinaryContext(); 608 std::vector<std::pair<MCSymbol *, uint64_t>> VtableSyms; 609 std::vector<MCInst *> MethodFetchInsns; 610 unsigned VtableReg, MethodReg; 611 uint64_t MethodOffset; 612 613 assert(!Function.getJumpTable(Inst) && 614 "Can't get vtable addrs for jump tables."); 615 616 if (!Function.hasMemoryProfile() || !opts::EliminateLoads) 617 return MethodInfoType(); 618 619 MutableArrayRef<MCInst> Insts(&BB.front(), &Inst + 1); 620 if (!BC.MIB->analyzeVirtualMethodCall(Insts.begin(), Insts.end(), 621 MethodFetchInsns, VtableReg, MethodReg, 622 MethodOffset)) { 623 DEBUG_VERBOSE( 624 1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in " 625 << Function << " at @ " << (&Inst - &BB.front()) << "\n"); 626 return MethodInfoType(); 627 } 628 629 ++TotalMethodLoadEliminationCandidates; 630 631 DEBUG_VERBOSE(1, { 632 dbgs() << "BOLT-INFO: ICP found virtual method call in " << Function 633 << " at @ " << (&Inst - &BB.front()) << "\n"; 634 dbgs() << "BOLT-INFO: ICP method fetch instructions:\n"; 635 for (MCInst *Inst : MethodFetchInsns) 636 BC.printInstruction(dbgs(), *Inst, 0, &Function); 637 638 if (MethodFetchInsns.back() != &Inst) 639 BC.printInstruction(dbgs(), Inst, 0, &Function); 640 }); 641 642 // Try to get value profiling data for the method load instruction. 643 auto ErrorOrMemAccesssProfile = 644 BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MethodFetchInsns.back(), 645 "MemoryAccessProfile"); 646 if (!ErrorOrMemAccesssProfile) { 647 DEBUG_VERBOSE(1, dbgs() 648 << "BOLT-INFO: ICP no memory profiling data found\n"); 649 return MethodInfoType(); 650 } 651 MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccesssProfile.get(); 652 653 // Find the vtable that each method belongs to. 654 std::map<const MCSymbol *, uint64_t> MethodToVtable; 655 656 for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) { 657 uint64_t Address = AccessInfo.Offset; 658 if (AccessInfo.MemoryObject) 659 Address += AccessInfo.MemoryObject->getAddress(); 660 661 // Ignore bogus data. 662 if (!Address) 663 continue; 664 665 const uint64_t VtableBase = Address - MethodOffset; 666 667 DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = " 668 << Twine::utohexstr(VtableBase) << "+" 669 << MethodOffset << "/" << AccessInfo.Count << "\n"); 670 671 if (ErrorOr<uint64_t> MethodAddr = BC.getPointerAtAddress(Address)) { 672 BinaryData *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get()); 673 if (!MethodBD) // skip unknown methods 674 continue; 675 MCSymbol *MethodSym = MethodBD->getSymbol(); 676 MethodToVtable[MethodSym] = VtableBase; 677 DEBUG_VERBOSE(1, { 678 const BinaryFunction *Method = BC.getFunctionForSymbol(MethodSym); 679 dbgs() << "BOLT-INFO: ICP found method = " 680 << Twine::utohexstr(MethodAddr.get()) << "/" 681 << (Method ? Method->getPrintName() : "") << "\n"; 682 }); 683 } 684 } 685 686 // Find the vtable for each target symbol. 687 for (size_t I = 0; I < SymTargets.size(); ++I) { 688 auto Itr = MethodToVtable.find(SymTargets[I].first); 689 if (Itr != MethodToVtable.end()) { 690 if (BinaryData *BD = BC.getBinaryDataContainingAddress(Itr->second)) { 691 const uint64_t Addend = Itr->second - BD->getAddress(); 692 VtableSyms.emplace_back(BD->getSymbol(), Addend); 693 continue; 694 } 695 } 696 // Give up if we can't find the vtable for a method. 697 DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for " 698 << SymTargets[I].first->getName() << "\n"); 699 return MethodInfoType(); 700 } 701 702 // Make sure the vtable reg is not clobbered by the argument passing code 703 if (VtableReg != MethodReg) { 704 for (MCInst *CurInst = MethodFetchInsns.front(); CurInst < &Inst; 705 ++CurInst) { 706 const MCInstrDesc &InstrInfo = BC.MII->get(CurInst->getOpcode()); 707 if (InstrInfo.hasDefOfPhysReg(*CurInst, VtableReg, *BC.MRI)) 708 return MethodInfoType(); 709 } 710 } 711 712 return MethodInfoType(VtableSyms, MethodFetchInsns); 713 } 714 715 std::vector<std::unique_ptr<BinaryBasicBlock>> 716 IndirectCallPromotion::rewriteCall( 717 BinaryBasicBlock &IndCallBlock, const MCInst &CallInst, 718 MCPlusBuilder::BlocksVectorTy &&ICPcode, 719 const std::vector<MCInst *> &MethodFetchInsns) const { 720 BinaryFunction &Function = *IndCallBlock.getFunction(); 721 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 722 723 // Create new basic blocks with correct code in each one first. 724 std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs; 725 const bool IsTailCallOrJT = 726 (MIB->isTailCall(CallInst) || Function.getJumpTable(CallInst)); 727 728 // Move instructions from the tail of the original call block 729 // to the merge block. 730 731 // Remember any pseudo instructions following a tail call. These 732 // must be preserved and moved to the original block. 733 InstructionListType TailInsts; 734 const MCInst *TailInst = &CallInst; 735 if (IsTailCallOrJT) 736 while (TailInst + 1 < &(*IndCallBlock.end()) && 737 MIB->isPseudo(*(TailInst + 1))) 738 TailInsts.push_back(*++TailInst); 739 740 InstructionListType MovedInst = IndCallBlock.splitInstructions(&CallInst); 741 // Link new BBs to the original input offset of the BB where the indirect 742 // call site is, so we can map samples recorded in new BBs back to the 743 // original BB seen in the input binary (if using BAT) 744 const uint32_t OrigOffset = IndCallBlock.getInputOffset(); 745 746 IndCallBlock.eraseInstructions(MethodFetchInsns.begin(), 747 MethodFetchInsns.end()); 748 if (IndCallBlock.empty() || 749 (!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst)) 750 IndCallBlock.addInstructions(ICPcode.front().second.begin(), 751 ICPcode.front().second.end()); 752 else 753 IndCallBlock.replaceInstruction(std::prev(IndCallBlock.end()), 754 ICPcode.front().second); 755 IndCallBlock.addInstructions(TailInsts.begin(), TailInsts.end()); 756 757 for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) { 758 MCSymbol *&Sym = Itr->first; 759 InstructionListType &Insts = Itr->second; 760 assert(Sym); 761 std::unique_ptr<BinaryBasicBlock> TBB = Function.createBasicBlock(Sym); 762 TBB->setOffset(OrigOffset); 763 for (MCInst &Inst : Insts) // sanitize new instructions. 764 if (MIB->isCall(Inst)) 765 MIB->removeAnnotation(Inst, "CallProfile"); 766 TBB->addInstructions(Insts.begin(), Insts.end()); 767 NewBBs.emplace_back(std::move(TBB)); 768 } 769 770 // Move tail of instructions from after the original call to 771 // the merge block. 772 if (!IsTailCallOrJT) 773 NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end()); 774 775 return NewBBs; 776 } 777 778 BinaryBasicBlock * 779 IndirectCallPromotion::fixCFG(BinaryBasicBlock &IndCallBlock, 780 const bool IsTailCall, const bool IsJumpTable, 781 IndirectCallPromotion::BasicBlocksVector &&NewBBs, 782 const std::vector<Callsite> &Targets) const { 783 BinaryFunction &Function = *IndCallBlock.getFunction(); 784 using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo; 785 BinaryBasicBlock *MergeBlock = nullptr; 786 787 // Scale indirect call counts to the execution count of the original 788 // basic block containing the indirect call. 789 uint64_t TotalCount = IndCallBlock.getKnownExecutionCount(); 790 uint64_t TotalIndirectBranches = 0; 791 for (const Callsite &Target : Targets) 792 TotalIndirectBranches += Target.Branches; 793 if (TotalIndirectBranches == 0) 794 TotalIndirectBranches = 1; 795 BinaryBasicBlock::BranchInfoType BBI; 796 BinaryBasicBlock::BranchInfoType ScaledBBI; 797 for (const Callsite &Target : Targets) { 798 const size_t NumEntries = 799 std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size()); 800 for (size_t I = 0; I < NumEntries; ++I) { 801 BBI.push_back( 802 BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries, 803 (Target.Mispreds + NumEntries - 1) / NumEntries}); 804 ScaledBBI.push_back( 805 BinaryBranchInfo{uint64_t(TotalCount * Target.Branches / 806 (NumEntries * TotalIndirectBranches)), 807 uint64_t(TotalCount * Target.Mispreds / 808 (NumEntries * TotalIndirectBranches))}); 809 } 810 } 811 812 if (IsJumpTable) { 813 BinaryBasicBlock *NewIndCallBlock = NewBBs.back().get(); 814 IndCallBlock.moveAllSuccessorsTo(NewIndCallBlock); 815 816 std::vector<MCSymbol *> SymTargets; 817 for (const Callsite &Target : Targets) { 818 const size_t NumEntries = 819 std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size()); 820 for (size_t I = 0; I < NumEntries; ++I) 821 SymTargets.push_back(Target.To.Sym); 822 } 823 assert(SymTargets.size() > NewBBs.size() - 1 && 824 "There must be a target symbol associated with each new BB."); 825 826 for (uint64_t I = 0; I < NewBBs.size(); ++I) { 827 BinaryBasicBlock *SourceBB = I ? NewBBs[I - 1].get() : &IndCallBlock; 828 SourceBB->setExecutionCount(TotalCount); 829 830 BinaryBasicBlock *TargetBB = 831 Function.getBasicBlockForLabel(SymTargets[I]); 832 SourceBB->addSuccessor(TargetBB, ScaledBBI[I]); // taken 833 834 TotalCount -= ScaledBBI[I].Count; 835 SourceBB->addSuccessor(NewBBs[I].get(), TotalCount); // fall-through 836 837 // Update branch info for the indirect jump. 838 BinaryBasicBlock::BinaryBranchInfo &BranchInfo = 839 NewIndCallBlock->getBranchInfo(*TargetBB); 840 if (BranchInfo.Count > BBI[I].Count) 841 BranchInfo.Count -= BBI[I].Count; 842 else 843 BranchInfo.Count = 0; 844 845 if (BranchInfo.MispredictedCount > BBI[I].MispredictedCount) 846 BranchInfo.MispredictedCount -= BBI[I].MispredictedCount; 847 else 848 BranchInfo.MispredictedCount = 0; 849 } 850 } else { 851 assert(NewBBs.size() >= 2); 852 assert(NewBBs.size() % 2 == 1 || IndCallBlock.succ_empty()); 853 assert(NewBBs.size() % 2 == 1 || IsTailCall); 854 855 auto ScaledBI = ScaledBBI.begin(); 856 auto updateCurrentBranchInfo = [&] { 857 assert(ScaledBI != ScaledBBI.end()); 858 TotalCount -= ScaledBI->Count; 859 ++ScaledBI; 860 }; 861 862 if (!IsTailCall) { 863 MergeBlock = NewBBs.back().get(); 864 IndCallBlock.moveAllSuccessorsTo(MergeBlock); 865 } 866 867 // Fix up successors and execution counts. 868 updateCurrentBranchInfo(); 869 IndCallBlock.addSuccessor(NewBBs[1].get(), TotalCount); 870 IndCallBlock.addSuccessor(NewBBs[0].get(), ScaledBBI[0]); 871 872 const size_t Adj = IsTailCall ? 1 : 2; 873 for (size_t I = 0; I < NewBBs.size() - Adj; ++I) { 874 assert(TotalCount <= IndCallBlock.getExecutionCount() || 875 TotalCount <= uint64_t(TotalIndirectBranches)); 876 uint64_t ExecCount = ScaledBBI[(I + 1) / 2].Count; 877 if (I % 2 == 0) { 878 if (MergeBlock) 879 NewBBs[I]->addSuccessor(MergeBlock, ScaledBBI[(I + 1) / 2].Count); 880 } else { 881 assert(I + 2 < NewBBs.size()); 882 updateCurrentBranchInfo(); 883 NewBBs[I]->addSuccessor(NewBBs[I + 2].get(), TotalCount); 884 NewBBs[I]->addSuccessor(NewBBs[I + 1].get(), ScaledBBI[(I + 1) / 2]); 885 ExecCount += TotalCount; 886 } 887 NewBBs[I]->setExecutionCount(ExecCount); 888 } 889 890 if (MergeBlock) { 891 // Arrange for the MergeBlock to be the fallthrough for the first 892 // promoted call block. 893 std::unique_ptr<BinaryBasicBlock> MBPtr; 894 std::swap(MBPtr, NewBBs.back()); 895 NewBBs.pop_back(); 896 NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr)); 897 // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here? 898 NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch 899 } 900 } 901 902 // Update the execution count. 903 NewBBs.back()->setExecutionCount(TotalCount); 904 905 // Update BB and BB layout. 906 Function.insertBasicBlocks(&IndCallBlock, std::move(NewBBs)); 907 assert(Function.validateCFG()); 908 909 return MergeBlock; 910 } 911 912 size_t IndirectCallPromotion::canPromoteCallsite( 913 const BinaryBasicBlock &BB, const MCInst &Inst, 914 const std::vector<Callsite> &Targets, uint64_t NumCalls) { 915 BinaryFunction *BF = BB.getFunction(); 916 const BinaryContext &BC = BF->getBinaryContext(); 917 918 if (BB.getKnownExecutionCount() < opts::ExecutionCountThreshold) 919 return 0; 920 921 const bool IsJumpTable = BF->getJumpTable(Inst); 922 923 auto computeStats = [&](size_t N) { 924 for (size_t I = 0; I < N; ++I) 925 if (IsJumpTable) 926 TotalNumFrequentJmps += Targets[I].Branches; 927 else 928 TotalNumFrequentCalls += Targets[I].Branches; 929 }; 930 931 // If we have no targets (or no calls), skip this callsite. 932 if (Targets.empty() || !NumCalls) { 933 if (opts::Verbosity >= 1) { 934 const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); 935 outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx << " in " 936 << BB.getName() << ", calls = " << NumCalls 937 << ", targets empty or NumCalls == 0.\n"; 938 } 939 return 0; 940 } 941 942 size_t TopN = opts::ICPTopN; 943 if (IsJumpTable) 944 TopN = opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : TopN; 945 else 946 TopN = opts::ICPCallsTopN ? opts::ICPCallsTopN : TopN; 947 948 const size_t TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size(); 949 950 if (opts::ICPTopCallsites > 0) { 951 if (!BC.MIB->hasAnnotation(Inst, "DoICP")) 952 return 0; 953 } 954 955 // Pick the top N targets. 956 uint64_t TotalMispredictsTopN = 0; 957 size_t N = 0; 958 959 if (opts::ICPUseMispredicts && 960 (!IsJumpTable || opts::ICPJumpTablesByTarget)) { 961 // Count total number of mispredictions for (at most) the top N targets. 962 // We may choose a smaller N (TrialN vs. N) if the frequency threshold 963 // is exceeded by fewer targets. 964 double Threshold = double(opts::ICPMispredictThreshold); 965 for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) { 966 Threshold -= (100.0 * Targets[I].Mispreds) / NumCalls; 967 TotalMispredictsTopN += Targets[I].Mispreds; 968 } 969 computeStats(N); 970 971 // Compute the misprediction frequency of the top N call targets. If this 972 // frequency is greater than the threshold, we should try ICP on this 973 // callsite. 974 const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls; 975 if (TopNFrequency == 0 || TopNFrequency < opts::ICPMispredictThreshold) { 976 if (opts::Verbosity >= 1) { 977 const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); 978 outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx 979 << " in " << BB.getName() << ", calls = " << NumCalls 980 << ", top N mis. frequency " << format("%.1f", TopNFrequency) 981 << "% < " << opts::ICPMispredictThreshold << "%\n"; 982 } 983 return 0; 984 } 985 } else { 986 size_t MaxTargets = 0; 987 988 // Count total number of calls for (at most) the top N targets. 989 // We may choose a smaller N (TrialN vs. N) if the frequency threshold 990 // is exceeded by fewer targets. 991 const unsigned TotalThreshold = IsJumpTable 992 ? opts::ICPJTTotalPercentThreshold 993 : opts::ICPCallsTotalPercentThreshold; 994 const unsigned RemainingThreshold = 995 IsJumpTable ? opts::ICPJTRemainingPercentThreshold 996 : opts::ICPCallsRemainingPercentThreshold; 997 uint64_t NumRemainingCalls = NumCalls; 998 for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) { 999 if (100 * Targets[I].Branches < NumCalls * TotalThreshold) 1000 break; 1001 if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold) 1002 break; 1003 if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) > 1004 TrialN) 1005 break; 1006 TotalMispredictsTopN += Targets[I].Mispreds; 1007 NumRemainingCalls -= Targets[I].Branches; 1008 N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size(); 1009 } 1010 computeStats(MaxTargets); 1011 1012 // Don't check misprediction frequency for jump tables -- we don't really 1013 // care as long as we are saving loads from the jump table. 1014 if (!IsJumpTable || opts::ICPJumpTablesByTarget) { 1015 // Compute the misprediction frequency of the top N call targets. If 1016 // this frequency is less than the threshold, we should skip ICP at 1017 // this callsite. 1018 const double TopNMispredictFrequency = 1019 (100.0 * TotalMispredictsTopN) / NumCalls; 1020 1021 if (TopNMispredictFrequency < opts::ICPMispredictThreshold) { 1022 if (opts::Verbosity >= 1) { 1023 const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); 1024 outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx 1025 << " in " << BB.getName() << ", calls = " << NumCalls 1026 << ", top N mispredict frequency " 1027 << format("%.1f", TopNMispredictFrequency) << "% < " 1028 << opts::ICPMispredictThreshold << "%\n"; 1029 } 1030 return 0; 1031 } 1032 } 1033 } 1034 1035 // Filter by inline-ability of target functions, stop at first target that 1036 // can't be inlined. 1037 if (opts::ICPPeelForInline) { 1038 for (size_t I = 0; I < N; ++I) { 1039 const MCSymbol *TargetSym = Targets[I].To.Sym; 1040 const BinaryFunction *TargetBF = BC.getFunctionForSymbol(TargetSym); 1041 if (!BinaryFunctionPass::shouldOptimize(*TargetBF) || 1042 getInliningInfo(*TargetBF).Type == InliningType::INL_NONE) { 1043 N = I; 1044 break; 1045 } 1046 } 1047 } 1048 1049 // Filter functions that can have ICP applied (for debugging) 1050 if (!opts::ICPFuncsList.empty()) { 1051 for (std::string &Name : opts::ICPFuncsList) 1052 if (BF->hasName(Name)) 1053 return N; 1054 return 0; 1055 } 1056 1057 return N; 1058 } 1059 1060 void IndirectCallPromotion::printCallsiteInfo( 1061 const BinaryBasicBlock &BB, const MCInst &Inst, 1062 const std::vector<Callsite> &Targets, const size_t N, 1063 uint64_t NumCalls) const { 1064 BinaryContext &BC = BB.getFunction()->getBinaryContext(); 1065 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1066 const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst); 1067 const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); 1068 1069 outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction() 1070 << " @ " << InstIdx << " in " << BB.getName() 1071 << " -> calls = " << NumCalls 1072 << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : "")) 1073 << "\n"; 1074 for (size_t I = 0; I < N; I++) { 1075 const double Frequency = 100.0 * Targets[I].Branches / NumCalls; 1076 const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls; 1077 outs() << "BOLT-INFO: "; 1078 if (Targets[I].To.Sym) 1079 outs() << Targets[I].To.Sym->getName(); 1080 else 1081 outs() << Targets[I].To.Addr; 1082 outs() << ", calls = " << Targets[I].Branches 1083 << ", mispreds = " << Targets[I].Mispreds 1084 << ", taken freq = " << format("%.1f", Frequency) << "%" 1085 << ", mis. freq = " << format("%.1f", MisFrequency) << "%"; 1086 bool First = true; 1087 for (uint64_t JTIndex : Targets[I].JTIndices) { 1088 outs() << (First ? ", indices = " : ", ") << JTIndex; 1089 First = false; 1090 } 1091 outs() << "\n"; 1092 } 1093 1094 LLVM_DEBUG({ 1095 dbgs() << "BOLT-INFO: ICP original call instruction:"; 1096 BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true); 1097 }); 1098 } 1099 1100 void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) { 1101 if (opts::ICP == ICP_NONE) 1102 return; 1103 1104 auto &BFs = BC.getBinaryFunctions(); 1105 1106 const bool OptimizeCalls = (opts::ICP == ICP_CALLS || opts::ICP == ICP_ALL); 1107 const bool OptimizeJumpTables = 1108 (opts::ICP == ICP_JUMP_TABLES || opts::ICP == ICP_ALL); 1109 1110 std::unique_ptr<RegAnalysis> RA; 1111 std::unique_ptr<BinaryFunctionCallGraph> CG; 1112 if (OptimizeJumpTables) { 1113 CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC))); 1114 RA.reset(new RegAnalysis(BC, &BFs, &*CG)); 1115 } 1116 1117 // If icp-top-callsites is enabled, compute the total number of indirect 1118 // calls and then optimize the hottest callsites that contribute to that 1119 // total. 1120 SetVector<BinaryFunction *> Functions; 1121 if (opts::ICPTopCallsites == 0) { 1122 for (auto &KV : BFs) 1123 Functions.insert(&KV.second); 1124 } else { 1125 using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>; 1126 std::vector<IndirectCallsite> IndirectCalls; 1127 size_t TotalIndirectCalls = 0; 1128 1129 // Find all the indirect callsites. 1130 for (auto &BFIt : BFs) { 1131 BinaryFunction &Function = BFIt.second; 1132 1133 if (!Function.isSimple() || Function.isIgnored() || 1134 !Function.hasProfile()) 1135 continue; 1136 1137 const bool HasLayout = !Function.layout_empty(); 1138 1139 for (BinaryBasicBlock &BB : Function) { 1140 if (HasLayout && Function.isSplit() && BB.isCold()) 1141 continue; 1142 1143 for (MCInst &Inst : BB) { 1144 const bool IsJumpTable = Function.getJumpTable(Inst); 1145 const bool HasIndirectCallProfile = 1146 BC.MIB->hasAnnotation(Inst, "CallProfile"); 1147 const bool IsDirectCall = 1148 (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0)); 1149 1150 if (!IsDirectCall && 1151 ((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) || 1152 (IsJumpTable && OptimizeJumpTables))) { 1153 uint64_t NumCalls = 0; 1154 for (const Callsite &BInfo : getCallTargets(BB, Inst)) 1155 NumCalls += BInfo.Branches; 1156 IndirectCalls.push_back( 1157 std::make_tuple(NumCalls, &Inst, &Function)); 1158 TotalIndirectCalls += NumCalls; 1159 } 1160 } 1161 } 1162 } 1163 1164 // Sort callsites by execution count. 1165 llvm::sort(reverse(IndirectCalls)); 1166 1167 // Find callsites that contribute to the top "opts::ICPTopCallsites"% 1168 // number of calls. 1169 const float TopPerc = opts::ICPTopCallsites / 100.0f; 1170 int64_t MaxCalls = TotalIndirectCalls * TopPerc; 1171 uint64_t LastFreq = std::numeric_limits<uint64_t>::max(); 1172 size_t Num = 0; 1173 for (const IndirectCallsite &IC : IndirectCalls) { 1174 const uint64_t CurFreq = std::get<0>(IC); 1175 // Once we decide to stop, include at least all branches that share the 1176 // same frequency of the last one to avoid non-deterministic behavior 1177 // (e.g. turning on/off ICP depending on the order of functions) 1178 if (MaxCalls <= 0 && CurFreq != LastFreq) 1179 break; 1180 MaxCalls -= CurFreq; 1181 LastFreq = CurFreq; 1182 BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true); 1183 Functions.insert(std::get<2>(IC)); 1184 ++Num; 1185 } 1186 outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls 1187 << ", " << Num << " callsites cover " << opts::ICPTopCallsites 1188 << "% of all indirect calls\n"; 1189 } 1190 1191 for (BinaryFunction *FuncPtr : Functions) { 1192 BinaryFunction &Function = *FuncPtr; 1193 1194 if (!Function.isSimple() || Function.isIgnored() || !Function.hasProfile()) 1195 continue; 1196 1197 const bool HasLayout = !Function.layout_empty(); 1198 1199 // Total number of indirect calls issued from the current Function. 1200 // (a fraction of TotalIndirectCalls) 1201 uint64_t FuncTotalIndirectCalls = 0; 1202 uint64_t FuncTotalIndirectJmps = 0; 1203 1204 std::vector<BinaryBasicBlock *> BBs; 1205 for (BinaryBasicBlock &BB : Function) { 1206 // Skip indirect calls in cold blocks. 1207 if (!HasLayout || !Function.isSplit() || !BB.isCold()) 1208 BBs.push_back(&BB); 1209 } 1210 if (BBs.empty()) 1211 continue; 1212 1213 DataflowInfoManager Info(Function, RA.get(), nullptr); 1214 while (!BBs.empty()) { 1215 BinaryBasicBlock *BB = BBs.back(); 1216 BBs.pop_back(); 1217 1218 for (unsigned Idx = 0; Idx < BB->size(); ++Idx) { 1219 MCInst &Inst = BB->getInstructionAtIndex(Idx); 1220 const ptrdiff_t InstIdx = &Inst - &(*BB->begin()); 1221 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1222 const bool HasIndirectCallProfile = 1223 BC.MIB->hasAnnotation(Inst, "CallProfile"); 1224 const bool IsJumpTable = Function.getJumpTable(Inst); 1225 1226 if (BC.MIB->isCall(Inst)) 1227 TotalCalls += BB->getKnownExecutionCount(); 1228 1229 if (IsJumpTable && !OptimizeJumpTables) 1230 continue; 1231 1232 if (!IsJumpTable && (!HasIndirectCallProfile || !OptimizeCalls)) 1233 continue; 1234 1235 // Ignore direct calls. 1236 if (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0)) 1237 continue; 1238 1239 assert((BC.MIB->isCall(Inst) || BC.MIB->isIndirectBranch(Inst)) && 1240 "expected a call or an indirect jump instruction"); 1241 1242 if (IsJumpTable) 1243 ++TotalJumpTableCallsites; 1244 else 1245 ++TotalIndirectCallsites; 1246 1247 std::vector<Callsite> Targets = getCallTargets(*BB, Inst); 1248 1249 // Compute the total number of calls from this particular callsite. 1250 uint64_t NumCalls = 0; 1251 for (const Callsite &BInfo : Targets) 1252 NumCalls += BInfo.Branches; 1253 if (!IsJumpTable) 1254 FuncTotalIndirectCalls += NumCalls; 1255 else 1256 FuncTotalIndirectJmps += NumCalls; 1257 1258 // If FLAGS regs is alive after this jmp site, do not try 1259 // promoting because we will clobber FLAGS. 1260 if (IsJumpTable) { 1261 ErrorOr<const BitVector &> State = 1262 Info.getLivenessAnalysis().getStateBefore(Inst); 1263 if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) { 1264 if (opts::Verbosity >= 1) 1265 outs() << "BOLT-INFO: ICP failed in " << Function << " @ " 1266 << InstIdx << " in " << BB->getName() 1267 << ", calls = " << NumCalls 1268 << (State ? ", cannot clobber flags reg.\n" 1269 : ", no liveness data available.\n"); 1270 continue; 1271 } 1272 } 1273 1274 // Should this callsite be optimized? Return the number of targets 1275 // to use when promoting this call. A value of zero means to skip 1276 // this callsite. 1277 size_t N = canPromoteCallsite(*BB, Inst, Targets, NumCalls); 1278 1279 // If it is a jump table and it failed to meet our initial threshold, 1280 // proceed to findCallTargetSymbols -- it may reevaluate N if 1281 // memory profile is present 1282 if (!N && !IsJumpTable) 1283 continue; 1284 1285 if (opts::Verbosity >= 1) 1286 printCallsiteInfo(*BB, Inst, Targets, N, NumCalls); 1287 1288 // Find MCSymbols or absolute addresses for each call target. 1289 MCInst *TargetFetchInst = nullptr; 1290 const SymTargetsType SymTargets = 1291 findCallTargetSymbols(Targets, N, *BB, Inst, TargetFetchInst); 1292 1293 // findCallTargetSymbols may have changed N if mem profile is available 1294 // for jump tables 1295 if (!N) 1296 continue; 1297 1298 LLVM_DEBUG(printDecision(dbgs(), Targets, N)); 1299 1300 // If we can't resolve any of the target symbols, punt on this callsite. 1301 // TODO: can this ever happen? 1302 if (SymTargets.size() < N) { 1303 const size_t LastTarget = SymTargets.size(); 1304 if (opts::Verbosity >= 1) 1305 outs() << "BOLT-INFO: ICP failed in " << Function << " @ " 1306 << InstIdx << " in " << BB->getName() 1307 << ", calls = " << NumCalls 1308 << ", ICP failed to find target symbol for " 1309 << Targets[LastTarget].To.Sym->getName() << "\n"; 1310 continue; 1311 } 1312 1313 MethodInfoType MethodInfo; 1314 1315 if (!IsJumpTable) { 1316 MethodInfo = maybeGetVtableSyms(*BB, Inst, SymTargets); 1317 TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1; 1318 LLVM_DEBUG(dbgs() 1319 << "BOLT-INFO: ICP " 1320 << (!MethodInfo.first.empty() ? "found" : "did not find") 1321 << " vtables for all methods.\n"); 1322 } else if (TargetFetchInst) { 1323 ++TotalIndexBasedJumps; 1324 MethodInfo.second.push_back(TargetFetchInst); 1325 } 1326 1327 // Generate new promoted call code for this callsite. 1328 MCPlusBuilder::BlocksVectorTy ICPcode = 1329 (IsJumpTable && !opts::ICPJumpTablesByTarget) 1330 ? BC.MIB->jumpTablePromotion(Inst, SymTargets, 1331 MethodInfo.second, BC.Ctx.get()) 1332 : BC.MIB->indirectCallPromotion( 1333 Inst, SymTargets, MethodInfo.first, MethodInfo.second, 1334 opts::ICPOldCodeSequence, BC.Ctx.get()); 1335 1336 if (ICPcode.empty()) { 1337 if (opts::Verbosity >= 1) 1338 outs() << "BOLT-INFO: ICP failed in " << Function << " @ " 1339 << InstIdx << " in " << BB->getName() 1340 << ", calls = " << NumCalls 1341 << ", unable to generate promoted call code.\n"; 1342 continue; 1343 } 1344 1345 LLVM_DEBUG({ 1346 uint64_t Offset = Targets[0].From.Addr; 1347 dbgs() << "BOLT-INFO: ICP indirect call code:\n"; 1348 for (const auto &entry : ICPcode) { 1349 const MCSymbol *const &Sym = entry.first; 1350 const InstructionListType &Insts = entry.second; 1351 if (Sym) 1352 dbgs() << Sym->getName() << ":\n"; 1353 Offset = BC.printInstructions(dbgs(), Insts.begin(), Insts.end(), 1354 Offset); 1355 } 1356 dbgs() << "---------------------------------------------------\n"; 1357 }); 1358 1359 // Rewrite the CFG with the newly generated ICP code. 1360 std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs = 1361 rewriteCall(*BB, Inst, std::move(ICPcode), MethodInfo.second); 1362 1363 // Fix the CFG after inserting the new basic blocks. 1364 BinaryBasicBlock *MergeBlock = 1365 fixCFG(*BB, IsTailCall, IsJumpTable, std::move(NewBBs), Targets); 1366 1367 // Since the tail of the original block was split off and it may contain 1368 // additional indirect calls, we must add the merge block to the set of 1369 // blocks to process. 1370 if (MergeBlock) 1371 BBs.push_back(MergeBlock); 1372 1373 if (opts::Verbosity >= 1) 1374 outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ " 1375 << InstIdx << " in " << BB->getName() 1376 << " -> calls = " << NumCalls << "\n"; 1377 1378 if (IsJumpTable) 1379 ++TotalOptimizedJumpTableCallsites; 1380 else 1381 ++TotalOptimizedIndirectCallsites; 1382 1383 Modified.insert(&Function); 1384 } 1385 } 1386 TotalIndirectCalls += FuncTotalIndirectCalls; 1387 TotalIndirectJmps += FuncTotalIndirectJmps; 1388 } 1389 1390 outs() << "BOLT-INFO: ICP total indirect callsites with profile = " 1391 << TotalIndirectCallsites << "\n" 1392 << "BOLT-INFO: ICP total jump table callsites = " 1393 << TotalJumpTableCallsites << "\n" 1394 << "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n" 1395 << "BOLT-INFO: ICP percentage of calls that are indirect = " 1396 << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n" 1397 << "BOLT-INFO: ICP percentage of indirect calls that can be " 1398 "optimized = " 1399 << format("%.1f", (100.0 * TotalNumFrequentCalls) / 1400 std::max<size_t>(TotalIndirectCalls, 1)) 1401 << "%\n" 1402 << "BOLT-INFO: ICP percentage of indirect callsites that are " 1403 "optimized = " 1404 << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) / 1405 std::max<uint64_t>(TotalIndirectCallsites, 1)) 1406 << "%\n" 1407 << "BOLT-INFO: ICP number of method load elimination candidates = " 1408 << TotalMethodLoadEliminationCandidates << "\n" 1409 << "BOLT-INFO: ICP percentage of method calls candidates that have " 1410 "loads eliminated = " 1411 << format("%.1f", (100.0 * TotalMethodLoadsEliminated) / 1412 std::max<uint64_t>( 1413 TotalMethodLoadEliminationCandidates, 1)) 1414 << "%\n" 1415 << "BOLT-INFO: ICP percentage of indirect branches that are " 1416 "optimized = " 1417 << format("%.1f", (100.0 * TotalNumFrequentJmps) / 1418 std::max<uint64_t>(TotalIndirectJmps, 1)) 1419 << "%\n" 1420 << "BOLT-INFO: ICP percentage of jump table callsites that are " 1421 << "optimized = " 1422 << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) / 1423 std::max<uint64_t>(TotalJumpTableCallsites, 1)) 1424 << "%\n" 1425 << "BOLT-INFO: ICP number of jump table callsites that can use hot " 1426 << "indices = " << TotalIndexBasedCandidates << "\n" 1427 << "BOLT-INFO: ICP percentage of jump table callsites that use hot " 1428 "indices = " 1429 << format("%.1f", (100.0 * TotalIndexBasedJumps) / 1430 std::max<uint64_t>(TotalIndexBasedCandidates, 1)) 1431 << "%\n"; 1432 1433 (void)verifyProfile; 1434 #ifndef NDEBUG 1435 verifyProfile(BFs); 1436 #endif 1437 } 1438 1439 } // namespace bolt 1440 } // namespace llvm 1441