1 //===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the SampleProfileLoader transformation. This pass 10 // reads a profile file generated by a sampling profiler (e.g. Linux Perf - 11 // http://perf.wiki.kernel.org/) and generates IR metadata to reflect the 12 // profile information in the given profile. 13 // 14 // This pass generates branch weight annotations on the IR: 15 // 16 // - prof: Represents branch weights. This annotation is added to branches 17 // to indicate the weights of each edge coming out of the branch. 18 // The weight of each edge is the weight of the target block for 19 // that edge. The weight of a block B is computed as the maximum 20 // number of samples found in B. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "llvm/Transforms/IPO/SampleProfile.h" 25 #include "llvm/ADT/ArrayRef.h" 26 #include "llvm/ADT/DenseMap.h" 27 #include "llvm/ADT/DenseSet.h" 28 #include "llvm/ADT/None.h" 29 #include "llvm/ADT/SmallPtrSet.h" 30 #include "llvm/ADT/SmallSet.h" 31 #include "llvm/ADT/SmallVector.h" 32 #include "llvm/ADT/StringMap.h" 33 #include "llvm/ADT/StringRef.h" 34 #include "llvm/ADT/Twine.h" 35 #include "llvm/Analysis/AssumptionCache.h" 36 #include "llvm/Analysis/InlineCost.h" 37 #include "llvm/Analysis/LoopInfo.h" 38 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 39 #include "llvm/Analysis/PostDominators.h" 40 #include "llvm/Analysis/ProfileSummaryInfo.h" 41 #include "llvm/Analysis/TargetTransformInfo.h" 42 #include "llvm/IR/BasicBlock.h" 43 #include "llvm/IR/CFG.h" 44 #include "llvm/IR/CallSite.h" 45 #include "llvm/IR/DebugInfoMetadata.h" 46 #include "llvm/IR/DebugLoc.h" 47 #include "llvm/IR/DiagnosticInfo.h" 48 #include "llvm/IR/Dominators.h" 49 #include "llvm/IR/Function.h" 50 #include "llvm/IR/GlobalValue.h" 51 #include "llvm/IR/InstrTypes.h" 52 #include "llvm/IR/Instruction.h" 53 #include "llvm/IR/Instructions.h" 54 #include "llvm/IR/IntrinsicInst.h" 55 #include "llvm/IR/LLVMContext.h" 56 #include "llvm/IR/MDBuilder.h" 57 #include "llvm/IR/Module.h" 58 #include "llvm/IR/PassManager.h" 59 #include "llvm/IR/ValueSymbolTable.h" 60 #include "llvm/Pass.h" 61 #include "llvm/ProfileData/InstrProf.h" 62 #include "llvm/ProfileData/SampleProf.h" 63 #include "llvm/ProfileData/SampleProfReader.h" 64 #include "llvm/Support/Casting.h" 65 #include "llvm/Support/CommandLine.h" 66 #include "llvm/Support/Debug.h" 67 #include "llvm/Support/ErrorHandling.h" 68 #include "llvm/Support/ErrorOr.h" 69 #include "llvm/Support/GenericDomTree.h" 70 #include "llvm/Support/raw_ostream.h" 71 #include "llvm/Transforms/IPO.h" 72 #include "llvm/Transforms/Instrumentation.h" 73 #include "llvm/Transforms/Utils/CallPromotionUtils.h" 74 #include "llvm/Transforms/Utils/Cloning.h" 75 #include "llvm/Transforms/Utils/MisExpect.h" 76 #include <algorithm> 77 #include <cassert> 78 #include <cstdint> 79 #include <functional> 80 #include <limits> 81 #include <map> 82 #include <memory> 83 #include <queue> 84 #include <string> 85 #include <system_error> 86 #include <utility> 87 #include <vector> 88 89 using namespace llvm; 90 using namespace sampleprof; 91 using ProfileCount = Function::ProfileCount; 92 #define DEBUG_TYPE "sample-profile" 93 94 // Command line option to specify the file to read samples from. This is 95 // mainly used for debugging. 96 static cl::opt<std::string> SampleProfileFile( 97 "sample-profile-file", cl::init(""), cl::value_desc("filename"), 98 cl::desc("Profile file loaded by -sample-profile"), cl::Hidden); 99 100 // The named file contains a set of transformations that may have been applied 101 // to the symbol names between the program from which the sample data was 102 // collected and the current program's symbols. 103 static cl::opt<std::string> SampleProfileRemappingFile( 104 "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), 105 cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden); 106 107 static cl::opt<unsigned> SampleProfileMaxPropagateIterations( 108 "sample-profile-max-propagate-iterations", cl::init(100), 109 cl::desc("Maximum number of iterations to go through when propagating " 110 "sample block/edge weights through the CFG.")); 111 112 static cl::opt<unsigned> SampleProfileRecordCoverage( 113 "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"), 114 cl::desc("Emit a warning if less than N% of records in the input profile " 115 "are matched to the IR.")); 116 117 static cl::opt<unsigned> SampleProfileSampleCoverage( 118 "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"), 119 cl::desc("Emit a warning if less than N% of samples in the input profile " 120 "are matched to the IR.")); 121 122 static cl::opt<bool> NoWarnSampleUnused( 123 "no-warn-sample-unused", cl::init(false), cl::Hidden, 124 cl::desc("Use this option to turn off/on warnings about function with " 125 "samples but without debug information to use those samples. ")); 126 127 static cl::opt<bool> ProfileSampleAccurate( 128 "profile-sample-accurate", cl::Hidden, cl::init(false), 129 cl::desc("If the sample profile is accurate, we will mark all un-sampled " 130 "callsite and function as having 0 samples. Otherwise, treat " 131 "un-sampled callsites and functions conservatively as unknown. ")); 132 133 namespace { 134 135 using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>; 136 using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>; 137 using Edge = std::pair<const BasicBlock *, const BasicBlock *>; 138 using EdgeWeightMap = DenseMap<Edge, uint64_t>; 139 using BlockEdgeMap = 140 DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>; 141 142 class SampleCoverageTracker { 143 public: 144 SampleCoverageTracker() = default; 145 146 bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset, 147 uint32_t Discriminator, uint64_t Samples); 148 unsigned computeCoverage(unsigned Used, unsigned Total) const; 149 unsigned countUsedRecords(const FunctionSamples *FS, 150 ProfileSummaryInfo *PSI) const; 151 unsigned countBodyRecords(const FunctionSamples *FS, 152 ProfileSummaryInfo *PSI) const; 153 uint64_t getTotalUsedSamples() const { return TotalUsedSamples; } 154 uint64_t countBodySamples(const FunctionSamples *FS, 155 ProfileSummaryInfo *PSI) const; 156 157 void clear() { 158 SampleCoverage.clear(); 159 TotalUsedSamples = 0; 160 } 161 162 private: 163 using BodySampleCoverageMap = std::map<LineLocation, unsigned>; 164 using FunctionSamplesCoverageMap = 165 DenseMap<const FunctionSamples *, BodySampleCoverageMap>; 166 167 /// Coverage map for sampling records. 168 /// 169 /// This map keeps a record of sampling records that have been matched to 170 /// an IR instruction. This is used to detect some form of staleness in 171 /// profiles (see flag -sample-profile-check-coverage). 172 /// 173 /// Each entry in the map corresponds to a FunctionSamples instance. This is 174 /// another map that counts how many times the sample record at the 175 /// given location has been used. 176 FunctionSamplesCoverageMap SampleCoverage; 177 178 /// Number of samples used from the profile. 179 /// 180 /// When a sampling record is used for the first time, the samples from 181 /// that record are added to this accumulator. Coverage is later computed 182 /// based on the total number of samples available in this function and 183 /// its callsites. 184 /// 185 /// Note that this accumulator tracks samples used from a single function 186 /// and all the inlined callsites. Strictly, we should have a map of counters 187 /// keyed by FunctionSamples pointers, but these stats are cleared after 188 /// every function, so we just need to keep a single counter. 189 uint64_t TotalUsedSamples = 0; 190 }; 191 192 class GUIDToFuncNameMapper { 193 public: 194 GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader, 195 DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap) 196 : CurrentReader(Reader), CurrentModule(M), 197 CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) { 198 if (CurrentReader.getFormat() != SPF_Compact_Binary) 199 return; 200 201 for (const auto &F : CurrentModule) { 202 StringRef OrigName = F.getName(); 203 CurrentGUIDToFuncNameMap.insert( 204 {Function::getGUID(OrigName), OrigName}); 205 206 // Local to global var promotion used by optimization like thinlto 207 // will rename the var and add suffix like ".llvm.xxx" to the 208 // original local name. In sample profile, the suffixes of function 209 // names are all stripped. Since it is possible that the mapper is 210 // built in post-thin-link phase and var promotion has been done, 211 // we need to add the substring of function name without the suffix 212 // into the GUIDToFuncNameMap. 213 StringRef CanonName = FunctionSamples::getCanonicalFnName(F); 214 if (CanonName != OrigName) 215 CurrentGUIDToFuncNameMap.insert( 216 {Function::getGUID(CanonName), CanonName}); 217 } 218 219 // Update GUIDToFuncNameMap for each function including inlinees. 220 SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap); 221 } 222 223 ~GUIDToFuncNameMapper() { 224 if (CurrentReader.getFormat() != SPF_Compact_Binary) 225 return; 226 227 CurrentGUIDToFuncNameMap.clear(); 228 229 // Reset GUIDToFuncNameMap for of each function as they're no 230 // longer valid at this point. 231 SetGUIDToFuncNameMapForAll(nullptr); 232 } 233 234 private: 235 void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) { 236 std::queue<FunctionSamples *> FSToUpdate; 237 for (auto &IFS : CurrentReader.getProfiles()) { 238 FSToUpdate.push(&IFS.second); 239 } 240 241 while (!FSToUpdate.empty()) { 242 FunctionSamples *FS = FSToUpdate.front(); 243 FSToUpdate.pop(); 244 FS->GUIDToFuncNameMap = Map; 245 for (const auto &ICS : FS->getCallsiteSamples()) { 246 const FunctionSamplesMap &FSMap = ICS.second; 247 for (auto &IFS : FSMap) { 248 FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second); 249 FSToUpdate.push(&FS); 250 } 251 } 252 } 253 } 254 255 SampleProfileReader &CurrentReader; 256 Module &CurrentModule; 257 DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap; 258 }; 259 260 /// Sample profile pass. 261 /// 262 /// This pass reads profile data from the file specified by 263 /// -sample-profile-file and annotates every affected function with the 264 /// profile information found in that file. 265 class SampleProfileLoader { 266 public: 267 SampleProfileLoader( 268 StringRef Name, StringRef RemapName, bool IsThinLTOPreLink, 269 std::function<AssumptionCache &(Function &)> GetAssumptionCache, 270 std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo) 271 : GetAC(std::move(GetAssumptionCache)), 272 GetTTI(std::move(GetTargetTransformInfo)), Filename(Name), 273 RemappingFilename(RemapName), IsThinLTOPreLink(IsThinLTOPreLink) {} 274 275 bool doInitialization(Module &M); 276 bool runOnModule(Module &M, ModuleAnalysisManager *AM, 277 ProfileSummaryInfo *_PSI); 278 279 void dump() { Reader->dump(); } 280 281 protected: 282 bool runOnFunction(Function &F, ModuleAnalysisManager *AM); 283 unsigned getFunctionLoc(Function &F); 284 bool emitAnnotations(Function &F); 285 ErrorOr<uint64_t> getInstWeight(const Instruction &I); 286 ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB); 287 const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const; 288 std::vector<const FunctionSamples *> 289 findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; 290 mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap; 291 const FunctionSamples *findFunctionSamples(const Instruction &I) const; 292 bool inlineCallInstruction(Instruction *I); 293 bool inlineHotFunctions(Function &F, 294 DenseSet<GlobalValue::GUID> &InlinedGUIDs); 295 void printEdgeWeight(raw_ostream &OS, Edge E); 296 void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const; 297 void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); 298 bool computeBlockWeights(Function &F); 299 void findEquivalenceClasses(Function &F); 300 template <bool IsPostDom> 301 void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants, 302 DominatorTreeBase<BasicBlock, IsPostDom> *DomTree); 303 304 void propagateWeights(Function &F); 305 uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); 306 void buildEdges(Function &F); 307 bool propagateThroughEdges(Function &F, bool UpdateBlockCount); 308 void computeDominanceAndLoopInfo(Function &F); 309 void clearFunctionData(); 310 311 /// Map basic blocks to their computed weights. 312 /// 313 /// The weight of a basic block is defined to be the maximum 314 /// of all the instruction weights in that block. 315 BlockWeightMap BlockWeights; 316 317 /// Map edges to their computed weights. 318 /// 319 /// Edge weights are computed by propagating basic block weights in 320 /// SampleProfile::propagateWeights. 321 EdgeWeightMap EdgeWeights; 322 323 /// Set of visited blocks during propagation. 324 SmallPtrSet<const BasicBlock *, 32> VisitedBlocks; 325 326 /// Set of visited edges during propagation. 327 SmallSet<Edge, 32> VisitedEdges; 328 329 /// Equivalence classes for block weights. 330 /// 331 /// Two blocks BB1 and BB2 are in the same equivalence class if they 332 /// dominate and post-dominate each other, and they are in the same loop 333 /// nest. When this happens, the two blocks are guaranteed to execute 334 /// the same number of times. 335 EquivalenceClassMap EquivalenceClass; 336 337 /// Map from function name to Function *. Used to find the function from 338 /// the function name. If the function name contains suffix, additional 339 /// entry is added to map from the stripped name to the function if there 340 /// is one-to-one mapping. 341 StringMap<Function *> SymbolMap; 342 343 /// Dominance, post-dominance and loop information. 344 std::unique_ptr<DominatorTree> DT; 345 std::unique_ptr<PostDominatorTree> PDT; 346 std::unique_ptr<LoopInfo> LI; 347 348 std::function<AssumptionCache &(Function &)> GetAC; 349 std::function<TargetTransformInfo &(Function &)> GetTTI; 350 351 /// Predecessors for each basic block in the CFG. 352 BlockEdgeMap Predecessors; 353 354 /// Successors for each basic block in the CFG. 355 BlockEdgeMap Successors; 356 357 SampleCoverageTracker CoverageTracker; 358 359 /// Profile reader object. 360 std::unique_ptr<SampleProfileReader> Reader; 361 362 /// Samples collected for the body of this function. 363 FunctionSamples *Samples = nullptr; 364 365 /// Name of the profile file to load. 366 std::string Filename; 367 368 /// Name of the profile remapping file to load. 369 std::string RemappingFilename; 370 371 /// Flag indicating whether the profile input loaded successfully. 372 bool ProfileIsValid = false; 373 374 /// Flag indicating if the pass is invoked in ThinLTO compile phase. 375 /// 376 /// In this phase, in annotation, we should not promote indirect calls. 377 /// Instead, we will mark GUIDs that needs to be annotated to the function. 378 bool IsThinLTOPreLink; 379 380 /// Profile Summary Info computed from sample profile. 381 ProfileSummaryInfo *PSI = nullptr; 382 383 /// Profle Symbol list tells whether a function name appears in the binary 384 /// used to generate the current profile. 385 std::unique_ptr<ProfileSymbolList> PSL; 386 387 /// Total number of samples collected in this profile. 388 /// 389 /// This is the sum of all the samples collected in all the functions executed 390 /// at runtime. 391 uint64_t TotalCollectedSamples = 0; 392 393 /// Optimization Remark Emitter used to emit diagnostic remarks. 394 OptimizationRemarkEmitter *ORE = nullptr; 395 396 // Information recorded when we declined to inline a call site 397 // because we have determined it is too cold is accumulated for 398 // each callee function. Initially this is just the entry count. 399 struct NotInlinedProfileInfo { 400 uint64_t entryCount; 401 }; 402 DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo; 403 404 // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for 405 // all the function symbols defined or declared in current module. 406 DenseMap<uint64_t, StringRef> GUIDToFuncNameMap; 407 }; 408 409 class SampleProfileLoaderLegacyPass : public ModulePass { 410 public: 411 // Class identification, replacement for typeinfo 412 static char ID; 413 414 SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile, 415 bool IsThinLTOPreLink = false) 416 : ModulePass(ID), 417 SampleLoader(Name, SampleProfileRemappingFile, IsThinLTOPreLink, 418 [&](Function &F) -> AssumptionCache & { 419 return ACT->getAssumptionCache(F); 420 }, 421 [&](Function &F) -> TargetTransformInfo & { 422 return TTIWP->getTTI(F); 423 }) { 424 initializeSampleProfileLoaderLegacyPassPass( 425 *PassRegistry::getPassRegistry()); 426 } 427 428 void dump() { SampleLoader.dump(); } 429 430 bool doInitialization(Module &M) override { 431 return SampleLoader.doInitialization(M); 432 } 433 434 StringRef getPassName() const override { return "Sample profile pass"; } 435 bool runOnModule(Module &M) override; 436 437 void getAnalysisUsage(AnalysisUsage &AU) const override { 438 AU.addRequired<AssumptionCacheTracker>(); 439 AU.addRequired<TargetTransformInfoWrapperPass>(); 440 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 441 } 442 443 private: 444 SampleProfileLoader SampleLoader; 445 AssumptionCacheTracker *ACT = nullptr; 446 TargetTransformInfoWrapperPass *TTIWP = nullptr; 447 }; 448 449 } // end anonymous namespace 450 451 /// Return true if the given callsite is hot wrt to hot cutoff threshold. 452 /// 453 /// Functions that were inlined in the original binary will be represented 454 /// in the inline stack in the sample profile. If the profile shows that 455 /// the original inline decision was "good" (i.e., the callsite is executed 456 /// frequently), then we will recreate the inline decision and apply the 457 /// profile from the inlined callsite. 458 /// 459 /// To decide whether an inlined callsite is hot, we compare the callsite 460 /// sample count with the hot cutoff computed by ProfileSummaryInfo, it is 461 /// regarded as hot if the count is above the cutoff value. 462 static bool callsiteIsHot(const FunctionSamples *CallsiteFS, 463 ProfileSummaryInfo *PSI) { 464 if (!CallsiteFS) 465 return false; // The callsite was not inlined in the original binary. 466 467 assert(PSI && "PSI is expected to be non null"); 468 uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples(); 469 return PSI->isHotCount(CallsiteTotalSamples); 470 } 471 472 /// Mark as used the sample record for the given function samples at 473 /// (LineOffset, Discriminator). 474 /// 475 /// \returns true if this is the first time we mark the given record. 476 bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS, 477 uint32_t LineOffset, 478 uint32_t Discriminator, 479 uint64_t Samples) { 480 LineLocation Loc(LineOffset, Discriminator); 481 unsigned &Count = SampleCoverage[FS][Loc]; 482 bool FirstTime = (++Count == 1); 483 if (FirstTime) 484 TotalUsedSamples += Samples; 485 return FirstTime; 486 } 487 488 /// Return the number of sample records that were applied from this profile. 489 /// 490 /// This count does not include records from cold inlined callsites. 491 unsigned 492 SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS, 493 ProfileSummaryInfo *PSI) const { 494 auto I = SampleCoverage.find(FS); 495 496 // The size of the coverage map for FS represents the number of records 497 // that were marked used at least once. 498 unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0; 499 500 // If there are inlined callsites in this function, count the samples found 501 // in the respective bodies. However, do not bother counting callees with 0 502 // total samples, these are callees that were never invoked at runtime. 503 for (const auto &I : FS->getCallsiteSamples()) 504 for (const auto &J : I.second) { 505 const FunctionSamples *CalleeSamples = &J.second; 506 if (callsiteIsHot(CalleeSamples, PSI)) 507 Count += countUsedRecords(CalleeSamples, PSI); 508 } 509 510 return Count; 511 } 512 513 /// Return the number of sample records in the body of this profile. 514 /// 515 /// This count does not include records from cold inlined callsites. 516 unsigned 517 SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS, 518 ProfileSummaryInfo *PSI) const { 519 unsigned Count = FS->getBodySamples().size(); 520 521 // Only count records in hot callsites. 522 for (const auto &I : FS->getCallsiteSamples()) 523 for (const auto &J : I.second) { 524 const FunctionSamples *CalleeSamples = &J.second; 525 if (callsiteIsHot(CalleeSamples, PSI)) 526 Count += countBodyRecords(CalleeSamples, PSI); 527 } 528 529 return Count; 530 } 531 532 /// Return the number of samples collected in the body of this profile. 533 /// 534 /// This count does not include samples from cold inlined callsites. 535 uint64_t 536 SampleCoverageTracker::countBodySamples(const FunctionSamples *FS, 537 ProfileSummaryInfo *PSI) const { 538 uint64_t Total = 0; 539 for (const auto &I : FS->getBodySamples()) 540 Total += I.second.getSamples(); 541 542 // Only count samples in hot callsites. 543 for (const auto &I : FS->getCallsiteSamples()) 544 for (const auto &J : I.second) { 545 const FunctionSamples *CalleeSamples = &J.second; 546 if (callsiteIsHot(CalleeSamples, PSI)) 547 Total += countBodySamples(CalleeSamples, PSI); 548 } 549 550 return Total; 551 } 552 553 /// Return the fraction of sample records used in this profile. 554 /// 555 /// The returned value is an unsigned integer in the range 0-100 indicating 556 /// the percentage of sample records that were used while applying this 557 /// profile to the associated function. 558 unsigned SampleCoverageTracker::computeCoverage(unsigned Used, 559 unsigned Total) const { 560 assert(Used <= Total && 561 "number of used records cannot exceed the total number of records"); 562 return Total > 0 ? Used * 100 / Total : 100; 563 } 564 565 /// Clear all the per-function data used to load samples and propagate weights. 566 void SampleProfileLoader::clearFunctionData() { 567 BlockWeights.clear(); 568 EdgeWeights.clear(); 569 VisitedBlocks.clear(); 570 VisitedEdges.clear(); 571 EquivalenceClass.clear(); 572 DT = nullptr; 573 PDT = nullptr; 574 LI = nullptr; 575 Predecessors.clear(); 576 Successors.clear(); 577 CoverageTracker.clear(); 578 } 579 580 #ifndef NDEBUG 581 /// Print the weight of edge \p E on stream \p OS. 582 /// 583 /// \param OS Stream to emit the output to. 584 /// \param E Edge to print. 585 void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) { 586 OS << "weight[" << E.first->getName() << "->" << E.second->getName() 587 << "]: " << EdgeWeights[E] << "\n"; 588 } 589 590 /// Print the equivalence class of block \p BB on stream \p OS. 591 /// 592 /// \param OS Stream to emit the output to. 593 /// \param BB Block to print. 594 void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS, 595 const BasicBlock *BB) { 596 const BasicBlock *Equiv = EquivalenceClass[BB]; 597 OS << "equivalence[" << BB->getName() 598 << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n"; 599 } 600 601 /// Print the weight of block \p BB on stream \p OS. 602 /// 603 /// \param OS Stream to emit the output to. 604 /// \param BB Block to print. 605 void SampleProfileLoader::printBlockWeight(raw_ostream &OS, 606 const BasicBlock *BB) const { 607 const auto &I = BlockWeights.find(BB); 608 uint64_t W = (I == BlockWeights.end() ? 0 : I->second); 609 OS << "weight[" << BB->getName() << "]: " << W << "\n"; 610 } 611 #endif 612 613 /// Get the weight for an instruction. 614 /// 615 /// The "weight" of an instruction \p Inst is the number of samples 616 /// collected on that instruction at runtime. To retrieve it, we 617 /// need to compute the line number of \p Inst relative to the start of its 618 /// function. We use HeaderLineno to compute the offset. We then 619 /// look up the samples collected for \p Inst using BodySamples. 620 /// 621 /// \param Inst Instruction to query. 622 /// 623 /// \returns the weight of \p Inst. 624 ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { 625 const DebugLoc &DLoc = Inst.getDebugLoc(); 626 if (!DLoc) 627 return std::error_code(); 628 629 const FunctionSamples *FS = findFunctionSamples(Inst); 630 if (!FS) 631 return std::error_code(); 632 633 // Ignore all intrinsics, phinodes and branch instructions. 634 // Branch and phinodes instruction usually contains debug info from sources outside of 635 // the residing basic block, thus we ignore them during annotation. 636 if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst)) 637 return std::error_code(); 638 639 // If a direct call/invoke instruction is inlined in profile 640 // (findCalleeFunctionSamples returns non-empty result), but not inlined here, 641 // it means that the inlined callsite has no sample, thus the call 642 // instruction should have 0 count. 643 if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) && 644 !ImmutableCallSite(&Inst).isIndirectCall() && 645 findCalleeFunctionSamples(Inst)) 646 return 0; 647 648 const DILocation *DIL = DLoc; 649 uint32_t LineOffset = FunctionSamples::getOffset(DIL); 650 uint32_t Discriminator = DIL->getBaseDiscriminator(); 651 ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator); 652 if (R) { 653 bool FirstMark = 654 CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get()); 655 if (FirstMark) { 656 ORE->emit([&]() { 657 OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst); 658 Remark << "Applied " << ore::NV("NumSamples", *R); 659 Remark << " samples from profile (offset: "; 660 Remark << ore::NV("LineOffset", LineOffset); 661 if (Discriminator) { 662 Remark << "."; 663 Remark << ore::NV("Discriminator", Discriminator); 664 } 665 Remark << ")"; 666 return Remark; 667 }); 668 } 669 LLVM_DEBUG(dbgs() << " " << DLoc.getLine() << "." 670 << DIL->getBaseDiscriminator() << ":" << Inst 671 << " (line offset: " << LineOffset << "." 672 << DIL->getBaseDiscriminator() << " - weight: " << R.get() 673 << ")\n"); 674 } 675 return R; 676 } 677 678 /// Compute the weight of a basic block. 679 /// 680 /// The weight of basic block \p BB is the maximum weight of all the 681 /// instructions in BB. 682 /// 683 /// \param BB The basic block to query. 684 /// 685 /// \returns the weight for \p BB. 686 ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) { 687 uint64_t Max = 0; 688 bool HasWeight = false; 689 for (auto &I : BB->getInstList()) { 690 const ErrorOr<uint64_t> &R = getInstWeight(I); 691 if (R) { 692 Max = std::max(Max, R.get()); 693 HasWeight = true; 694 } 695 } 696 return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code(); 697 } 698 699 /// Compute and store the weights of every basic block. 700 /// 701 /// This populates the BlockWeights map by computing 702 /// the weights of every basic block in the CFG. 703 /// 704 /// \param F The function to query. 705 bool SampleProfileLoader::computeBlockWeights(Function &F) { 706 bool Changed = false; 707 LLVM_DEBUG(dbgs() << "Block weights\n"); 708 for (const auto &BB : F) { 709 ErrorOr<uint64_t> Weight = getBlockWeight(&BB); 710 if (Weight) { 711 BlockWeights[&BB] = Weight.get(); 712 VisitedBlocks.insert(&BB); 713 Changed = true; 714 } 715 LLVM_DEBUG(printBlockWeight(dbgs(), &BB)); 716 } 717 718 return Changed; 719 } 720 721 /// Get the FunctionSamples for a call instruction. 722 /// 723 /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined 724 /// instance in which that call instruction is calling to. It contains 725 /// all samples that resides in the inlined instance. We first find the 726 /// inlined instance in which the call instruction is from, then we 727 /// traverse its children to find the callsite with the matching 728 /// location. 729 /// 730 /// \param Inst Call/Invoke instruction to query. 731 /// 732 /// \returns The FunctionSamples pointer to the inlined instance. 733 const FunctionSamples * 734 SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const { 735 const DILocation *DIL = Inst.getDebugLoc(); 736 if (!DIL) { 737 return nullptr; 738 } 739 740 StringRef CalleeName; 741 if (const CallInst *CI = dyn_cast<CallInst>(&Inst)) 742 if (Function *Callee = CI->getCalledFunction()) 743 CalleeName = Callee->getName(); 744 745 const FunctionSamples *FS = findFunctionSamples(Inst); 746 if (FS == nullptr) 747 return nullptr; 748 749 return FS->findFunctionSamplesAt(LineLocation(FunctionSamples::getOffset(DIL), 750 DIL->getBaseDiscriminator()), 751 CalleeName); 752 } 753 754 /// Returns a vector of FunctionSamples that are the indirect call targets 755 /// of \p Inst. The vector is sorted by the total number of samples. Stores 756 /// the total call count of the indirect call in \p Sum. 757 std::vector<const FunctionSamples *> 758 SampleProfileLoader::findIndirectCallFunctionSamples( 759 const Instruction &Inst, uint64_t &Sum) const { 760 const DILocation *DIL = Inst.getDebugLoc(); 761 std::vector<const FunctionSamples *> R; 762 763 if (!DIL) { 764 return R; 765 } 766 767 const FunctionSamples *FS = findFunctionSamples(Inst); 768 if (FS == nullptr) 769 return R; 770 771 uint32_t LineOffset = FunctionSamples::getOffset(DIL); 772 uint32_t Discriminator = DIL->getBaseDiscriminator(); 773 774 auto T = FS->findCallTargetMapAt(LineOffset, Discriminator); 775 Sum = 0; 776 if (T) 777 for (const auto &T_C : T.get()) 778 Sum += T_C.second; 779 if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(LineLocation( 780 FunctionSamples::getOffset(DIL), DIL->getBaseDiscriminator()))) { 781 if (M->empty()) 782 return R; 783 for (const auto &NameFS : *M) { 784 Sum += NameFS.second.getEntrySamples(); 785 R.push_back(&NameFS.second); 786 } 787 llvm::sort(R, [](const FunctionSamples *L, const FunctionSamples *R) { 788 if (L->getEntrySamples() != R->getEntrySamples()) 789 return L->getEntrySamples() > R->getEntrySamples(); 790 return FunctionSamples::getGUID(L->getName()) < 791 FunctionSamples::getGUID(R->getName()); 792 }); 793 } 794 return R; 795 } 796 797 /// Get the FunctionSamples for an instruction. 798 /// 799 /// The FunctionSamples of an instruction \p Inst is the inlined instance 800 /// in which that instruction is coming from. We traverse the inline stack 801 /// of that instruction, and match it with the tree nodes in the profile. 802 /// 803 /// \param Inst Instruction to query. 804 /// 805 /// \returns the FunctionSamples pointer to the inlined instance. 806 const FunctionSamples * 807 SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { 808 const DILocation *DIL = Inst.getDebugLoc(); 809 if (!DIL) 810 return Samples; 811 812 auto it = DILocation2SampleMap.try_emplace(DIL,nullptr); 813 if (it.second) 814 it.first->second = Samples->findFunctionSamples(DIL); 815 return it.first->second; 816 } 817 818 bool SampleProfileLoader::inlineCallInstruction(Instruction *I) { 819 assert(isa<CallInst>(I) || isa<InvokeInst>(I)); 820 CallSite CS(I); 821 Function *CalledFunction = CS.getCalledFunction(); 822 assert(CalledFunction); 823 DebugLoc DLoc = I->getDebugLoc(); 824 BasicBlock *BB = I->getParent(); 825 InlineParams Params = getInlineParams(); 826 Params.ComputeFullInlineCost = true; 827 // Checks if there is anything in the reachable portion of the callee at 828 // this callsite that makes this inlining potentially illegal. Need to 829 // set ComputeFullInlineCost, otherwise getInlineCost may return early 830 // when cost exceeds threshold without checking all IRs in the callee. 831 // The acutal cost does not matter because we only checks isNever() to 832 // see if it is legal to inline the callsite. 833 InlineCost Cost = 834 getInlineCost(cast<CallBase>(*I), Params, GetTTI(*CalledFunction), GetAC, 835 None, nullptr, nullptr); 836 if (Cost.isNever()) { 837 ORE->emit(OptimizationRemark(DEBUG_TYPE, "Not inline", DLoc, BB) 838 << "incompatible inlining"); 839 return false; 840 } 841 InlineFunctionInfo IFI(nullptr, &GetAC); 842 if (InlineFunction(CS, IFI)) { 843 // The call to InlineFunction erases I, so we can't pass it here. 844 ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB) 845 << "inlined hot callee '" << ore::NV("Callee", CalledFunction) 846 << "' into '" << ore::NV("Caller", BB->getParent()) << "'"); 847 return true; 848 } 849 return false; 850 } 851 852 /// Iteratively inline hot callsites of a function. 853 /// 854 /// Iteratively traverse all callsites of the function \p F, and find if 855 /// the corresponding inlined instance exists and is hot in profile. If 856 /// it is hot enough, inline the callsites and adds new callsites of the 857 /// callee into the caller. If the call is an indirect call, first promote 858 /// it to direct call. Each indirect call is limited with a single target. 859 /// 860 /// \param F function to perform iterative inlining. 861 /// \param InlinedGUIDs a set to be updated to include all GUIDs that are 862 /// inlined in the profiled binary. 863 /// 864 /// \returns True if there is any inline happened. 865 bool SampleProfileLoader::inlineHotFunctions( 866 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { 867 DenseSet<Instruction *> PromotedInsns; 868 869 DenseMap<Instruction *, const FunctionSamples *> localNotInlinedCallSites; 870 bool Changed = false; 871 while (true) { 872 bool LocalChanged = false; 873 SmallVector<Instruction *, 10> CIS; 874 for (auto &BB : F) { 875 bool Hot = false; 876 SmallVector<Instruction *, 10> Candidates; 877 for (auto &I : BB.getInstList()) { 878 const FunctionSamples *FS = nullptr; 879 if ((isa<CallInst>(I) || isa<InvokeInst>(I)) && 880 !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) { 881 Candidates.push_back(&I); 882 if (FS->getEntrySamples() > 0) 883 localNotInlinedCallSites.try_emplace(&I, FS); 884 if (callsiteIsHot(FS, PSI)) 885 Hot = true; 886 } 887 } 888 if (Hot) { 889 CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end()); 890 } 891 } 892 for (auto I : CIS) { 893 Function *CalledFunction = CallSite(I).getCalledFunction(); 894 // Do not inline recursive calls. 895 if (CalledFunction == &F) 896 continue; 897 if (CallSite(I).isIndirectCall()) { 898 if (PromotedInsns.count(I)) 899 continue; 900 uint64_t Sum; 901 for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) { 902 if (IsThinLTOPreLink) { 903 FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), 904 PSI->getOrCompHotCountThreshold()); 905 continue; 906 } 907 auto CalleeFunctionName = FS->getFuncNameInModule(F.getParent()); 908 // If it is a recursive call, we do not inline it as it could bloat 909 // the code exponentially. There is way to better handle this, e.g. 910 // clone the caller first, and inline the cloned caller if it is 911 // recursive. As llvm does not inline recursive calls, we will 912 // simply ignore it instead of handling it explicitly. 913 if (CalleeFunctionName == F.getName()) 914 continue; 915 916 if (!callsiteIsHot(FS, PSI)) 917 continue; 918 919 const char *Reason = "Callee function not available"; 920 auto R = SymbolMap.find(CalleeFunctionName); 921 if (R != SymbolMap.end() && R->getValue() && 922 !R->getValue()->isDeclaration() && 923 R->getValue()->getSubprogram() && 924 isLegalToPromote(CallSite(I), R->getValue(), &Reason)) { 925 uint64_t C = FS->getEntrySamples(); 926 Instruction *DI = 927 pgo::promoteIndirectCall(I, R->getValue(), C, Sum, false, ORE); 928 Sum -= C; 929 PromotedInsns.insert(I); 930 // If profile mismatches, we should not attempt to inline DI. 931 if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) && 932 inlineCallInstruction(DI)) { 933 localNotInlinedCallSites.erase(I); 934 LocalChanged = true; 935 } 936 } else { 937 LLVM_DEBUG(dbgs() 938 << "\nFailed to promote indirect call to " 939 << CalleeFunctionName << " because " << Reason << "\n"); 940 } 941 } 942 } else if (CalledFunction && CalledFunction->getSubprogram() && 943 !CalledFunction->isDeclaration()) { 944 if (inlineCallInstruction(I)) { 945 localNotInlinedCallSites.erase(I); 946 LocalChanged = true; 947 } 948 } else if (IsThinLTOPreLink) { 949 findCalleeFunctionSamples(*I)->findInlinedFunctions( 950 InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold()); 951 } 952 } 953 if (LocalChanged) { 954 Changed = true; 955 } else { 956 break; 957 } 958 } 959 960 // Accumulate not inlined callsite information into notInlinedSamples 961 for (const auto &Pair : localNotInlinedCallSites) { 962 Instruction *I = Pair.getFirst(); 963 Function *Callee = CallSite(I).getCalledFunction(); 964 if (!Callee || Callee->isDeclaration()) 965 continue; 966 const FunctionSamples *FS = Pair.getSecond(); 967 auto pair = 968 notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); 969 pair.first->second.entryCount += FS->getEntrySamples(); 970 } 971 return Changed; 972 } 973 974 /// Find equivalence classes for the given block. 975 /// 976 /// This finds all the blocks that are guaranteed to execute the same 977 /// number of times as \p BB1. To do this, it traverses all the 978 /// descendants of \p BB1 in the dominator or post-dominator tree. 979 /// 980 /// A block BB2 will be in the same equivalence class as \p BB1 if 981 /// the following holds: 982 /// 983 /// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2 984 /// is a descendant of \p BB1 in the dominator tree, then BB2 should 985 /// dominate BB1 in the post-dominator tree. 986 /// 987 /// 2- Both BB2 and \p BB1 must be in the same loop. 988 /// 989 /// For every block BB2 that meets those two requirements, we set BB2's 990 /// equivalence class to \p BB1. 991 /// 992 /// \param BB1 Block to check. 993 /// \param Descendants Descendants of \p BB1 in either the dom or pdom tree. 994 /// \param DomTree Opposite dominator tree. If \p Descendants is filled 995 /// with blocks from \p BB1's dominator tree, then 996 /// this is the post-dominator tree, and vice versa. 997 template <bool IsPostDom> 998 void SampleProfileLoader::findEquivalencesFor( 999 BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants, 1000 DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) { 1001 const BasicBlock *EC = EquivalenceClass[BB1]; 1002 uint64_t Weight = BlockWeights[EC]; 1003 for (const auto *BB2 : Descendants) { 1004 bool IsDomParent = DomTree->dominates(BB2, BB1); 1005 bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2); 1006 if (BB1 != BB2 && IsDomParent && IsInSameLoop) { 1007 EquivalenceClass[BB2] = EC; 1008 // If BB2 is visited, then the entire EC should be marked as visited. 1009 if (VisitedBlocks.count(BB2)) { 1010 VisitedBlocks.insert(EC); 1011 } 1012 1013 // If BB2 is heavier than BB1, make BB2 have the same weight 1014 // as BB1. 1015 // 1016 // Note that we don't worry about the opposite situation here 1017 // (when BB2 is lighter than BB1). We will deal with this 1018 // during the propagation phase. Right now, we just want to 1019 // make sure that BB1 has the largest weight of all the 1020 // members of its equivalence set. 1021 Weight = std::max(Weight, BlockWeights[BB2]); 1022 } 1023 } 1024 if (EC == &EC->getParent()->getEntryBlock()) { 1025 BlockWeights[EC] = Samples->getHeadSamples() + 1; 1026 } else { 1027 BlockWeights[EC] = Weight; 1028 } 1029 } 1030 1031 /// Find equivalence classes. 1032 /// 1033 /// Since samples may be missing from blocks, we can fill in the gaps by setting 1034 /// the weights of all the blocks in the same equivalence class to the same 1035 /// weight. To compute the concept of equivalence, we use dominance and loop 1036 /// information. Two blocks B1 and B2 are in the same equivalence class if B1 1037 /// dominates B2, B2 post-dominates B1 and both are in the same loop. 1038 /// 1039 /// \param F The function to query. 1040 void SampleProfileLoader::findEquivalenceClasses(Function &F) { 1041 SmallVector<BasicBlock *, 8> DominatedBBs; 1042 LLVM_DEBUG(dbgs() << "\nBlock equivalence classes\n"); 1043 // Find equivalence sets based on dominance and post-dominance information. 1044 for (auto &BB : F) { 1045 BasicBlock *BB1 = &BB; 1046 1047 // Compute BB1's equivalence class once. 1048 if (EquivalenceClass.count(BB1)) { 1049 LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1)); 1050 continue; 1051 } 1052 1053 // By default, blocks are in their own equivalence class. 1054 EquivalenceClass[BB1] = BB1; 1055 1056 // Traverse all the blocks dominated by BB1. We are looking for 1057 // every basic block BB2 such that: 1058 // 1059 // 1- BB1 dominates BB2. 1060 // 2- BB2 post-dominates BB1. 1061 // 3- BB1 and BB2 are in the same loop nest. 1062 // 1063 // If all those conditions hold, it means that BB2 is executed 1064 // as many times as BB1, so they are placed in the same equivalence 1065 // class by making BB2's equivalence class be BB1. 1066 DominatedBBs.clear(); 1067 DT->getDescendants(BB1, DominatedBBs); 1068 findEquivalencesFor(BB1, DominatedBBs, PDT.get()); 1069 1070 LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1)); 1071 } 1072 1073 // Assign weights to equivalence classes. 1074 // 1075 // All the basic blocks in the same equivalence class will execute 1076 // the same number of times. Since we know that the head block in 1077 // each equivalence class has the largest weight, assign that weight 1078 // to all the blocks in that equivalence class. 1079 LLVM_DEBUG( 1080 dbgs() << "\nAssign the same weight to all blocks in the same class\n"); 1081 for (auto &BI : F) { 1082 const BasicBlock *BB = &BI; 1083 const BasicBlock *EquivBB = EquivalenceClass[BB]; 1084 if (BB != EquivBB) 1085 BlockWeights[BB] = BlockWeights[EquivBB]; 1086 LLVM_DEBUG(printBlockWeight(dbgs(), BB)); 1087 } 1088 } 1089 1090 /// Visit the given edge to decide if it has a valid weight. 1091 /// 1092 /// If \p E has not been visited before, we copy to \p UnknownEdge 1093 /// and increment the count of unknown edges. 1094 /// 1095 /// \param E Edge to visit. 1096 /// \param NumUnknownEdges Current number of unknown edges. 1097 /// \param UnknownEdge Set if E has not been visited before. 1098 /// 1099 /// \returns E's weight, if known. Otherwise, return 0. 1100 uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges, 1101 Edge *UnknownEdge) { 1102 if (!VisitedEdges.count(E)) { 1103 (*NumUnknownEdges)++; 1104 *UnknownEdge = E; 1105 return 0; 1106 } 1107 1108 return EdgeWeights[E]; 1109 } 1110 1111 /// Propagate weights through incoming/outgoing edges. 1112 /// 1113 /// If the weight of a basic block is known, and there is only one edge 1114 /// with an unknown weight, we can calculate the weight of that edge. 1115 /// 1116 /// Similarly, if all the edges have a known count, we can calculate the 1117 /// count of the basic block, if needed. 1118 /// 1119 /// \param F Function to process. 1120 /// \param UpdateBlockCount Whether we should update basic block counts that 1121 /// has already been annotated. 1122 /// 1123 /// \returns True if new weights were assigned to edges or blocks. 1124 bool SampleProfileLoader::propagateThroughEdges(Function &F, 1125 bool UpdateBlockCount) { 1126 bool Changed = false; 1127 LLVM_DEBUG(dbgs() << "\nPropagation through edges\n"); 1128 for (const auto &BI : F) { 1129 const BasicBlock *BB = &BI; 1130 const BasicBlock *EC = EquivalenceClass[BB]; 1131 1132 // Visit all the predecessor and successor edges to determine 1133 // which ones have a weight assigned already. Note that it doesn't 1134 // matter that we only keep track of a single unknown edge. The 1135 // only case we are interested in handling is when only a single 1136 // edge is unknown (see setEdgeOrBlockWeight). 1137 for (unsigned i = 0; i < 2; i++) { 1138 uint64_t TotalWeight = 0; 1139 unsigned NumUnknownEdges = 0, NumTotalEdges = 0; 1140 Edge UnknownEdge, SelfReferentialEdge, SingleEdge; 1141 1142 if (i == 0) { 1143 // First, visit all predecessor edges. 1144 NumTotalEdges = Predecessors[BB].size(); 1145 for (auto *Pred : Predecessors[BB]) { 1146 Edge E = std::make_pair(Pred, BB); 1147 TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); 1148 if (E.first == E.second) 1149 SelfReferentialEdge = E; 1150 } 1151 if (NumTotalEdges == 1) { 1152 SingleEdge = std::make_pair(Predecessors[BB][0], BB); 1153 } 1154 } else { 1155 // On the second round, visit all successor edges. 1156 NumTotalEdges = Successors[BB].size(); 1157 for (auto *Succ : Successors[BB]) { 1158 Edge E = std::make_pair(BB, Succ); 1159 TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); 1160 } 1161 if (NumTotalEdges == 1) { 1162 SingleEdge = std::make_pair(BB, Successors[BB][0]); 1163 } 1164 } 1165 1166 // After visiting all the edges, there are three cases that we 1167 // can handle immediately: 1168 // 1169 // - All the edge weights are known (i.e., NumUnknownEdges == 0). 1170 // In this case, we simply check that the sum of all the edges 1171 // is the same as BB's weight. If not, we change BB's weight 1172 // to match. Additionally, if BB had not been visited before, 1173 // we mark it visited. 1174 // 1175 // - Only one edge is unknown and BB has already been visited. 1176 // In this case, we can compute the weight of the edge by 1177 // subtracting the total block weight from all the known 1178 // edge weights. If the edges weight more than BB, then the 1179 // edge of the last remaining edge is set to zero. 1180 // 1181 // - There exists a self-referential edge and the weight of BB is 1182 // known. In this case, this edge can be based on BB's weight. 1183 // We add up all the other known edges and set the weight on 1184 // the self-referential edge as we did in the previous case. 1185 // 1186 // In any other case, we must continue iterating. Eventually, 1187 // all edges will get a weight, or iteration will stop when 1188 // it reaches SampleProfileMaxPropagateIterations. 1189 if (NumUnknownEdges <= 1) { 1190 uint64_t &BBWeight = BlockWeights[EC]; 1191 if (NumUnknownEdges == 0) { 1192 if (!VisitedBlocks.count(EC)) { 1193 // If we already know the weight of all edges, the weight of the 1194 // basic block can be computed. It should be no larger than the sum 1195 // of all edge weights. 1196 if (TotalWeight > BBWeight) { 1197 BBWeight = TotalWeight; 1198 Changed = true; 1199 LLVM_DEBUG(dbgs() << "All edge weights for " << BB->getName() 1200 << " known. Set weight for block: "; 1201 printBlockWeight(dbgs(), BB);); 1202 } 1203 } else if (NumTotalEdges == 1 && 1204 EdgeWeights[SingleEdge] < BlockWeights[EC]) { 1205 // If there is only one edge for the visited basic block, use the 1206 // block weight to adjust edge weight if edge weight is smaller. 1207 EdgeWeights[SingleEdge] = BlockWeights[EC]; 1208 Changed = true; 1209 } 1210 } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) { 1211 // If there is a single unknown edge and the block has been 1212 // visited, then we can compute E's weight. 1213 if (BBWeight >= TotalWeight) 1214 EdgeWeights[UnknownEdge] = BBWeight - TotalWeight; 1215 else 1216 EdgeWeights[UnknownEdge] = 0; 1217 const BasicBlock *OtherEC; 1218 if (i == 0) 1219 OtherEC = EquivalenceClass[UnknownEdge.first]; 1220 else 1221 OtherEC = EquivalenceClass[UnknownEdge.second]; 1222 // Edge weights should never exceed the BB weights it connects. 1223 if (VisitedBlocks.count(OtherEC) && 1224 EdgeWeights[UnknownEdge] > BlockWeights[OtherEC]) 1225 EdgeWeights[UnknownEdge] = BlockWeights[OtherEC]; 1226 VisitedEdges.insert(UnknownEdge); 1227 Changed = true; 1228 LLVM_DEBUG(dbgs() << "Set weight for edge: "; 1229 printEdgeWeight(dbgs(), UnknownEdge)); 1230 } 1231 } else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) { 1232 // If a block Weights 0, all its in/out edges should weight 0. 1233 if (i == 0) { 1234 for (auto *Pred : Predecessors[BB]) { 1235 Edge E = std::make_pair(Pred, BB); 1236 EdgeWeights[E] = 0; 1237 VisitedEdges.insert(E); 1238 } 1239 } else { 1240 for (auto *Succ : Successors[BB]) { 1241 Edge E = std::make_pair(BB, Succ); 1242 EdgeWeights[E] = 0; 1243 VisitedEdges.insert(E); 1244 } 1245 } 1246 } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) { 1247 uint64_t &BBWeight = BlockWeights[BB]; 1248 // We have a self-referential edge and the weight of BB is known. 1249 if (BBWeight >= TotalWeight) 1250 EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight; 1251 else 1252 EdgeWeights[SelfReferentialEdge] = 0; 1253 VisitedEdges.insert(SelfReferentialEdge); 1254 Changed = true; 1255 LLVM_DEBUG(dbgs() << "Set self-referential edge weight to: "; 1256 printEdgeWeight(dbgs(), SelfReferentialEdge)); 1257 } 1258 if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) { 1259 BlockWeights[EC] = TotalWeight; 1260 VisitedBlocks.insert(EC); 1261 Changed = true; 1262 } 1263 } 1264 } 1265 1266 return Changed; 1267 } 1268 1269 /// Build in/out edge lists for each basic block in the CFG. 1270 /// 1271 /// We are interested in unique edges. If a block B1 has multiple 1272 /// edges to another block B2, we only add a single B1->B2 edge. 1273 void SampleProfileLoader::buildEdges(Function &F) { 1274 for (auto &BI : F) { 1275 BasicBlock *B1 = &BI; 1276 1277 // Add predecessors for B1. 1278 SmallPtrSet<BasicBlock *, 16> Visited; 1279 if (!Predecessors[B1].empty()) 1280 llvm_unreachable("Found a stale predecessors list in a basic block."); 1281 for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) { 1282 BasicBlock *B2 = *PI; 1283 if (Visited.insert(B2).second) 1284 Predecessors[B1].push_back(B2); 1285 } 1286 1287 // Add successors for B1. 1288 Visited.clear(); 1289 if (!Successors[B1].empty()) 1290 llvm_unreachable("Found a stale successors list in a basic block."); 1291 for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) { 1292 BasicBlock *B2 = *SI; 1293 if (Visited.insert(B2).second) 1294 Successors[B1].push_back(B2); 1295 } 1296 } 1297 } 1298 1299 /// Returns the sorted CallTargetMap \p M by count in descending order. 1300 static SmallVector<InstrProfValueData, 2> GetSortedValueDataFromCallTargets( 1301 const SampleRecord::CallTargetMap & M) { 1302 SmallVector<InstrProfValueData, 2> R; 1303 for (const auto &I : SampleRecord::SortCallTargets(M)) { 1304 R.emplace_back(InstrProfValueData{FunctionSamples::getGUID(I.first), I.second}); 1305 } 1306 return R; 1307 } 1308 1309 /// Propagate weights into edges 1310 /// 1311 /// The following rules are applied to every block BB in the CFG: 1312 /// 1313 /// - If BB has a single predecessor/successor, then the weight 1314 /// of that edge is the weight of the block. 1315 /// 1316 /// - If all incoming or outgoing edges are known except one, and the 1317 /// weight of the block is already known, the weight of the unknown 1318 /// edge will be the weight of the block minus the sum of all the known 1319 /// edges. If the sum of all the known edges is larger than BB's weight, 1320 /// we set the unknown edge weight to zero. 1321 /// 1322 /// - If there is a self-referential edge, and the weight of the block is 1323 /// known, the weight for that edge is set to the weight of the block 1324 /// minus the weight of the other incoming edges to that block (if 1325 /// known). 1326 void SampleProfileLoader::propagateWeights(Function &F) { 1327 bool Changed = true; 1328 unsigned I = 0; 1329 1330 // If BB weight is larger than its corresponding loop's header BB weight, 1331 // use the BB weight to replace the loop header BB weight. 1332 for (auto &BI : F) { 1333 BasicBlock *BB = &BI; 1334 Loop *L = LI->getLoopFor(BB); 1335 if (!L) { 1336 continue; 1337 } 1338 BasicBlock *Header = L->getHeader(); 1339 if (Header && BlockWeights[BB] > BlockWeights[Header]) { 1340 BlockWeights[Header] = BlockWeights[BB]; 1341 } 1342 } 1343 1344 // Before propagation starts, build, for each block, a list of 1345 // unique predecessors and successors. This is necessary to handle 1346 // identical edges in multiway branches. Since we visit all blocks and all 1347 // edges of the CFG, it is cleaner to build these lists once at the start 1348 // of the pass. 1349 buildEdges(F); 1350 1351 // Propagate until we converge or we go past the iteration limit. 1352 while (Changed && I++ < SampleProfileMaxPropagateIterations) { 1353 Changed = propagateThroughEdges(F, false); 1354 } 1355 1356 // The first propagation propagates BB counts from annotated BBs to unknown 1357 // BBs. The 2nd propagation pass resets edges weights, and use all BB weights 1358 // to propagate edge weights. 1359 VisitedEdges.clear(); 1360 Changed = true; 1361 while (Changed && I++ < SampleProfileMaxPropagateIterations) { 1362 Changed = propagateThroughEdges(F, false); 1363 } 1364 1365 // The 3rd propagation pass allows adjust annotated BB weights that are 1366 // obviously wrong. 1367 Changed = true; 1368 while (Changed && I++ < SampleProfileMaxPropagateIterations) { 1369 Changed = propagateThroughEdges(F, true); 1370 } 1371 1372 // Generate MD_prof metadata for every branch instruction using the 1373 // edge weights computed during propagation. 1374 LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n"); 1375 LLVMContext &Ctx = F.getContext(); 1376 MDBuilder MDB(Ctx); 1377 for (auto &BI : F) { 1378 BasicBlock *BB = &BI; 1379 1380 if (BlockWeights[BB]) { 1381 for (auto &I : BB->getInstList()) { 1382 if (!isa<CallInst>(I) && !isa<InvokeInst>(I)) 1383 continue; 1384 CallSite CS(&I); 1385 if (!CS.getCalledFunction()) { 1386 const DebugLoc &DLoc = I.getDebugLoc(); 1387 if (!DLoc) 1388 continue; 1389 const DILocation *DIL = DLoc; 1390 uint32_t LineOffset = FunctionSamples::getOffset(DIL); 1391 uint32_t Discriminator = DIL->getBaseDiscriminator(); 1392 1393 const FunctionSamples *FS = findFunctionSamples(I); 1394 if (!FS) 1395 continue; 1396 auto T = FS->findCallTargetMapAt(LineOffset, Discriminator); 1397 if (!T || T.get().empty()) 1398 continue; 1399 SmallVector<InstrProfValueData, 2> SortedCallTargets = 1400 GetSortedValueDataFromCallTargets(T.get()); 1401 uint64_t Sum; 1402 findIndirectCallFunctionSamples(I, Sum); 1403 annotateValueSite(*I.getParent()->getParent()->getParent(), I, 1404 SortedCallTargets, Sum, IPVK_IndirectCallTarget, 1405 SortedCallTargets.size()); 1406 } else if (!isa<IntrinsicInst>(&I)) { 1407 I.setMetadata(LLVMContext::MD_prof, 1408 MDB.createBranchWeights( 1409 {static_cast<uint32_t>(BlockWeights[BB])})); 1410 } 1411 } 1412 } 1413 Instruction *TI = BB->getTerminator(); 1414 if (TI->getNumSuccessors() == 1) 1415 continue; 1416 if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)) 1417 continue; 1418 1419 DebugLoc BranchLoc = TI->getDebugLoc(); 1420 LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line " 1421 << ((BranchLoc) ? Twine(BranchLoc.getLine()) 1422 : Twine("<UNKNOWN LOCATION>")) 1423 << ".\n"); 1424 SmallVector<uint32_t, 4> Weights; 1425 uint32_t MaxWeight = 0; 1426 Instruction *MaxDestInst; 1427 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { 1428 BasicBlock *Succ = TI->getSuccessor(I); 1429 Edge E = std::make_pair(BB, Succ); 1430 uint64_t Weight = EdgeWeights[E]; 1431 LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E)); 1432 // Use uint32_t saturated arithmetic to adjust the incoming weights, 1433 // if needed. Sample counts in profiles are 64-bit unsigned values, 1434 // but internally branch weights are expressed as 32-bit values. 1435 if (Weight > std::numeric_limits<uint32_t>::max()) { 1436 LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)"); 1437 Weight = std::numeric_limits<uint32_t>::max(); 1438 } 1439 // Weight is added by one to avoid propagation errors introduced by 1440 // 0 weights. 1441 Weights.push_back(static_cast<uint32_t>(Weight + 1)); 1442 if (Weight != 0) { 1443 if (Weight > MaxWeight) { 1444 MaxWeight = Weight; 1445 MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime(); 1446 } 1447 } 1448 } 1449 1450 misexpect::verifyMisExpect(TI, Weights, TI->getContext()); 1451 1452 uint64_t TempWeight; 1453 // Only set weights if there is at least one non-zero weight. 1454 // In any other case, let the analyzer set weights. 1455 // Do not set weights if the weights are present. In ThinLTO, the profile 1456 // annotation is done twice. If the first annotation already set the 1457 // weights, the second pass does not need to set it. 1458 if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) { 1459 LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); 1460 TI->setMetadata(LLVMContext::MD_prof, 1461 MDB.createBranchWeights(Weights)); 1462 ORE->emit([&]() { 1463 return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst) 1464 << "most popular destination for conditional branches at " 1465 << ore::NV("CondBranchesLoc", BranchLoc); 1466 }); 1467 } else { 1468 LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n"); 1469 } 1470 } 1471 } 1472 1473 /// Get the line number for the function header. 1474 /// 1475 /// This looks up function \p F in the current compilation unit and 1476 /// retrieves the line number where the function is defined. This is 1477 /// line 0 for all the samples read from the profile file. Every line 1478 /// number is relative to this line. 1479 /// 1480 /// \param F Function object to query. 1481 /// 1482 /// \returns the line number where \p F is defined. If it returns 0, 1483 /// it means that there is no debug information available for \p F. 1484 unsigned SampleProfileLoader::getFunctionLoc(Function &F) { 1485 if (DISubprogram *S = F.getSubprogram()) 1486 return S->getLine(); 1487 1488 if (NoWarnSampleUnused) 1489 return 0; 1490 1491 // If the start of \p F is missing, emit a diagnostic to inform the user 1492 // about the missed opportunity. 1493 F.getContext().diagnose(DiagnosticInfoSampleProfile( 1494 "No debug information found in function " + F.getName() + 1495 ": Function profile not used", 1496 DS_Warning)); 1497 return 0; 1498 } 1499 1500 void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) { 1501 DT.reset(new DominatorTree); 1502 DT->recalculate(F); 1503 1504 PDT.reset(new PostDominatorTree(F)); 1505 1506 LI.reset(new LoopInfo); 1507 LI->analyze(*DT); 1508 } 1509 1510 /// Generate branch weight metadata for all branches in \p F. 1511 /// 1512 /// Branch weights are computed out of instruction samples using a 1513 /// propagation heuristic. Propagation proceeds in 3 phases: 1514 /// 1515 /// 1- Assignment of block weights. All the basic blocks in the function 1516 /// are initial assigned the same weight as their most frequently 1517 /// executed instruction. 1518 /// 1519 /// 2- Creation of equivalence classes. Since samples may be missing from 1520 /// blocks, we can fill in the gaps by setting the weights of all the 1521 /// blocks in the same equivalence class to the same weight. To compute 1522 /// the concept of equivalence, we use dominance and loop information. 1523 /// Two blocks B1 and B2 are in the same equivalence class if B1 1524 /// dominates B2, B2 post-dominates B1 and both are in the same loop. 1525 /// 1526 /// 3- Propagation of block weights into edges. This uses a simple 1527 /// propagation heuristic. The following rules are applied to every 1528 /// block BB in the CFG: 1529 /// 1530 /// - If BB has a single predecessor/successor, then the weight 1531 /// of that edge is the weight of the block. 1532 /// 1533 /// - If all the edges are known except one, and the weight of the 1534 /// block is already known, the weight of the unknown edge will 1535 /// be the weight of the block minus the sum of all the known 1536 /// edges. If the sum of all the known edges is larger than BB's weight, 1537 /// we set the unknown edge weight to zero. 1538 /// 1539 /// - If there is a self-referential edge, and the weight of the block is 1540 /// known, the weight for that edge is set to the weight of the block 1541 /// minus the weight of the other incoming edges to that block (if 1542 /// known). 1543 /// 1544 /// Since this propagation is not guaranteed to finalize for every CFG, we 1545 /// only allow it to proceed for a limited number of iterations (controlled 1546 /// by -sample-profile-max-propagate-iterations). 1547 /// 1548 /// FIXME: Try to replace this propagation heuristic with a scheme 1549 /// that is guaranteed to finalize. A work-list approach similar to 1550 /// the standard value propagation algorithm used by SSA-CCP might 1551 /// work here. 1552 /// 1553 /// Once all the branch weights are computed, we emit the MD_prof 1554 /// metadata on BB using the computed values for each of its branches. 1555 /// 1556 /// \param F The function to query. 1557 /// 1558 /// \returns true if \p F was modified. Returns false, otherwise. 1559 bool SampleProfileLoader::emitAnnotations(Function &F) { 1560 bool Changed = false; 1561 1562 if (getFunctionLoc(F) == 0) 1563 return false; 1564 1565 LLVM_DEBUG(dbgs() << "Line number for the first instruction in " 1566 << F.getName() << ": " << getFunctionLoc(F) << "\n"); 1567 1568 DenseSet<GlobalValue::GUID> InlinedGUIDs; 1569 Changed |= inlineHotFunctions(F, InlinedGUIDs); 1570 1571 // Compute basic block weights. 1572 Changed |= computeBlockWeights(F); 1573 1574 if (Changed) { 1575 // Add an entry count to the function using the samples gathered at the 1576 // function entry. 1577 // Sets the GUIDs that are inlined in the profiled binary. This is used 1578 // for ThinLink to make correct liveness analysis, and also make the IR 1579 // match the profiled binary before annotation. 1580 F.setEntryCount( 1581 ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real), 1582 &InlinedGUIDs); 1583 1584 // Compute dominance and loop info needed for propagation. 1585 computeDominanceAndLoopInfo(F); 1586 1587 // Find equivalence classes. 1588 findEquivalenceClasses(F); 1589 1590 // Propagate weights to all edges. 1591 propagateWeights(F); 1592 } 1593 1594 // If coverage checking was requested, compute it now. 1595 if (SampleProfileRecordCoverage) { 1596 unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI); 1597 unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI); 1598 unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); 1599 if (Coverage < SampleProfileRecordCoverage) { 1600 F.getContext().diagnose(DiagnosticInfoSampleProfile( 1601 F.getSubprogram()->getFilename(), getFunctionLoc(F), 1602 Twine(Used) + " of " + Twine(Total) + " available profile records (" + 1603 Twine(Coverage) + "%) were applied", 1604 DS_Warning)); 1605 } 1606 } 1607 1608 if (SampleProfileSampleCoverage) { 1609 uint64_t Used = CoverageTracker.getTotalUsedSamples(); 1610 uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI); 1611 unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); 1612 if (Coverage < SampleProfileSampleCoverage) { 1613 F.getContext().diagnose(DiagnosticInfoSampleProfile( 1614 F.getSubprogram()->getFilename(), getFunctionLoc(F), 1615 Twine(Used) + " of " + Twine(Total) + " available profile samples (" + 1616 Twine(Coverage) + "%) were applied", 1617 DS_Warning)); 1618 } 1619 } 1620 return Changed; 1621 } 1622 1623 char SampleProfileLoaderLegacyPass::ID = 0; 1624 1625 INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile", 1626 "Sample Profile loader", false, false) 1627 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1628 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1629 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1630 INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", 1631 "Sample Profile loader", false, false) 1632 1633 bool SampleProfileLoader::doInitialization(Module &M) { 1634 auto &Ctx = M.getContext(); 1635 auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx); 1636 if (std::error_code EC = ReaderOrErr.getError()) { 1637 std::string Msg = "Could not open profile: " + EC.message(); 1638 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); 1639 return false; 1640 } 1641 Reader = std::move(ReaderOrErr.get()); 1642 Reader->collectFuncsToUse(M); 1643 ProfileIsValid = (Reader->read() == sampleprof_error::success); 1644 PSL = Reader->getProfileSymbolList(); 1645 1646 if (!RemappingFilename.empty()) { 1647 // Apply profile remappings to the loaded profile data if requested. 1648 // For now, we only support remapping symbols encoded using the Itanium 1649 // C++ ABI's name mangling scheme. 1650 ReaderOrErr = SampleProfileReaderItaniumRemapper::create( 1651 RemappingFilename, Ctx, std::move(Reader)); 1652 if (std::error_code EC = ReaderOrErr.getError()) { 1653 std::string Msg = "Could not open profile remapping file: " + EC.message(); 1654 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); 1655 return false; 1656 } 1657 Reader = std::move(ReaderOrErr.get()); 1658 ProfileIsValid = (Reader->read() == sampleprof_error::success); 1659 } 1660 return true; 1661 } 1662 1663 ModulePass *llvm::createSampleProfileLoaderPass() { 1664 return new SampleProfileLoaderLegacyPass(); 1665 } 1666 1667 ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) { 1668 return new SampleProfileLoaderLegacyPass(Name); 1669 } 1670 1671 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, 1672 ProfileSummaryInfo *_PSI) { 1673 GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap); 1674 if (!ProfileIsValid) 1675 return false; 1676 1677 PSI = _PSI; 1678 if (M.getProfileSummary(/* IsCS */ false) == nullptr) 1679 M.setProfileSummary(Reader->getSummary().getMD(M.getContext()), 1680 ProfileSummary::PSK_Sample); 1681 1682 // Compute the total number of samples collected in this profile. 1683 for (const auto &I : Reader->getProfiles()) 1684 TotalCollectedSamples += I.second.getTotalSamples(); 1685 1686 // Populate the symbol map. 1687 for (const auto &N_F : M.getValueSymbolTable()) { 1688 StringRef OrigName = N_F.getKey(); 1689 Function *F = dyn_cast<Function>(N_F.getValue()); 1690 if (F == nullptr) 1691 continue; 1692 SymbolMap[OrigName] = F; 1693 auto pos = OrigName.find('.'); 1694 if (pos != StringRef::npos) { 1695 StringRef NewName = OrigName.substr(0, pos); 1696 auto r = SymbolMap.insert(std::make_pair(NewName, F)); 1697 // Failiing to insert means there is already an entry in SymbolMap, 1698 // thus there are multiple functions that are mapped to the same 1699 // stripped name. In this case of name conflicting, set the value 1700 // to nullptr to avoid confusion. 1701 if (!r.second) 1702 r.first->second = nullptr; 1703 } 1704 } 1705 1706 bool retval = false; 1707 for (auto &F : M) 1708 if (!F.isDeclaration()) { 1709 clearFunctionData(); 1710 retval |= runOnFunction(F, AM); 1711 } 1712 1713 // Account for cold calls not inlined.... 1714 for (const std::pair<Function *, NotInlinedProfileInfo> &pair : 1715 notInlinedCallInfo) 1716 updateProfileCallee(pair.first, pair.second.entryCount); 1717 1718 return retval; 1719 } 1720 1721 bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { 1722 ACT = &getAnalysis<AssumptionCacheTracker>(); 1723 TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); 1724 ProfileSummaryInfo *PSI = 1725 &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1726 return SampleLoader.runOnModule(M, nullptr, PSI); 1727 } 1728 1729 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { 1730 1731 DILocation2SampleMap.clear(); 1732 // By default the entry count is initialized to -1, which will be treated 1733 // conservatively by getEntryCount as the same as unknown (None). This is 1734 // to avoid newly added code to be treated as cold. If we have samples 1735 // this will be overwritten in emitAnnotations. 1736 // 1737 // PSL -- profile symbol list include all the symbols in sampled binary. 1738 // If ProfileSampleAccurate is true or F has profile-sample-accurate 1739 // attribute, and if there is no profile symbol list read in, initialize 1740 // all the function entry counts to 0; if there is profile symbol list, only 1741 // initialize the entry count to 0 when current function is in the list. 1742 uint64_t initialEntryCount = 1743 ((ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) && 1744 (!PSL || PSL->contains(F.getName()))) 1745 ? 0 1746 : -1; 1747 F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real)); 1748 std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; 1749 if (AM) { 1750 auto &FAM = 1751 AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent()) 1752 .getManager(); 1753 ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 1754 } else { 1755 OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F); 1756 ORE = OwnedORE.get(); 1757 } 1758 Samples = Reader->getSamplesFor(F); 1759 if (Samples && !Samples->empty()) 1760 return emitAnnotations(F); 1761 return false; 1762 } 1763 1764 PreservedAnalyses SampleProfileLoaderPass::run(Module &M, 1765 ModuleAnalysisManager &AM) { 1766 FunctionAnalysisManager &FAM = 1767 AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1768 1769 auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { 1770 return FAM.getResult<AssumptionAnalysis>(F); 1771 }; 1772 auto GetTTI = [&](Function &F) -> TargetTransformInfo & { 1773 return FAM.getResult<TargetIRAnalysis>(F); 1774 }; 1775 1776 SampleProfileLoader SampleLoader( 1777 ProfileFileName.empty() ? SampleProfileFile : ProfileFileName, 1778 ProfileRemappingFileName.empty() ? SampleProfileRemappingFile 1779 : ProfileRemappingFileName, 1780 IsThinLTOPreLink, GetAssumptionCache, GetTTI); 1781 1782 SampleLoader.doInitialization(M); 1783 1784 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 1785 if (!SampleLoader.runOnModule(M, &AM, PSI)) 1786 return PreservedAnalyses::all(); 1787 1788 return PreservedAnalyses::none(); 1789 } 1790