1 //===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the SampleProfileLoader transformation. This pass
10 // reads a profile file generated by a sampling profiler (e.g. Linux Perf -
11 // http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
12 // profile information in the given profile.
13 //
14 // This pass generates branch weight annotations on the IR:
15 //
16 // - prof: Represents branch weights. This annotation is added to branches
17 //      to indicate the weights of each edge coming out of the branch.
18 //      The weight of each edge is the weight of the target block for
19 //      that edge. The weight of a block B is computed as the maximum
20 //      number of samples found in B.
21 //
22 //===----------------------------------------------------------------------===//
23 
24 #include "llvm/Transforms/IPO/SampleProfile.h"
25 #include "llvm/ADT/ArrayRef.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/DenseSet.h"
28 #include "llvm/ADT/None.h"
29 #include "llvm/ADT/SmallPtrSet.h"
30 #include "llvm/ADT/SmallSet.h"
31 #include "llvm/ADT/SmallVector.h"
32 #include "llvm/ADT/StringMap.h"
33 #include "llvm/ADT/StringRef.h"
34 #include "llvm/ADT/Twine.h"
35 #include "llvm/Analysis/AssumptionCache.h"
36 #include "llvm/Analysis/InlineCost.h"
37 #include "llvm/Analysis/LoopInfo.h"
38 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
39 #include "llvm/Analysis/PostDominators.h"
40 #include "llvm/Analysis/ProfileSummaryInfo.h"
41 #include "llvm/Analysis/TargetTransformInfo.h"
42 #include "llvm/IR/BasicBlock.h"
43 #include "llvm/IR/CFG.h"
44 #include "llvm/IR/CallSite.h"
45 #include "llvm/IR/DebugInfoMetadata.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/IR/DiagnosticInfo.h"
48 #include "llvm/IR/Dominators.h"
49 #include "llvm/IR/Function.h"
50 #include "llvm/IR/GlobalValue.h"
51 #include "llvm/IR/InstrTypes.h"
52 #include "llvm/IR/Instruction.h"
53 #include "llvm/IR/Instructions.h"
54 #include "llvm/IR/IntrinsicInst.h"
55 #include "llvm/IR/LLVMContext.h"
56 #include "llvm/IR/MDBuilder.h"
57 #include "llvm/IR/Module.h"
58 #include "llvm/IR/PassManager.h"
59 #include "llvm/IR/ValueSymbolTable.h"
60 #include "llvm/Pass.h"
61 #include "llvm/ProfileData/InstrProf.h"
62 #include "llvm/ProfileData/SampleProf.h"
63 #include "llvm/ProfileData/SampleProfReader.h"
64 #include "llvm/Support/Casting.h"
65 #include "llvm/Support/CommandLine.h"
66 #include "llvm/Support/Debug.h"
67 #include "llvm/Support/ErrorHandling.h"
68 #include "llvm/Support/ErrorOr.h"
69 #include "llvm/Support/GenericDomTree.h"
70 #include "llvm/Support/raw_ostream.h"
71 #include "llvm/Transforms/IPO.h"
72 #include "llvm/Transforms/Instrumentation.h"
73 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
74 #include "llvm/Transforms/Utils/Cloning.h"
75 #include "llvm/Transforms/Utils/MisExpect.h"
76 #include <algorithm>
77 #include <cassert>
78 #include <cstdint>
79 #include <functional>
80 #include <limits>
81 #include <map>
82 #include <memory>
83 #include <queue>
84 #include <string>
85 #include <system_error>
86 #include <utility>
87 #include <vector>
88 
89 using namespace llvm;
90 using namespace sampleprof;
91 using ProfileCount = Function::ProfileCount;
92 #define DEBUG_TYPE "sample-profile"
93 
94 // Command line option to specify the file to read samples from. This is
95 // mainly used for debugging.
96 static cl::opt<std::string> SampleProfileFile(
97     "sample-profile-file", cl::init(""), cl::value_desc("filename"),
98     cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
99 
100 // The named file contains a set of transformations that may have been applied
101 // to the symbol names between the program from which the sample data was
102 // collected and the current program's symbols.
103 static cl::opt<std::string> SampleProfileRemappingFile(
104     "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
105     cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
106 
107 static cl::opt<unsigned> SampleProfileMaxPropagateIterations(
108     "sample-profile-max-propagate-iterations", cl::init(100),
109     cl::desc("Maximum number of iterations to go through when propagating "
110              "sample block/edge weights through the CFG."));
111 
112 static cl::opt<unsigned> SampleProfileRecordCoverage(
113     "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"),
114     cl::desc("Emit a warning if less than N% of records in the input profile "
115              "are matched to the IR."));
116 
117 static cl::opt<unsigned> SampleProfileSampleCoverage(
118     "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"),
119     cl::desc("Emit a warning if less than N% of samples in the input profile "
120              "are matched to the IR."));
121 
122 static cl::opt<bool> NoWarnSampleUnused(
123     "no-warn-sample-unused", cl::init(false), cl::Hidden,
124     cl::desc("Use this option to turn off/on warnings about function with "
125              "samples but without debug information to use those samples. "));
126 
127 static cl::opt<bool> ProfileSampleAccurate(
128     "profile-sample-accurate", cl::Hidden, cl::init(false),
129     cl::desc("If the sample profile is accurate, we will mark all un-sampled "
130              "callsite and function as having 0 samples. Otherwise, treat "
131              "un-sampled callsites and functions conservatively as unknown. "));
132 
133 namespace {
134 
135 using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
136 using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
137 using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
138 using EdgeWeightMap = DenseMap<Edge, uint64_t>;
139 using BlockEdgeMap =
140     DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;
141 
142 class SampleProfileLoader;
143 
144 class SampleCoverageTracker {
145 public:
146   SampleCoverageTracker(SampleProfileLoader &SPL) : SPLoader(SPL){};
147 
148   bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
149                        uint32_t Discriminator, uint64_t Samples);
150   unsigned computeCoverage(unsigned Used, unsigned Total) const;
151   unsigned countUsedRecords(const FunctionSamples *FS,
152                             ProfileSummaryInfo *PSI) const;
153   unsigned countBodyRecords(const FunctionSamples *FS,
154                             ProfileSummaryInfo *PSI) const;
155   uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
156   uint64_t countBodySamples(const FunctionSamples *FS,
157                             ProfileSummaryInfo *PSI) const;
158 
159   void clear() {
160     SampleCoverage.clear();
161     TotalUsedSamples = 0;
162   }
163 
164 private:
165   using BodySampleCoverageMap = std::map<LineLocation, unsigned>;
166   using FunctionSamplesCoverageMap =
167       DenseMap<const FunctionSamples *, BodySampleCoverageMap>;
168 
169   /// Coverage map for sampling records.
170   ///
171   /// This map keeps a record of sampling records that have been matched to
172   /// an IR instruction. This is used to detect some form of staleness in
173   /// profiles (see flag -sample-profile-check-coverage).
174   ///
175   /// Each entry in the map corresponds to a FunctionSamples instance.  This is
176   /// another map that counts how many times the sample record at the
177   /// given location has been used.
178   FunctionSamplesCoverageMap SampleCoverage;
179 
180   /// Number of samples used from the profile.
181   ///
182   /// When a sampling record is used for the first time, the samples from
183   /// that record are added to this accumulator.  Coverage is later computed
184   /// based on the total number of samples available in this function and
185   /// its callsites.
186   ///
187   /// Note that this accumulator tracks samples used from a single function
188   /// and all the inlined callsites. Strictly, we should have a map of counters
189   /// keyed by FunctionSamples pointers, but these stats are cleared after
190   /// every function, so we just need to keep a single counter.
191   uint64_t TotalUsedSamples = 0;
192 
193   SampleProfileLoader &SPLoader;
194 };
195 
196 class GUIDToFuncNameMapper {
197 public:
198   GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
199                         DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
200       : CurrentReader(Reader), CurrentModule(M),
201       CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
202     if (CurrentReader.getFormat() != SPF_Compact_Binary)
203       return;
204 
205     for (const auto &F : CurrentModule) {
206       StringRef OrigName = F.getName();
207       CurrentGUIDToFuncNameMap.insert(
208           {Function::getGUID(OrigName), OrigName});
209 
210       // Local to global var promotion used by optimization like thinlto
211       // will rename the var and add suffix like ".llvm.xxx" to the
212       // original local name. In sample profile, the suffixes of function
213       // names are all stripped. Since it is possible that the mapper is
214       // built in post-thin-link phase and var promotion has been done,
215       // we need to add the substring of function name without the suffix
216       // into the GUIDToFuncNameMap.
217       StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
218       if (CanonName != OrigName)
219         CurrentGUIDToFuncNameMap.insert(
220             {Function::getGUID(CanonName), CanonName});
221     }
222 
223     // Update GUIDToFuncNameMap for each function including inlinees.
224     SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
225   }
226 
227   ~GUIDToFuncNameMapper() {
228     if (CurrentReader.getFormat() != SPF_Compact_Binary)
229       return;
230 
231     CurrentGUIDToFuncNameMap.clear();
232 
233     // Reset GUIDToFuncNameMap for of each function as they're no
234     // longer valid at this point.
235     SetGUIDToFuncNameMapForAll(nullptr);
236   }
237 
238 private:
239   void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
240     std::queue<FunctionSamples *> FSToUpdate;
241     for (auto &IFS : CurrentReader.getProfiles()) {
242       FSToUpdate.push(&IFS.second);
243     }
244 
245     while (!FSToUpdate.empty()) {
246       FunctionSamples *FS = FSToUpdate.front();
247       FSToUpdate.pop();
248       FS->GUIDToFuncNameMap = Map;
249       for (const auto &ICS : FS->getCallsiteSamples()) {
250         const FunctionSamplesMap &FSMap = ICS.second;
251         for (auto &IFS : FSMap) {
252           FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
253           FSToUpdate.push(&FS);
254         }
255       }
256     }
257   }
258 
259   SampleProfileReader &CurrentReader;
260   Module &CurrentModule;
261   DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
262 };
263 
264 /// Sample profile pass.
265 ///
266 /// This pass reads profile data from the file specified by
267 /// -sample-profile-file and annotates every affected function with the
268 /// profile information found in that file.
269 class SampleProfileLoader {
270 public:
271   SampleProfileLoader(
272       StringRef Name, StringRef RemapName, bool IsThinLTOPreLink,
273       std::function<AssumptionCache &(Function &)> GetAssumptionCache,
274       std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo)
275       : GetAC(std::move(GetAssumptionCache)),
276         GetTTI(std::move(GetTargetTransformInfo)), CoverageTracker(*this),
277         Filename(Name), RemappingFilename(RemapName),
278         IsThinLTOPreLink(IsThinLTOPreLink) {}
279 
280   bool doInitialization(Module &M);
281   bool runOnModule(Module &M, ModuleAnalysisManager *AM,
282                    ProfileSummaryInfo *_PSI);
283 
284   void dump() { Reader->dump(); }
285 
286 protected:
287   friend class SampleCoverageTracker;
288 
289   bool runOnFunction(Function &F, ModuleAnalysisManager *AM);
290   unsigned getFunctionLoc(Function &F);
291   bool emitAnnotations(Function &F);
292   ErrorOr<uint64_t> getInstWeight(const Instruction &I);
293   ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
294   const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const;
295   std::vector<const FunctionSamples *>
296   findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
297   mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap;
298   const FunctionSamples *findFunctionSamples(const Instruction &I) const;
299   bool inlineCallInstruction(Instruction *I);
300   bool inlineHotFunctions(Function &F,
301                           DenseSet<GlobalValue::GUID> &InlinedGUIDs);
302   void printEdgeWeight(raw_ostream &OS, Edge E);
303   void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
304   void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
305   bool computeBlockWeights(Function &F);
306   void findEquivalenceClasses(Function &F);
307   template <bool IsPostDom>
308   void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
309                            DominatorTreeBase<BasicBlock, IsPostDom> *DomTree);
310 
311   void propagateWeights(Function &F);
312   uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
313   void buildEdges(Function &F);
314   bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
315   void computeDominanceAndLoopInfo(Function &F);
316   void clearFunctionData();
317   bool callsiteIsHot(const FunctionSamples *CallsiteFS,
318                      ProfileSummaryInfo *PSI);
319 
320   /// Map basic blocks to their computed weights.
321   ///
322   /// The weight of a basic block is defined to be the maximum
323   /// of all the instruction weights in that block.
324   BlockWeightMap BlockWeights;
325 
326   /// Map edges to their computed weights.
327   ///
328   /// Edge weights are computed by propagating basic block weights in
329   /// SampleProfile::propagateWeights.
330   EdgeWeightMap EdgeWeights;
331 
332   /// Set of visited blocks during propagation.
333   SmallPtrSet<const BasicBlock *, 32> VisitedBlocks;
334 
335   /// Set of visited edges during propagation.
336   SmallSet<Edge, 32> VisitedEdges;
337 
338   /// Equivalence classes for block weights.
339   ///
340   /// Two blocks BB1 and BB2 are in the same equivalence class if they
341   /// dominate and post-dominate each other, and they are in the same loop
342   /// nest. When this happens, the two blocks are guaranteed to execute
343   /// the same number of times.
344   EquivalenceClassMap EquivalenceClass;
345 
346   /// Map from function name to Function *. Used to find the function from
347   /// the function name. If the function name contains suffix, additional
348   /// entry is added to map from the stripped name to the function if there
349   /// is one-to-one mapping.
350   StringMap<Function *> SymbolMap;
351 
352   /// Dominance, post-dominance and loop information.
353   std::unique_ptr<DominatorTree> DT;
354   std::unique_ptr<PostDominatorTree> PDT;
355   std::unique_ptr<LoopInfo> LI;
356 
357   std::function<AssumptionCache &(Function &)> GetAC;
358   std::function<TargetTransformInfo &(Function &)> GetTTI;
359 
360   /// Predecessors for each basic block in the CFG.
361   BlockEdgeMap Predecessors;
362 
363   /// Successors for each basic block in the CFG.
364   BlockEdgeMap Successors;
365 
366   SampleCoverageTracker CoverageTracker;
367 
368   /// Profile reader object.
369   std::unique_ptr<SampleProfileReader> Reader;
370 
371   /// Samples collected for the body of this function.
372   FunctionSamples *Samples = nullptr;
373 
374   /// Name of the profile file to load.
375   std::string Filename;
376 
377   /// Name of the profile remapping file to load.
378   std::string RemappingFilename;
379 
380   /// Flag indicating whether the profile input loaded successfully.
381   bool ProfileIsValid = false;
382 
383   /// Flag indicating if the pass is invoked in ThinLTO compile phase.
384   ///
385   /// In this phase, in annotation, we should not promote indirect calls.
386   /// Instead, we will mark GUIDs that needs to be annotated to the function.
387   bool IsThinLTOPreLink;
388 
389   /// Profile Summary Info computed from sample profile.
390   ProfileSummaryInfo *PSI = nullptr;
391 
392   /// Profle Symbol list tells whether a function name appears in the binary
393   /// used to generate the current profile.
394   std::unique_ptr<ProfileSymbolList> PSL;
395 
396   /// Total number of samples collected in this profile.
397   ///
398   /// This is the sum of all the samples collected in all the functions executed
399   /// at runtime.
400   uint64_t TotalCollectedSamples = 0;
401 
402   /// Optimization Remark Emitter used to emit diagnostic remarks.
403   OptimizationRemarkEmitter *ORE = nullptr;
404 
405   // Information recorded when we declined to inline a call site
406   // because we have determined it is too cold is accumulated for
407   // each callee function. Initially this is just the entry count.
408   struct NotInlinedProfileInfo {
409     uint64_t entryCount;
410   };
411   DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo;
412 
413   // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
414   // all the function symbols defined or declared in current module.
415   DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
416 
417   // All the Names used in FunctionSamples including outline function
418   // names, inline instance names and call target names.
419   StringSet<> NamesInProfile;
420 
421   // Showing whether ProfileSampleAccurate is enabled for current function.
422   bool ProfSampleAccEnabled = false;
423 };
424 
425 class SampleProfileLoaderLegacyPass : public ModulePass {
426 public:
427   // Class identification, replacement for typeinfo
428   static char ID;
429 
430   SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile,
431                                 bool IsThinLTOPreLink = false)
432       : ModulePass(ID),
433         SampleLoader(Name, SampleProfileRemappingFile, IsThinLTOPreLink,
434                      [&](Function &F) -> AssumptionCache & {
435                        return ACT->getAssumptionCache(F);
436                      },
437                      [&](Function &F) -> TargetTransformInfo & {
438                        return TTIWP->getTTI(F);
439                      }) {
440     initializeSampleProfileLoaderLegacyPassPass(
441         *PassRegistry::getPassRegistry());
442   }
443 
444   void dump() { SampleLoader.dump(); }
445 
446   bool doInitialization(Module &M) override {
447     return SampleLoader.doInitialization(M);
448   }
449 
450   StringRef getPassName() const override { return "Sample profile pass"; }
451   bool runOnModule(Module &M) override;
452 
453   void getAnalysisUsage(AnalysisUsage &AU) const override {
454     AU.addRequired<AssumptionCacheTracker>();
455     AU.addRequired<TargetTransformInfoWrapperPass>();
456     AU.addRequired<ProfileSummaryInfoWrapperPass>();
457   }
458 
459 private:
460   SampleProfileLoader SampleLoader;
461   AssumptionCacheTracker *ACT = nullptr;
462   TargetTransformInfoWrapperPass *TTIWP = nullptr;
463 };
464 
465 } // end anonymous namespace
466 
467 /// Return true if the given callsite is hot wrt to hot cutoff threshold.
468 ///
469 /// Functions that were inlined in the original binary will be represented
470 /// in the inline stack in the sample profile. If the profile shows that
471 /// the original inline decision was "good" (i.e., the callsite is executed
472 /// frequently), then we will recreate the inline decision and apply the
473 /// profile from the inlined callsite.
474 ///
475 /// To decide whether an inlined callsite is hot, we compare the callsite
476 /// sample count with the hot cutoff computed by ProfileSummaryInfo, it is
477 /// regarded as hot if the count is above the cutoff value.
478 ///
479 /// When profile-sample-accurate is enabled, functions without profile will
480 /// be regarded as cold and much less inlining will happen in CGSCC inlining
481 /// pass, so we tend to lower the hot criteria here to allow more early
482 /// inlining to happen for warm callsites and it is helpful for performance.
483 bool SampleProfileLoader::callsiteIsHot(const FunctionSamples *CallsiteFS,
484                                         ProfileSummaryInfo *PSI) {
485   if (!CallsiteFS)
486     return false; // The callsite was not inlined in the original binary.
487 
488   assert(PSI && "PSI is expected to be non null");
489   uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
490   if (ProfSampleAccEnabled)
491     return !PSI->isColdCount(CallsiteTotalSamples);
492   else
493     return PSI->isHotCount(CallsiteTotalSamples);
494 }
495 
496 /// Mark as used the sample record for the given function samples at
497 /// (LineOffset, Discriminator).
498 ///
499 /// \returns true if this is the first time we mark the given record.
500 bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS,
501                                             uint32_t LineOffset,
502                                             uint32_t Discriminator,
503                                             uint64_t Samples) {
504   LineLocation Loc(LineOffset, Discriminator);
505   unsigned &Count = SampleCoverage[FS][Loc];
506   bool FirstTime = (++Count == 1);
507   if (FirstTime)
508     TotalUsedSamples += Samples;
509   return FirstTime;
510 }
511 
512 /// Return the number of sample records that were applied from this profile.
513 ///
514 /// This count does not include records from cold inlined callsites.
515 unsigned
516 SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS,
517                                         ProfileSummaryInfo *PSI) const {
518   auto I = SampleCoverage.find(FS);
519 
520   // The size of the coverage map for FS represents the number of records
521   // that were marked used at least once.
522   unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0;
523 
524   // If there are inlined callsites in this function, count the samples found
525   // in the respective bodies. However, do not bother counting callees with 0
526   // total samples, these are callees that were never invoked at runtime.
527   for (const auto &I : FS->getCallsiteSamples())
528     for (const auto &J : I.second) {
529       const FunctionSamples *CalleeSamples = &J.second;
530       if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
531         Count += countUsedRecords(CalleeSamples, PSI);
532     }
533 
534   return Count;
535 }
536 
537 /// Return the number of sample records in the body of this profile.
538 ///
539 /// This count does not include records from cold inlined callsites.
540 unsigned
541 SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS,
542                                         ProfileSummaryInfo *PSI) const {
543   unsigned Count = FS->getBodySamples().size();
544 
545   // Only count records in hot callsites.
546   for (const auto &I : FS->getCallsiteSamples())
547     for (const auto &J : I.second) {
548       const FunctionSamples *CalleeSamples = &J.second;
549       if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
550         Count += countBodyRecords(CalleeSamples, PSI);
551     }
552 
553   return Count;
554 }
555 
556 /// Return the number of samples collected in the body of this profile.
557 ///
558 /// This count does not include samples from cold inlined callsites.
559 uint64_t
560 SampleCoverageTracker::countBodySamples(const FunctionSamples *FS,
561                                         ProfileSummaryInfo *PSI) const {
562   uint64_t Total = 0;
563   for (const auto &I : FS->getBodySamples())
564     Total += I.second.getSamples();
565 
566   // Only count samples in hot callsites.
567   for (const auto &I : FS->getCallsiteSamples())
568     for (const auto &J : I.second) {
569       const FunctionSamples *CalleeSamples = &J.second;
570       if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
571         Total += countBodySamples(CalleeSamples, PSI);
572     }
573 
574   return Total;
575 }
576 
577 /// Return the fraction of sample records used in this profile.
578 ///
579 /// The returned value is an unsigned integer in the range 0-100 indicating
580 /// the percentage of sample records that were used while applying this
581 /// profile to the associated function.
582 unsigned SampleCoverageTracker::computeCoverage(unsigned Used,
583                                                 unsigned Total) const {
584   assert(Used <= Total &&
585          "number of used records cannot exceed the total number of records");
586   return Total > 0 ? Used * 100 / Total : 100;
587 }
588 
589 /// Clear all the per-function data used to load samples and propagate weights.
590 void SampleProfileLoader::clearFunctionData() {
591   BlockWeights.clear();
592   EdgeWeights.clear();
593   VisitedBlocks.clear();
594   VisitedEdges.clear();
595   EquivalenceClass.clear();
596   DT = nullptr;
597   PDT = nullptr;
598   LI = nullptr;
599   Predecessors.clear();
600   Successors.clear();
601   CoverageTracker.clear();
602 }
603 
604 #ifndef NDEBUG
605 /// Print the weight of edge \p E on stream \p OS.
606 ///
607 /// \param OS  Stream to emit the output to.
608 /// \param E  Edge to print.
609 void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {
610   OS << "weight[" << E.first->getName() << "->" << E.second->getName()
611      << "]: " << EdgeWeights[E] << "\n";
612 }
613 
614 /// Print the equivalence class of block \p BB on stream \p OS.
615 ///
616 /// \param OS  Stream to emit the output to.
617 /// \param BB  Block to print.
618 void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
619                                                 const BasicBlock *BB) {
620   const BasicBlock *Equiv = EquivalenceClass[BB];
621   OS << "equivalence[" << BB->getName()
622      << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
623 }
624 
625 /// Print the weight of block \p BB on stream \p OS.
626 ///
627 /// \param OS  Stream to emit the output to.
628 /// \param BB  Block to print.
629 void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
630                                            const BasicBlock *BB) const {
631   const auto &I = BlockWeights.find(BB);
632   uint64_t W = (I == BlockWeights.end() ? 0 : I->second);
633   OS << "weight[" << BB->getName() << "]: " << W << "\n";
634 }
635 #endif
636 
637 /// Get the weight for an instruction.
638 ///
639 /// The "weight" of an instruction \p Inst is the number of samples
640 /// collected on that instruction at runtime. To retrieve it, we
641 /// need to compute the line number of \p Inst relative to the start of its
642 /// function. We use HeaderLineno to compute the offset. We then
643 /// look up the samples collected for \p Inst using BodySamples.
644 ///
645 /// \param Inst Instruction to query.
646 ///
647 /// \returns the weight of \p Inst.
648 ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
649   const DebugLoc &DLoc = Inst.getDebugLoc();
650   if (!DLoc)
651     return std::error_code();
652 
653   const FunctionSamples *FS = findFunctionSamples(Inst);
654   if (!FS)
655     return std::error_code();
656 
657   // Ignore all intrinsics, phinodes and branch instructions.
658   // Branch and phinodes instruction usually contains debug info from sources outside of
659   // the residing basic block, thus we ignore them during annotation.
660   if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
661     return std::error_code();
662 
663   // If a direct call/invoke instruction is inlined in profile
664   // (findCalleeFunctionSamples returns non-empty result), but not inlined here,
665   // it means that the inlined callsite has no sample, thus the call
666   // instruction should have 0 count.
667   if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
668       !ImmutableCallSite(&Inst).isIndirectCall() &&
669       findCalleeFunctionSamples(Inst))
670     return 0;
671 
672   const DILocation *DIL = DLoc;
673   uint32_t LineOffset = FunctionSamples::getOffset(DIL);
674   uint32_t Discriminator = DIL->getBaseDiscriminator();
675   ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
676   if (R) {
677     bool FirstMark =
678         CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
679     if (FirstMark) {
680       ORE->emit([&]() {
681         OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst);
682         Remark << "Applied " << ore::NV("NumSamples", *R);
683         Remark << " samples from profile (offset: ";
684         Remark << ore::NV("LineOffset", LineOffset);
685         if (Discriminator) {
686           Remark << ".";
687           Remark << ore::NV("Discriminator", Discriminator);
688         }
689         Remark << ")";
690         return Remark;
691       });
692     }
693     LLVM_DEBUG(dbgs() << "    " << DLoc.getLine() << "."
694                       << DIL->getBaseDiscriminator() << ":" << Inst
695                       << " (line offset: " << LineOffset << "."
696                       << DIL->getBaseDiscriminator() << " - weight: " << R.get()
697                       << ")\n");
698   }
699   return R;
700 }
701 
702 /// Compute the weight of a basic block.
703 ///
704 /// The weight of basic block \p BB is the maximum weight of all the
705 /// instructions in BB.
706 ///
707 /// \param BB The basic block to query.
708 ///
709 /// \returns the weight for \p BB.
710 ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
711   uint64_t Max = 0;
712   bool HasWeight = false;
713   for (auto &I : BB->getInstList()) {
714     const ErrorOr<uint64_t> &R = getInstWeight(I);
715     if (R) {
716       Max = std::max(Max, R.get());
717       HasWeight = true;
718     }
719   }
720   return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code();
721 }
722 
723 /// Compute and store the weights of every basic block.
724 ///
725 /// This populates the BlockWeights map by computing
726 /// the weights of every basic block in the CFG.
727 ///
728 /// \param F The function to query.
729 bool SampleProfileLoader::computeBlockWeights(Function &F) {
730   bool Changed = false;
731   LLVM_DEBUG(dbgs() << "Block weights\n");
732   for (const auto &BB : F) {
733     ErrorOr<uint64_t> Weight = getBlockWeight(&BB);
734     if (Weight) {
735       BlockWeights[&BB] = Weight.get();
736       VisitedBlocks.insert(&BB);
737       Changed = true;
738     }
739     LLVM_DEBUG(printBlockWeight(dbgs(), &BB));
740   }
741 
742   return Changed;
743 }
744 
745 /// Get the FunctionSamples for a call instruction.
746 ///
747 /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
748 /// instance in which that call instruction is calling to. It contains
749 /// all samples that resides in the inlined instance. We first find the
750 /// inlined instance in which the call instruction is from, then we
751 /// traverse its children to find the callsite with the matching
752 /// location.
753 ///
754 /// \param Inst Call/Invoke instruction to query.
755 ///
756 /// \returns The FunctionSamples pointer to the inlined instance.
757 const FunctionSamples *
758 SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
759   const DILocation *DIL = Inst.getDebugLoc();
760   if (!DIL) {
761     return nullptr;
762   }
763 
764   StringRef CalleeName;
765   if (const CallInst *CI = dyn_cast<CallInst>(&Inst))
766     if (Function *Callee = CI->getCalledFunction())
767       CalleeName = Callee->getName();
768 
769   const FunctionSamples *FS = findFunctionSamples(Inst);
770   if (FS == nullptr)
771     return nullptr;
772 
773   return FS->findFunctionSamplesAt(LineLocation(FunctionSamples::getOffset(DIL),
774                                                 DIL->getBaseDiscriminator()),
775                                    CalleeName);
776 }
777 
778 /// Returns a vector of FunctionSamples that are the indirect call targets
779 /// of \p Inst. The vector is sorted by the total number of samples. Stores
780 /// the total call count of the indirect call in \p Sum.
781 std::vector<const FunctionSamples *>
782 SampleProfileLoader::findIndirectCallFunctionSamples(
783     const Instruction &Inst, uint64_t &Sum) const {
784   const DILocation *DIL = Inst.getDebugLoc();
785   std::vector<const FunctionSamples *> R;
786 
787   if (!DIL) {
788     return R;
789   }
790 
791   const FunctionSamples *FS = findFunctionSamples(Inst);
792   if (FS == nullptr)
793     return R;
794 
795   uint32_t LineOffset = FunctionSamples::getOffset(DIL);
796   uint32_t Discriminator = DIL->getBaseDiscriminator();
797 
798   auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
799   Sum = 0;
800   if (T)
801     for (const auto &T_C : T.get())
802       Sum += T_C.second;
803   if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(LineLocation(
804           FunctionSamples::getOffset(DIL), DIL->getBaseDiscriminator()))) {
805     if (M->empty())
806       return R;
807     for (const auto &NameFS : *M) {
808       Sum += NameFS.second.getEntrySamples();
809       R.push_back(&NameFS.second);
810     }
811     llvm::sort(R, [](const FunctionSamples *L, const FunctionSamples *R) {
812       if (L->getEntrySamples() != R->getEntrySamples())
813         return L->getEntrySamples() > R->getEntrySamples();
814       return FunctionSamples::getGUID(L->getName()) <
815              FunctionSamples::getGUID(R->getName());
816     });
817   }
818   return R;
819 }
820 
821 /// Get the FunctionSamples for an instruction.
822 ///
823 /// The FunctionSamples of an instruction \p Inst is the inlined instance
824 /// in which that instruction is coming from. We traverse the inline stack
825 /// of that instruction, and match it with the tree nodes in the profile.
826 ///
827 /// \param Inst Instruction to query.
828 ///
829 /// \returns the FunctionSamples pointer to the inlined instance.
830 const FunctionSamples *
831 SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
832   const DILocation *DIL = Inst.getDebugLoc();
833   if (!DIL)
834     return Samples;
835 
836   auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
837   if (it.second)
838     it.first->second = Samples->findFunctionSamples(DIL);
839   return it.first->second;
840 }
841 
842 bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
843   assert(isa<CallInst>(I) || isa<InvokeInst>(I));
844   CallSite CS(I);
845   Function *CalledFunction = CS.getCalledFunction();
846   assert(CalledFunction);
847   DebugLoc DLoc = I->getDebugLoc();
848   BasicBlock *BB = I->getParent();
849   InlineParams Params = getInlineParams();
850   Params.ComputeFullInlineCost = true;
851   // Checks if there is anything in the reachable portion of the callee at
852   // this callsite that makes this inlining potentially illegal. Need to
853   // set ComputeFullInlineCost, otherwise getInlineCost may return early
854   // when cost exceeds threshold without checking all IRs in the callee.
855   // The acutal cost does not matter because we only checks isNever() to
856   // see if it is legal to inline the callsite.
857   InlineCost Cost =
858       getInlineCost(cast<CallBase>(*I), Params, GetTTI(*CalledFunction), GetAC,
859                     None, nullptr, nullptr);
860   if (Cost.isNever()) {
861     ORE->emit(OptimizationRemark(DEBUG_TYPE, "Not inline", DLoc, BB)
862               << "incompatible inlining");
863     return false;
864   }
865   InlineFunctionInfo IFI(nullptr, &GetAC);
866   if (InlineFunction(CS, IFI)) {
867     // The call to InlineFunction erases I, so we can't pass it here.
868     ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB)
869               << "inlined hot callee '" << ore::NV("Callee", CalledFunction)
870               << "' into '" << ore::NV("Caller", BB->getParent()) << "'");
871     return true;
872   }
873   return false;
874 }
875 
876 /// Iteratively inline hot callsites of a function.
877 ///
878 /// Iteratively traverse all callsites of the function \p F, and find if
879 /// the corresponding inlined instance exists and is hot in profile. If
880 /// it is hot enough, inline the callsites and adds new callsites of the
881 /// callee into the caller. If the call is an indirect call, first promote
882 /// it to direct call. Each indirect call is limited with a single target.
883 ///
884 /// \param F function to perform iterative inlining.
885 /// \param InlinedGUIDs a set to be updated to include all GUIDs that are
886 ///     inlined in the profiled binary.
887 ///
888 /// \returns True if there is any inline happened.
889 bool SampleProfileLoader::inlineHotFunctions(
890     Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
891   DenseSet<Instruction *> PromotedInsns;
892 
893   DenseMap<Instruction *, const FunctionSamples *> localNotInlinedCallSites;
894   bool Changed = false;
895   while (true) {
896     bool LocalChanged = false;
897     SmallVector<Instruction *, 10> CIS;
898     for (auto &BB : F) {
899       bool Hot = false;
900       SmallVector<Instruction *, 10> Candidates;
901       for (auto &I : BB.getInstList()) {
902         const FunctionSamples *FS = nullptr;
903         if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
904             !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) {
905           Candidates.push_back(&I);
906           if (FS->getEntrySamples() > 0)
907             localNotInlinedCallSites.try_emplace(&I, FS);
908           if (callsiteIsHot(FS, PSI))
909             Hot = true;
910         }
911       }
912       if (Hot) {
913         CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end());
914       }
915     }
916     for (auto I : CIS) {
917       Function *CalledFunction = CallSite(I).getCalledFunction();
918       // Do not inline recursive calls.
919       if (CalledFunction == &F)
920         continue;
921       if (CallSite(I).isIndirectCall()) {
922         if (PromotedInsns.count(I))
923           continue;
924         uint64_t Sum;
925         for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
926           if (IsThinLTOPreLink) {
927             FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
928                                      PSI->getOrCompHotCountThreshold());
929             continue;
930           }
931           auto CalleeFunctionName = FS->getFuncNameInModule(F.getParent());
932           // If it is a recursive call, we do not inline it as it could bloat
933           // the code exponentially. There is way to better handle this, e.g.
934           // clone the caller first, and inline the cloned caller if it is
935           // recursive. As llvm does not inline recursive calls, we will
936           // simply ignore it instead of handling it explicitly.
937           if (CalleeFunctionName == F.getName())
938             continue;
939 
940           if (!callsiteIsHot(FS, PSI))
941             continue;
942 
943           const char *Reason = "Callee function not available";
944           auto R = SymbolMap.find(CalleeFunctionName);
945           if (R != SymbolMap.end() && R->getValue() &&
946               !R->getValue()->isDeclaration() &&
947               R->getValue()->getSubprogram() &&
948               isLegalToPromote(CallSite(I), R->getValue(), &Reason)) {
949             uint64_t C = FS->getEntrySamples();
950             Instruction *DI =
951                 pgo::promoteIndirectCall(I, R->getValue(), C, Sum, false, ORE);
952             Sum -= C;
953             PromotedInsns.insert(I);
954             // If profile mismatches, we should not attempt to inline DI.
955             if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
956                 inlineCallInstruction(DI)) {
957               localNotInlinedCallSites.erase(I);
958               LocalChanged = true;
959             }
960           } else {
961             LLVM_DEBUG(dbgs()
962                        << "\nFailed to promote indirect call to "
963                        << CalleeFunctionName << " because " << Reason << "\n");
964           }
965         }
966       } else if (CalledFunction && CalledFunction->getSubprogram() &&
967                  !CalledFunction->isDeclaration()) {
968         if (inlineCallInstruction(I)) {
969           localNotInlinedCallSites.erase(I);
970           LocalChanged = true;
971         }
972       } else if (IsThinLTOPreLink) {
973         findCalleeFunctionSamples(*I)->findInlinedFunctions(
974             InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
975       }
976     }
977     if (LocalChanged) {
978       Changed = true;
979     } else {
980       break;
981     }
982   }
983 
984   // Accumulate not inlined callsite information into notInlinedSamples
985   for (const auto &Pair : localNotInlinedCallSites) {
986     Instruction *I = Pair.getFirst();
987     Function *Callee = CallSite(I).getCalledFunction();
988     if (!Callee || Callee->isDeclaration())
989       continue;
990     const FunctionSamples *FS = Pair.getSecond();
991     auto pair =
992         notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
993     pair.first->second.entryCount += FS->getEntrySamples();
994   }
995   return Changed;
996 }
997 
998 /// Find equivalence classes for the given block.
999 ///
1000 /// This finds all the blocks that are guaranteed to execute the same
1001 /// number of times as \p BB1. To do this, it traverses all the
1002 /// descendants of \p BB1 in the dominator or post-dominator tree.
1003 ///
1004 /// A block BB2 will be in the same equivalence class as \p BB1 if
1005 /// the following holds:
1006 ///
1007 /// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2
1008 ///    is a descendant of \p BB1 in the dominator tree, then BB2 should
1009 ///    dominate BB1 in the post-dominator tree.
1010 ///
1011 /// 2- Both BB2 and \p BB1 must be in the same loop.
1012 ///
1013 /// For every block BB2 that meets those two requirements, we set BB2's
1014 /// equivalence class to \p BB1.
1015 ///
1016 /// \param BB1  Block to check.
1017 /// \param Descendants  Descendants of \p BB1 in either the dom or pdom tree.
1018 /// \param DomTree  Opposite dominator tree. If \p Descendants is filled
1019 ///                 with blocks from \p BB1's dominator tree, then
1020 ///                 this is the post-dominator tree, and vice versa.
1021 template <bool IsPostDom>
1022 void SampleProfileLoader::findEquivalencesFor(
1023     BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
1024     DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) {
1025   const BasicBlock *EC = EquivalenceClass[BB1];
1026   uint64_t Weight = BlockWeights[EC];
1027   for (const auto *BB2 : Descendants) {
1028     bool IsDomParent = DomTree->dominates(BB2, BB1);
1029     bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
1030     if (BB1 != BB2 && IsDomParent && IsInSameLoop) {
1031       EquivalenceClass[BB2] = EC;
1032       // If BB2 is visited, then the entire EC should be marked as visited.
1033       if (VisitedBlocks.count(BB2)) {
1034         VisitedBlocks.insert(EC);
1035       }
1036 
1037       // If BB2 is heavier than BB1, make BB2 have the same weight
1038       // as BB1.
1039       //
1040       // Note that we don't worry about the opposite situation here
1041       // (when BB2 is lighter than BB1). We will deal with this
1042       // during the propagation phase. Right now, we just want to
1043       // make sure that BB1 has the largest weight of all the
1044       // members of its equivalence set.
1045       Weight = std::max(Weight, BlockWeights[BB2]);
1046     }
1047   }
1048   if (EC == &EC->getParent()->getEntryBlock()) {
1049     BlockWeights[EC] = Samples->getHeadSamples() + 1;
1050   } else {
1051     BlockWeights[EC] = Weight;
1052   }
1053 }
1054 
1055 /// Find equivalence classes.
1056 ///
1057 /// Since samples may be missing from blocks, we can fill in the gaps by setting
1058 /// the weights of all the blocks in the same equivalence class to the same
1059 /// weight. To compute the concept of equivalence, we use dominance and loop
1060 /// information. Two blocks B1 and B2 are in the same equivalence class if B1
1061 /// dominates B2, B2 post-dominates B1 and both are in the same loop.
1062 ///
1063 /// \param F The function to query.
1064 void SampleProfileLoader::findEquivalenceClasses(Function &F) {
1065   SmallVector<BasicBlock *, 8> DominatedBBs;
1066   LLVM_DEBUG(dbgs() << "\nBlock equivalence classes\n");
1067   // Find equivalence sets based on dominance and post-dominance information.
1068   for (auto &BB : F) {
1069     BasicBlock *BB1 = &BB;
1070 
1071     // Compute BB1's equivalence class once.
1072     if (EquivalenceClass.count(BB1)) {
1073       LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
1074       continue;
1075     }
1076 
1077     // By default, blocks are in their own equivalence class.
1078     EquivalenceClass[BB1] = BB1;
1079 
1080     // Traverse all the blocks dominated by BB1. We are looking for
1081     // every basic block BB2 such that:
1082     //
1083     // 1- BB1 dominates BB2.
1084     // 2- BB2 post-dominates BB1.
1085     // 3- BB1 and BB2 are in the same loop nest.
1086     //
1087     // If all those conditions hold, it means that BB2 is executed
1088     // as many times as BB1, so they are placed in the same equivalence
1089     // class by making BB2's equivalence class be BB1.
1090     DominatedBBs.clear();
1091     DT->getDescendants(BB1, DominatedBBs);
1092     findEquivalencesFor(BB1, DominatedBBs, PDT.get());
1093 
1094     LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
1095   }
1096 
1097   // Assign weights to equivalence classes.
1098   //
1099   // All the basic blocks in the same equivalence class will execute
1100   // the same number of times. Since we know that the head block in
1101   // each equivalence class has the largest weight, assign that weight
1102   // to all the blocks in that equivalence class.
1103   LLVM_DEBUG(
1104       dbgs() << "\nAssign the same weight to all blocks in the same class\n");
1105   for (auto &BI : F) {
1106     const BasicBlock *BB = &BI;
1107     const BasicBlock *EquivBB = EquivalenceClass[BB];
1108     if (BB != EquivBB)
1109       BlockWeights[BB] = BlockWeights[EquivBB];
1110     LLVM_DEBUG(printBlockWeight(dbgs(), BB));
1111   }
1112 }
1113 
1114 /// Visit the given edge to decide if it has a valid weight.
1115 ///
1116 /// If \p E has not been visited before, we copy to \p UnknownEdge
1117 /// and increment the count of unknown edges.
1118 ///
1119 /// \param E  Edge to visit.
1120 /// \param NumUnknownEdges  Current number of unknown edges.
1121 /// \param UnknownEdge  Set if E has not been visited before.
1122 ///
1123 /// \returns E's weight, if known. Otherwise, return 0.
1124 uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
1125                                         Edge *UnknownEdge) {
1126   if (!VisitedEdges.count(E)) {
1127     (*NumUnknownEdges)++;
1128     *UnknownEdge = E;
1129     return 0;
1130   }
1131 
1132   return EdgeWeights[E];
1133 }
1134 
1135 /// Propagate weights through incoming/outgoing edges.
1136 ///
1137 /// If the weight of a basic block is known, and there is only one edge
1138 /// with an unknown weight, we can calculate the weight of that edge.
1139 ///
1140 /// Similarly, if all the edges have a known count, we can calculate the
1141 /// count of the basic block, if needed.
1142 ///
1143 /// \param F  Function to process.
1144 /// \param UpdateBlockCount  Whether we should update basic block counts that
1145 ///                          has already been annotated.
1146 ///
1147 /// \returns  True if new weights were assigned to edges or blocks.
1148 bool SampleProfileLoader::propagateThroughEdges(Function &F,
1149                                                 bool UpdateBlockCount) {
1150   bool Changed = false;
1151   LLVM_DEBUG(dbgs() << "\nPropagation through edges\n");
1152   for (const auto &BI : F) {
1153     const BasicBlock *BB = &BI;
1154     const BasicBlock *EC = EquivalenceClass[BB];
1155 
1156     // Visit all the predecessor and successor edges to determine
1157     // which ones have a weight assigned already. Note that it doesn't
1158     // matter that we only keep track of a single unknown edge. The
1159     // only case we are interested in handling is when only a single
1160     // edge is unknown (see setEdgeOrBlockWeight).
1161     for (unsigned i = 0; i < 2; i++) {
1162       uint64_t TotalWeight = 0;
1163       unsigned NumUnknownEdges = 0, NumTotalEdges = 0;
1164       Edge UnknownEdge, SelfReferentialEdge, SingleEdge;
1165 
1166       if (i == 0) {
1167         // First, visit all predecessor edges.
1168         NumTotalEdges = Predecessors[BB].size();
1169         for (auto *Pred : Predecessors[BB]) {
1170           Edge E = std::make_pair(Pred, BB);
1171           TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
1172           if (E.first == E.second)
1173             SelfReferentialEdge = E;
1174         }
1175         if (NumTotalEdges == 1) {
1176           SingleEdge = std::make_pair(Predecessors[BB][0], BB);
1177         }
1178       } else {
1179         // On the second round, visit all successor edges.
1180         NumTotalEdges = Successors[BB].size();
1181         for (auto *Succ : Successors[BB]) {
1182           Edge E = std::make_pair(BB, Succ);
1183           TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
1184         }
1185         if (NumTotalEdges == 1) {
1186           SingleEdge = std::make_pair(BB, Successors[BB][0]);
1187         }
1188       }
1189 
1190       // After visiting all the edges, there are three cases that we
1191       // can handle immediately:
1192       //
1193       // - All the edge weights are known (i.e., NumUnknownEdges == 0).
1194       //   In this case, we simply check that the sum of all the edges
1195       //   is the same as BB's weight. If not, we change BB's weight
1196       //   to match. Additionally, if BB had not been visited before,
1197       //   we mark it visited.
1198       //
1199       // - Only one edge is unknown and BB has already been visited.
1200       //   In this case, we can compute the weight of the edge by
1201       //   subtracting the total block weight from all the known
1202       //   edge weights. If the edges weight more than BB, then the
1203       //   edge of the last remaining edge is set to zero.
1204       //
1205       // - There exists a self-referential edge and the weight of BB is
1206       //   known. In this case, this edge can be based on BB's weight.
1207       //   We add up all the other known edges and set the weight on
1208       //   the self-referential edge as we did in the previous case.
1209       //
1210       // In any other case, we must continue iterating. Eventually,
1211       // all edges will get a weight, or iteration will stop when
1212       // it reaches SampleProfileMaxPropagateIterations.
1213       if (NumUnknownEdges <= 1) {
1214         uint64_t &BBWeight = BlockWeights[EC];
1215         if (NumUnknownEdges == 0) {
1216           if (!VisitedBlocks.count(EC)) {
1217             // If we already know the weight of all edges, the weight of the
1218             // basic block can be computed. It should be no larger than the sum
1219             // of all edge weights.
1220             if (TotalWeight > BBWeight) {
1221               BBWeight = TotalWeight;
1222               Changed = true;
1223               LLVM_DEBUG(dbgs() << "All edge weights for " << BB->getName()
1224                                 << " known. Set weight for block: ";
1225                          printBlockWeight(dbgs(), BB););
1226             }
1227           } else if (NumTotalEdges == 1 &&
1228                      EdgeWeights[SingleEdge] < BlockWeights[EC]) {
1229             // If there is only one edge for the visited basic block, use the
1230             // block weight to adjust edge weight if edge weight is smaller.
1231             EdgeWeights[SingleEdge] = BlockWeights[EC];
1232             Changed = true;
1233           }
1234         } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) {
1235           // If there is a single unknown edge and the block has been
1236           // visited, then we can compute E's weight.
1237           if (BBWeight >= TotalWeight)
1238             EdgeWeights[UnknownEdge] = BBWeight - TotalWeight;
1239           else
1240             EdgeWeights[UnknownEdge] = 0;
1241           const BasicBlock *OtherEC;
1242           if (i == 0)
1243             OtherEC = EquivalenceClass[UnknownEdge.first];
1244           else
1245             OtherEC = EquivalenceClass[UnknownEdge.second];
1246           // Edge weights should never exceed the BB weights it connects.
1247           if (VisitedBlocks.count(OtherEC) &&
1248               EdgeWeights[UnknownEdge] > BlockWeights[OtherEC])
1249             EdgeWeights[UnknownEdge] = BlockWeights[OtherEC];
1250           VisitedEdges.insert(UnknownEdge);
1251           Changed = true;
1252           LLVM_DEBUG(dbgs() << "Set weight for edge: ";
1253                      printEdgeWeight(dbgs(), UnknownEdge));
1254         }
1255       } else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) {
1256         // If a block Weights 0, all its in/out edges should weight 0.
1257         if (i == 0) {
1258           for (auto *Pred : Predecessors[BB]) {
1259             Edge E = std::make_pair(Pred, BB);
1260             EdgeWeights[E] = 0;
1261             VisitedEdges.insert(E);
1262           }
1263         } else {
1264           for (auto *Succ : Successors[BB]) {
1265             Edge E = std::make_pair(BB, Succ);
1266             EdgeWeights[E] = 0;
1267             VisitedEdges.insert(E);
1268           }
1269         }
1270       } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) {
1271         uint64_t &BBWeight = BlockWeights[BB];
1272         // We have a self-referential edge and the weight of BB is known.
1273         if (BBWeight >= TotalWeight)
1274           EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight;
1275         else
1276           EdgeWeights[SelfReferentialEdge] = 0;
1277         VisitedEdges.insert(SelfReferentialEdge);
1278         Changed = true;
1279         LLVM_DEBUG(dbgs() << "Set self-referential edge weight to: ";
1280                    printEdgeWeight(dbgs(), SelfReferentialEdge));
1281       }
1282       if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) {
1283         BlockWeights[EC] = TotalWeight;
1284         VisitedBlocks.insert(EC);
1285         Changed = true;
1286       }
1287     }
1288   }
1289 
1290   return Changed;
1291 }
1292 
1293 /// Build in/out edge lists for each basic block in the CFG.
1294 ///
1295 /// We are interested in unique edges. If a block B1 has multiple
1296 /// edges to another block B2, we only add a single B1->B2 edge.
1297 void SampleProfileLoader::buildEdges(Function &F) {
1298   for (auto &BI : F) {
1299     BasicBlock *B1 = &BI;
1300 
1301     // Add predecessors for B1.
1302     SmallPtrSet<BasicBlock *, 16> Visited;
1303     if (!Predecessors[B1].empty())
1304       llvm_unreachable("Found a stale predecessors list in a basic block.");
1305     for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) {
1306       BasicBlock *B2 = *PI;
1307       if (Visited.insert(B2).second)
1308         Predecessors[B1].push_back(B2);
1309     }
1310 
1311     // Add successors for B1.
1312     Visited.clear();
1313     if (!Successors[B1].empty())
1314       llvm_unreachable("Found a stale successors list in a basic block.");
1315     for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) {
1316       BasicBlock *B2 = *SI;
1317       if (Visited.insert(B2).second)
1318         Successors[B1].push_back(B2);
1319     }
1320   }
1321 }
1322 
1323 /// Returns the sorted CallTargetMap \p M by count in descending order.
1324 static SmallVector<InstrProfValueData, 2> GetSortedValueDataFromCallTargets(
1325     const SampleRecord::CallTargetMap & M) {
1326   SmallVector<InstrProfValueData, 2> R;
1327   for (const auto &I : SampleRecord::SortCallTargets(M)) {
1328     R.emplace_back(InstrProfValueData{FunctionSamples::getGUID(I.first), I.second});
1329   }
1330   return R;
1331 }
1332 
1333 /// Propagate weights into edges
1334 ///
1335 /// The following rules are applied to every block BB in the CFG:
1336 ///
1337 /// - If BB has a single predecessor/successor, then the weight
1338 ///   of that edge is the weight of the block.
1339 ///
1340 /// - If all incoming or outgoing edges are known except one, and the
1341 ///   weight of the block is already known, the weight of the unknown
1342 ///   edge will be the weight of the block minus the sum of all the known
1343 ///   edges. If the sum of all the known edges is larger than BB's weight,
1344 ///   we set the unknown edge weight to zero.
1345 ///
1346 /// - If there is a self-referential edge, and the weight of the block is
1347 ///   known, the weight for that edge is set to the weight of the block
1348 ///   minus the weight of the other incoming edges to that block (if
1349 ///   known).
1350 void SampleProfileLoader::propagateWeights(Function &F) {
1351   bool Changed = true;
1352   unsigned I = 0;
1353 
1354   // If BB weight is larger than its corresponding loop's header BB weight,
1355   // use the BB weight to replace the loop header BB weight.
1356   for (auto &BI : F) {
1357     BasicBlock *BB = &BI;
1358     Loop *L = LI->getLoopFor(BB);
1359     if (!L) {
1360       continue;
1361     }
1362     BasicBlock *Header = L->getHeader();
1363     if (Header && BlockWeights[BB] > BlockWeights[Header]) {
1364       BlockWeights[Header] = BlockWeights[BB];
1365     }
1366   }
1367 
1368   // Before propagation starts, build, for each block, a list of
1369   // unique predecessors and successors. This is necessary to handle
1370   // identical edges in multiway branches. Since we visit all blocks and all
1371   // edges of the CFG, it is cleaner to build these lists once at the start
1372   // of the pass.
1373   buildEdges(F);
1374 
1375   // Propagate until we converge or we go past the iteration limit.
1376   while (Changed && I++ < SampleProfileMaxPropagateIterations) {
1377     Changed = propagateThroughEdges(F, false);
1378   }
1379 
1380   // The first propagation propagates BB counts from annotated BBs to unknown
1381   // BBs. The 2nd propagation pass resets edges weights, and use all BB weights
1382   // to propagate edge weights.
1383   VisitedEdges.clear();
1384   Changed = true;
1385   while (Changed && I++ < SampleProfileMaxPropagateIterations) {
1386     Changed = propagateThroughEdges(F, false);
1387   }
1388 
1389   // The 3rd propagation pass allows adjust annotated BB weights that are
1390   // obviously wrong.
1391   Changed = true;
1392   while (Changed && I++ < SampleProfileMaxPropagateIterations) {
1393     Changed = propagateThroughEdges(F, true);
1394   }
1395 
1396   // Generate MD_prof metadata for every branch instruction using the
1397   // edge weights computed during propagation.
1398   LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
1399   LLVMContext &Ctx = F.getContext();
1400   MDBuilder MDB(Ctx);
1401   for (auto &BI : F) {
1402     BasicBlock *BB = &BI;
1403 
1404     if (BlockWeights[BB]) {
1405       for (auto &I : BB->getInstList()) {
1406         if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
1407           continue;
1408         CallSite CS(&I);
1409         if (!CS.getCalledFunction()) {
1410           const DebugLoc &DLoc = I.getDebugLoc();
1411           if (!DLoc)
1412             continue;
1413           const DILocation *DIL = DLoc;
1414           uint32_t LineOffset = FunctionSamples::getOffset(DIL);
1415           uint32_t Discriminator = DIL->getBaseDiscriminator();
1416 
1417           const FunctionSamples *FS = findFunctionSamples(I);
1418           if (!FS)
1419             continue;
1420           auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
1421           if (!T || T.get().empty())
1422             continue;
1423           SmallVector<InstrProfValueData, 2> SortedCallTargets =
1424               GetSortedValueDataFromCallTargets(T.get());
1425           uint64_t Sum;
1426           findIndirectCallFunctionSamples(I, Sum);
1427           annotateValueSite(*I.getParent()->getParent()->getParent(), I,
1428                             SortedCallTargets, Sum, IPVK_IndirectCallTarget,
1429                             SortedCallTargets.size());
1430         } else if (!isa<IntrinsicInst>(&I)) {
1431           I.setMetadata(LLVMContext::MD_prof,
1432                         MDB.createBranchWeights(
1433                             {static_cast<uint32_t>(BlockWeights[BB])}));
1434         }
1435       }
1436     }
1437     Instruction *TI = BB->getTerminator();
1438     if (TI->getNumSuccessors() == 1)
1439       continue;
1440     if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
1441       continue;
1442 
1443     DebugLoc BranchLoc = TI->getDebugLoc();
1444     LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
1445                       << ((BranchLoc) ? Twine(BranchLoc.getLine())
1446                                       : Twine("<UNKNOWN LOCATION>"))
1447                       << ".\n");
1448     SmallVector<uint32_t, 4> Weights;
1449     uint32_t MaxWeight = 0;
1450     Instruction *MaxDestInst;
1451     for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1452       BasicBlock *Succ = TI->getSuccessor(I);
1453       Edge E = std::make_pair(BB, Succ);
1454       uint64_t Weight = EdgeWeights[E];
1455       LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
1456       // Use uint32_t saturated arithmetic to adjust the incoming weights,
1457       // if needed. Sample counts in profiles are 64-bit unsigned values,
1458       // but internally branch weights are expressed as 32-bit values.
1459       if (Weight > std::numeric_limits<uint32_t>::max()) {
1460         LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
1461         Weight = std::numeric_limits<uint32_t>::max();
1462       }
1463       // Weight is added by one to avoid propagation errors introduced by
1464       // 0 weights.
1465       Weights.push_back(static_cast<uint32_t>(Weight + 1));
1466       if (Weight != 0) {
1467         if (Weight > MaxWeight) {
1468           MaxWeight = Weight;
1469           MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
1470         }
1471       }
1472     }
1473 
1474     misexpect::verifyMisExpect(TI, Weights, TI->getContext());
1475 
1476     uint64_t TempWeight;
1477     // Only set weights if there is at least one non-zero weight.
1478     // In any other case, let the analyzer set weights.
1479     // Do not set weights if the weights are present. In ThinLTO, the profile
1480     // annotation is done twice. If the first annotation already set the
1481     // weights, the second pass does not need to set it.
1482     if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
1483       LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1484       TI->setMetadata(LLVMContext::MD_prof,
1485                       MDB.createBranchWeights(Weights));
1486       ORE->emit([&]() {
1487         return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
1488                << "most popular destination for conditional branches at "
1489                << ore::NV("CondBranchesLoc", BranchLoc);
1490       });
1491     } else {
1492       LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1493     }
1494   }
1495 }
1496 
1497 /// Get the line number for the function header.
1498 ///
1499 /// This looks up function \p F in the current compilation unit and
1500 /// retrieves the line number where the function is defined. This is
1501 /// line 0 for all the samples read from the profile file. Every line
1502 /// number is relative to this line.
1503 ///
1504 /// \param F  Function object to query.
1505 ///
1506 /// \returns the line number where \p F is defined. If it returns 0,
1507 ///          it means that there is no debug information available for \p F.
1508 unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
1509   if (DISubprogram *S = F.getSubprogram())
1510     return S->getLine();
1511 
1512   if (NoWarnSampleUnused)
1513     return 0;
1514 
1515   // If the start of \p F is missing, emit a diagnostic to inform the user
1516   // about the missed opportunity.
1517   F.getContext().diagnose(DiagnosticInfoSampleProfile(
1518       "No debug information found in function " + F.getName() +
1519           ": Function profile not used",
1520       DS_Warning));
1521   return 0;
1522 }
1523 
1524 void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
1525   DT.reset(new DominatorTree);
1526   DT->recalculate(F);
1527 
1528   PDT.reset(new PostDominatorTree(F));
1529 
1530   LI.reset(new LoopInfo);
1531   LI->analyze(*DT);
1532 }
1533 
1534 /// Generate branch weight metadata for all branches in \p F.
1535 ///
1536 /// Branch weights are computed out of instruction samples using a
1537 /// propagation heuristic. Propagation proceeds in 3 phases:
1538 ///
1539 /// 1- Assignment of block weights. All the basic blocks in the function
1540 ///    are initial assigned the same weight as their most frequently
1541 ///    executed instruction.
1542 ///
1543 /// 2- Creation of equivalence classes. Since samples may be missing from
1544 ///    blocks, we can fill in the gaps by setting the weights of all the
1545 ///    blocks in the same equivalence class to the same weight. To compute
1546 ///    the concept of equivalence, we use dominance and loop information.
1547 ///    Two blocks B1 and B2 are in the same equivalence class if B1
1548 ///    dominates B2, B2 post-dominates B1 and both are in the same loop.
1549 ///
1550 /// 3- Propagation of block weights into edges. This uses a simple
1551 ///    propagation heuristic. The following rules are applied to every
1552 ///    block BB in the CFG:
1553 ///
1554 ///    - If BB has a single predecessor/successor, then the weight
1555 ///      of that edge is the weight of the block.
1556 ///
1557 ///    - If all the edges are known except one, and the weight of the
1558 ///      block is already known, the weight of the unknown edge will
1559 ///      be the weight of the block minus the sum of all the known
1560 ///      edges. If the sum of all the known edges is larger than BB's weight,
1561 ///      we set the unknown edge weight to zero.
1562 ///
1563 ///    - If there is a self-referential edge, and the weight of the block is
1564 ///      known, the weight for that edge is set to the weight of the block
1565 ///      minus the weight of the other incoming edges to that block (if
1566 ///      known).
1567 ///
1568 /// Since this propagation is not guaranteed to finalize for every CFG, we
1569 /// only allow it to proceed for a limited number of iterations (controlled
1570 /// by -sample-profile-max-propagate-iterations).
1571 ///
1572 /// FIXME: Try to replace this propagation heuristic with a scheme
1573 /// that is guaranteed to finalize. A work-list approach similar to
1574 /// the standard value propagation algorithm used by SSA-CCP might
1575 /// work here.
1576 ///
1577 /// Once all the branch weights are computed, we emit the MD_prof
1578 /// metadata on BB using the computed values for each of its branches.
1579 ///
1580 /// \param F The function to query.
1581 ///
1582 /// \returns true if \p F was modified. Returns false, otherwise.
1583 bool SampleProfileLoader::emitAnnotations(Function &F) {
1584   bool Changed = false;
1585 
1586   if (getFunctionLoc(F) == 0)
1587     return false;
1588 
1589   LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
1590                     << F.getName() << ": " << getFunctionLoc(F) << "\n");
1591 
1592   DenseSet<GlobalValue::GUID> InlinedGUIDs;
1593   Changed |= inlineHotFunctions(F, InlinedGUIDs);
1594 
1595   // Compute basic block weights.
1596   Changed |= computeBlockWeights(F);
1597 
1598   if (Changed) {
1599     // Add an entry count to the function using the samples gathered at the
1600     // function entry.
1601     // Sets the GUIDs that are inlined in the profiled binary. This is used
1602     // for ThinLink to make correct liveness analysis, and also make the IR
1603     // match the profiled binary before annotation.
1604     F.setEntryCount(
1605         ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real),
1606         &InlinedGUIDs);
1607 
1608     // Compute dominance and loop info needed for propagation.
1609     computeDominanceAndLoopInfo(F);
1610 
1611     // Find equivalence classes.
1612     findEquivalenceClasses(F);
1613 
1614     // Propagate weights to all edges.
1615     propagateWeights(F);
1616   }
1617 
1618   // If coverage checking was requested, compute it now.
1619   if (SampleProfileRecordCoverage) {
1620     unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI);
1621     unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI);
1622     unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
1623     if (Coverage < SampleProfileRecordCoverage) {
1624       F.getContext().diagnose(DiagnosticInfoSampleProfile(
1625           F.getSubprogram()->getFilename(), getFunctionLoc(F),
1626           Twine(Used) + " of " + Twine(Total) + " available profile records (" +
1627               Twine(Coverage) + "%) were applied",
1628           DS_Warning));
1629     }
1630   }
1631 
1632   if (SampleProfileSampleCoverage) {
1633     uint64_t Used = CoverageTracker.getTotalUsedSamples();
1634     uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI);
1635     unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
1636     if (Coverage < SampleProfileSampleCoverage) {
1637       F.getContext().diagnose(DiagnosticInfoSampleProfile(
1638           F.getSubprogram()->getFilename(), getFunctionLoc(F),
1639           Twine(Used) + " of " + Twine(Total) + " available profile samples (" +
1640               Twine(Coverage) + "%) were applied",
1641           DS_Warning));
1642     }
1643   }
1644   return Changed;
1645 }
1646 
1647 char SampleProfileLoaderLegacyPass::ID = 0;
1648 
1649 INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
1650                       "Sample Profile loader", false, false)
1651 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1652 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1653 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1654 INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
1655                     "Sample Profile loader", false, false)
1656 
1657 bool SampleProfileLoader::doInitialization(Module &M) {
1658   auto &Ctx = M.getContext();
1659   auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx);
1660   if (std::error_code EC = ReaderOrErr.getError()) {
1661     std::string Msg = "Could not open profile: " + EC.message();
1662     Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1663     return false;
1664   }
1665   Reader = std::move(ReaderOrErr.get());
1666   Reader->collectFuncsToUse(M);
1667   ProfileIsValid = (Reader->read() == sampleprof_error::success);
1668   PSL = Reader->getProfileSymbolList();
1669 
1670   if (ProfileSampleAccurate) {
1671     NamesInProfile.clear();
1672     if (auto NameTable = Reader->getNameTable())
1673       NamesInProfile.insert(NameTable->begin(), NameTable->end());
1674   }
1675 
1676   if (!RemappingFilename.empty()) {
1677     // Apply profile remappings to the loaded profile data if requested.
1678     // For now, we only support remapping symbols encoded using the Itanium
1679     // C++ ABI's name mangling scheme.
1680     ReaderOrErr = SampleProfileReaderItaniumRemapper::create(
1681         RemappingFilename, Ctx, std::move(Reader));
1682     if (std::error_code EC = ReaderOrErr.getError()) {
1683       std::string Msg = "Could not open profile remapping file: " + EC.message();
1684       Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1685       return false;
1686     }
1687     Reader = std::move(ReaderOrErr.get());
1688     ProfileIsValid = (Reader->read() == sampleprof_error::success);
1689   }
1690   return true;
1691 }
1692 
1693 ModulePass *llvm::createSampleProfileLoaderPass() {
1694   return new SampleProfileLoaderLegacyPass();
1695 }
1696 
1697 ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
1698   return new SampleProfileLoaderLegacyPass(Name);
1699 }
1700 
1701 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
1702                                       ProfileSummaryInfo *_PSI) {
1703   GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
1704   if (!ProfileIsValid)
1705     return false;
1706 
1707   PSI = _PSI;
1708   if (M.getProfileSummary(/* IsCS */ false) == nullptr)
1709     M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
1710                         ProfileSummary::PSK_Sample);
1711 
1712   // Compute the total number of samples collected in this profile.
1713   for (const auto &I : Reader->getProfiles())
1714     TotalCollectedSamples += I.second.getTotalSamples();
1715 
1716   // Populate the symbol map.
1717   for (const auto &N_F : M.getValueSymbolTable()) {
1718     StringRef OrigName = N_F.getKey();
1719     Function *F = dyn_cast<Function>(N_F.getValue());
1720     if (F == nullptr)
1721       continue;
1722     SymbolMap[OrigName] = F;
1723     auto pos = OrigName.find('.');
1724     if (pos != StringRef::npos) {
1725       StringRef NewName = OrigName.substr(0, pos);
1726       auto r = SymbolMap.insert(std::make_pair(NewName, F));
1727       // Failiing to insert means there is already an entry in SymbolMap,
1728       // thus there are multiple functions that are mapped to the same
1729       // stripped name. In this case of name conflicting, set the value
1730       // to nullptr to avoid confusion.
1731       if (!r.second)
1732         r.first->second = nullptr;
1733     }
1734   }
1735 
1736   bool retval = false;
1737   for (auto &F : M)
1738     if (!F.isDeclaration()) {
1739       clearFunctionData();
1740       retval |= runOnFunction(F, AM);
1741     }
1742 
1743   // Account for cold calls not inlined....
1744   for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
1745        notInlinedCallInfo)
1746     updateProfileCallee(pair.first, pair.second.entryCount);
1747 
1748   return retval;
1749 }
1750 
1751 bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
1752   ACT = &getAnalysis<AssumptionCacheTracker>();
1753   TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
1754   ProfileSummaryInfo *PSI =
1755       &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1756   return SampleLoader.runOnModule(M, nullptr, PSI);
1757 }
1758 
1759 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
1760 
1761   DILocation2SampleMap.clear();
1762   // By default the entry count is initialized to -1, which will be treated
1763   // conservatively by getEntryCount as the same as unknown (None). This is
1764   // to avoid newly added code to be treated as cold. If we have samples
1765   // this will be overwritten in emitAnnotations.
1766   uint64_t initialEntryCount = -1;
1767 
1768   ProfSampleAccEnabled =
1769       ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate");
1770   if (ProfSampleAccEnabled) {
1771     // PSL -- profile symbol list include all the symbols in sampled binary.
1772     // It is used to prevent new functions to be treated as cold.
1773     if (PSL) {
1774       // Profile symbol list is available, initialize the entry count to 0
1775       // only for functions in the list.
1776       if (PSL->contains(F.getName()))
1777         initialEntryCount = 0;
1778 
1779       // When ProfileSampleAccurate is true, function without sample will be
1780       // regarded as cold. To minimize the potential negative performance
1781       // impact it could have, we want to be a little conservative here
1782       // saying if a function shows up in the profile, no matter as outline
1783       // function, inline instance or call targets, treat the function as not
1784       // being cold. This will handle the cases such as most callsites of a
1785       // function are inlined in sampled binary but not inlined in current
1786       // build (because of source code drift, imprecise debug information, or
1787       // the callsites are all cold individually but not cold
1788       // accumulatively...), so the outline function showing up as cold in
1789       // sampled binary will actually not be cold after current build.
1790       StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
1791       if (NamesInProfile.count(CanonName))
1792         initialEntryCount = -1;
1793     } else {
1794       // If there is no profile symbol list available, initialize all the
1795       // function entry counts to 0. It means all the functions without
1796       // profile will be regarded as cold.
1797       initialEntryCount = 0;
1798     }
1799   }
1800 
1801   F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
1802   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
1803   if (AM) {
1804     auto &FAM =
1805         AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent())
1806             .getManager();
1807     ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
1808   } else {
1809     OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
1810     ORE = OwnedORE.get();
1811   }
1812   Samples = Reader->getSamplesFor(F);
1813   if (Samples && !Samples->empty())
1814     return emitAnnotations(F);
1815   return false;
1816 }
1817 
1818 PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
1819                                                ModuleAnalysisManager &AM) {
1820   FunctionAnalysisManager &FAM =
1821       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1822 
1823   auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
1824     return FAM.getResult<AssumptionAnalysis>(F);
1825   };
1826   auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
1827     return FAM.getResult<TargetIRAnalysis>(F);
1828   };
1829 
1830   SampleProfileLoader SampleLoader(
1831       ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
1832       ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
1833                                        : ProfileRemappingFileName,
1834       IsThinLTOPreLink, GetAssumptionCache, GetTTI);
1835 
1836   SampleLoader.doInitialization(M);
1837 
1838   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
1839   if (!SampleLoader.runOnModule(M, &AM, PSI))
1840     return PreservedAnalyses::all();
1841 
1842   return PreservedAnalyses::none();
1843 }
1844