1 //===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
2 //
3 //                      The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the SampleProfileLoader transformation. This pass
11 // reads a profile file generated by a sampling profiler (e.g. Linux Perf -
12 // http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
13 // profile information in the given profile.
14 //
15 // This pass generates branch weight annotations on the IR:
16 //
17 // - prof: Represents branch weights. This annotation is added to branches
18 //      to indicate the weights of each edge coming out of the branch.
19 //      The weight of each edge is the weight of the target block for
20 //      that edge. The weight of a block B is computed as the maximum
21 //      number of samples found in B.
22 //
23 //===----------------------------------------------------------------------===//
24 
25 #include "llvm/Transforms/SampleProfile.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/SmallPtrSet.h"
28 #include "llvm/ADT/SmallSet.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/Analysis/AssumptionCache.h"
31 #include "llvm/Analysis/LoopInfo.h"
32 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
33 #include "llvm/Analysis/PostDominators.h"
34 #include "llvm/IR/Constants.h"
35 #include "llvm/IR/DebugInfo.h"
36 #include "llvm/IR/DiagnosticInfo.h"
37 #include "llvm/IR/Dominators.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/GlobalValue.h"
40 #include "llvm/IR/InstIterator.h"
41 #include "llvm/IR/Instructions.h"
42 #include "llvm/IR/IntrinsicInst.h"
43 #include "llvm/IR/LLVMContext.h"
44 #include "llvm/IR/MDBuilder.h"
45 #include "llvm/IR/Metadata.h"
46 #include "llvm/IR/Module.h"
47 #include "llvm/IR/ValueSymbolTable.h"
48 #include "llvm/Pass.h"
49 #include "llvm/ProfileData/InstrProf.h"
50 #include "llvm/ProfileData/SampleProfReader.h"
51 #include "llvm/Support/CommandLine.h"
52 #include "llvm/Support/Debug.h"
53 #include "llvm/Support/ErrorOr.h"
54 #include "llvm/Support/Format.h"
55 #include "llvm/Support/raw_ostream.h"
56 #include "llvm/Transforms/IPO.h"
57 #include "llvm/Transforms/Instrumentation.h"
58 #include "llvm/Transforms/Utils/Cloning.h"
59 #include <cctype>
60 
61 using namespace llvm;
62 using namespace sampleprof;
63 
64 #define DEBUG_TYPE "sample-profile"
65 
66 // Command line option to specify the file to read samples from. This is
67 // mainly used for debugging.
68 static cl::opt<std::string> SampleProfileFile(
69     "sample-profile-file", cl::init(""), cl::value_desc("filename"),
70     cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
71 static cl::opt<unsigned> SampleProfileMaxPropagateIterations(
72     "sample-profile-max-propagate-iterations", cl::init(100),
73     cl::desc("Maximum number of iterations to go through when propagating "
74              "sample block/edge weights through the CFG."));
75 static cl::opt<unsigned> SampleProfileRecordCoverage(
76     "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"),
77     cl::desc("Emit a warning if less than N% of records in the input profile "
78              "are matched to the IR."));
79 static cl::opt<unsigned> SampleProfileSampleCoverage(
80     "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"),
81     cl::desc("Emit a warning if less than N% of samples in the input profile "
82              "are matched to the IR."));
83 static cl::opt<double> SampleProfileHotThreshold(
84     "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"),
85     cl::desc("Inlined functions that account for more than N% of all samples "
86              "collected in the parent function, will be inlined again."));
87 
88 namespace {
89 typedef DenseMap<const BasicBlock *, uint64_t> BlockWeightMap;
90 typedef DenseMap<const BasicBlock *, const BasicBlock *> EquivalenceClassMap;
91 typedef std::pair<const BasicBlock *, const BasicBlock *> Edge;
92 typedef DenseMap<Edge, uint64_t> EdgeWeightMap;
93 typedef DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>
94     BlockEdgeMap;
95 
96 class SampleCoverageTracker {
97 public:
98   SampleCoverageTracker() : SampleCoverage(), TotalUsedSamples(0) {}
99 
100   bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
101                        uint32_t Discriminator, uint64_t Samples);
102   unsigned computeCoverage(unsigned Used, unsigned Total) const;
103   unsigned countUsedRecords(const FunctionSamples *FS) const;
104   unsigned countBodyRecords(const FunctionSamples *FS) const;
105   uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
106   uint64_t countBodySamples(const FunctionSamples *FS) const;
107   void clear() {
108     SampleCoverage.clear();
109     TotalUsedSamples = 0;
110   }
111 
112 private:
113   typedef std::map<LineLocation, unsigned> BodySampleCoverageMap;
114   typedef DenseMap<const FunctionSamples *, BodySampleCoverageMap>
115       FunctionSamplesCoverageMap;
116 
117   /// Coverage map for sampling records.
118   ///
119   /// This map keeps a record of sampling records that have been matched to
120   /// an IR instruction. This is used to detect some form of staleness in
121   /// profiles (see flag -sample-profile-check-coverage).
122   ///
123   /// Each entry in the map corresponds to a FunctionSamples instance.  This is
124   /// another map that counts how many times the sample record at the
125   /// given location has been used.
126   FunctionSamplesCoverageMap SampleCoverage;
127 
128   /// Number of samples used from the profile.
129   ///
130   /// When a sampling record is used for the first time, the samples from
131   /// that record are added to this accumulator.  Coverage is later computed
132   /// based on the total number of samples available in this function and
133   /// its callsites.
134   ///
135   /// Note that this accumulator tracks samples used from a single function
136   /// and all the inlined callsites. Strictly, we should have a map of counters
137   /// keyed by FunctionSamples pointers, but these stats are cleared after
138   /// every function, so we just need to keep a single counter.
139   uint64_t TotalUsedSamples;
140 };
141 
142 /// \brief Sample profile pass.
143 ///
144 /// This pass reads profile data from the file specified by
145 /// -sample-profile-file and annotates every affected function with the
146 /// profile information found in that file.
147 class SampleProfileLoader {
148 public:
149   SampleProfileLoader(StringRef Name = SampleProfileFile)
150       : DT(nullptr), PDT(nullptr), LI(nullptr), ACT(nullptr), Reader(),
151         Samples(nullptr), Filename(Name), ProfileIsValid(false),
152         TotalCollectedSamples(0), ORE(nullptr) {}
153 
154   bool doInitialization(Module &M);
155   bool runOnModule(Module &M, ModuleAnalysisManager *AM);
156   void setACT(AssumptionCacheTracker *A) { ACT = A; }
157 
158   void dump() { Reader->dump(); }
159 
160 protected:
161   bool runOnFunction(Function &F, ModuleAnalysisManager *AM);
162   unsigned getFunctionLoc(Function &F);
163   bool emitAnnotations(Function &F);
164   ErrorOr<uint64_t> getInstWeight(const Instruction &I);
165   ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
166   const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const;
167   std::vector<const FunctionSamples *>
168   findIndirectCallFunctionSamples(const Instruction &I) const;
169   const FunctionSamples *findFunctionSamples(const Instruction &I) const;
170   bool inlineHotFunctions(Function &F,
171                           DenseSet<GlobalValue::GUID> &ImportGUIDs);
172   void printEdgeWeight(raw_ostream &OS, Edge E);
173   void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
174   void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
175   bool computeBlockWeights(Function &F);
176   void findEquivalenceClasses(Function &F);
177   template <bool IsPostDom>
178   void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
179                            DominatorTreeBase<BasicBlock, IsPostDom> *DomTree);
180 
181   void propagateWeights(Function &F);
182   uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
183   void buildEdges(Function &F);
184   bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
185   void computeDominanceAndLoopInfo(Function &F);
186   unsigned getOffset(const DILocation *DIL) const;
187   void clearFunctionData();
188 
189   /// \brief Map basic blocks to their computed weights.
190   ///
191   /// The weight of a basic block is defined to be the maximum
192   /// of all the instruction weights in that block.
193   BlockWeightMap BlockWeights;
194 
195   /// \brief Map edges to their computed weights.
196   ///
197   /// Edge weights are computed by propagating basic block weights in
198   /// SampleProfile::propagateWeights.
199   EdgeWeightMap EdgeWeights;
200 
201   /// \brief Set of visited blocks during propagation.
202   SmallPtrSet<const BasicBlock *, 32> VisitedBlocks;
203 
204   /// \brief Set of visited edges during propagation.
205   SmallSet<Edge, 32> VisitedEdges;
206 
207   /// \brief Equivalence classes for block weights.
208   ///
209   /// Two blocks BB1 and BB2 are in the same equivalence class if they
210   /// dominate and post-dominate each other, and they are in the same loop
211   /// nest. When this happens, the two blocks are guaranteed to execute
212   /// the same number of times.
213   EquivalenceClassMap EquivalenceClass;
214 
215   /// Map from function name to Function *. Used to find the function from
216   /// the function name. If the function name contains suffix, additional
217   /// entry is added to map from the stripped name to the function if there
218   /// is one-to-one mapping.
219   StringMap<Function *> SymbolMap;
220 
221   /// \brief Dominance, post-dominance and loop information.
222   std::unique_ptr<DominatorTree> DT;
223   std::unique_ptr<PostDomTreeBase<BasicBlock>> PDT;
224   std::unique_ptr<LoopInfo> LI;
225 
226   AssumptionCacheTracker *ACT;
227 
228   /// \brief Predecessors for each basic block in the CFG.
229   BlockEdgeMap Predecessors;
230 
231   /// \brief Successors for each basic block in the CFG.
232   BlockEdgeMap Successors;
233 
234   SampleCoverageTracker CoverageTracker;
235 
236   /// \brief Profile reader object.
237   std::unique_ptr<SampleProfileReader> Reader;
238 
239   /// \brief Samples collected for the body of this function.
240   FunctionSamples *Samples;
241 
242   /// \brief Name of the profile file to load.
243   std::string Filename;
244 
245   /// \brief Flag indicating whether the profile input loaded successfully.
246   bool ProfileIsValid;
247 
248   /// \brief Total number of samples collected in this profile.
249   ///
250   /// This is the sum of all the samples collected in all the functions executed
251   /// at runtime.
252   uint64_t TotalCollectedSamples;
253 
254   /// \brief Optimization Remark Emitter used to emit diagnostic remarks.
255   OptimizationRemarkEmitter *ORE;
256 };
257 
258 class SampleProfileLoaderLegacyPass : public ModulePass {
259 public:
260   // Class identification, replacement for typeinfo
261   static char ID;
262 
263   SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile)
264       : ModulePass(ID), SampleLoader(Name) {
265     initializeSampleProfileLoaderLegacyPassPass(
266         *PassRegistry::getPassRegistry());
267   }
268 
269   void dump() { SampleLoader.dump(); }
270 
271   bool doInitialization(Module &M) override {
272     return SampleLoader.doInitialization(M);
273   }
274   StringRef getPassName() const override { return "Sample profile pass"; }
275   bool runOnModule(Module &M) override;
276 
277   void getAnalysisUsage(AnalysisUsage &AU) const override {
278     AU.addRequired<AssumptionCacheTracker>();
279   }
280 
281 private:
282   SampleProfileLoader SampleLoader;
283 };
284 
285 /// Return true if the given callsite is hot wrt to its caller.
286 ///
287 /// Functions that were inlined in the original binary will be represented
288 /// in the inline stack in the sample profile. If the profile shows that
289 /// the original inline decision was "good" (i.e., the callsite is executed
290 /// frequently), then we will recreate the inline decision and apply the
291 /// profile from the inlined callsite.
292 ///
293 /// To decide whether an inlined callsite is hot, we compute the fraction
294 /// of samples used by the callsite with respect to the total number of samples
295 /// collected in the caller.
296 ///
297 /// If that fraction is larger than the default given by
298 /// SampleProfileHotThreshold, the callsite will be inlined again.
299 bool callsiteIsHot(const FunctionSamples *CallerFS,
300                    const FunctionSamples *CallsiteFS) {
301   if (!CallsiteFS)
302     return false; // The callsite was not inlined in the original binary.
303 
304   uint64_t ParentTotalSamples = CallerFS->getTotalSamples();
305   if (ParentTotalSamples == 0)
306     return false; // Avoid division by zero.
307 
308   uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
309   if (CallsiteTotalSamples == 0)
310     return false; // Callsite is trivially cold.
311 
312   double PercentSamples =
313       (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0;
314   return PercentSamples >= SampleProfileHotThreshold;
315 }
316 }
317 
318 /// Mark as used the sample record for the given function samples at
319 /// (LineOffset, Discriminator).
320 ///
321 /// \returns true if this is the first time we mark the given record.
322 bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS,
323                                             uint32_t LineOffset,
324                                             uint32_t Discriminator,
325                                             uint64_t Samples) {
326   LineLocation Loc(LineOffset, Discriminator);
327   unsigned &Count = SampleCoverage[FS][Loc];
328   bool FirstTime = (++Count == 1);
329   if (FirstTime)
330     TotalUsedSamples += Samples;
331   return FirstTime;
332 }
333 
334 /// Return the number of sample records that were applied from this profile.
335 ///
336 /// This count does not include records from cold inlined callsites.
337 unsigned
338 SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const {
339   auto I = SampleCoverage.find(FS);
340 
341   // The size of the coverage map for FS represents the number of records
342   // that were marked used at least once.
343   unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0;
344 
345   // If there are inlined callsites in this function, count the samples found
346   // in the respective bodies. However, do not bother counting callees with 0
347   // total samples, these are callees that were never invoked at runtime.
348   for (const auto &I : FS->getCallsiteSamples())
349     for (const auto &J : I.second) {
350       const FunctionSamples *CalleeSamples = &J.second;
351       if (callsiteIsHot(FS, CalleeSamples))
352         Count += countUsedRecords(CalleeSamples);
353     }
354 
355   return Count;
356 }
357 
358 /// Return the number of sample records in the body of this profile.
359 ///
360 /// This count does not include records from cold inlined callsites.
361 unsigned
362 SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const {
363   unsigned Count = FS->getBodySamples().size();
364 
365   // Only count records in hot callsites.
366   for (const auto &I : FS->getCallsiteSamples())
367     for (const auto &J : I.second) {
368       const FunctionSamples *CalleeSamples = &J.second;
369       if (callsiteIsHot(FS, CalleeSamples))
370         Count += countBodyRecords(CalleeSamples);
371     }
372 
373   return Count;
374 }
375 
376 /// Return the number of samples collected in the body of this profile.
377 ///
378 /// This count does not include samples from cold inlined callsites.
379 uint64_t
380 SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const {
381   uint64_t Total = 0;
382   for (const auto &I : FS->getBodySamples())
383     Total += I.second.getSamples();
384 
385   // Only count samples in hot callsites.
386   for (const auto &I : FS->getCallsiteSamples())
387     for (const auto &J : I.second) {
388       const FunctionSamples *CalleeSamples = &J.second;
389       if (callsiteIsHot(FS, CalleeSamples))
390         Total += countBodySamples(CalleeSamples);
391     }
392 
393   return Total;
394 }
395 
396 /// Return the fraction of sample records used in this profile.
397 ///
398 /// The returned value is an unsigned integer in the range 0-100 indicating
399 /// the percentage of sample records that were used while applying this
400 /// profile to the associated function.
401 unsigned SampleCoverageTracker::computeCoverage(unsigned Used,
402                                                 unsigned Total) const {
403   assert(Used <= Total &&
404          "number of used records cannot exceed the total number of records");
405   return Total > 0 ? Used * 100 / Total : 100;
406 }
407 
408 /// Clear all the per-function data used to load samples and propagate weights.
409 void SampleProfileLoader::clearFunctionData() {
410   BlockWeights.clear();
411   EdgeWeights.clear();
412   VisitedBlocks.clear();
413   VisitedEdges.clear();
414   EquivalenceClass.clear();
415   DT = nullptr;
416   PDT = nullptr;
417   LI = nullptr;
418   Predecessors.clear();
419   Successors.clear();
420   CoverageTracker.clear();
421 }
422 
423 /// Returns the line offset to the start line of the subprogram.
424 /// We assume that a single function will not exceed 65535 LOC.
425 unsigned SampleProfileLoader::getOffset(const DILocation *DIL) const {
426   return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
427          0xffff;
428 }
429 
430 #ifndef NDEBUG
431 /// \brief Print the weight of edge \p E on stream \p OS.
432 ///
433 /// \param OS  Stream to emit the output to.
434 /// \param E  Edge to print.
435 void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {
436   OS << "weight[" << E.first->getName() << "->" << E.second->getName()
437      << "]: " << EdgeWeights[E] << "\n";
438 }
439 
440 /// \brief Print the equivalence class of block \p BB on stream \p OS.
441 ///
442 /// \param OS  Stream to emit the output to.
443 /// \param BB  Block to print.
444 void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
445                                                 const BasicBlock *BB) {
446   const BasicBlock *Equiv = EquivalenceClass[BB];
447   OS << "equivalence[" << BB->getName()
448      << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
449 }
450 
451 /// \brief Print the weight of block \p BB on stream \p OS.
452 ///
453 /// \param OS  Stream to emit the output to.
454 /// \param BB  Block to print.
455 void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
456                                            const BasicBlock *BB) const {
457   const auto &I = BlockWeights.find(BB);
458   uint64_t W = (I == BlockWeights.end() ? 0 : I->second);
459   OS << "weight[" << BB->getName() << "]: " << W << "\n";
460 }
461 #endif
462 
463 /// \brief Get the weight for an instruction.
464 ///
465 /// The "weight" of an instruction \p Inst is the number of samples
466 /// collected on that instruction at runtime. To retrieve it, we
467 /// need to compute the line number of \p Inst relative to the start of its
468 /// function. We use HeaderLineno to compute the offset. We then
469 /// look up the samples collected for \p Inst using BodySamples.
470 ///
471 /// \param Inst Instruction to query.
472 ///
473 /// \returns the weight of \p Inst.
474 ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
475   const DebugLoc &DLoc = Inst.getDebugLoc();
476   if (!DLoc)
477     return std::error_code();
478 
479   const FunctionSamples *FS = findFunctionSamples(Inst);
480   if (!FS)
481     return std::error_code();
482 
483   // Ignore all intrinsics and branch instructions.
484   // Branch instruction usually contains debug info from sources outside of
485   // the residing basic block, thus we ignore them during annotation.
486   if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst))
487     return std::error_code();
488 
489   // If a call/invoke instruction is inlined in profile, but not inlined here,
490   // it means that the inlined callsite has no sample, thus the call
491   // instruction should have 0 count.
492   if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
493       findCalleeFunctionSamples(Inst))
494     return 0;
495 
496   const DILocation *DIL = DLoc;
497   uint32_t LineOffset = getOffset(DIL);
498   uint32_t Discriminator = DIL->getBaseDiscriminator();
499   ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
500   if (R) {
501     bool FirstMark =
502         CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
503     if (FirstMark) {
504       if (Discriminator)
505         ORE->emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AppliedSamples", &Inst)
506                   << "Applied " << ore::NV("NumSamples", *R)
507                   << " samples from profile (offset: "
508                   << ore::NV("LineOffset", LineOffset) << "."
509                   << ore::NV("Discriminator", Discriminator) << ")");
510       else
511         ORE->emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AppliedSamples", &Inst)
512                   << "Applied " << ore::NV("NumSamples", *R)
513                   << " samples from profile (offset: "
514                   << ore::NV("LineOffset", LineOffset) << ")");
515     }
516     DEBUG(dbgs() << "    " << DLoc.getLine() << "."
517                  << DIL->getBaseDiscriminator() << ":" << Inst
518                  << " (line offset: " << LineOffset << "."
519                  << DIL->getBaseDiscriminator() << " - weight: " << R.get()
520                  << ")\n");
521   }
522   return R;
523 }
524 
525 /// \brief Compute the weight of a basic block.
526 ///
527 /// The weight of basic block \p BB is the maximum weight of all the
528 /// instructions in BB.
529 ///
530 /// \param BB The basic block to query.
531 ///
532 /// \returns the weight for \p BB.
533 ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
534   uint64_t Max = 0;
535   bool HasWeight = false;
536   for (auto &I : BB->getInstList()) {
537     const ErrorOr<uint64_t> &R = getInstWeight(I);
538     if (R) {
539       Max = std::max(Max, R.get());
540       HasWeight = true;
541     }
542   }
543   return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code();
544 }
545 
546 /// \brief Compute and store the weights of every basic block.
547 ///
548 /// This populates the BlockWeights map by computing
549 /// the weights of every basic block in the CFG.
550 ///
551 /// \param F The function to query.
552 bool SampleProfileLoader::computeBlockWeights(Function &F) {
553   bool Changed = false;
554   DEBUG(dbgs() << "Block weights\n");
555   for (const auto &BB : F) {
556     ErrorOr<uint64_t> Weight = getBlockWeight(&BB);
557     if (Weight) {
558       BlockWeights[&BB] = Weight.get();
559       VisitedBlocks.insert(&BB);
560       Changed = true;
561     }
562     DEBUG(printBlockWeight(dbgs(), &BB));
563   }
564 
565   return Changed;
566 }
567 
568 /// \brief Get the FunctionSamples for a call instruction.
569 ///
570 /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
571 /// instance in which that call instruction is calling to. It contains
572 /// all samples that resides in the inlined instance. We first find the
573 /// inlined instance in which the call instruction is from, then we
574 /// traverse its children to find the callsite with the matching
575 /// location.
576 ///
577 /// \param Inst Call/Invoke instruction to query.
578 ///
579 /// \returns The FunctionSamples pointer to the inlined instance.
580 const FunctionSamples *
581 SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
582   const DILocation *DIL = Inst.getDebugLoc();
583   if (!DIL) {
584     return nullptr;
585   }
586 
587   StringRef CalleeName;
588   if (const CallInst *CI = dyn_cast<CallInst>(&Inst))
589     if (Function *Callee = CI->getCalledFunction())
590       CalleeName = Callee->getName();
591 
592   const FunctionSamples *FS = findFunctionSamples(Inst);
593   if (FS == nullptr)
594     return nullptr;
595 
596   return FS->findFunctionSamplesAt(
597       LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()), CalleeName);
598 }
599 
600 /// Returns a vector of FunctionSamples that are the indirect call targets
601 /// of \p Inst. The vector is sorted by the total number of samples.
602 std::vector<const FunctionSamples *>
603 SampleProfileLoader::findIndirectCallFunctionSamples(
604     const Instruction &Inst) const {
605   const DILocation *DIL = Inst.getDebugLoc();
606   std::vector<const FunctionSamples *> R;
607 
608   if (!DIL) {
609     return R;
610   }
611 
612   const FunctionSamples *FS = findFunctionSamples(Inst);
613   if (FS == nullptr)
614     return R;
615 
616   if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(
617           LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()))) {
618     if (M->size() == 0)
619       return R;
620     for (const auto &NameFS : *M) {
621       R.push_back(&NameFS.second);
622     }
623     std::sort(R.begin(), R.end(),
624               [](const FunctionSamples *L, const FunctionSamples *R) {
625                 return L->getTotalSamples() > R->getTotalSamples();
626               });
627   }
628   return R;
629 }
630 
631 /// \brief Get the FunctionSamples for an instruction.
632 ///
633 /// The FunctionSamples of an instruction \p Inst is the inlined instance
634 /// in which that instruction is coming from. We traverse the inline stack
635 /// of that instruction, and match it with the tree nodes in the profile.
636 ///
637 /// \param Inst Instruction to query.
638 ///
639 /// \returns the FunctionSamples pointer to the inlined instance.
640 const FunctionSamples *
641 SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
642   SmallVector<std::pair<LineLocation, StringRef>, 10> S;
643   const DILocation *DIL = Inst.getDebugLoc();
644   if (!DIL)
645     return Samples;
646 
647   const DILocation *PrevDIL = DIL;
648   for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
649     S.push_back(std::make_pair(
650         LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()),
651         PrevDIL->getScope()->getSubprogram()->getLinkageName()));
652     PrevDIL = DIL;
653   }
654   if (S.size() == 0)
655     return Samples;
656   const FunctionSamples *FS = Samples;
657   for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) {
658     FS = FS->findFunctionSamplesAt(S[i].first, S[i].second);
659   }
660   return FS;
661 }
662 
663 /// \brief Iteratively inline hot callsites of a function.
664 ///
665 /// Iteratively traverse all callsites of the function \p F, and find if
666 /// the corresponding inlined instance exists and is hot in profile. If
667 /// it is hot enough, inline the callsites and adds new callsites of the
668 /// callee into the caller. If the call is an indirect call, first promote
669 /// it to direct call. Each indirect call is limited with a single target.
670 ///
671 /// \param F function to perform iterative inlining.
672 /// \param ImportGUIDs a set to be updated to include all GUIDs that come
673 ///     from a different module but inlined in the profiled binary.
674 ///
675 /// \returns True if there is any inline happened.
676 bool SampleProfileLoader::inlineHotFunctions(
677     Function &F, DenseSet<GlobalValue::GUID> &ImportGUIDs) {
678   DenseSet<Instruction *> PromotedInsns;
679   bool Changed = false;
680   std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&](
681       Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); };
682   while (true) {
683     bool LocalChanged = false;
684     SmallVector<Instruction *, 10> CIS;
685     for (auto &BB : F) {
686       bool Hot = false;
687       SmallVector<Instruction *, 10> Candidates;
688       for (auto &I : BB.getInstList()) {
689         const FunctionSamples *FS = nullptr;
690         if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
691             !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) {
692           Candidates.push_back(&I);
693           if (callsiteIsHot(Samples, FS))
694             Hot = true;
695         }
696       }
697       if (Hot) {
698         CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end());
699       }
700     }
701     for (auto I : CIS) {
702       InlineFunctionInfo IFI(nullptr, ACT ? &GetAssumptionCache : nullptr);
703       Function *CalledFunction = CallSite(I).getCalledFunction();
704       // Do not inline recursive calls.
705       if (CalledFunction == &F)
706         continue;
707       Instruction *DI = I;
708       if (!CalledFunction && !PromotedInsns.count(I) &&
709           CallSite(I).isIndirectCall())
710         for (const auto *FS : findIndirectCallFunctionSamples(*I)) {
711           auto CalleeFunctionName = FS->getName();
712           // If it is a recursive call, we do not inline it as it could bloat
713           // the code exponentially. There is way to better handle this, e.g.
714           // clone the caller first, and inline the cloned caller if it is
715           // recursive. As llvm does not inline recursive calls, we will simply
716           // ignore it instead of handling it explicitly.
717           if (CalleeFunctionName == F.getName())
718             continue;
719           const char *Reason = "Callee function not available";
720           auto R = SymbolMap.find(CalleeFunctionName);
721           if (R == SymbolMap.end())
722             continue;
723           CalledFunction = R->getValue();
724           if (CalledFunction && isLegalToPromote(I, CalledFunction, &Reason)) {
725             // The indirect target was promoted and inlined in the profile, as a
726             // result, we do not have profile info for the branch probability.
727             // We set the probability to 80% taken to indicate that the static
728             // call is likely taken.
729             DI = dyn_cast<Instruction>(
730                 promoteIndirectCall(I, CalledFunction, 80, 100, false, ORE)
731                     ->stripPointerCasts());
732             PromotedInsns.insert(I);
733           } else {
734             DEBUG(dbgs() << "\nFailed to promote indirect call to "
735                          << CalleeFunctionName << " because " << Reason
736                          << "\n");
737             continue;
738           }
739         }
740       if (!CalledFunction || !CalledFunction->getSubprogram()) {
741         findCalleeFunctionSamples(*I)->findImportedFunctions(
742             ImportGUIDs, F.getParent(),
743             Samples->getTotalSamples() * SampleProfileHotThreshold / 100);
744         continue;
745       }
746       DebugLoc DLoc = I->getDebugLoc();
747       BasicBlock *BB = I->getParent();
748       if (InlineFunction(CallSite(DI), IFI)) {
749         LocalChanged = true;
750         // The call to InlineFunction erases DI, so we can't pass it here.
751         ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB)
752                   << "inlined hot callee '"
753                   << ore::NV("Callee", CalledFunction) << "' into '"
754                   << ore::NV("Caller", &F) << "'");
755       }
756     }
757     if (LocalChanged) {
758       Changed = true;
759     } else {
760       break;
761     }
762   }
763   return Changed;
764 }
765 
766 /// \brief Find equivalence classes for the given block.
767 ///
768 /// This finds all the blocks that are guaranteed to execute the same
769 /// number of times as \p BB1. To do this, it traverses all the
770 /// descendants of \p BB1 in the dominator or post-dominator tree.
771 ///
772 /// A block BB2 will be in the same equivalence class as \p BB1 if
773 /// the following holds:
774 ///
775 /// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2
776 ///    is a descendant of \p BB1 in the dominator tree, then BB2 should
777 ///    dominate BB1 in the post-dominator tree.
778 ///
779 /// 2- Both BB2 and \p BB1 must be in the same loop.
780 ///
781 /// For every block BB2 that meets those two requirements, we set BB2's
782 /// equivalence class to \p BB1.
783 ///
784 /// \param BB1  Block to check.
785 /// \param Descendants  Descendants of \p BB1 in either the dom or pdom tree.
786 /// \param DomTree  Opposite dominator tree. If \p Descendants is filled
787 ///                 with blocks from \p BB1's dominator tree, then
788 ///                 this is the post-dominator tree, and vice versa.
789 template <bool IsPostDom>
790 void SampleProfileLoader::findEquivalencesFor(
791     BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
792     DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) {
793   const BasicBlock *EC = EquivalenceClass[BB1];
794   uint64_t Weight = BlockWeights[EC];
795   for (const auto *BB2 : Descendants) {
796     bool IsDomParent = DomTree->dominates(BB2, BB1);
797     bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
798     if (BB1 != BB2 && IsDomParent && IsInSameLoop) {
799       EquivalenceClass[BB2] = EC;
800       // If BB2 is visited, then the entire EC should be marked as visited.
801       if (VisitedBlocks.count(BB2)) {
802         VisitedBlocks.insert(EC);
803       }
804 
805       // If BB2 is heavier than BB1, make BB2 have the same weight
806       // as BB1.
807       //
808       // Note that we don't worry about the opposite situation here
809       // (when BB2 is lighter than BB1). We will deal with this
810       // during the propagation phase. Right now, we just want to
811       // make sure that BB1 has the largest weight of all the
812       // members of its equivalence set.
813       Weight = std::max(Weight, BlockWeights[BB2]);
814     }
815   }
816   if (EC == &EC->getParent()->getEntryBlock()) {
817     BlockWeights[EC] = Samples->getHeadSamples() + 1;
818   } else {
819     BlockWeights[EC] = Weight;
820   }
821 }
822 
823 /// \brief Find equivalence classes.
824 ///
825 /// Since samples may be missing from blocks, we can fill in the gaps by setting
826 /// the weights of all the blocks in the same equivalence class to the same
827 /// weight. To compute the concept of equivalence, we use dominance and loop
828 /// information. Two blocks B1 and B2 are in the same equivalence class if B1
829 /// dominates B2, B2 post-dominates B1 and both are in the same loop.
830 ///
831 /// \param F The function to query.
832 void SampleProfileLoader::findEquivalenceClasses(Function &F) {
833   SmallVector<BasicBlock *, 8> DominatedBBs;
834   DEBUG(dbgs() << "\nBlock equivalence classes\n");
835   // Find equivalence sets based on dominance and post-dominance information.
836   for (auto &BB : F) {
837     BasicBlock *BB1 = &BB;
838 
839     // Compute BB1's equivalence class once.
840     if (EquivalenceClass.count(BB1)) {
841       DEBUG(printBlockEquivalence(dbgs(), BB1));
842       continue;
843     }
844 
845     // By default, blocks are in their own equivalence class.
846     EquivalenceClass[BB1] = BB1;
847 
848     // Traverse all the blocks dominated by BB1. We are looking for
849     // every basic block BB2 such that:
850     //
851     // 1- BB1 dominates BB2.
852     // 2- BB2 post-dominates BB1.
853     // 3- BB1 and BB2 are in the same loop nest.
854     //
855     // If all those conditions hold, it means that BB2 is executed
856     // as many times as BB1, so they are placed in the same equivalence
857     // class by making BB2's equivalence class be BB1.
858     DominatedBBs.clear();
859     DT->getDescendants(BB1, DominatedBBs);
860     findEquivalencesFor(BB1, DominatedBBs, PDT.get());
861 
862     DEBUG(printBlockEquivalence(dbgs(), BB1));
863   }
864 
865   // Assign weights to equivalence classes.
866   //
867   // All the basic blocks in the same equivalence class will execute
868   // the same number of times. Since we know that the head block in
869   // each equivalence class has the largest weight, assign that weight
870   // to all the blocks in that equivalence class.
871   DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n");
872   for (auto &BI : F) {
873     const BasicBlock *BB = &BI;
874     const BasicBlock *EquivBB = EquivalenceClass[BB];
875     if (BB != EquivBB)
876       BlockWeights[BB] = BlockWeights[EquivBB];
877     DEBUG(printBlockWeight(dbgs(), BB));
878   }
879 }
880 
881 /// \brief Visit the given edge to decide if it has a valid weight.
882 ///
883 /// If \p E has not been visited before, we copy to \p UnknownEdge
884 /// and increment the count of unknown edges.
885 ///
886 /// \param E  Edge to visit.
887 /// \param NumUnknownEdges  Current number of unknown edges.
888 /// \param UnknownEdge  Set if E has not been visited before.
889 ///
890 /// \returns E's weight, if known. Otherwise, return 0.
891 uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
892                                         Edge *UnknownEdge) {
893   if (!VisitedEdges.count(E)) {
894     (*NumUnknownEdges)++;
895     *UnknownEdge = E;
896     return 0;
897   }
898 
899   return EdgeWeights[E];
900 }
901 
902 /// \brief Propagate weights through incoming/outgoing edges.
903 ///
904 /// If the weight of a basic block is known, and there is only one edge
905 /// with an unknown weight, we can calculate the weight of that edge.
906 ///
907 /// Similarly, if all the edges have a known count, we can calculate the
908 /// count of the basic block, if needed.
909 ///
910 /// \param F  Function to process.
911 /// \param UpdateBlockCount  Whether we should update basic block counts that
912 ///                          has already been annotated.
913 ///
914 /// \returns  True if new weights were assigned to edges or blocks.
915 bool SampleProfileLoader::propagateThroughEdges(Function &F,
916                                                 bool UpdateBlockCount) {
917   bool Changed = false;
918   DEBUG(dbgs() << "\nPropagation through edges\n");
919   for (const auto &BI : F) {
920     const BasicBlock *BB = &BI;
921     const BasicBlock *EC = EquivalenceClass[BB];
922 
923     // Visit all the predecessor and successor edges to determine
924     // which ones have a weight assigned already. Note that it doesn't
925     // matter that we only keep track of a single unknown edge. The
926     // only case we are interested in handling is when only a single
927     // edge is unknown (see setEdgeOrBlockWeight).
928     for (unsigned i = 0; i < 2; i++) {
929       uint64_t TotalWeight = 0;
930       unsigned NumUnknownEdges = 0, NumTotalEdges = 0;
931       Edge UnknownEdge, SelfReferentialEdge, SingleEdge;
932 
933       if (i == 0) {
934         // First, visit all predecessor edges.
935         NumTotalEdges = Predecessors[BB].size();
936         for (auto *Pred : Predecessors[BB]) {
937           Edge E = std::make_pair(Pred, BB);
938           TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
939           if (E.first == E.second)
940             SelfReferentialEdge = E;
941         }
942         if (NumTotalEdges == 1) {
943           SingleEdge = std::make_pair(Predecessors[BB][0], BB);
944         }
945       } else {
946         // On the second round, visit all successor edges.
947         NumTotalEdges = Successors[BB].size();
948         for (auto *Succ : Successors[BB]) {
949           Edge E = std::make_pair(BB, Succ);
950           TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
951         }
952         if (NumTotalEdges == 1) {
953           SingleEdge = std::make_pair(BB, Successors[BB][0]);
954         }
955       }
956 
957       // After visiting all the edges, there are three cases that we
958       // can handle immediately:
959       //
960       // - All the edge weights are known (i.e., NumUnknownEdges == 0).
961       //   In this case, we simply check that the sum of all the edges
962       //   is the same as BB's weight. If not, we change BB's weight
963       //   to match. Additionally, if BB had not been visited before,
964       //   we mark it visited.
965       //
966       // - Only one edge is unknown and BB has already been visited.
967       //   In this case, we can compute the weight of the edge by
968       //   subtracting the total block weight from all the known
969       //   edge weights. If the edges weight more than BB, then the
970       //   edge of the last remaining edge is set to zero.
971       //
972       // - There exists a self-referential edge and the weight of BB is
973       //   known. In this case, this edge can be based on BB's weight.
974       //   We add up all the other known edges and set the weight on
975       //   the self-referential edge as we did in the previous case.
976       //
977       // In any other case, we must continue iterating. Eventually,
978       // all edges will get a weight, or iteration will stop when
979       // it reaches SampleProfileMaxPropagateIterations.
980       if (NumUnknownEdges <= 1) {
981         uint64_t &BBWeight = BlockWeights[EC];
982         if (NumUnknownEdges == 0) {
983           if (!VisitedBlocks.count(EC)) {
984             // If we already know the weight of all edges, the weight of the
985             // basic block can be computed. It should be no larger than the sum
986             // of all edge weights.
987             if (TotalWeight > BBWeight) {
988               BBWeight = TotalWeight;
989               Changed = true;
990               DEBUG(dbgs() << "All edge weights for " << BB->getName()
991                            << " known. Set weight for block: ";
992                     printBlockWeight(dbgs(), BB););
993             }
994           } else if (NumTotalEdges == 1 &&
995                      EdgeWeights[SingleEdge] < BlockWeights[EC]) {
996             // If there is only one edge for the visited basic block, use the
997             // block weight to adjust edge weight if edge weight is smaller.
998             EdgeWeights[SingleEdge] = BlockWeights[EC];
999             Changed = true;
1000           }
1001         } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) {
1002           // If there is a single unknown edge and the block has been
1003           // visited, then we can compute E's weight.
1004           if (BBWeight >= TotalWeight)
1005             EdgeWeights[UnknownEdge] = BBWeight - TotalWeight;
1006           else
1007             EdgeWeights[UnknownEdge] = 0;
1008           const BasicBlock *OtherEC;
1009           if (i == 0)
1010             OtherEC = EquivalenceClass[UnknownEdge.first];
1011           else
1012             OtherEC = EquivalenceClass[UnknownEdge.second];
1013           // Edge weights should never exceed the BB weights it connects.
1014           if (VisitedBlocks.count(OtherEC) &&
1015               EdgeWeights[UnknownEdge] > BlockWeights[OtherEC])
1016             EdgeWeights[UnknownEdge] = BlockWeights[OtherEC];
1017           VisitedEdges.insert(UnknownEdge);
1018           Changed = true;
1019           DEBUG(dbgs() << "Set weight for edge: ";
1020                 printEdgeWeight(dbgs(), UnknownEdge));
1021         }
1022       } else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) {
1023         // If a block Weights 0, all its in/out edges should weight 0.
1024         if (i == 0) {
1025           for (auto *Pred : Predecessors[BB]) {
1026             Edge E = std::make_pair(Pred, BB);
1027             EdgeWeights[E] = 0;
1028             VisitedEdges.insert(E);
1029           }
1030         } else {
1031           for (auto *Succ : Successors[BB]) {
1032             Edge E = std::make_pair(BB, Succ);
1033             EdgeWeights[E] = 0;
1034             VisitedEdges.insert(E);
1035           }
1036         }
1037       } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) {
1038         uint64_t &BBWeight = BlockWeights[BB];
1039         // We have a self-referential edge and the weight of BB is known.
1040         if (BBWeight >= TotalWeight)
1041           EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight;
1042         else
1043           EdgeWeights[SelfReferentialEdge] = 0;
1044         VisitedEdges.insert(SelfReferentialEdge);
1045         Changed = true;
1046         DEBUG(dbgs() << "Set self-referential edge weight to: ";
1047               printEdgeWeight(dbgs(), SelfReferentialEdge));
1048       }
1049       if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) {
1050         BlockWeights[EC] = TotalWeight;
1051         VisitedBlocks.insert(EC);
1052         Changed = true;
1053       }
1054     }
1055   }
1056 
1057   return Changed;
1058 }
1059 
1060 /// \brief Build in/out edge lists for each basic block in the CFG.
1061 ///
1062 /// We are interested in unique edges. If a block B1 has multiple
1063 /// edges to another block B2, we only add a single B1->B2 edge.
1064 void SampleProfileLoader::buildEdges(Function &F) {
1065   for (auto &BI : F) {
1066     BasicBlock *B1 = &BI;
1067 
1068     // Add predecessors for B1.
1069     SmallPtrSet<BasicBlock *, 16> Visited;
1070     if (!Predecessors[B1].empty())
1071       llvm_unreachable("Found a stale predecessors list in a basic block.");
1072     for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) {
1073       BasicBlock *B2 = *PI;
1074       if (Visited.insert(B2).second)
1075         Predecessors[B1].push_back(B2);
1076     }
1077 
1078     // Add successors for B1.
1079     Visited.clear();
1080     if (!Successors[B1].empty())
1081       llvm_unreachable("Found a stale successors list in a basic block.");
1082     for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) {
1083       BasicBlock *B2 = *SI;
1084       if (Visited.insert(B2).second)
1085         Successors[B1].push_back(B2);
1086     }
1087   }
1088 }
1089 
1090 /// Sorts the CallTargetMap \p M by count in descending order and stores the
1091 /// sorted result in \p Sorted. Returns the total counts.
1092 static uint64_t SortCallTargets(SmallVector<InstrProfValueData, 2> &Sorted,
1093                                 const SampleRecord::CallTargetMap &M) {
1094   Sorted.clear();
1095   uint64_t Sum = 0;
1096   for (auto I = M.begin(); I != M.end(); ++I) {
1097     Sum += I->getValue();
1098     Sorted.push_back({Function::getGUID(I->getKey()), I->getValue()});
1099   }
1100   std::sort(Sorted.begin(), Sorted.end(),
1101             [](const InstrProfValueData &L, const InstrProfValueData &R) {
1102               if (L.Count == R.Count)
1103                 return L.Value > R.Value;
1104               else
1105                 return L.Count > R.Count;
1106             });
1107   return Sum;
1108 }
1109 
1110 /// \brief Propagate weights into edges
1111 ///
1112 /// The following rules are applied to every block BB in the CFG:
1113 ///
1114 /// - If BB has a single predecessor/successor, then the weight
1115 ///   of that edge is the weight of the block.
1116 ///
1117 /// - If all incoming or outgoing edges are known except one, and the
1118 ///   weight of the block is already known, the weight of the unknown
1119 ///   edge will be the weight of the block minus the sum of all the known
1120 ///   edges. If the sum of all the known edges is larger than BB's weight,
1121 ///   we set the unknown edge weight to zero.
1122 ///
1123 /// - If there is a self-referential edge, and the weight of the block is
1124 ///   known, the weight for that edge is set to the weight of the block
1125 ///   minus the weight of the other incoming edges to that block (if
1126 ///   known).
1127 void SampleProfileLoader::propagateWeights(Function &F) {
1128   bool Changed = true;
1129   unsigned I = 0;
1130 
1131   // If BB weight is larger than its corresponding loop's header BB weight,
1132   // use the BB weight to replace the loop header BB weight.
1133   for (auto &BI : F) {
1134     BasicBlock *BB = &BI;
1135     Loop *L = LI->getLoopFor(BB);
1136     if (!L) {
1137       continue;
1138     }
1139     BasicBlock *Header = L->getHeader();
1140     if (Header && BlockWeights[BB] > BlockWeights[Header]) {
1141       BlockWeights[Header] = BlockWeights[BB];
1142     }
1143   }
1144 
1145   // Before propagation starts, build, for each block, a list of
1146   // unique predecessors and successors. This is necessary to handle
1147   // identical edges in multiway branches. Since we visit all blocks and all
1148   // edges of the CFG, it is cleaner to build these lists once at the start
1149   // of the pass.
1150   buildEdges(F);
1151 
1152   // Propagate until we converge or we go past the iteration limit.
1153   while (Changed && I++ < SampleProfileMaxPropagateIterations) {
1154     Changed = propagateThroughEdges(F, false);
1155   }
1156 
1157   // The first propagation propagates BB counts from annotated BBs to unknown
1158   // BBs. The 2nd propagation pass resets edges weights, and use all BB weights
1159   // to propagate edge weights.
1160   VisitedEdges.clear();
1161   Changed = true;
1162   while (Changed && I++ < SampleProfileMaxPropagateIterations) {
1163     Changed = propagateThroughEdges(F, false);
1164   }
1165 
1166   // The 3rd propagation pass allows adjust annotated BB weights that are
1167   // obviously wrong.
1168   Changed = true;
1169   while (Changed && I++ < SampleProfileMaxPropagateIterations) {
1170     Changed = propagateThroughEdges(F, true);
1171   }
1172 
1173   // Generate MD_prof metadata for every branch instruction using the
1174   // edge weights computed during propagation.
1175   DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
1176   LLVMContext &Ctx = F.getContext();
1177   MDBuilder MDB(Ctx);
1178   for (auto &BI : F) {
1179     BasicBlock *BB = &BI;
1180 
1181     if (BlockWeights[BB]) {
1182       for (auto &I : BB->getInstList()) {
1183         if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
1184           continue;
1185         CallSite CS(&I);
1186         if (!CS.getCalledFunction()) {
1187           const DebugLoc &DLoc = I.getDebugLoc();
1188           if (!DLoc)
1189             continue;
1190           const DILocation *DIL = DLoc;
1191           uint32_t LineOffset = getOffset(DIL);
1192           uint32_t Discriminator = DIL->getBaseDiscriminator();
1193 
1194           const FunctionSamples *FS = findFunctionSamples(I);
1195           if (!FS)
1196             continue;
1197           auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
1198           if (!T || T.get().size() == 0)
1199             continue;
1200           SmallVector<InstrProfValueData, 2> SortedCallTargets;
1201           uint64_t Sum = SortCallTargets(SortedCallTargets, T.get());
1202           annotateValueSite(*I.getParent()->getParent()->getParent(), I,
1203                             SortedCallTargets, Sum, IPVK_IndirectCallTarget,
1204                             SortedCallTargets.size());
1205         } else if (!dyn_cast<IntrinsicInst>(&I)) {
1206           SmallVector<uint32_t, 1> Weights;
1207           Weights.push_back(BlockWeights[BB]);
1208           I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
1209         }
1210       }
1211     }
1212     TerminatorInst *TI = BB->getTerminator();
1213     if (TI->getNumSuccessors() == 1)
1214       continue;
1215     if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
1216       continue;
1217 
1218     DebugLoc BranchLoc = TI->getDebugLoc();
1219     DEBUG(dbgs() << "\nGetting weights for branch at line "
1220                  << ((BranchLoc) ? Twine(BranchLoc.getLine())
1221                                  : Twine("<UNKNOWN LOCATION>"))
1222                  << ".\n");
1223     SmallVector<uint32_t, 4> Weights;
1224     uint32_t MaxWeight = 0;
1225     Instruction *MaxDestInst;
1226     for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1227       BasicBlock *Succ = TI->getSuccessor(I);
1228       Edge E = std::make_pair(BB, Succ);
1229       uint64_t Weight = EdgeWeights[E];
1230       DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
1231       // Use uint32_t saturated arithmetic to adjust the incoming weights,
1232       // if needed. Sample counts in profiles are 64-bit unsigned values,
1233       // but internally branch weights are expressed as 32-bit values.
1234       if (Weight > std::numeric_limits<uint32_t>::max()) {
1235         DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
1236         Weight = std::numeric_limits<uint32_t>::max();
1237       }
1238       // Weight is added by one to avoid propagation errors introduced by
1239       // 0 weights.
1240       Weights.push_back(static_cast<uint32_t>(Weight + 1));
1241       if (Weight != 0) {
1242         if (Weight > MaxWeight) {
1243           MaxWeight = Weight;
1244           MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
1245         }
1246       }
1247     }
1248 
1249     uint64_t TempWeight;
1250     // Only set weights if there is at least one non-zero weight.
1251     // In any other case, let the analyzer set weights.
1252     // Do not set weights if the weights are present. In ThinLTO, the profile
1253     // annotation is done twice. If the first annotation already set the
1254     // weights, the second pass does not need to set it.
1255     if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
1256       DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1257       TI->setMetadata(llvm::LLVMContext::MD_prof,
1258                       MDB.createBranchWeights(Weights));
1259       ORE->emit(OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
1260                 << "most popular destination for conditional branches at "
1261                 << ore::NV("CondBranchesLoc", BranchLoc));
1262     } else {
1263       DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1264     }
1265   }
1266 }
1267 
1268 /// \brief Get the line number for the function header.
1269 ///
1270 /// This looks up function \p F in the current compilation unit and
1271 /// retrieves the line number where the function is defined. This is
1272 /// line 0 for all the samples read from the profile file. Every line
1273 /// number is relative to this line.
1274 ///
1275 /// \param F  Function object to query.
1276 ///
1277 /// \returns the line number where \p F is defined. If it returns 0,
1278 ///          it means that there is no debug information available for \p F.
1279 unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
1280   if (DISubprogram *S = F.getSubprogram())
1281     return S->getLine();
1282 
1283   // If the start of \p F is missing, emit a diagnostic to inform the user
1284   // about the missed opportunity.
1285   F.getContext().diagnose(DiagnosticInfoSampleProfile(
1286       "No debug information found in function " + F.getName() +
1287           ": Function profile not used",
1288       DS_Warning));
1289   return 0;
1290 }
1291 
1292 void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
1293   DT.reset(new DominatorTree);
1294   DT->recalculate(F);
1295 
1296   PDT.reset(new PostDomTreeBase<BasicBlock>());
1297   PDT->recalculate(F);
1298 
1299   LI.reset(new LoopInfo);
1300   LI->analyze(*DT);
1301 }
1302 
1303 /// \brief Generate branch weight metadata for all branches in \p F.
1304 ///
1305 /// Branch weights are computed out of instruction samples using a
1306 /// propagation heuristic. Propagation proceeds in 3 phases:
1307 ///
1308 /// 1- Assignment of block weights. All the basic blocks in the function
1309 ///    are initial assigned the same weight as their most frequently
1310 ///    executed instruction.
1311 ///
1312 /// 2- Creation of equivalence classes. Since samples may be missing from
1313 ///    blocks, we can fill in the gaps by setting the weights of all the
1314 ///    blocks in the same equivalence class to the same weight. To compute
1315 ///    the concept of equivalence, we use dominance and loop information.
1316 ///    Two blocks B1 and B2 are in the same equivalence class if B1
1317 ///    dominates B2, B2 post-dominates B1 and both are in the same loop.
1318 ///
1319 /// 3- Propagation of block weights into edges. This uses a simple
1320 ///    propagation heuristic. The following rules are applied to every
1321 ///    block BB in the CFG:
1322 ///
1323 ///    - If BB has a single predecessor/successor, then the weight
1324 ///      of that edge is the weight of the block.
1325 ///
1326 ///    - If all the edges are known except one, and the weight of the
1327 ///      block is already known, the weight of the unknown edge will
1328 ///      be the weight of the block minus the sum of all the known
1329 ///      edges. If the sum of all the known edges is larger than BB's weight,
1330 ///      we set the unknown edge weight to zero.
1331 ///
1332 ///    - If there is a self-referential edge, and the weight of the block is
1333 ///      known, the weight for that edge is set to the weight of the block
1334 ///      minus the weight of the other incoming edges to that block (if
1335 ///      known).
1336 ///
1337 /// Since this propagation is not guaranteed to finalize for every CFG, we
1338 /// only allow it to proceed for a limited number of iterations (controlled
1339 /// by -sample-profile-max-propagate-iterations).
1340 ///
1341 /// FIXME: Try to replace this propagation heuristic with a scheme
1342 /// that is guaranteed to finalize. A work-list approach similar to
1343 /// the standard value propagation algorithm used by SSA-CCP might
1344 /// work here.
1345 ///
1346 /// Once all the branch weights are computed, we emit the MD_prof
1347 /// metadata on BB using the computed values for each of its branches.
1348 ///
1349 /// \param F The function to query.
1350 ///
1351 /// \returns true if \p F was modified. Returns false, otherwise.
1352 bool SampleProfileLoader::emitAnnotations(Function &F) {
1353   bool Changed = false;
1354 
1355   if (getFunctionLoc(F) == 0)
1356     return false;
1357 
1358   DEBUG(dbgs() << "Line number for the first instruction in " << F.getName()
1359                << ": " << getFunctionLoc(F) << "\n");
1360 
1361   DenseSet<GlobalValue::GUID> ImportGUIDs;
1362   Changed |= inlineHotFunctions(F, ImportGUIDs);
1363 
1364   // Compute basic block weights.
1365   Changed |= computeBlockWeights(F);
1366 
1367   if (Changed) {
1368     // Add an entry count to the function using the samples gathered at the
1369     // function entry. Also sets the GUIDs that comes from a different
1370     // module but inlined in the profiled binary. This is aiming at making
1371     // the IR match the profiled binary before annotation.
1372     F.setEntryCount(Samples->getHeadSamples() + 1, &ImportGUIDs);
1373 
1374     // Compute dominance and loop info needed for propagation.
1375     computeDominanceAndLoopInfo(F);
1376 
1377     // Find equivalence classes.
1378     findEquivalenceClasses(F);
1379 
1380     // Propagate weights to all edges.
1381     propagateWeights(F);
1382   }
1383 
1384   // If coverage checking was requested, compute it now.
1385   if (SampleProfileRecordCoverage) {
1386     unsigned Used = CoverageTracker.countUsedRecords(Samples);
1387     unsigned Total = CoverageTracker.countBodyRecords(Samples);
1388     unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
1389     if (Coverage < SampleProfileRecordCoverage) {
1390       F.getContext().diagnose(DiagnosticInfoSampleProfile(
1391           F.getSubprogram()->getFilename(), getFunctionLoc(F),
1392           Twine(Used) + " of " + Twine(Total) + " available profile records (" +
1393               Twine(Coverage) + "%) were applied",
1394           DS_Warning));
1395     }
1396   }
1397 
1398   if (SampleProfileSampleCoverage) {
1399     uint64_t Used = CoverageTracker.getTotalUsedSamples();
1400     uint64_t Total = CoverageTracker.countBodySamples(Samples);
1401     unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
1402     if (Coverage < SampleProfileSampleCoverage) {
1403       F.getContext().diagnose(DiagnosticInfoSampleProfile(
1404           F.getSubprogram()->getFilename(), getFunctionLoc(F),
1405           Twine(Used) + " of " + Twine(Total) + " available profile samples (" +
1406               Twine(Coverage) + "%) were applied",
1407           DS_Warning));
1408     }
1409   }
1410   return Changed;
1411 }
1412 
1413 char SampleProfileLoaderLegacyPass::ID = 0;
1414 INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
1415                       "Sample Profile loader", false, false)
1416 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1417 INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
1418                     "Sample Profile loader", false, false)
1419 
1420 bool SampleProfileLoader::doInitialization(Module &M) {
1421   auto &Ctx = M.getContext();
1422   auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx);
1423   if (std::error_code EC = ReaderOrErr.getError()) {
1424     std::string Msg = "Could not open profile: " + EC.message();
1425     Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1426     return false;
1427   }
1428   Reader = std::move(ReaderOrErr.get());
1429   ProfileIsValid = (Reader->read() == sampleprof_error::success);
1430   return true;
1431 }
1432 
1433 ModulePass *llvm::createSampleProfileLoaderPass() {
1434   return new SampleProfileLoaderLegacyPass(SampleProfileFile);
1435 }
1436 
1437 ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
1438   return new SampleProfileLoaderLegacyPass(Name);
1439 }
1440 
1441 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM) {
1442   if (!ProfileIsValid)
1443     return false;
1444 
1445   // Compute the total number of samples collected in this profile.
1446   for (const auto &I : Reader->getProfiles())
1447     TotalCollectedSamples += I.second.getTotalSamples();
1448 
1449   // Populate the symbol map.
1450   for (const auto &N_F : M.getValueSymbolTable()) {
1451     std::string OrigName = N_F.getKey();
1452     Function *F = dyn_cast<Function>(N_F.getValue());
1453     if (F == nullptr)
1454       continue;
1455     SymbolMap[OrigName] = F;
1456     auto pos = OrigName.find('.');
1457     if (pos != std::string::npos) {
1458       std::string NewName = OrigName.substr(0, pos);
1459       auto r = SymbolMap.insert(std::make_pair(NewName, F));
1460       // Failiing to insert means there is already an entry in SymbolMap,
1461       // thus there are multiple functions that are mapped to the same
1462       // stripped name. In this case of name conflicting, set the value
1463       // to nullptr to avoid confusion.
1464       if (!r.second)
1465         r.first->second = nullptr;
1466     }
1467   }
1468 
1469   bool retval = false;
1470   for (auto &F : M)
1471     if (!F.isDeclaration()) {
1472       clearFunctionData();
1473       retval |= runOnFunction(F, AM);
1474     }
1475   if (M.getProfileSummary() == nullptr)
1476     M.setProfileSummary(Reader->getSummary().getMD(M.getContext()));
1477   return retval;
1478 }
1479 
1480 bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
1481   // FIXME: pass in AssumptionCache correctly for the new pass manager.
1482   SampleLoader.setACT(&getAnalysis<AssumptionCacheTracker>());
1483   return SampleLoader.runOnModule(M, nullptr);
1484 }
1485 
1486 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
1487   F.setEntryCount(0);
1488   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
1489   if (AM) {
1490     auto &FAM =
1491         AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent())
1492             .getManager();
1493     ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
1494   } else {
1495     OwnedORE = make_unique<OptimizationRemarkEmitter>(&F);
1496     ORE = OwnedORE.get();
1497   }
1498   Samples = Reader->getSamplesFor(F);
1499   if (Samples && !Samples->empty())
1500     return emitAnnotations(F);
1501   return false;
1502 }
1503 
1504 PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
1505                                                ModuleAnalysisManager &AM) {
1506 
1507   SampleProfileLoader SampleLoader(
1508       ProfileFileName.empty() ? SampleProfileFile : ProfileFileName);
1509 
1510   SampleLoader.doInitialization(M);
1511 
1512   if (!SampleLoader.runOnModule(M, &AM))
1513     return PreservedAnalyses::all();
1514 
1515   return PreservedAnalyses::none();
1516 }
1517