1 //===-- ProfileGenerator.cpp - Profile Generator  ---------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "ProfileGenerator.h"
9 #include "ErrorHandling.h"
10 #include "ProfiledBinary.h"
11 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
12 #include "llvm/ProfileData/ProfileCommon.h"
13 #include <algorithm>
14 #include <float.h>
15 #include <unordered_set>
16 
17 cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
18                                     cl::Required,
19                                     cl::desc("Output profile file"));
20 static cl::alias OutputA("o", cl::desc("Alias for --output"),
21                          cl::aliasopt(OutputFilename));
22 
23 static cl::opt<SampleProfileFormat> OutputFormat(
24     "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary),
25     cl::values(
26         clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"),
27         clEnumValN(SPF_Compact_Binary, "compbinary", "Compact binary encoding"),
28         clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"),
29         clEnumValN(SPF_Text, "text", "Text encoding"),
30         clEnumValN(SPF_GCC, "gcc",
31                    "GCC encoding (only meaningful for -sample)")));
32 
33 cl::opt<bool> UseMD5(
34     "use-md5", cl::init(false), cl::Hidden,
35     cl::desc("Use md5 to represent function names in the output profile (only "
36              "meaningful for -extbinary)"));
37 
38 static cl::opt<bool> PopulateProfileSymbolList(
39     "populate-profile-symbol-list", cl::init(false), cl::Hidden,
40     cl::desc("Populate profile symbol list (only meaningful for -extbinary)"));
41 
42 static cl::opt<bool> FillZeroForAllFuncs(
43     "fill-zero-for-all-funcs", cl::init(false), cl::Hidden,
44     cl::desc("Attribute all functions' range with zero count "
45              "even it's not hit by any samples."));
46 
47 static cl::opt<int32_t, true> RecursionCompression(
48     "compress-recursion",
49     cl::desc("Compressing recursion by deduplicating adjacent frame "
50              "sequences up to the specified size. -1 means no size limit."),
51     cl::Hidden,
52     cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize));
53 
54 static cl::opt<bool>
55     TrimColdProfile("trim-cold-profile", cl::init(false), cl::ZeroOrMore,
56                     cl::desc("If the total count of the profile is smaller "
57                              "than threshold, it will be trimmed."));
58 
59 static cl::opt<bool> CSProfMergeColdContext(
60     "csprof-merge-cold-context", cl::init(true), cl::ZeroOrMore,
61     cl::desc("If the total count of context profile is smaller than "
62              "the threshold, it will be merged into context-less base "
63              "profile."));
64 
65 static cl::opt<uint32_t> CSProfMaxColdContextDepth(
66     "csprof-max-cold-context-depth", cl::init(1), cl::ZeroOrMore,
67     cl::desc("Keep the last K contexts while merging cold profile. 1 means the "
68              "context-less base profile"));
69 
70 static cl::opt<int, true> CSProfMaxContextDepth(
71     "csprof-max-context-depth", cl::ZeroOrMore,
72     cl::desc("Keep the last K contexts while merging profile. -1 means no "
73              "depth limit."),
74     cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
75 
76 static cl::opt<double> HotFunctionDensityThreshold(
77     "hot-function-density-threshold", llvm::cl::init(1000),
78     llvm::cl::desc(
79         "specify density threshold for hot functions (default: 1000)"),
80     llvm::cl::Optional);
81 static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
82                                  llvm::cl::desc("show profile density details"),
83                                  llvm::cl::Optional);
84 
85 static cl::opt<bool> UpdateTotalSamples(
86     "update-total-samples", llvm::cl::init(false),
87     llvm::cl::desc(
88         "Update total samples by accumulating all its body samples."),
89     llvm::cl::Optional);
90 
91 extern cl::opt<int> ProfileSummaryCutoffHot;
92 
93 static cl::opt<bool> GenCSNestedProfile(
94     "gen-cs-nested-profile", cl::Hidden, cl::init(true),
95     cl::desc("Generate nested function profiles for CSSPGO"));
96 
97 using namespace llvm;
98 using namespace sampleprof;
99 
100 namespace llvm {
101 namespace sampleprof {
102 
103 // Initialize the MaxCompressionSize to -1 which means no size limit
104 int32_t CSProfileGenerator::MaxCompressionSize = -1;
105 
106 int CSProfileGenerator::MaxContextDepth = -1;
107 
108 bool ProfileGeneratorBase::UseFSDiscriminator = false;
109 
110 std::unique_ptr<ProfileGeneratorBase>
111 ProfileGeneratorBase::create(ProfiledBinary *Binary,
112                              const ContextSampleCounterMap &SampleCounters,
113                              bool ProfileIsCSFlat) {
114   std::unique_ptr<ProfileGeneratorBase> Generator;
115   if (ProfileIsCSFlat) {
116     if (Binary->useFSDiscriminator())
117       exitWithError("FS discriminator is not supported in CS profile.");
118     Generator.reset(new CSProfileGenerator(Binary, SampleCounters));
119   } else {
120     Generator.reset(new ProfileGenerator(Binary, SampleCounters));
121   }
122   ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
123   FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
124 
125   return Generator;
126 }
127 
128 void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer,
129                                  SampleProfileMap &ProfileMap) {
130   // Populate profile symbol list if extended binary format is used.
131   ProfileSymbolList SymbolList;
132 
133   if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) {
134     Binary->populateSymbolListFromDWARF(SymbolList);
135     Writer->setProfileSymbolList(&SymbolList);
136   }
137 
138   if (std::error_code EC = Writer->write(ProfileMap))
139     exitWithError(std::move(EC));
140 }
141 
142 void ProfileGeneratorBase::write() {
143   auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat);
144   if (std::error_code EC = WriterOrErr.getError())
145     exitWithError(EC, OutputFilename);
146 
147   if (UseMD5) {
148     if (OutputFormat != SPF_Ext_Binary)
149       WithColor::warning() << "-use-md5 is ignored. Specify "
150                               "--format=extbinary to enable it\n";
151     else
152       WriterOrErr.get()->setUseMD5();
153   }
154 
155   write(std::move(WriterOrErr.get()), ProfileMap);
156 }
157 
158 void ProfileGeneratorBase::showDensitySuggestion(double Density) {
159   if (Density == 0.0)
160     WithColor::warning() << "The --profile-summary-cutoff-hot option may be "
161                             "set too low. Please check your command.\n";
162   else if (Density < HotFunctionDensityThreshold)
163     WithColor::warning()
164         << "AutoFDO is estimated to optimize better with "
165         << format("%.1f", HotFunctionDensityThreshold / Density)
166         << "x more samples. Please consider increasing sampling rate or "
167            "profiling for longer duration to get more samples.\n";
168 
169   if (ShowDensity)
170     outs() << "Minimum profile density for hot functions with top "
171            << format("%.2f",
172                      static_cast<double>(ProfileSummaryCutoffHot.getValue()) /
173                          10000)
174            << "% total samples: " << format("%.1f", Density) << "\n";
175 }
176 
177 double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles,
178                                               uint64_t HotCntThreshold) {
179   double Density = DBL_MAX;
180   std::vector<const FunctionSamples *> HotFuncs;
181   for (auto &I : Profiles) {
182     auto &FuncSamples = I.second;
183     if (FuncSamples.getTotalSamples() < HotCntThreshold)
184       continue;
185     HotFuncs.emplace_back(&FuncSamples);
186   }
187 
188   for (auto *FuncSamples : HotFuncs) {
189     auto *Func = Binary->getBinaryFunction(FuncSamples->getName());
190     if (!Func)
191       continue;
192     uint64_t FuncSize = Func->getFuncSize();
193     if (FuncSize == 0)
194       continue;
195     Density =
196         std::min(Density, static_cast<double>(FuncSamples->getTotalSamples()) /
197                               FuncSize);
198   }
199 
200   return Density == DBL_MAX ? 0.0 : Density;
201 }
202 
203 void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
204                                               const RangeSample &Ranges) {
205 
206   /*
207   Regions may overlap with each other. Using the boundary info, find all
208   disjoint ranges and their sample count. BoundaryPoint contains the count
209   multiple samples begin/end at this points.
210 
211   |<--100-->|           Sample1
212   |<------200------>|   Sample2
213   A         B       C
214 
215   In the example above,
216   Sample1 begins at A, ends at B, its value is 100.
217   Sample2 beings at A, ends at C, its value is 200.
218   For A, BeginCount is the sum of sample begins at A, which is 300 and no
219   samples ends at A, so EndCount is 0.
220   Then boundary points A, B, and C with begin/end counts are:
221   A: (300, 0)
222   B: (0, 100)
223   C: (0, 200)
224   */
225   struct BoundaryPoint {
226     // Sum of sample counts beginning at this point
227     uint64_t BeginCount = UINT64_MAX;
228     // Sum of sample counts ending at this point
229     uint64_t EndCount = UINT64_MAX;
230     // Is the begin point of a zero range.
231     bool IsZeroRangeBegin = false;
232     // Is the end point of a zero range.
233     bool IsZeroRangeEnd = false;
234 
235     void addBeginCount(uint64_t Count) {
236       if (BeginCount == UINT64_MAX)
237         BeginCount = 0;
238       BeginCount += Count;
239     }
240 
241     void addEndCount(uint64_t Count) {
242       if (EndCount == UINT64_MAX)
243         EndCount = 0;
244       EndCount += Count;
245     }
246   };
247 
248   /*
249   For the above example. With boundary points, follwing logic finds two
250   disjoint region of
251 
252   [A,B]:   300
253   [B+1,C]: 200
254 
255   If there is a boundary point that both begin and end, the point itself
256   becomes a separate disjoint region. For example, if we have original
257   ranges of
258 
259   |<--- 100 --->|
260                 |<--- 200 --->|
261   A             B             C
262 
263   there are three boundary points with their begin/end counts of
264 
265   A: (100, 0)
266   B: (200, 100)
267   C: (0, 200)
268 
269   the disjoint ranges would be
270 
271   [A, B-1]: 100
272   [B, B]:   300
273   [B+1, C]: 200.
274 
275   Example for zero value range:
276 
277     |<--- 100 --->|
278                        |<--- 200 --->|
279   |<---------------  0 ----------------->|
280   A  B            C    D             E   F
281 
282   [A, B-1]  : 0
283   [B, C]    : 100
284   [C+1, D-1]: 0
285   [D, E]    : 200
286   [E+1, F]  : 0
287   */
288   std::map<uint64_t, BoundaryPoint> Boundaries;
289 
290   for (const auto &Item : Ranges) {
291     assert(Item.first.first <= Item.first.second &&
292            "Invalid instruction range");
293     auto &BeginPoint = Boundaries[Item.first.first];
294     auto &EndPoint = Boundaries[Item.first.second];
295     uint64_t Count = Item.second;
296 
297     BeginPoint.addBeginCount(Count);
298     EndPoint.addEndCount(Count);
299     if (Count == 0) {
300       BeginPoint.IsZeroRangeBegin = true;
301       EndPoint.IsZeroRangeEnd = true;
302     }
303   }
304 
305   // Use UINT64_MAX to indicate there is no existing range between BeginAddress
306   // and the next valid address
307   uint64_t BeginAddress = UINT64_MAX;
308   int ZeroRangeDepth = 0;
309   uint64_t Count = 0;
310   for (const auto &Item : Boundaries) {
311     uint64_t Address = Item.first;
312     const BoundaryPoint &Point = Item.second;
313     if (Point.BeginCount != UINT64_MAX) {
314       if (BeginAddress != UINT64_MAX)
315         DisjointRanges[{BeginAddress, Address - 1}] = Count;
316       Count += Point.BeginCount;
317       BeginAddress = Address;
318       ZeroRangeDepth += Point.IsZeroRangeBegin;
319     }
320     if (Point.EndCount != UINT64_MAX) {
321       assert((BeginAddress != UINT64_MAX) &&
322              "First boundary point cannot be 'end' point");
323       DisjointRanges[{BeginAddress, Address}] = Count;
324       assert(Count >= Point.EndCount && "Mismatched live ranges");
325       Count -= Point.EndCount;
326       BeginAddress = Address + 1;
327       ZeroRangeDepth -= Point.IsZeroRangeEnd;
328       // If the remaining count is zero and it's no longer in a zero range, this
329       // means we consume all the ranges before, thus mark BeginAddress as
330       // UINT64_MAX. e.g. supposing we have two non-overlapping ranges:
331       //  [<---- 10 ---->]
332       //                       [<---- 20 ---->]
333       //   A             B     C              D
334       // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't
335       // have the [B+1, C-1] zero range.
336       if (Count == 0 && ZeroRangeDepth == 0)
337         BeginAddress = UINT64_MAX;
338     }
339   }
340 }
341 
342 void ProfileGeneratorBase::updateBodySamplesforFunctionProfile(
343     FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc,
344     uint64_t Count) {
345   // Use the maximum count of samples with same line location
346   uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator);
347 
348   // Use duplication factor to compensated for loop unroll/vectorization.
349   // Note that this is only needed when we're taking MAX of the counts at
350   // the location instead of SUM.
351   Count *= getDuplicationFactor(LeafLoc.Location.Discriminator);
352 
353   ErrorOr<uint64_t> R =
354       FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator);
355 
356   uint64_t PreviousCount = R ? R.get() : 0;
357   if (PreviousCount <= Count) {
358     FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator,
359                                    Count - PreviousCount);
360   }
361 }
362 
363 void ProfileGeneratorBase::updateTotalSamples() {
364   if (!UpdateTotalSamples)
365     return;
366 
367   for (auto &Item : ProfileMap) {
368     FunctionSamples &FunctionProfile = Item.second;
369     FunctionProfile.updateTotalSamples();
370   }
371 }
372 
373 void ProfileGeneratorBase::collectProfiledFunctions() {
374   std::unordered_set<const BinaryFunction *> ProfiledFunctions;
375   // Go through all the stacks, ranges and branches in sample counters, use the
376   // start of the range to look up the function it belongs and record the
377   // function.
378   for (const auto &CI : SampleCounters) {
379     if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(CI.first.getPtr())) {
380       for (auto Addr : CtxKey->Context) {
381         if (FuncRange *FRange = Binary->findFuncRangeForOffset(
382                 Binary->virtualAddrToOffset(Addr)))
383           ProfiledFunctions.insert(FRange->Func);
384       }
385     }
386 
387     for (auto Item : CI.second.RangeCounter) {
388       uint64_t StartOffset = Item.first.first;
389       if (FuncRange *FRange = Binary->findFuncRangeForOffset(StartOffset))
390         ProfiledFunctions.insert(FRange->Func);
391     }
392 
393     for (auto Item : CI.second.BranchCounter) {
394       uint64_t SourceOffset = Item.first.first;
395       uint64_t TargetOffset = Item.first.first;
396       if (FuncRange *FRange = Binary->findFuncRangeForOffset(SourceOffset))
397         ProfiledFunctions.insert(FRange->Func);
398       if (FuncRange *FRange = Binary->findFuncRangeForOffset(TargetOffset))
399         ProfiledFunctions.insert(FRange->Func);
400     }
401   }
402 
403   Binary->setProfiledFunctions(ProfiledFunctions);
404 }
405 
406 FunctionSamples &
407 ProfileGenerator::getTopLevelFunctionProfile(StringRef FuncName) {
408   SampleContext Context(FuncName);
409   auto Ret = ProfileMap.emplace(Context, FunctionSamples());
410   if (Ret.second) {
411     FunctionSamples &FProfile = Ret.first->second;
412     FProfile.setContext(Context);
413   }
414   return Ret.first->second;
415 }
416 
417 void ProfileGenerator::generateProfile() {
418   collectProfiledFunctions();
419   if (Binary->usePseudoProbes()) {
420     generateProbeBasedProfile();
421   } else {
422     generateLineNumBasedProfile();
423   }
424   postProcessProfiles();
425 }
426 
427 void ProfileGenerator::postProcessProfiles() {
428   computeSummaryAndThreshold();
429   trimColdProfiles(ProfileMap, ColdCountThreshold);
430   calculateAndShowDensity(ProfileMap);
431 }
432 
433 void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles,
434                                         uint64_t ColdCntThreshold) {
435   if (!TrimColdProfile)
436     return;
437 
438   // Move cold profiles into a tmp container.
439   std::vector<SampleContext> ColdProfiles;
440   for (const auto &I : ProfileMap) {
441     if (I.second.getTotalSamples() < ColdCntThreshold)
442       ColdProfiles.emplace_back(I.first);
443   }
444 
445   // Remove the cold profile from ProfileMap.
446   for (const auto &I : ColdProfiles)
447     ProfileMap.erase(I);
448 }
449 
450 void ProfileGenerator::generateLineNumBasedProfile() {
451   assert(SampleCounters.size() == 1 &&
452          "Must have one entry for profile generation.");
453   const SampleCounter &SC = SampleCounters.begin()->second;
454   // Fill in function body samples
455   populateBodySamplesForAllFunctions(SC.RangeCounter);
456   // Fill in boundary sample counts as well as call site samples for calls
457   populateBoundarySamplesForAllFunctions(SC.BranchCounter);
458 
459   updateTotalSamples();
460 }
461 
462 void ProfileGenerator::generateProbeBasedProfile() {
463   assert(SampleCounters.size() == 1 &&
464          "Must have one entry for profile generation.");
465   Binary->decodePseudoProbe();
466   // Enable pseudo probe functionalities in SampleProf
467   FunctionSamples::ProfileIsProbeBased = true;
468   const SampleCounter &SC = SampleCounters.begin()->second;
469   // Fill in function body samples
470   populateBodySamplesWithProbesForAllFunctions(SC.RangeCounter);
471   // Fill in boundary sample counts as well as call site samples for calls
472   populateBoundarySamplesWithProbesForAllFunctions(SC.BranchCounter);
473 
474   updateTotalSamples();
475 }
476 
477 void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions(
478     const RangeSample &RangeCounter) {
479   ProbeCounterMap ProbeCounter;
480   // preprocessRangeCounter returns disjoint ranges, so no longer to redo it
481   // inside extractProbesFromRange.
482   extractProbesFromRange(preprocessRangeCounter(RangeCounter), ProbeCounter,
483                          false);
484 
485   for (const auto &PI : ProbeCounter) {
486     const MCDecodedPseudoProbe *Probe = PI.first;
487     uint64_t Count = PI.second;
488     SampleContextFrameVector FrameVec;
489     Binary->getInlineContextForProbe(Probe, FrameVec, true);
490     FunctionSamples &FunctionProfile =
491         getLeafProfileAndAddTotalSamples(FrameVec, Count);
492     FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count);
493     if (Probe->isEntry())
494       FunctionProfile.addHeadSamples(Count);
495   }
496 }
497 
498 void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions(
499     const BranchSample &BranchCounters) {
500   for (const auto &Entry : BranchCounters) {
501     uint64_t SourceOffset = Entry.first.first;
502     uint64_t TargetOffset = Entry.first.second;
503     uint64_t Count = Entry.second;
504     assert(Count != 0 && "Unexpected zero weight branch");
505 
506     StringRef CalleeName = getCalleeNameForOffset(TargetOffset);
507     if (CalleeName.size() == 0)
508       continue;
509 
510     uint64_t SourceAddress = Binary->offsetToVirtualAddr(SourceOffset);
511     const MCDecodedPseudoProbe *CallProbe =
512         Binary->getCallProbeForAddr(SourceAddress);
513     if (CallProbe == nullptr)
514       continue;
515 
516     // Record called target sample and its count.
517     SampleContextFrameVector FrameVec;
518     Binary->getInlineContextForProbe(CallProbe, FrameVec, true);
519 
520     if (!FrameVec.empty()) {
521       FunctionSamples &FunctionProfile =
522           getLeafProfileAndAddTotalSamples(FrameVec, 0);
523       FunctionProfile.addCalledTargetSamples(
524           FrameVec.back().Location.LineOffset, 0, CalleeName, Count);
525     }
526   }
527 }
528 
529 FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples(
530     const SampleContextFrameVector &FrameVec, uint64_t Count) {
531   // Get top level profile
532   FunctionSamples *FunctionProfile =
533       &getTopLevelFunctionProfile(FrameVec[0].FuncName);
534   FunctionProfile->addTotalSamples(Count);
535   if (Binary->usePseudoProbes()) {
536     const auto *FuncDesc = Binary->getFuncDescForGUID(
537         Function::getGUID(FunctionProfile->getName()));
538     FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
539   }
540 
541   for (size_t I = 1; I < FrameVec.size(); I++) {
542     LineLocation Callsite(
543         FrameVec[I - 1].Location.LineOffset,
544         getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator));
545     FunctionSamplesMap &SamplesMap =
546         FunctionProfile->functionSamplesAt(Callsite);
547     auto Ret =
548         SamplesMap.emplace(FrameVec[I].FuncName.str(), FunctionSamples());
549     if (Ret.second) {
550       SampleContext Context(FrameVec[I].FuncName);
551       Ret.first->second.setContext(Context);
552     }
553     FunctionProfile = &Ret.first->second;
554     FunctionProfile->addTotalSamples(Count);
555     if (Binary->usePseudoProbes()) {
556       const auto *FuncDesc = Binary->getFuncDescForGUID(
557           Function::getGUID(FunctionProfile->getName()));
558       FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
559     }
560   }
561 
562   return *FunctionProfile;
563 }
564 
565 RangeSample
566 ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) {
567   RangeSample Ranges(RangeCounter.begin(), RangeCounter.end());
568   if (FillZeroForAllFuncs) {
569     for (auto &FuncI : Binary->getAllBinaryFunctions()) {
570       for (auto &R : FuncI.second.Ranges) {
571         Ranges[{R.first, R.second - 1}] += 0;
572       }
573     }
574   } else {
575     // For each range, we search for all ranges of the function it belongs to
576     // and initialize it with zero count, so it remains zero if doesn't hit any
577     // samples. This is to be consistent with compiler that interpret zero count
578     // as unexecuted(cold).
579     for (const auto &I : RangeCounter) {
580       uint64_t StartOffset = I.first.first;
581       for (const auto &Range : Binary->getRangesForOffset(StartOffset))
582         Ranges[{Range.first, Range.second - 1}] += 0;
583     }
584   }
585   RangeSample DisjointRanges;
586   findDisjointRanges(DisjointRanges, Ranges);
587   return DisjointRanges;
588 }
589 
590 void ProfileGenerator::populateBodySamplesForAllFunctions(
591     const RangeSample &RangeCounter) {
592   for (const auto &Range : preprocessRangeCounter(RangeCounter)) {
593     uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first);
594     uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second);
595     uint64_t Count = Range.second;
596 
597     InstructionPointer IP(Binary, RangeBegin, true);
598     // Disjoint ranges may have range in the middle of two instr,
599     // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
600     // can be Addr1+1 to Addr2-1. We should ignore such range.
601     if (IP.Address > RangeEnd)
602       continue;
603 
604     do {
605       uint64_t Offset = Binary->virtualAddrToOffset(IP.Address);
606       const SampleContextFrameVector &FrameVec =
607           Binary->getFrameLocationStack(Offset);
608       if (!FrameVec.empty()) {
609         // FIXME: As accumulating total count per instruction caused some
610         // regression, we changed to accumulate total count per byte as a
611         // workaround. Tuning hotness threshold on the compiler side might be
612         // necessary in the future.
613         FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples(
614             FrameVec, Count * Binary->getInstSize(Offset));
615         updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(),
616                                             Count);
617       }
618     } while (IP.advance() && IP.Address <= RangeEnd);
619   }
620 }
621 
622 StringRef ProfileGeneratorBase::getCalleeNameForOffset(uint64_t TargetOffset) {
623   // Get the function range by branch target if it's a call branch.
624   auto *FRange = Binary->findFuncRangeForStartOffset(TargetOffset);
625 
626   // We won't accumulate sample count for a range whose start is not the real
627   // function entry such as outlined function or inner labels.
628   if (!FRange || !FRange->IsFuncEntry)
629     return StringRef();
630 
631   return FunctionSamples::getCanonicalFnName(FRange->getFuncName());
632 }
633 
634 void ProfileGenerator::populateBoundarySamplesForAllFunctions(
635     const BranchSample &BranchCounters) {
636   for (const auto &Entry : BranchCounters) {
637     uint64_t SourceOffset = Entry.first.first;
638     uint64_t TargetOffset = Entry.first.second;
639     uint64_t Count = Entry.second;
640     assert(Count != 0 && "Unexpected zero weight branch");
641 
642     StringRef CalleeName = getCalleeNameForOffset(TargetOffset);
643     if (CalleeName.size() == 0)
644       continue;
645     // Record called target sample and its count.
646     const SampleContextFrameVector &FrameVec =
647         Binary->getFrameLocationStack(SourceOffset);
648     if (!FrameVec.empty()) {
649       FunctionSamples &FunctionProfile =
650           getLeafProfileAndAddTotalSamples(FrameVec, 0);
651       FunctionProfile.addCalledTargetSamples(
652           FrameVec.back().Location.LineOffset,
653           getBaseDiscriminator(FrameVec.back().Location.Discriminator),
654           CalleeName, Count);
655     }
656     // Add head samples for callee.
657     FunctionSamples &CalleeProfile = getTopLevelFunctionProfile(CalleeName);
658     CalleeProfile.addHeadSamples(Count);
659   }
660 }
661 
662 void ProfileGeneratorBase::calculateAndShowDensity(
663     const SampleProfileMap &Profiles) {
664   double Density = calculateDensity(Profiles, HotCountThreshold);
665   showDensitySuggestion(Density);
666 }
667 
668 FunctionSamples &CSProfileGenerator::getFunctionProfileForContext(
669     const SampleContextFrameVector &Context, bool WasLeafInlined) {
670   auto I = ProfileMap.find(SampleContext(Context));
671   if (I == ProfileMap.end()) {
672     // Save the new context for future references.
673     SampleContextFrames NewContext = *Contexts.insert(Context).first;
674     SampleContext FContext(NewContext, RawContext);
675     auto Ret = ProfileMap.emplace(FContext, FunctionSamples());
676     if (WasLeafInlined)
677       FContext.setAttribute(ContextWasInlined);
678     FunctionSamples &FProfile = Ret.first->second;
679     FProfile.setContext(FContext);
680     return Ret.first->second;
681   }
682   return I->second;
683 }
684 
685 void CSProfileGenerator::generateProfile() {
686   FunctionSamples::ProfileIsCSFlat = true;
687 
688   collectProfiledFunctions();
689 
690   if (Binary->usePseudoProbes()) {
691     generateProbeBasedProfile();
692   } else {
693     generateLineNumBasedProfile();
694   }
695 
696   if (Binary->getTrackFuncContextSize())
697     computeSizeForProfiledFunctions();
698 
699   postProcessProfiles();
700 }
701 
702 void CSProfileGenerator::computeSizeForProfiledFunctions() {
703   std::unordered_set<const BinaryFunction *> ProfiledFunctions;
704   for (auto *Func : Binary->getProfiledFunctions())
705     Binary->computeInlinedContextSizeForFunc(Func);
706 
707   // Flush the symbolizer to save memory.
708   Binary->flushSymbolizer();
709 }
710 
711 void CSProfileGenerator::generateLineNumBasedProfile() {
712   for (const auto &CI : SampleCounters) {
713     const auto *CtxKey = cast<StringBasedCtxKey>(CI.first.getPtr());
714 
715     // Get or create function profile for the range
716     FunctionSamples &FunctionProfile =
717         getFunctionProfileForContext(CtxKey->Context, CtxKey->WasLeafInlined);
718 
719     // Fill in function body samples
720     populateBodySamplesForFunction(FunctionProfile, CI.second.RangeCounter);
721     // Fill in boundary sample counts as well as call site samples for calls
722     populateBoundarySamplesForFunction(CtxKey->Context, FunctionProfile,
723                                        CI.second.BranchCounter);
724   }
725   // Fill in call site value sample for inlined calls and also use context to
726   // infer missing samples. Since we don't have call count for inlined
727   // functions, we estimate it from inlinee's profile using the entry of the
728   // body sample.
729   populateInferredFunctionSamples();
730 
731   updateTotalSamples();
732 }
733 
734 void CSProfileGenerator::populateBodySamplesForFunction(
735     FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) {
736   // Compute disjoint ranges first, so we can use MAX
737   // for calculating count for each location.
738   RangeSample Ranges;
739   findDisjointRanges(Ranges, RangeCounter);
740   for (const auto &Range : Ranges) {
741     uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first);
742     uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second);
743     uint64_t Count = Range.second;
744     // Disjoint ranges have introduce zero-filled gap that
745     // doesn't belong to current context, filter them out.
746     if (Count == 0)
747       continue;
748 
749     InstructionPointer IP(Binary, RangeBegin, true);
750     // Disjoint ranges may have range in the middle of two instr,
751     // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
752     // can be Addr1+1 to Addr2-1. We should ignore such range.
753     if (IP.Address > RangeEnd)
754       continue;
755 
756     do {
757       uint64_t Offset = Binary->virtualAddrToOffset(IP.Address);
758       auto LeafLoc = Binary->getInlineLeafFrameLoc(Offset);
759       if (LeafLoc.hasValue()) {
760         // Recording body sample for this specific context
761         updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count);
762         FunctionProfile.addTotalSamples(Count);
763       }
764     } while (IP.advance() && IP.Address <= RangeEnd);
765   }
766 }
767 
768 void CSProfileGenerator::populateBoundarySamplesForFunction(
769     SampleContextFrames ContextId, FunctionSamples &FunctionProfile,
770     const BranchSample &BranchCounters) {
771 
772   for (const auto &Entry : BranchCounters) {
773     uint64_t SourceOffset = Entry.first.first;
774     uint64_t TargetOffset = Entry.first.second;
775     uint64_t Count = Entry.second;
776     assert(Count != 0 && "Unexpected zero weight branch");
777 
778     StringRef CalleeName = getCalleeNameForOffset(TargetOffset);
779     if (CalleeName.size() == 0)
780       continue;
781 
782     // Record called target sample and its count
783     auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceOffset);
784     if (!LeafLoc.hasValue())
785       continue;
786     FunctionProfile.addCalledTargetSamples(
787         LeafLoc->Location.LineOffset,
788         getBaseDiscriminator(LeafLoc->Location.Discriminator), CalleeName,
789         Count);
790 
791     // Record head sample for called target(callee)
792     SampleContextFrameVector CalleeCtx(ContextId.begin(), ContextId.end());
793     assert(CalleeCtx.back().FuncName == LeafLoc->FuncName &&
794            "Leaf function name doesn't match");
795     CalleeCtx.back() = *LeafLoc;
796     CalleeCtx.emplace_back(CalleeName, LineLocation(0, 0));
797     FunctionSamples &CalleeProfile = getFunctionProfileForContext(CalleeCtx);
798     CalleeProfile.addHeadSamples(Count);
799   }
800 }
801 
802 static SampleContextFrame
803 getCallerContext(SampleContextFrames CalleeContext,
804                  SampleContextFrameVector &CallerContext) {
805   assert(CalleeContext.size() > 1 && "Unexpected empty context");
806   CalleeContext = CalleeContext.drop_back();
807   CallerContext.assign(CalleeContext.begin(), CalleeContext.end());
808   SampleContextFrame CallerFrame = CallerContext.back();
809   CallerContext.back().Location = LineLocation(0, 0);
810   return CallerFrame;
811 }
812 
813 void CSProfileGenerator::populateInferredFunctionSamples() {
814   for (const auto &Item : ProfileMap) {
815     const auto &CalleeContext = Item.first;
816     const FunctionSamples &CalleeProfile = Item.second;
817 
818     // If we already have head sample counts, we must have value profile
819     // for call sites added already. Skip to avoid double counting.
820     if (CalleeProfile.getHeadSamples())
821       continue;
822     // If we don't have context, nothing to do for caller's call site.
823     // This could happen for entry point function.
824     if (CalleeContext.isBaseContext())
825       continue;
826 
827     // Infer Caller's frame loc and context ID through string splitting
828     SampleContextFrameVector CallerContextId;
829     SampleContextFrame &&CallerLeafFrameLoc =
830         getCallerContext(CalleeContext.getContextFrames(), CallerContextId);
831     SampleContextFrames CallerContext(CallerContextId);
832 
833     // It's possible that we haven't seen any sample directly in the caller,
834     // in which case CallerProfile will not exist. But we can't modify
835     // ProfileMap while iterating it.
836     // TODO: created function profile for those callers too
837     if (ProfileMap.find(CallerContext) == ProfileMap.end())
838       continue;
839     FunctionSamples &CallerProfile = ProfileMap[CallerContext];
840 
841     // Since we don't have call count for inlined functions, we
842     // estimate it from inlinee's profile using entry body sample.
843     uint64_t EstimatedCallCount = CalleeProfile.getEntrySamples();
844     // If we don't have samples with location, use 1 to indicate live.
845     if (!EstimatedCallCount && !CalleeProfile.getBodySamples().size())
846       EstimatedCallCount = 1;
847     CallerProfile.addCalledTargetSamples(
848         CallerLeafFrameLoc.Location.LineOffset,
849         CallerLeafFrameLoc.Location.Discriminator,
850         CalleeProfile.getContext().getName(), EstimatedCallCount);
851     CallerProfile.addBodySamples(CallerLeafFrameLoc.Location.LineOffset,
852                                  CallerLeafFrameLoc.Location.Discriminator,
853                                  EstimatedCallCount);
854     CallerProfile.addTotalSamples(EstimatedCallCount);
855   }
856 }
857 
858 void CSProfileGenerator::postProcessProfiles() {
859   // Compute hot/cold threshold based on profile. This will be used for cold
860   // context profile merging/trimming.
861   computeSummaryAndThreshold();
862 
863   // Run global pre-inliner to adjust/merge context profile based on estimated
864   // inline decisions.
865   if (EnableCSPreInliner) {
866     CSPreInliner(ProfileMap, *Binary, HotCountThreshold, ColdCountThreshold)
867         .run();
868     // Turn off the profile merger by default unless it is explicitly enabled.
869     if (!CSProfMergeColdContext.getNumOccurrences())
870       CSProfMergeColdContext = false;
871   }
872 
873   // Trim and merge cold context profile using cold threshold above.
874   if (TrimColdProfile || CSProfMergeColdContext) {
875     SampleContextTrimmer(ProfileMap)
876         .trimAndMergeColdContextProfiles(
877             HotCountThreshold, TrimColdProfile, CSProfMergeColdContext,
878             CSProfMaxColdContextDepth, EnableCSPreInliner);
879   }
880 
881   // Merge function samples of CS profile to calculate profile density.
882   sampleprof::SampleProfileMap ContextLessProfiles;
883   for (const auto &I : ProfileMap) {
884     ContextLessProfiles[I.second.getName()].merge(I.second);
885   }
886 
887   calculateAndShowDensity(ContextLessProfiles);
888   if (GenCSNestedProfile) {
889     CSProfileConverter CSConverter(ProfileMap);
890     CSConverter.convertProfiles();
891     FunctionSamples::ProfileIsCSFlat = false;
892     FunctionSamples::ProfileIsCSNested = EnableCSPreInliner;
893   }
894 }
895 
896 void ProfileGeneratorBase::computeSummaryAndThreshold() {
897   SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
898   auto Summary = Builder.computeSummaryForProfiles(ProfileMap);
899   HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold(
900       (Summary->getDetailedSummary()));
901   ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
902       (Summary->getDetailedSummary()));
903 }
904 
905 void ProfileGeneratorBase::extractProbesFromRange(
906     const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter,
907     bool FindDisjointRanges) {
908   const RangeSample *PRanges = &RangeCounter;
909   RangeSample Ranges;
910   if (FindDisjointRanges) {
911     findDisjointRanges(Ranges, RangeCounter);
912     PRanges = &Ranges;
913   }
914 
915   for (const auto &Range : *PRanges) {
916     uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first);
917     uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second);
918     uint64_t Count = Range.second;
919 
920     InstructionPointer IP(Binary, RangeBegin, true);
921     // Disjoint ranges may have range in the middle of two instr,
922     // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
923     // can be Addr1+1 to Addr2-1. We should ignore such range.
924     if (IP.Address > RangeEnd)
925       continue;
926 
927     do {
928       const AddressProbesMap &Address2ProbesMap =
929           Binary->getAddress2ProbesMap();
930       auto It = Address2ProbesMap.find(IP.Address);
931       if (It != Address2ProbesMap.end()) {
932         for (const auto &Probe : It->second) {
933           ProbeCounter[&Probe] += Count;
934         }
935       }
936     } while (IP.advance() && IP.Address <= RangeEnd);
937   }
938 }
939 
940 static void
941 extractPrefixContextStack(SampleContextFrameVector &ContextStack,
942                           const SmallVectorImpl<uint64_t> &Addresses,
943                           ProfiledBinary *Binary) {
944   SmallVector<const MCDecodedPseudoProbe *, 16> Probes;
945   for (auto Addr : reverse(Addresses)) {
946     const MCDecodedPseudoProbe *CallProbe = Binary->getCallProbeForAddr(Addr);
947     // These could be the cases when a probe is not found at a calliste. Cutting
948     // off the context from here since the inliner will not know how to consume
949     // a context with unknown callsites.
950     // 1. for functions that are not sampled when
951     // --decode-probe-for-profiled-functions-only is on.
952     // 2. for a merged callsite. Callsite merging may cause the loss of original
953     // probe IDs.
954     // 3. for an external callsite.
955     if (!CallProbe)
956       break;
957     Probes.push_back(CallProbe);
958   }
959 
960   std::reverse(Probes.begin(), Probes.end());
961 
962   // Extract context stack for reusing, leaf context stack will be added
963   // compressed while looking up function profile.
964   for (const auto *P : Probes) {
965     Binary->getInlineContextForProbe(P, ContextStack, true);
966   }
967 }
968 
969 void CSProfileGenerator::generateProbeBasedProfile() {
970   Binary->decodePseudoProbe();
971   // Enable pseudo probe functionalities in SampleProf
972   FunctionSamples::ProfileIsProbeBased = true;
973   for (const auto &CI : SampleCounters) {
974     const AddrBasedCtxKey *CtxKey =
975         dyn_cast<AddrBasedCtxKey>(CI.first.getPtr());
976     SampleContextFrameVector ContextStack;
977     extractPrefixContextStack(ContextStack, CtxKey->Context, Binary);
978     // Fill in function body samples from probes, also infer caller's samples
979     // from callee's probe
980     populateBodySamplesWithProbes(CI.second.RangeCounter, ContextStack);
981     // Fill in boundary samples for a call probe
982     populateBoundarySamplesWithProbes(CI.second.BranchCounter, ContextStack);
983   }
984 }
985 
986 void CSProfileGenerator::populateBodySamplesWithProbes(
987     const RangeSample &RangeCounter, SampleContextFrames ContextStack) {
988   ProbeCounterMap ProbeCounter;
989   // Extract the top frame probes by looking up each address among the range in
990   // the Address2ProbeMap
991   extractProbesFromRange(RangeCounter, ProbeCounter);
992   std::unordered_map<MCDecodedPseudoProbeInlineTree *,
993                      std::unordered_set<FunctionSamples *>>
994       FrameSamples;
995   for (const auto &PI : ProbeCounter) {
996     const MCDecodedPseudoProbe *Probe = PI.first;
997     uint64_t Count = PI.second;
998     // Disjoint ranges have introduce zero-filled gap that
999     // doesn't belong to current context, filter them out.
1000     if (!Probe->isBlock() || Count == 0)
1001       continue;
1002     FunctionSamples &FunctionProfile =
1003         getFunctionProfileForLeafProbe(ContextStack, Probe);
1004     // Record the current frame and FunctionProfile whenever samples are
1005     // collected for non-danglie probes. This is for reporting all of the
1006     // zero count probes of the frame later.
1007     FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile);
1008     FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count);
1009     FunctionProfile.addTotalSamples(Count);
1010     if (Probe->isEntry()) {
1011       FunctionProfile.addHeadSamples(Count);
1012       // Look up for the caller's function profile
1013       const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe);
1014       SampleContextFrames CalleeContextId =
1015           FunctionProfile.getContext().getContextFrames();
1016       if (InlinerDesc != nullptr && CalleeContextId.size() > 1) {
1017         // Since the context id will be compressed, we have to use callee's
1018         // context id to infer caller's context id to ensure they share the
1019         // same context prefix.
1020         SampleContextFrameVector CallerContextId;
1021         SampleContextFrame &&CallerLeafFrameLoc =
1022             getCallerContext(CalleeContextId, CallerContextId);
1023         uint64_t CallerIndex = CallerLeafFrameLoc.Location.LineOffset;
1024         assert(CallerIndex &&
1025                "Inferred caller's location index shouldn't be zero!");
1026         FunctionSamples &CallerProfile =
1027             getFunctionProfileForContext(CallerContextId);
1028         CallerProfile.setFunctionHash(InlinerDesc->FuncHash);
1029         CallerProfile.addBodySamples(CallerIndex, 0, Count);
1030         CallerProfile.addTotalSamples(Count);
1031         CallerProfile.addCalledTargetSamples(
1032             CallerIndex, 0, FunctionProfile.getContext().getName(), Count);
1033       }
1034     }
1035   }
1036 
1037   // Assign zero count for remaining probes without sample hits to
1038   // differentiate from probes optimized away, of which the counts are unknown
1039   // and will be inferred by the compiler.
1040   for (auto &I : FrameSamples) {
1041     for (auto *FunctionProfile : I.second) {
1042       for (auto *Probe : I.first->getProbes()) {
1043         FunctionProfile->addBodySamplesForProbe(Probe->getIndex(), 0);
1044       }
1045     }
1046   }
1047 }
1048 
1049 void CSProfileGenerator::populateBoundarySamplesWithProbes(
1050     const BranchSample &BranchCounter, SampleContextFrames ContextStack) {
1051   for (const auto &BI : BranchCounter) {
1052     uint64_t SourceOffset = BI.first.first;
1053     uint64_t TargetOffset = BI.first.second;
1054     uint64_t Count = BI.second;
1055     uint64_t SourceAddress = Binary->offsetToVirtualAddr(SourceOffset);
1056     const MCDecodedPseudoProbe *CallProbe =
1057         Binary->getCallProbeForAddr(SourceAddress);
1058     if (CallProbe == nullptr)
1059       continue;
1060     FunctionSamples &FunctionProfile =
1061         getFunctionProfileForLeafProbe(ContextStack, CallProbe);
1062     FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count);
1063     FunctionProfile.addTotalSamples(Count);
1064     StringRef CalleeName = getCalleeNameForOffset(TargetOffset);
1065     if (CalleeName.size() == 0)
1066       continue;
1067     FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(), 0, CalleeName,
1068                                            Count);
1069   }
1070 }
1071 
1072 FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe(
1073     SampleContextFrames ContextStack, const MCDecodedPseudoProbe *LeafProbe) {
1074 
1075   // Explicitly copy the context for appending the leaf context
1076   SampleContextFrameVector NewContextStack(ContextStack.begin(),
1077                                            ContextStack.end());
1078   Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true);
1079   // For leaf inlined context with the top frame, we should strip off the top
1080   // frame's probe id, like:
1081   // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar"
1082   auto LeafFrame = NewContextStack.back();
1083   LeafFrame.Location = LineLocation(0, 0);
1084   NewContextStack.pop_back();
1085   // Compress the context string except for the leaf frame
1086   CSProfileGenerator::compressRecursionContext(NewContextStack);
1087   CSProfileGenerator::trimContext(NewContextStack);
1088   NewContextStack.push_back(LeafFrame);
1089 
1090   const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid());
1091   bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite();
1092   FunctionSamples &FunctionProile =
1093       getFunctionProfileForContext(NewContextStack, WasLeafInlined);
1094   FunctionProile.setFunctionHash(FuncDesc->FuncHash);
1095   return FunctionProile;
1096 }
1097 
1098 } // end namespace sampleprof
1099 } // end namespace llvm
1100