1 //===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "ProfileGenerator.h" 10 #include "ErrorHandling.h" 11 #include "ProfiledBinary.h" 12 #include "llvm/ProfileData/ProfileCommon.h" 13 #include <float.h> 14 #include <unordered_set> 15 16 cl::opt<std::string> OutputFilename("output", cl::value_desc("output"), 17 cl::Required, 18 cl::desc("Output profile file")); 19 static cl::alias OutputA("o", cl::desc("Alias for --output"), 20 cl::aliasopt(OutputFilename)); 21 22 static cl::opt<SampleProfileFormat> OutputFormat( 23 "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary), 24 cl::values( 25 clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"), 26 clEnumValN(SPF_Compact_Binary, "compbinary", "Compact binary encoding"), 27 clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"), 28 clEnumValN(SPF_Text, "text", "Text encoding"), 29 clEnumValN(SPF_GCC, "gcc", 30 "GCC encoding (only meaningful for -sample)"))); 31 32 cl::opt<bool> UseMD5( 33 "use-md5", cl::init(false), cl::Hidden, 34 cl::desc("Use md5 to represent function names in the output profile (only " 35 "meaningful for -extbinary)")); 36 37 static cl::opt<bool> PopulateProfileSymbolList( 38 "populate-profile-symbol-list", cl::init(false), cl::Hidden, 39 cl::desc("Populate profile symbol list (only meaningful for -extbinary)")); 40 41 static cl::opt<bool> FillZeroForAllFuncs( 42 "fill-zero-for-all-funcs", cl::init(false), cl::Hidden, 43 cl::desc("Attribute all functions' range with zero count " 44 "even it's not hit by any samples.")); 45 46 static cl::opt<int32_t, true> RecursionCompression( 47 "compress-recursion", 48 cl::desc("Compressing recursion by deduplicating adjacent frame " 49 "sequences up to the specified size. -1 means no size limit."), 50 cl::Hidden, 51 cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize)); 52 53 static cl::opt<bool> 54 TrimColdProfile("trim-cold-profile", cl::init(false), cl::ZeroOrMore, 55 cl::desc("If the total count of the profile is smaller " 56 "than threshold, it will be trimmed.")); 57 58 static cl::opt<bool> CSProfMergeColdContext( 59 "csprof-merge-cold-context", cl::init(true), cl::ZeroOrMore, 60 cl::desc("If the total count of context profile is smaller than " 61 "the threshold, it will be merged into context-less base " 62 "profile.")); 63 64 static cl::opt<uint32_t> CSProfMaxColdContextDepth( 65 "csprof-max-cold-context-depth", cl::init(1), cl::ZeroOrMore, 66 cl::desc("Keep the last K contexts while merging cold profile. 1 means the " 67 "context-less base profile")); 68 69 static cl::opt<int, true> CSProfMaxContextDepth( 70 "csprof-max-context-depth", cl::ZeroOrMore, 71 cl::desc("Keep the last K contexts while merging profile. -1 means no " 72 "depth limit."), 73 cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth)); 74 75 static cl::opt<double> HotFunctionDensityThreshold( 76 "hot-function-density-threshold", llvm::cl::init(1000), 77 llvm::cl::desc( 78 "specify density threshold for hot functions (default: 1000)"), 79 llvm::cl::Optional); 80 static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false), 81 llvm::cl::desc("show profile density details"), 82 llvm::cl::Optional); 83 84 static cl::opt<bool> UpdateTotalSamples( 85 "update-total-samples", llvm::cl::init(false), 86 llvm::cl::desc( 87 "Update total samples by accumulating all its body samples."), 88 llvm::cl::Optional); 89 90 extern cl::opt<int> ProfileSummaryCutoffHot; 91 92 static cl::opt<bool> GenCSNestedProfile( 93 "gen-cs-nested-profile", cl::Hidden, cl::init(false), 94 cl::desc("Generate nested function profiles for CSSPGO")); 95 96 using namespace llvm; 97 using namespace sampleprof; 98 99 namespace llvm { 100 namespace sampleprof { 101 102 // Initialize the MaxCompressionSize to -1 which means no size limit 103 int32_t CSProfileGenerator::MaxCompressionSize = -1; 104 105 int CSProfileGenerator::MaxContextDepth = -1; 106 107 bool ProfileGeneratorBase::UseFSDiscriminator = false; 108 109 std::unique_ptr<ProfileGeneratorBase> 110 ProfileGeneratorBase::create(ProfiledBinary *Binary, 111 const ContextSampleCounterMap &SampleCounters, 112 bool ProfileIsCSFlat) { 113 std::unique_ptr<ProfileGeneratorBase> Generator; 114 if (ProfileIsCSFlat) { 115 if (Binary->useFSDiscriminator()) 116 exitWithError("FS discriminator is not supported in CS profile."); 117 Generator.reset(new CSProfileGenerator(Binary, SampleCounters)); 118 } else { 119 Generator.reset(new ProfileGenerator(Binary, SampleCounters)); 120 } 121 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); 122 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); 123 124 return Generator; 125 } 126 127 void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer, 128 SampleProfileMap &ProfileMap) { 129 // Populate profile symbol list if extended binary format is used. 130 ProfileSymbolList SymbolList; 131 132 if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) { 133 Binary->populateSymbolListFromDWARF(SymbolList); 134 Writer->setProfileSymbolList(&SymbolList); 135 } 136 137 if (std::error_code EC = Writer->write(ProfileMap)) 138 exitWithError(std::move(EC)); 139 } 140 141 void ProfileGeneratorBase::write() { 142 auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat); 143 if (std::error_code EC = WriterOrErr.getError()) 144 exitWithError(EC, OutputFilename); 145 146 if (UseMD5) { 147 if (OutputFormat != SPF_Ext_Binary) 148 WithColor::warning() << "-use-md5 is ignored. Specify " 149 "--format=extbinary to enable it\n"; 150 else 151 WriterOrErr.get()->setUseMD5(); 152 } 153 154 write(std::move(WriterOrErr.get()), ProfileMap); 155 } 156 157 void ProfileGeneratorBase::showDensitySuggestion(double Density) { 158 if (Density == 0.0) 159 WithColor::warning() << "The --profile-summary-cutoff-hot option may be " 160 "set too low. Please check your command.\n"; 161 else if (Density < HotFunctionDensityThreshold) 162 WithColor::warning() 163 << "AutoFDO is estimated to optimize better with " 164 << format("%.1f", HotFunctionDensityThreshold / Density) 165 << "x more samples. Please consider increasing sampling rate or " 166 "profiling for longer duration to get more samples.\n"; 167 168 if (ShowDensity) 169 outs() << "Minimum profile density for hot functions with top " 170 << format("%.2f", 171 static_cast<double>(ProfileSummaryCutoffHot.getValue()) / 172 10000) 173 << "% total samples: " << format("%.1f", Density) << "\n"; 174 } 175 176 double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles, 177 uint64_t HotCntThreshold) { 178 double Density = DBL_MAX; 179 std::vector<const FunctionSamples *> HotFuncs; 180 for (auto &I : Profiles) { 181 auto &FuncSamples = I.second; 182 if (FuncSamples.getTotalSamples() < HotCntThreshold) 183 continue; 184 HotFuncs.emplace_back(&FuncSamples); 185 } 186 187 for (auto *FuncSamples : HotFuncs) { 188 auto *Func = Binary->getBinaryFunction(FuncSamples->getName()); 189 if (!Func) 190 continue; 191 uint64_t FuncSize = Func->getFuncSize(); 192 if (FuncSize == 0) 193 continue; 194 Density = 195 std::min(Density, static_cast<double>(FuncSamples->getTotalSamples()) / 196 FuncSize); 197 } 198 199 return Density == DBL_MAX ? 0.0 : Density; 200 } 201 202 void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges, 203 const RangeSample &Ranges) { 204 205 /* 206 Regions may overlap with each other. Using the boundary info, find all 207 disjoint ranges and their sample count. BoundaryPoint contains the count 208 multiple samples begin/end at this points. 209 210 |<--100-->| Sample1 211 |<------200------>| Sample2 212 A B C 213 214 In the example above, 215 Sample1 begins at A, ends at B, its value is 100. 216 Sample2 beings at A, ends at C, its value is 200. 217 For A, BeginCount is the sum of sample begins at A, which is 300 and no 218 samples ends at A, so EndCount is 0. 219 Then boundary points A, B, and C with begin/end counts are: 220 A: (300, 0) 221 B: (0, 100) 222 C: (0, 200) 223 */ 224 struct BoundaryPoint { 225 // Sum of sample counts beginning at this point 226 uint64_t BeginCount = UINT64_MAX; 227 // Sum of sample counts ending at this point 228 uint64_t EndCount = UINT64_MAX; 229 // Is the begin point of a zero range. 230 bool IsZeroRangeBegin = false; 231 // Is the end point of a zero range. 232 bool IsZeroRangeEnd = false; 233 234 void addBeginCount(uint64_t Count) { 235 if (BeginCount == UINT64_MAX) 236 BeginCount = 0; 237 BeginCount += Count; 238 } 239 240 void addEndCount(uint64_t Count) { 241 if (EndCount == UINT64_MAX) 242 EndCount = 0; 243 EndCount += Count; 244 } 245 }; 246 247 /* 248 For the above example. With boundary points, follwing logic finds two 249 disjoint region of 250 251 [A,B]: 300 252 [B+1,C]: 200 253 254 If there is a boundary point that both begin and end, the point itself 255 becomes a separate disjoint region. For example, if we have original 256 ranges of 257 258 |<--- 100 --->| 259 |<--- 200 --->| 260 A B C 261 262 there are three boundary points with their begin/end counts of 263 264 A: (100, 0) 265 B: (200, 100) 266 C: (0, 200) 267 268 the disjoint ranges would be 269 270 [A, B-1]: 100 271 [B, B]: 300 272 [B+1, C]: 200. 273 274 Example for zero value range: 275 276 |<--- 100 --->| 277 |<--- 200 --->| 278 |<--------------- 0 ----------------->| 279 A B C D E F 280 281 [A, B-1] : 0 282 [B, C] : 100 283 [C+1, D-1]: 0 284 [D, E] : 200 285 [E+1, F] : 0 286 */ 287 std::map<uint64_t, BoundaryPoint> Boundaries; 288 289 for (const auto &Item : Ranges) { 290 assert(Item.first.first <= Item.first.second && 291 "Invalid instruction range"); 292 auto &BeginPoint = Boundaries[Item.first.first]; 293 auto &EndPoint = Boundaries[Item.first.second]; 294 uint64_t Count = Item.second; 295 296 BeginPoint.addBeginCount(Count); 297 EndPoint.addEndCount(Count); 298 if (Count == 0) { 299 BeginPoint.IsZeroRangeBegin = true; 300 EndPoint.IsZeroRangeEnd = true; 301 } 302 } 303 304 // Use UINT64_MAX to indicate there is no existing range between BeginAddress 305 // and the next valid address 306 uint64_t BeginAddress = UINT64_MAX; 307 int ZeroRangeDepth = 0; 308 uint64_t Count = 0; 309 for (const auto &Item : Boundaries) { 310 uint64_t Address = Item.first; 311 const BoundaryPoint &Point = Item.second; 312 if (Point.BeginCount != UINT64_MAX) { 313 if (BeginAddress != UINT64_MAX) 314 DisjointRanges[{BeginAddress, Address - 1}] = Count; 315 Count += Point.BeginCount; 316 BeginAddress = Address; 317 ZeroRangeDepth += Point.IsZeroRangeBegin; 318 } 319 if (Point.EndCount != UINT64_MAX) { 320 assert((BeginAddress != UINT64_MAX) && 321 "First boundary point cannot be 'end' point"); 322 DisjointRanges[{BeginAddress, Address}] = Count; 323 assert(Count >= Point.EndCount && "Mismatched live ranges"); 324 Count -= Point.EndCount; 325 BeginAddress = Address + 1; 326 ZeroRangeDepth -= Point.IsZeroRangeEnd; 327 // If the remaining count is zero and it's no longer in a zero range, this 328 // means we consume all the ranges before, thus mark BeginAddress as 329 // UINT64_MAX. e.g. supposing we have two non-overlapping ranges: 330 // [<---- 10 ---->] 331 // [<---- 20 ---->] 332 // A B C D 333 // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't 334 // have the [B+1, C-1] zero range. 335 if (Count == 0 && ZeroRangeDepth == 0) 336 BeginAddress = UINT64_MAX; 337 } 338 } 339 } 340 341 void ProfileGeneratorBase::updateBodySamplesforFunctionProfile( 342 FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc, 343 uint64_t Count) { 344 // Use the maximum count of samples with same line location 345 uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator); 346 347 // Use duplication factor to compensated for loop unroll/vectorization. 348 // Note that this is only needed when we're taking MAX of the counts at 349 // the location instead of SUM. 350 Count *= getDuplicationFactor(LeafLoc.Location.Discriminator); 351 352 ErrorOr<uint64_t> R = 353 FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator); 354 355 uint64_t PreviousCount = R ? R.get() : 0; 356 if (PreviousCount <= Count) { 357 FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator, 358 Count - PreviousCount); 359 } 360 } 361 362 void ProfileGeneratorBase::updateTotalSamples() { 363 if (!UpdateTotalSamples) 364 return; 365 366 for (auto &Item : ProfileMap) { 367 FunctionSamples &FunctionProfile = Item.second; 368 FunctionProfile.updateTotalSamples(); 369 } 370 } 371 372 FunctionSamples & 373 ProfileGenerator::getTopLevelFunctionProfile(StringRef FuncName) { 374 SampleContext Context(FuncName); 375 auto Ret = ProfileMap.emplace(Context, FunctionSamples()); 376 if (Ret.second) { 377 FunctionSamples &FProfile = Ret.first->second; 378 FProfile.setContext(Context); 379 } 380 return Ret.first->second; 381 } 382 383 void ProfileGenerator::generateProfile() { 384 if (Binary->usePseudoProbes()) { 385 // TODO: Support probe based profile generation 386 exitWithError("Probe based profile generation not supported for AutoFDO, " 387 "consider dropping `--ignore-stack-samples` or adding `--use-dwarf-correlation`."); 388 } else { 389 generateLineNumBasedProfile(); 390 } 391 postProcessProfiles(); 392 } 393 394 void ProfileGenerator::postProcessProfiles() { 395 computeSummaryAndThreshold(); 396 trimColdProfiles(ProfileMap, ColdCountThreshold); 397 calculateAndShowDensity(ProfileMap); 398 } 399 400 void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles, 401 uint64_t ColdCntThreshold) { 402 if (!TrimColdProfile) 403 return; 404 405 // Move cold profiles into a tmp container. 406 std::vector<SampleContext> ColdProfiles; 407 for (const auto &I : ProfileMap) { 408 if (I.second.getTotalSamples() < ColdCntThreshold) 409 ColdProfiles.emplace_back(I.first); 410 } 411 412 // Remove the cold profile from ProfileMap. 413 for (const auto &I : ColdProfiles) 414 ProfileMap.erase(I); 415 } 416 417 void ProfileGenerator::generateLineNumBasedProfile() { 418 assert(SampleCounters.size() == 1 && 419 "Must have one entry for profile generation."); 420 const SampleCounter &SC = SampleCounters.begin()->second; 421 // Fill in function body samples 422 populateBodySamplesForAllFunctions(SC.RangeCounter); 423 // Fill in boundary sample counts as well as call site samples for calls 424 populateBoundarySamplesForAllFunctions(SC.BranchCounter); 425 426 updateTotalSamples(); 427 } 428 429 FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples( 430 const SampleContextFrameVector &FrameVec, uint64_t Count) { 431 // Get top level profile 432 FunctionSamples *FunctionProfile = 433 &getTopLevelFunctionProfile(FrameVec[0].FuncName); 434 FunctionProfile->addTotalSamples(Count); 435 436 for (size_t I = 1; I < FrameVec.size(); I++) { 437 LineLocation Callsite( 438 FrameVec[I - 1].Location.LineOffset, 439 getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator)); 440 FunctionSamplesMap &SamplesMap = 441 FunctionProfile->functionSamplesAt(Callsite); 442 auto Ret = 443 SamplesMap.emplace(FrameVec[I].FuncName.str(), FunctionSamples()); 444 if (Ret.second) { 445 SampleContext Context(FrameVec[I].FuncName); 446 Ret.first->second.setContext(Context); 447 } 448 FunctionProfile = &Ret.first->second; 449 FunctionProfile->addTotalSamples(Count); 450 } 451 452 return *FunctionProfile; 453 } 454 455 RangeSample 456 ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) { 457 RangeSample Ranges(RangeCounter.begin(), RangeCounter.end()); 458 if (FillZeroForAllFuncs) { 459 for (auto &FuncI : Binary->getAllBinaryFunctions()) { 460 for (auto &R : FuncI.second.Ranges) { 461 Ranges[{R.first, R.second - 1}] += 0; 462 } 463 } 464 } else { 465 // For each range, we search for all ranges of the function it belongs to 466 // and initialize it with zero count, so it remains zero if doesn't hit any 467 // samples. This is to be consistent with compiler that interpret zero count 468 // as unexecuted(cold). 469 for (const auto &I : RangeCounter) { 470 uint64_t StartOffset = I.first.first; 471 for (const auto &Range : Binary->getRangesForOffset(StartOffset)) 472 Ranges[{Range.first, Range.second - 1}] += 0; 473 } 474 } 475 RangeSample DisjointRanges; 476 findDisjointRanges(DisjointRanges, Ranges); 477 return DisjointRanges; 478 } 479 480 void ProfileGenerator::populateBodySamplesForAllFunctions( 481 const RangeSample &RangeCounter) { 482 for (const auto &Range : preprocessRangeCounter(RangeCounter)) { 483 uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); 484 uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); 485 uint64_t Count = Range.second; 486 487 InstructionPointer IP(Binary, RangeBegin, true); 488 // Disjoint ranges may have range in the middle of two instr, 489 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range 490 // can be Addr1+1 to Addr2-1. We should ignore such range. 491 if (IP.Address > RangeEnd) 492 continue; 493 494 do { 495 uint64_t Offset = Binary->virtualAddrToOffset(IP.Address); 496 const SampleContextFrameVector &FrameVec = 497 Binary->getFrameLocationStack(Offset); 498 if (!FrameVec.empty()) { 499 // FIXME: As accumulating total count per instruction caused some 500 // regression, we changed to accumulate total count per byte as a 501 // workaround. Tuning hotness threshold on the compiler side might be 502 // necessary in the future. 503 FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples( 504 FrameVec, Count * Binary->getInstSize(Offset)); 505 updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(), 506 Count); 507 } 508 } while (IP.advance() && IP.Address <= RangeEnd); 509 } 510 } 511 512 StringRef ProfileGeneratorBase::getCalleeNameForOffset(uint64_t TargetOffset) { 513 // Get the function range by branch target if it's a call branch. 514 auto *FRange = Binary->findFuncRangeForStartOffset(TargetOffset); 515 516 // We won't accumulate sample count for a range whose start is not the real 517 // function entry such as outlined function or inner labels. 518 if (!FRange || !FRange->IsFuncEntry) 519 return StringRef(); 520 521 return FunctionSamples::getCanonicalFnName(FRange->getFuncName()); 522 } 523 524 void ProfileGenerator::populateBoundarySamplesForAllFunctions( 525 const BranchSample &BranchCounters) { 526 for (const auto &Entry : BranchCounters) { 527 uint64_t SourceOffset = Entry.first.first; 528 uint64_t TargetOffset = Entry.first.second; 529 uint64_t Count = Entry.second; 530 assert(Count != 0 && "Unexpected zero weight branch"); 531 532 StringRef CalleeName = getCalleeNameForOffset(TargetOffset); 533 if (CalleeName.size() == 0) 534 continue; 535 // Record called target sample and its count. 536 const SampleContextFrameVector &FrameVec = 537 Binary->getFrameLocationStack(SourceOffset); 538 if (!FrameVec.empty()) { 539 FunctionSamples &FunctionProfile = 540 getLeafProfileAndAddTotalSamples(FrameVec, 0); 541 FunctionProfile.addCalledTargetSamples( 542 FrameVec.back().Location.LineOffset, 543 getBaseDiscriminator(FrameVec.back().Location.Discriminator), 544 CalleeName, Count); 545 } 546 // Add head samples for callee. 547 FunctionSamples &CalleeProfile = getTopLevelFunctionProfile(CalleeName); 548 CalleeProfile.addHeadSamples(Count); 549 } 550 } 551 552 void ProfileGeneratorBase::calculateAndShowDensity( 553 const SampleProfileMap &Profiles) { 554 double Density = calculateDensity(Profiles, HotCountThreshold); 555 showDensitySuggestion(Density); 556 } 557 558 FunctionSamples &CSProfileGenerator::getFunctionProfileForContext( 559 const SampleContextFrameVector &Context, bool WasLeafInlined) { 560 auto I = ProfileMap.find(SampleContext(Context)); 561 if (I == ProfileMap.end()) { 562 // Save the new context for future references. 563 SampleContextFrames NewContext = *Contexts.insert(Context).first; 564 SampleContext FContext(NewContext, RawContext); 565 auto Ret = ProfileMap.emplace(FContext, FunctionSamples()); 566 if (WasLeafInlined) 567 FContext.setAttribute(ContextWasInlined); 568 FunctionSamples &FProfile = Ret.first->second; 569 FProfile.setContext(FContext); 570 return Ret.first->second; 571 } 572 return I->second; 573 } 574 575 void CSProfileGenerator::generateProfile() { 576 FunctionSamples::ProfileIsCSFlat = true; 577 578 if (Binary->getTrackFuncContextSize()) 579 computeSizeForProfiledFunctions(); 580 581 if (Binary->usePseudoProbes()) { 582 // Enable pseudo probe functionalities in SampleProf 583 FunctionSamples::ProfileIsProbeBased = true; 584 generateProbeBasedProfile(); 585 } else { 586 generateLineNumBasedProfile(); 587 } 588 postProcessProfiles(); 589 } 590 591 void CSProfileGenerator::computeSizeForProfiledFunctions() { 592 // Hash map to deduplicate the function range and the item is a pair of 593 // function start and end offset. 594 std::unordered_map<uint64_t, uint64_t> AggregatedRanges; 595 // Go through all the ranges in the CS counters, use the start of the range to 596 // look up the function it belongs and record the function range. 597 for (const auto &CI : SampleCounters) { 598 for (const auto &Item : CI.second.RangeCounter) { 599 // FIXME: Filter the bogus crossing function range. 600 uint64_t StartOffset = Item.first.first; 601 // Note that a function can be spilt into multiple ranges, so get all 602 // ranges of the function. 603 for (const auto &Range : Binary->getRangesForOffset(StartOffset)) 604 AggregatedRanges[Range.first] = Range.second; 605 } 606 } 607 608 for (const auto &I : AggregatedRanges) { 609 uint64_t StartOffset = I.first; 610 uint64_t EndOffset = I.second; 611 Binary->computeInlinedContextSizeForRange(StartOffset, EndOffset); 612 } 613 } 614 615 void CSProfileGenerator::generateLineNumBasedProfile() { 616 for (const auto &CI : SampleCounters) { 617 const auto *CtxKey = cast<StringBasedCtxKey>(CI.first.getPtr()); 618 619 // Get or create function profile for the range 620 FunctionSamples &FunctionProfile = 621 getFunctionProfileForContext(CtxKey->Context, CtxKey->WasLeafInlined); 622 623 // Fill in function body samples 624 populateBodySamplesForFunction(FunctionProfile, CI.second.RangeCounter); 625 // Fill in boundary sample counts as well as call site samples for calls 626 populateBoundarySamplesForFunction(CtxKey->Context, FunctionProfile, 627 CI.second.BranchCounter); 628 } 629 // Fill in call site value sample for inlined calls and also use context to 630 // infer missing samples. Since we don't have call count for inlined 631 // functions, we estimate it from inlinee's profile using the entry of the 632 // body sample. 633 populateInferredFunctionSamples(); 634 635 updateTotalSamples(); 636 } 637 638 void CSProfileGenerator::populateBodySamplesForFunction( 639 FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) { 640 // Compute disjoint ranges first, so we can use MAX 641 // for calculating count for each location. 642 RangeSample Ranges; 643 findDisjointRanges(Ranges, RangeCounter); 644 for (const auto &Range : Ranges) { 645 uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); 646 uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); 647 uint64_t Count = Range.second; 648 // Disjoint ranges have introduce zero-filled gap that 649 // doesn't belong to current context, filter them out. 650 if (Count == 0) 651 continue; 652 653 InstructionPointer IP(Binary, RangeBegin, true); 654 // Disjoint ranges may have range in the middle of two instr, 655 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range 656 // can be Addr1+1 to Addr2-1. We should ignore such range. 657 if (IP.Address > RangeEnd) 658 continue; 659 660 do { 661 uint64_t Offset = Binary->virtualAddrToOffset(IP.Address); 662 auto LeafLoc = Binary->getInlineLeafFrameLoc(Offset); 663 if (LeafLoc.hasValue()) { 664 // Recording body sample for this specific context 665 updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count); 666 FunctionProfile.addTotalSamples(Count); 667 } 668 } while (IP.advance() && IP.Address <= RangeEnd); 669 } 670 } 671 672 void CSProfileGenerator::populateBoundarySamplesForFunction( 673 SampleContextFrames ContextId, FunctionSamples &FunctionProfile, 674 const BranchSample &BranchCounters) { 675 676 for (const auto &Entry : BranchCounters) { 677 uint64_t SourceOffset = Entry.first.first; 678 uint64_t TargetOffset = Entry.first.second; 679 uint64_t Count = Entry.second; 680 assert(Count != 0 && "Unexpected zero weight branch"); 681 682 StringRef CalleeName = getCalleeNameForOffset(TargetOffset); 683 if (CalleeName.size() == 0) 684 continue; 685 686 // Record called target sample and its count 687 auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceOffset); 688 if (!LeafLoc.hasValue()) 689 continue; 690 FunctionProfile.addCalledTargetSamples( 691 LeafLoc->Location.LineOffset, 692 getBaseDiscriminator(LeafLoc->Location.Discriminator), CalleeName, 693 Count); 694 695 // Record head sample for called target(callee) 696 SampleContextFrameVector CalleeCtx(ContextId.begin(), ContextId.end()); 697 assert(CalleeCtx.back().FuncName == LeafLoc->FuncName && 698 "Leaf function name doesn't match"); 699 CalleeCtx.back() = *LeafLoc; 700 CalleeCtx.emplace_back(CalleeName, LineLocation(0, 0)); 701 FunctionSamples &CalleeProfile = getFunctionProfileForContext(CalleeCtx); 702 CalleeProfile.addHeadSamples(Count); 703 } 704 } 705 706 static SampleContextFrame 707 getCallerContext(SampleContextFrames CalleeContext, 708 SampleContextFrameVector &CallerContext) { 709 assert(CalleeContext.size() > 1 && "Unexpected empty context"); 710 CalleeContext = CalleeContext.drop_back(); 711 CallerContext.assign(CalleeContext.begin(), CalleeContext.end()); 712 SampleContextFrame CallerFrame = CallerContext.back(); 713 CallerContext.back().Location = LineLocation(0, 0); 714 return CallerFrame; 715 } 716 717 void CSProfileGenerator::populateInferredFunctionSamples() { 718 for (const auto &Item : ProfileMap) { 719 const auto &CalleeContext = Item.first; 720 const FunctionSamples &CalleeProfile = Item.second; 721 722 // If we already have head sample counts, we must have value profile 723 // for call sites added already. Skip to avoid double counting. 724 if (CalleeProfile.getHeadSamples()) 725 continue; 726 // If we don't have context, nothing to do for caller's call site. 727 // This could happen for entry point function. 728 if (CalleeContext.isBaseContext()) 729 continue; 730 731 // Infer Caller's frame loc and context ID through string splitting 732 SampleContextFrameVector CallerContextId; 733 SampleContextFrame &&CallerLeafFrameLoc = 734 getCallerContext(CalleeContext.getContextFrames(), CallerContextId); 735 SampleContextFrames CallerContext(CallerContextId); 736 737 // It's possible that we haven't seen any sample directly in the caller, 738 // in which case CallerProfile will not exist. But we can't modify 739 // ProfileMap while iterating it. 740 // TODO: created function profile for those callers too 741 if (ProfileMap.find(CallerContext) == ProfileMap.end()) 742 continue; 743 FunctionSamples &CallerProfile = ProfileMap[CallerContext]; 744 745 // Since we don't have call count for inlined functions, we 746 // estimate it from inlinee's profile using entry body sample. 747 uint64_t EstimatedCallCount = CalleeProfile.getEntrySamples(); 748 // If we don't have samples with location, use 1 to indicate live. 749 if (!EstimatedCallCount && !CalleeProfile.getBodySamples().size()) 750 EstimatedCallCount = 1; 751 CallerProfile.addCalledTargetSamples( 752 CallerLeafFrameLoc.Location.LineOffset, 753 CallerLeafFrameLoc.Location.Discriminator, 754 CalleeProfile.getContext().getName(), EstimatedCallCount); 755 CallerProfile.addBodySamples(CallerLeafFrameLoc.Location.LineOffset, 756 CallerLeafFrameLoc.Location.Discriminator, 757 EstimatedCallCount); 758 CallerProfile.addTotalSamples(EstimatedCallCount); 759 } 760 } 761 762 void CSProfileGenerator::postProcessProfiles() { 763 // Compute hot/cold threshold based on profile. This will be used for cold 764 // context profile merging/trimming. 765 computeSummaryAndThreshold(); 766 767 // Run global pre-inliner to adjust/merge context profile based on estimated 768 // inline decisions. 769 if (EnableCSPreInliner) { 770 CSPreInliner(ProfileMap, *Binary, HotCountThreshold, ColdCountThreshold) 771 .run(); 772 // Turn off the profile merger by default unless it is explicitly enabled. 773 if (!CSProfMergeColdContext.getNumOccurrences()) 774 CSProfMergeColdContext = false; 775 } 776 777 // Trim and merge cold context profile using cold threshold above. 778 if (TrimColdProfile || CSProfMergeColdContext) { 779 SampleContextTrimmer(ProfileMap) 780 .trimAndMergeColdContextProfiles( 781 HotCountThreshold, TrimColdProfile, CSProfMergeColdContext, 782 CSProfMaxColdContextDepth, EnableCSPreInliner); 783 } 784 785 // Merge function samples of CS profile to calculate profile density. 786 sampleprof::SampleProfileMap ContextLessProfiles; 787 for (const auto &I : ProfileMap) { 788 ContextLessProfiles[I.second.getName()].merge(I.second); 789 } 790 791 calculateAndShowDensity(ContextLessProfiles); 792 if (GenCSNestedProfile) { 793 CSProfileConverter CSConverter(ProfileMap); 794 CSConverter.convertProfiles(); 795 FunctionSamples::ProfileIsCSFlat = false; 796 FunctionSamples::ProfileIsCSNested = EnableCSPreInliner; 797 } 798 } 799 800 void ProfileGeneratorBase::computeSummaryAndThreshold() { 801 SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); 802 auto Summary = Builder.computeSummaryForProfiles(ProfileMap); 803 HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold( 804 (Summary->getDetailedSummary())); 805 ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( 806 (Summary->getDetailedSummary())); 807 } 808 809 // Helper function to extract context prefix string stack 810 // Extract context stack for reusing, leaf context stack will 811 // be added compressed while looking up function profile 812 static void extractPrefixContextStack( 813 SampleContextFrameVector &ContextStack, 814 const SmallVectorImpl<const MCDecodedPseudoProbe *> &Probes, 815 ProfiledBinary *Binary) { 816 for (const auto *P : Probes) { 817 Binary->getInlineContextForProbe(P, ContextStack, true); 818 } 819 } 820 821 void CSProfileGenerator::generateProbeBasedProfile() { 822 for (const auto &CI : SampleCounters) { 823 const ProbeBasedCtxKey *CtxKey = 824 dyn_cast<ProbeBasedCtxKey>(CI.first.getPtr()); 825 SampleContextFrameVector ContextStack; 826 extractPrefixContextStack(ContextStack, CtxKey->Probes, Binary); 827 // Fill in function body samples from probes, also infer caller's samples 828 // from callee's probe 829 populateBodySamplesWithProbes(CI.second.RangeCounter, ContextStack); 830 // Fill in boundary samples for a call probe 831 populateBoundarySamplesWithProbes(CI.second.BranchCounter, ContextStack); 832 } 833 } 834 835 void CSProfileGenerator::extractProbesFromRange(const RangeSample &RangeCounter, 836 ProbeCounterMap &ProbeCounter) { 837 RangeSample Ranges; 838 findDisjointRanges(Ranges, RangeCounter); 839 for (const auto &Range : Ranges) { 840 uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); 841 uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); 842 uint64_t Count = Range.second; 843 // Disjoint ranges have introduce zero-filled gap that 844 // doesn't belong to current context, filter them out. 845 if (Count == 0) 846 continue; 847 848 InstructionPointer IP(Binary, RangeBegin, true); 849 // Disjoint ranges may have range in the middle of two instr, 850 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range 851 // can be Addr1+1 to Addr2-1. We should ignore such range. 852 if (IP.Address > RangeEnd) 853 continue; 854 855 do { 856 const AddressProbesMap &Address2ProbesMap = 857 Binary->getAddress2ProbesMap(); 858 auto It = Address2ProbesMap.find(IP.Address); 859 if (It != Address2ProbesMap.end()) { 860 for (const auto &Probe : It->second) { 861 if (!Probe.isBlock()) 862 continue; 863 ProbeCounter[&Probe] += Count; 864 } 865 } 866 } while (IP.advance() && IP.Address <= RangeEnd); 867 } 868 } 869 870 void CSProfileGenerator::populateBodySamplesWithProbes( 871 const RangeSample &RangeCounter, SampleContextFrames ContextStack) { 872 ProbeCounterMap ProbeCounter; 873 // Extract the top frame probes by looking up each address among the range in 874 // the Address2ProbeMap 875 extractProbesFromRange(RangeCounter, ProbeCounter); 876 std::unordered_map<MCDecodedPseudoProbeInlineTree *, 877 std::unordered_set<FunctionSamples *>> 878 FrameSamples; 879 for (const auto &PI : ProbeCounter) { 880 const MCDecodedPseudoProbe *Probe = PI.first; 881 uint64_t Count = PI.second; 882 FunctionSamples &FunctionProfile = 883 getFunctionProfileForLeafProbe(ContextStack, Probe); 884 // Record the current frame and FunctionProfile whenever samples are 885 // collected for non-danglie probes. This is for reporting all of the 886 // zero count probes of the frame later. 887 FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile); 888 FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count); 889 FunctionProfile.addTotalSamples(Count); 890 if (Probe->isEntry()) { 891 FunctionProfile.addHeadSamples(Count); 892 // Look up for the caller's function profile 893 const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe); 894 SampleContextFrames CalleeContextId = 895 FunctionProfile.getContext().getContextFrames(); 896 if (InlinerDesc != nullptr && CalleeContextId.size() > 1) { 897 // Since the context id will be compressed, we have to use callee's 898 // context id to infer caller's context id to ensure they share the 899 // same context prefix. 900 SampleContextFrameVector CallerContextId; 901 SampleContextFrame &&CallerLeafFrameLoc = 902 getCallerContext(CalleeContextId, CallerContextId); 903 uint64_t CallerIndex = CallerLeafFrameLoc.Location.LineOffset; 904 assert(CallerIndex && 905 "Inferred caller's location index shouldn't be zero!"); 906 FunctionSamples &CallerProfile = 907 getFunctionProfileForContext(CallerContextId); 908 CallerProfile.setFunctionHash(InlinerDesc->FuncHash); 909 CallerProfile.addBodySamples(CallerIndex, 0, Count); 910 CallerProfile.addTotalSamples(Count); 911 CallerProfile.addCalledTargetSamples( 912 CallerIndex, 0, FunctionProfile.getContext().getName(), Count); 913 } 914 } 915 } 916 917 // Assign zero count for remaining probes without sample hits to 918 // differentiate from probes optimized away, of which the counts are unknown 919 // and will be inferred by the compiler. 920 for (auto &I : FrameSamples) { 921 for (auto *FunctionProfile : I.second) { 922 for (auto *Probe : I.first->getProbes()) { 923 FunctionProfile->addBodySamplesForProbe(Probe->getIndex(), 0); 924 } 925 } 926 } 927 } 928 929 void CSProfileGenerator::populateBoundarySamplesWithProbes( 930 const BranchSample &BranchCounter, SampleContextFrames ContextStack) { 931 for (const auto &BI : BranchCounter) { 932 uint64_t SourceOffset = BI.first.first; 933 uint64_t TargetOffset = BI.first.second; 934 uint64_t Count = BI.second; 935 uint64_t SourceAddress = Binary->offsetToVirtualAddr(SourceOffset); 936 const MCDecodedPseudoProbe *CallProbe = 937 Binary->getCallProbeForAddr(SourceAddress); 938 if (CallProbe == nullptr) 939 continue; 940 FunctionSamples &FunctionProfile = 941 getFunctionProfileForLeafProbe(ContextStack, CallProbe); 942 FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count); 943 FunctionProfile.addTotalSamples(Count); 944 StringRef CalleeName = getCalleeNameForOffset(TargetOffset); 945 if (CalleeName.size() == 0) 946 continue; 947 FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(), 0, CalleeName, 948 Count); 949 } 950 } 951 952 FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe( 953 SampleContextFrames ContextStack, const MCDecodedPseudoProbe *LeafProbe) { 954 955 // Explicitly copy the context for appending the leaf context 956 SampleContextFrameVector NewContextStack(ContextStack.begin(), 957 ContextStack.end()); 958 Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true); 959 // For leaf inlined context with the top frame, we should strip off the top 960 // frame's probe id, like: 961 // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar" 962 auto LeafFrame = NewContextStack.back(); 963 LeafFrame.Location = LineLocation(0, 0); 964 NewContextStack.pop_back(); 965 // Compress the context string except for the leaf frame 966 CSProfileGenerator::compressRecursionContext(NewContextStack); 967 CSProfileGenerator::trimContext(NewContextStack); 968 NewContextStack.push_back(LeafFrame); 969 970 const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid()); 971 bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite(); 972 FunctionSamples &FunctionProile = 973 getFunctionProfileForContext(NewContextStack, WasLeafInlined); 974 FunctionProile.setFunctionHash(FuncDesc->FuncHash); 975 return FunctionProile; 976 } 977 978 } // end namespace sampleprof 979 } // end namespace llvm 980