1 //===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "ProfileGenerator.h" 9 #include "ErrorHandling.h" 10 #include "ProfiledBinary.h" 11 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" 12 #include "llvm/ProfileData/ProfileCommon.h" 13 #include <algorithm> 14 #include <float.h> 15 #include <unordered_set> 16 17 cl::opt<std::string> OutputFilename("output", cl::value_desc("output"), 18 cl::Required, 19 cl::desc("Output profile file")); 20 static cl::alias OutputA("o", cl::desc("Alias for --output"), 21 cl::aliasopt(OutputFilename)); 22 23 static cl::opt<SampleProfileFormat> OutputFormat( 24 "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary), 25 cl::values( 26 clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"), 27 clEnumValN(SPF_Compact_Binary, "compbinary", "Compact binary encoding"), 28 clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"), 29 clEnumValN(SPF_Text, "text", "Text encoding"), 30 clEnumValN(SPF_GCC, "gcc", 31 "GCC encoding (only meaningful for -sample)"))); 32 33 cl::opt<bool> UseMD5( 34 "use-md5", cl::init(false), cl::Hidden, 35 cl::desc("Use md5 to represent function names in the output profile (only " 36 "meaningful for -extbinary)")); 37 38 static cl::opt<bool> PopulateProfileSymbolList( 39 "populate-profile-symbol-list", cl::init(false), cl::Hidden, 40 cl::desc("Populate profile symbol list (only meaningful for -extbinary)")); 41 42 static cl::opt<bool> FillZeroForAllFuncs( 43 "fill-zero-for-all-funcs", cl::init(false), cl::Hidden, 44 cl::desc("Attribute all functions' range with zero count " 45 "even it's not hit by any samples.")); 46 47 static cl::opt<int32_t, true> RecursionCompression( 48 "compress-recursion", 49 cl::desc("Compressing recursion by deduplicating adjacent frame " 50 "sequences up to the specified size. -1 means no size limit."), 51 cl::Hidden, 52 cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize)); 53 54 static cl::opt<bool> 55 TrimColdProfile("trim-cold-profile", cl::init(false), cl::ZeroOrMore, 56 cl::desc("If the total count of the profile is smaller " 57 "than threshold, it will be trimmed.")); 58 59 static cl::opt<bool> CSProfMergeColdContext( 60 "csprof-merge-cold-context", cl::init(true), cl::ZeroOrMore, 61 cl::desc("If the total count of context profile is smaller than " 62 "the threshold, it will be merged into context-less base " 63 "profile.")); 64 65 static cl::opt<uint32_t> CSProfMaxColdContextDepth( 66 "csprof-max-cold-context-depth", cl::init(1), cl::ZeroOrMore, 67 cl::desc("Keep the last K contexts while merging cold profile. 1 means the " 68 "context-less base profile")); 69 70 static cl::opt<int, true> CSProfMaxContextDepth( 71 "csprof-max-context-depth", cl::ZeroOrMore, 72 cl::desc("Keep the last K contexts while merging profile. -1 means no " 73 "depth limit."), 74 cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth)); 75 76 static cl::opt<double> HotFunctionDensityThreshold( 77 "hot-function-density-threshold", llvm::cl::init(1000), 78 llvm::cl::desc( 79 "specify density threshold for hot functions (default: 1000)"), 80 llvm::cl::Optional); 81 static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false), 82 llvm::cl::desc("show profile density details"), 83 llvm::cl::Optional); 84 85 static cl::opt<bool> UpdateTotalSamples( 86 "update-total-samples", llvm::cl::init(false), 87 llvm::cl::desc( 88 "Update total samples by accumulating all its body samples."), 89 llvm::cl::Optional); 90 91 extern cl::opt<int> ProfileSummaryCutoffHot; 92 93 static cl::opt<bool> GenCSNestedProfile( 94 "gen-cs-nested-profile", cl::Hidden, cl::init(true), 95 cl::desc("Generate nested function profiles for CSSPGO")); 96 97 using namespace llvm; 98 using namespace sampleprof; 99 100 namespace llvm { 101 namespace sampleprof { 102 103 // Initialize the MaxCompressionSize to -1 which means no size limit 104 int32_t CSProfileGenerator::MaxCompressionSize = -1; 105 106 int CSProfileGenerator::MaxContextDepth = -1; 107 108 bool ProfileGeneratorBase::UseFSDiscriminator = false; 109 110 std::unique_ptr<ProfileGeneratorBase> 111 ProfileGeneratorBase::create(ProfiledBinary *Binary, 112 const ContextSampleCounterMap &SampleCounters, 113 bool ProfileIsCSFlat) { 114 std::unique_ptr<ProfileGeneratorBase> Generator; 115 if (ProfileIsCSFlat) { 116 if (Binary->useFSDiscriminator()) 117 exitWithError("FS discriminator is not supported in CS profile."); 118 Generator.reset(new CSProfileGenerator(Binary, SampleCounters)); 119 } else { 120 Generator.reset(new ProfileGenerator(Binary, SampleCounters)); 121 } 122 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); 123 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); 124 125 return Generator; 126 } 127 128 void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer, 129 SampleProfileMap &ProfileMap) { 130 // Populate profile symbol list if extended binary format is used. 131 ProfileSymbolList SymbolList; 132 133 if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) { 134 Binary->populateSymbolListFromDWARF(SymbolList); 135 Writer->setProfileSymbolList(&SymbolList); 136 } 137 138 if (std::error_code EC = Writer->write(ProfileMap)) 139 exitWithError(std::move(EC)); 140 } 141 142 void ProfileGeneratorBase::write() { 143 auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat); 144 if (std::error_code EC = WriterOrErr.getError()) 145 exitWithError(EC, OutputFilename); 146 147 if (UseMD5) { 148 if (OutputFormat != SPF_Ext_Binary) 149 WithColor::warning() << "-use-md5 is ignored. Specify " 150 "--format=extbinary to enable it\n"; 151 else 152 WriterOrErr.get()->setUseMD5(); 153 } 154 155 write(std::move(WriterOrErr.get()), ProfileMap); 156 } 157 158 void ProfileGeneratorBase::showDensitySuggestion(double Density) { 159 if (Density == 0.0) 160 WithColor::warning() << "The --profile-summary-cutoff-hot option may be " 161 "set too low. Please check your command.\n"; 162 else if (Density < HotFunctionDensityThreshold) 163 WithColor::warning() 164 << "AutoFDO is estimated to optimize better with " 165 << format("%.1f", HotFunctionDensityThreshold / Density) 166 << "x more samples. Please consider increasing sampling rate or " 167 "profiling for longer duration to get more samples.\n"; 168 169 if (ShowDensity) 170 outs() << "Minimum profile density for hot functions with top " 171 << format("%.2f", 172 static_cast<double>(ProfileSummaryCutoffHot.getValue()) / 173 10000) 174 << "% total samples: " << format("%.1f", Density) << "\n"; 175 } 176 177 double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles, 178 uint64_t HotCntThreshold) { 179 double Density = DBL_MAX; 180 std::vector<const FunctionSamples *> HotFuncs; 181 for (auto &I : Profiles) { 182 auto &FuncSamples = I.second; 183 if (FuncSamples.getTotalSamples() < HotCntThreshold) 184 continue; 185 HotFuncs.emplace_back(&FuncSamples); 186 } 187 188 for (auto *FuncSamples : HotFuncs) { 189 auto *Func = Binary->getBinaryFunction(FuncSamples->getName()); 190 if (!Func) 191 continue; 192 uint64_t FuncSize = Func->getFuncSize(); 193 if (FuncSize == 0) 194 continue; 195 Density = 196 std::min(Density, static_cast<double>(FuncSamples->getTotalSamples()) / 197 FuncSize); 198 } 199 200 return Density == DBL_MAX ? 0.0 : Density; 201 } 202 203 void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges, 204 const RangeSample &Ranges) { 205 206 /* 207 Regions may overlap with each other. Using the boundary info, find all 208 disjoint ranges and their sample count. BoundaryPoint contains the count 209 multiple samples begin/end at this points. 210 211 |<--100-->| Sample1 212 |<------200------>| Sample2 213 A B C 214 215 In the example above, 216 Sample1 begins at A, ends at B, its value is 100. 217 Sample2 beings at A, ends at C, its value is 200. 218 For A, BeginCount is the sum of sample begins at A, which is 300 and no 219 samples ends at A, so EndCount is 0. 220 Then boundary points A, B, and C with begin/end counts are: 221 A: (300, 0) 222 B: (0, 100) 223 C: (0, 200) 224 */ 225 struct BoundaryPoint { 226 // Sum of sample counts beginning at this point 227 uint64_t BeginCount = UINT64_MAX; 228 // Sum of sample counts ending at this point 229 uint64_t EndCount = UINT64_MAX; 230 // Is the begin point of a zero range. 231 bool IsZeroRangeBegin = false; 232 // Is the end point of a zero range. 233 bool IsZeroRangeEnd = false; 234 235 void addBeginCount(uint64_t Count) { 236 if (BeginCount == UINT64_MAX) 237 BeginCount = 0; 238 BeginCount += Count; 239 } 240 241 void addEndCount(uint64_t Count) { 242 if (EndCount == UINT64_MAX) 243 EndCount = 0; 244 EndCount += Count; 245 } 246 }; 247 248 /* 249 For the above example. With boundary points, follwing logic finds two 250 disjoint region of 251 252 [A,B]: 300 253 [B+1,C]: 200 254 255 If there is a boundary point that both begin and end, the point itself 256 becomes a separate disjoint region. For example, if we have original 257 ranges of 258 259 |<--- 100 --->| 260 |<--- 200 --->| 261 A B C 262 263 there are three boundary points with their begin/end counts of 264 265 A: (100, 0) 266 B: (200, 100) 267 C: (0, 200) 268 269 the disjoint ranges would be 270 271 [A, B-1]: 100 272 [B, B]: 300 273 [B+1, C]: 200. 274 275 Example for zero value range: 276 277 |<--- 100 --->| 278 |<--- 200 --->| 279 |<--------------- 0 ----------------->| 280 A B C D E F 281 282 [A, B-1] : 0 283 [B, C] : 100 284 [C+1, D-1]: 0 285 [D, E] : 200 286 [E+1, F] : 0 287 */ 288 std::map<uint64_t, BoundaryPoint> Boundaries; 289 290 for (const auto &Item : Ranges) { 291 assert(Item.first.first <= Item.first.second && 292 "Invalid instruction range"); 293 auto &BeginPoint = Boundaries[Item.first.first]; 294 auto &EndPoint = Boundaries[Item.first.second]; 295 uint64_t Count = Item.second; 296 297 BeginPoint.addBeginCount(Count); 298 EndPoint.addEndCount(Count); 299 if (Count == 0) { 300 BeginPoint.IsZeroRangeBegin = true; 301 EndPoint.IsZeroRangeEnd = true; 302 } 303 } 304 305 // Use UINT64_MAX to indicate there is no existing range between BeginAddress 306 // and the next valid address 307 uint64_t BeginAddress = UINT64_MAX; 308 int ZeroRangeDepth = 0; 309 uint64_t Count = 0; 310 for (const auto &Item : Boundaries) { 311 uint64_t Address = Item.first; 312 const BoundaryPoint &Point = Item.second; 313 if (Point.BeginCount != UINT64_MAX) { 314 if (BeginAddress != UINT64_MAX) 315 DisjointRanges[{BeginAddress, Address - 1}] = Count; 316 Count += Point.BeginCount; 317 BeginAddress = Address; 318 ZeroRangeDepth += Point.IsZeroRangeBegin; 319 } 320 if (Point.EndCount != UINT64_MAX) { 321 assert((BeginAddress != UINT64_MAX) && 322 "First boundary point cannot be 'end' point"); 323 DisjointRanges[{BeginAddress, Address}] = Count; 324 assert(Count >= Point.EndCount && "Mismatched live ranges"); 325 Count -= Point.EndCount; 326 BeginAddress = Address + 1; 327 ZeroRangeDepth -= Point.IsZeroRangeEnd; 328 // If the remaining count is zero and it's no longer in a zero range, this 329 // means we consume all the ranges before, thus mark BeginAddress as 330 // UINT64_MAX. e.g. supposing we have two non-overlapping ranges: 331 // [<---- 10 ---->] 332 // [<---- 20 ---->] 333 // A B C D 334 // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't 335 // have the [B+1, C-1] zero range. 336 if (Count == 0 && ZeroRangeDepth == 0) 337 BeginAddress = UINT64_MAX; 338 } 339 } 340 } 341 342 void ProfileGeneratorBase::updateBodySamplesforFunctionProfile( 343 FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc, 344 uint64_t Count) { 345 // Use the maximum count of samples with same line location 346 uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator); 347 348 // Use duplication factor to compensated for loop unroll/vectorization. 349 // Note that this is only needed when we're taking MAX of the counts at 350 // the location instead of SUM. 351 Count *= getDuplicationFactor(LeafLoc.Location.Discriminator); 352 353 ErrorOr<uint64_t> R = 354 FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator); 355 356 uint64_t PreviousCount = R ? R.get() : 0; 357 if (PreviousCount <= Count) { 358 FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator, 359 Count - PreviousCount); 360 } 361 } 362 363 void ProfileGeneratorBase::updateTotalSamples() { 364 if (!UpdateTotalSamples) 365 return; 366 367 for (auto &Item : ProfileMap) { 368 FunctionSamples &FunctionProfile = Item.second; 369 FunctionProfile.updateTotalSamples(); 370 } 371 } 372 373 void ProfileGeneratorBase::collectProfiledFunctions() { 374 std::unordered_set<const BinaryFunction *> ProfiledFunctions; 375 // Go through all the stacks, ranges and branches in sample counters, use the 376 // start of the range to look up the function it belongs and record the 377 // function. 378 for (const auto &CI : SampleCounters) { 379 if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(CI.first.getPtr())) { 380 for (auto Addr : CtxKey->Context) { 381 if (FuncRange *FRange = Binary->findFuncRangeForOffset( 382 Binary->virtualAddrToOffset(Addr))) 383 ProfiledFunctions.insert(FRange->Func); 384 } 385 } 386 387 for (auto Item : CI.second.RangeCounter) { 388 uint64_t StartOffset = Item.first.first; 389 if (FuncRange *FRange = Binary->findFuncRangeForOffset(StartOffset)) 390 ProfiledFunctions.insert(FRange->Func); 391 } 392 393 for (auto Item : CI.second.BranchCounter) { 394 uint64_t SourceOffset = Item.first.first; 395 uint64_t TargetOffset = Item.first.first; 396 if (FuncRange *FRange = Binary->findFuncRangeForOffset(SourceOffset)) 397 ProfiledFunctions.insert(FRange->Func); 398 if (FuncRange *FRange = Binary->findFuncRangeForOffset(TargetOffset)) 399 ProfiledFunctions.insert(FRange->Func); 400 } 401 } 402 403 Binary->setProfiledFunctions(ProfiledFunctions); 404 } 405 406 FunctionSamples & 407 ProfileGenerator::getTopLevelFunctionProfile(StringRef FuncName) { 408 SampleContext Context(FuncName); 409 auto Ret = ProfileMap.emplace(Context, FunctionSamples()); 410 if (Ret.second) { 411 FunctionSamples &FProfile = Ret.first->second; 412 FProfile.setContext(Context); 413 } 414 return Ret.first->second; 415 } 416 417 void ProfileGenerator::generateProfile() { 418 collectProfiledFunctions(); 419 if (Binary->usePseudoProbes()) { 420 generateProbeBasedProfile(); 421 } else { 422 generateLineNumBasedProfile(); 423 } 424 postProcessProfiles(); 425 } 426 427 void ProfileGenerator::postProcessProfiles() { 428 computeSummaryAndThreshold(); 429 trimColdProfiles(ProfileMap, ColdCountThreshold); 430 calculateAndShowDensity(ProfileMap); 431 } 432 433 void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles, 434 uint64_t ColdCntThreshold) { 435 if (!TrimColdProfile) 436 return; 437 438 // Move cold profiles into a tmp container. 439 std::vector<SampleContext> ColdProfiles; 440 for (const auto &I : ProfileMap) { 441 if (I.second.getTotalSamples() < ColdCntThreshold) 442 ColdProfiles.emplace_back(I.first); 443 } 444 445 // Remove the cold profile from ProfileMap. 446 for (const auto &I : ColdProfiles) 447 ProfileMap.erase(I); 448 } 449 450 void ProfileGenerator::generateLineNumBasedProfile() { 451 assert(SampleCounters.size() == 1 && 452 "Must have one entry for profile generation."); 453 const SampleCounter &SC = SampleCounters.begin()->second; 454 // Fill in function body samples 455 populateBodySamplesForAllFunctions(SC.RangeCounter); 456 // Fill in boundary sample counts as well as call site samples for calls 457 populateBoundarySamplesForAllFunctions(SC.BranchCounter); 458 459 updateTotalSamples(); 460 } 461 462 void ProfileGenerator::generateProbeBasedProfile() { 463 assert(SampleCounters.size() == 1 && 464 "Must have one entry for profile generation."); 465 Binary->decodePseudoProbe(); 466 // Enable pseudo probe functionalities in SampleProf 467 FunctionSamples::ProfileIsProbeBased = true; 468 const SampleCounter &SC = SampleCounters.begin()->second; 469 // Fill in function body samples 470 populateBodySamplesWithProbesForAllFunctions(SC.RangeCounter); 471 // Fill in boundary sample counts as well as call site samples for calls 472 populateBoundarySamplesWithProbesForAllFunctions(SC.BranchCounter); 473 474 updateTotalSamples(); 475 } 476 477 void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions( 478 const RangeSample &RangeCounter) { 479 ProbeCounterMap ProbeCounter; 480 // preprocessRangeCounter returns disjoint ranges, so no longer to redo it 481 // inside extractProbesFromRange. 482 extractProbesFromRange(preprocessRangeCounter(RangeCounter), ProbeCounter, 483 false); 484 485 for (const auto &PI : ProbeCounter) { 486 const MCDecodedPseudoProbe *Probe = PI.first; 487 uint64_t Count = PI.second; 488 SampleContextFrameVector FrameVec; 489 Binary->getInlineContextForProbe(Probe, FrameVec, true); 490 FunctionSamples &FunctionProfile = 491 getLeafProfileAndAddTotalSamples(FrameVec, Count); 492 FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count); 493 if (Probe->isEntry()) 494 FunctionProfile.addHeadSamples(Count); 495 } 496 } 497 498 void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions( 499 const BranchSample &BranchCounters) { 500 for (const auto &Entry : BranchCounters) { 501 uint64_t SourceOffset = Entry.first.first; 502 uint64_t TargetOffset = Entry.first.second; 503 uint64_t Count = Entry.second; 504 assert(Count != 0 && "Unexpected zero weight branch"); 505 506 StringRef CalleeName = getCalleeNameForOffset(TargetOffset); 507 if (CalleeName.size() == 0) 508 continue; 509 510 uint64_t SourceAddress = Binary->offsetToVirtualAddr(SourceOffset); 511 const MCDecodedPseudoProbe *CallProbe = 512 Binary->getCallProbeForAddr(SourceAddress); 513 if (CallProbe == nullptr) 514 continue; 515 516 // Record called target sample and its count. 517 SampleContextFrameVector FrameVec; 518 Binary->getInlineContextForProbe(CallProbe, FrameVec, true); 519 520 if (!FrameVec.empty()) { 521 FunctionSamples &FunctionProfile = 522 getLeafProfileAndAddTotalSamples(FrameVec, 0); 523 FunctionProfile.addCalledTargetSamples( 524 FrameVec.back().Location.LineOffset, 0, CalleeName, Count); 525 } 526 } 527 } 528 529 FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples( 530 const SampleContextFrameVector &FrameVec, uint64_t Count) { 531 // Get top level profile 532 FunctionSamples *FunctionProfile = 533 &getTopLevelFunctionProfile(FrameVec[0].FuncName); 534 FunctionProfile->addTotalSamples(Count); 535 if (Binary->usePseudoProbes()) { 536 const auto *FuncDesc = Binary->getFuncDescForGUID( 537 Function::getGUID(FunctionProfile->getName())); 538 FunctionProfile->setFunctionHash(FuncDesc->FuncHash); 539 } 540 541 for (size_t I = 1; I < FrameVec.size(); I++) { 542 LineLocation Callsite( 543 FrameVec[I - 1].Location.LineOffset, 544 getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator)); 545 FunctionSamplesMap &SamplesMap = 546 FunctionProfile->functionSamplesAt(Callsite); 547 auto Ret = 548 SamplesMap.emplace(FrameVec[I].FuncName.str(), FunctionSamples()); 549 if (Ret.second) { 550 SampleContext Context(FrameVec[I].FuncName); 551 Ret.first->second.setContext(Context); 552 } 553 FunctionProfile = &Ret.first->second; 554 FunctionProfile->addTotalSamples(Count); 555 if (Binary->usePseudoProbes()) { 556 const auto *FuncDesc = Binary->getFuncDescForGUID( 557 Function::getGUID(FunctionProfile->getName())); 558 FunctionProfile->setFunctionHash(FuncDesc->FuncHash); 559 } 560 } 561 562 return *FunctionProfile; 563 } 564 565 RangeSample 566 ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) { 567 RangeSample Ranges(RangeCounter.begin(), RangeCounter.end()); 568 if (FillZeroForAllFuncs) { 569 for (auto &FuncI : Binary->getAllBinaryFunctions()) { 570 for (auto &R : FuncI.second.Ranges) { 571 Ranges[{R.first, R.second - 1}] += 0; 572 } 573 } 574 } else { 575 // For each range, we search for all ranges of the function it belongs to 576 // and initialize it with zero count, so it remains zero if doesn't hit any 577 // samples. This is to be consistent with compiler that interpret zero count 578 // as unexecuted(cold). 579 for (const auto &I : RangeCounter) { 580 uint64_t StartOffset = I.first.first; 581 for (const auto &Range : Binary->getRangesForOffset(StartOffset)) 582 Ranges[{Range.first, Range.second - 1}] += 0; 583 } 584 } 585 RangeSample DisjointRanges; 586 findDisjointRanges(DisjointRanges, Ranges); 587 return DisjointRanges; 588 } 589 590 void ProfileGenerator::populateBodySamplesForAllFunctions( 591 const RangeSample &RangeCounter) { 592 for (const auto &Range : preprocessRangeCounter(RangeCounter)) { 593 uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); 594 uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); 595 uint64_t Count = Range.second; 596 597 InstructionPointer IP(Binary, RangeBegin, true); 598 // Disjoint ranges may have range in the middle of two instr, 599 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range 600 // can be Addr1+1 to Addr2-1. We should ignore such range. 601 if (IP.Address > RangeEnd) 602 continue; 603 604 do { 605 uint64_t Offset = Binary->virtualAddrToOffset(IP.Address); 606 const SampleContextFrameVector &FrameVec = 607 Binary->getFrameLocationStack(Offset); 608 if (!FrameVec.empty()) { 609 // FIXME: As accumulating total count per instruction caused some 610 // regression, we changed to accumulate total count per byte as a 611 // workaround. Tuning hotness threshold on the compiler side might be 612 // necessary in the future. 613 FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples( 614 FrameVec, Count * Binary->getInstSize(Offset)); 615 updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(), 616 Count); 617 } 618 } while (IP.advance() && IP.Address <= RangeEnd); 619 } 620 } 621 622 StringRef ProfileGeneratorBase::getCalleeNameForOffset(uint64_t TargetOffset) { 623 // Get the function range by branch target if it's a call branch. 624 auto *FRange = Binary->findFuncRangeForStartOffset(TargetOffset); 625 626 // We won't accumulate sample count for a range whose start is not the real 627 // function entry such as outlined function or inner labels. 628 if (!FRange || !FRange->IsFuncEntry) 629 return StringRef(); 630 631 return FunctionSamples::getCanonicalFnName(FRange->getFuncName()); 632 } 633 634 void ProfileGenerator::populateBoundarySamplesForAllFunctions( 635 const BranchSample &BranchCounters) { 636 for (const auto &Entry : BranchCounters) { 637 uint64_t SourceOffset = Entry.first.first; 638 uint64_t TargetOffset = Entry.first.second; 639 uint64_t Count = Entry.second; 640 assert(Count != 0 && "Unexpected zero weight branch"); 641 642 StringRef CalleeName = getCalleeNameForOffset(TargetOffset); 643 if (CalleeName.size() == 0) 644 continue; 645 // Record called target sample and its count. 646 const SampleContextFrameVector &FrameVec = 647 Binary->getFrameLocationStack(SourceOffset); 648 if (!FrameVec.empty()) { 649 FunctionSamples &FunctionProfile = 650 getLeafProfileAndAddTotalSamples(FrameVec, 0); 651 FunctionProfile.addCalledTargetSamples( 652 FrameVec.back().Location.LineOffset, 653 getBaseDiscriminator(FrameVec.back().Location.Discriminator), 654 CalleeName, Count); 655 } 656 // Add head samples for callee. 657 FunctionSamples &CalleeProfile = getTopLevelFunctionProfile(CalleeName); 658 CalleeProfile.addHeadSamples(Count); 659 } 660 } 661 662 void ProfileGeneratorBase::calculateAndShowDensity( 663 const SampleProfileMap &Profiles) { 664 double Density = calculateDensity(Profiles, HotCountThreshold); 665 showDensitySuggestion(Density); 666 } 667 668 FunctionSamples &CSProfileGenerator::getFunctionProfileForContext( 669 const SampleContextFrameVector &Context, bool WasLeafInlined) { 670 auto I = ProfileMap.find(SampleContext(Context)); 671 if (I == ProfileMap.end()) { 672 // Save the new context for future references. 673 SampleContextFrames NewContext = *Contexts.insert(Context).first; 674 SampleContext FContext(NewContext, RawContext); 675 auto Ret = ProfileMap.emplace(FContext, FunctionSamples()); 676 if (WasLeafInlined) 677 FContext.setAttribute(ContextWasInlined); 678 FunctionSamples &FProfile = Ret.first->second; 679 FProfile.setContext(FContext); 680 return Ret.first->second; 681 } 682 return I->second; 683 } 684 685 void CSProfileGenerator::generateProfile() { 686 FunctionSamples::ProfileIsCSFlat = true; 687 688 collectProfiledFunctions(); 689 690 if (Binary->usePseudoProbes()) { 691 generateProbeBasedProfile(); 692 } else { 693 generateLineNumBasedProfile(); 694 } 695 696 if (Binary->getTrackFuncContextSize()) 697 computeSizeForProfiledFunctions(); 698 699 postProcessProfiles(); 700 } 701 702 void CSProfileGenerator::computeSizeForProfiledFunctions() { 703 std::unordered_set<const BinaryFunction *> ProfiledFunctions; 704 for (auto *Func : Binary->getProfiledFunctions()) 705 Binary->computeInlinedContextSizeForFunc(Func); 706 707 // Flush the symbolizer to save memory. 708 Binary->flushSymbolizer(); 709 } 710 711 void CSProfileGenerator::generateLineNumBasedProfile() { 712 for (const auto &CI : SampleCounters) { 713 const auto *CtxKey = cast<StringBasedCtxKey>(CI.first.getPtr()); 714 715 // Get or create function profile for the range 716 FunctionSamples &FunctionProfile = 717 getFunctionProfileForContext(CtxKey->Context, CtxKey->WasLeafInlined); 718 719 // Fill in function body samples 720 populateBodySamplesForFunction(FunctionProfile, CI.second.RangeCounter); 721 // Fill in boundary sample counts as well as call site samples for calls 722 populateBoundarySamplesForFunction(CtxKey->Context, FunctionProfile, 723 CI.second.BranchCounter); 724 } 725 // Fill in call site value sample for inlined calls and also use context to 726 // infer missing samples. Since we don't have call count for inlined 727 // functions, we estimate it from inlinee's profile using the entry of the 728 // body sample. 729 populateInferredFunctionSamples(); 730 731 updateTotalSamples(); 732 } 733 734 void CSProfileGenerator::populateBodySamplesForFunction( 735 FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) { 736 // Compute disjoint ranges first, so we can use MAX 737 // for calculating count for each location. 738 RangeSample Ranges; 739 findDisjointRanges(Ranges, RangeCounter); 740 for (const auto &Range : Ranges) { 741 uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); 742 uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); 743 uint64_t Count = Range.second; 744 // Disjoint ranges have introduce zero-filled gap that 745 // doesn't belong to current context, filter them out. 746 if (Count == 0) 747 continue; 748 749 InstructionPointer IP(Binary, RangeBegin, true); 750 // Disjoint ranges may have range in the middle of two instr, 751 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range 752 // can be Addr1+1 to Addr2-1. We should ignore such range. 753 if (IP.Address > RangeEnd) 754 continue; 755 756 do { 757 uint64_t Offset = Binary->virtualAddrToOffset(IP.Address); 758 auto LeafLoc = Binary->getInlineLeafFrameLoc(Offset); 759 if (LeafLoc.hasValue()) { 760 // Recording body sample for this specific context 761 updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count); 762 FunctionProfile.addTotalSamples(Count); 763 } 764 } while (IP.advance() && IP.Address <= RangeEnd); 765 } 766 } 767 768 void CSProfileGenerator::populateBoundarySamplesForFunction( 769 SampleContextFrames ContextId, FunctionSamples &FunctionProfile, 770 const BranchSample &BranchCounters) { 771 772 for (const auto &Entry : BranchCounters) { 773 uint64_t SourceOffset = Entry.first.first; 774 uint64_t TargetOffset = Entry.first.second; 775 uint64_t Count = Entry.second; 776 assert(Count != 0 && "Unexpected zero weight branch"); 777 778 StringRef CalleeName = getCalleeNameForOffset(TargetOffset); 779 if (CalleeName.size() == 0) 780 continue; 781 782 // Record called target sample and its count 783 auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceOffset); 784 if (!LeafLoc.hasValue()) 785 continue; 786 FunctionProfile.addCalledTargetSamples( 787 LeafLoc->Location.LineOffset, 788 getBaseDiscriminator(LeafLoc->Location.Discriminator), CalleeName, 789 Count); 790 791 // Record head sample for called target(callee) 792 SampleContextFrameVector CalleeCtx(ContextId.begin(), ContextId.end()); 793 assert(CalleeCtx.back().FuncName == LeafLoc->FuncName && 794 "Leaf function name doesn't match"); 795 CalleeCtx.back() = *LeafLoc; 796 CalleeCtx.emplace_back(CalleeName, LineLocation(0, 0)); 797 FunctionSamples &CalleeProfile = getFunctionProfileForContext(CalleeCtx); 798 CalleeProfile.addHeadSamples(Count); 799 } 800 } 801 802 static SampleContextFrame 803 getCallerContext(SampleContextFrames CalleeContext, 804 SampleContextFrameVector &CallerContext) { 805 assert(CalleeContext.size() > 1 && "Unexpected empty context"); 806 CalleeContext = CalleeContext.drop_back(); 807 CallerContext.assign(CalleeContext.begin(), CalleeContext.end()); 808 SampleContextFrame CallerFrame = CallerContext.back(); 809 CallerContext.back().Location = LineLocation(0, 0); 810 return CallerFrame; 811 } 812 813 void CSProfileGenerator::populateInferredFunctionSamples() { 814 for (const auto &Item : ProfileMap) { 815 const auto &CalleeContext = Item.first; 816 const FunctionSamples &CalleeProfile = Item.second; 817 818 // If we already have head sample counts, we must have value profile 819 // for call sites added already. Skip to avoid double counting. 820 if (CalleeProfile.getHeadSamples()) 821 continue; 822 // If we don't have context, nothing to do for caller's call site. 823 // This could happen for entry point function. 824 if (CalleeContext.isBaseContext()) 825 continue; 826 827 // Infer Caller's frame loc and context ID through string splitting 828 SampleContextFrameVector CallerContextId; 829 SampleContextFrame &&CallerLeafFrameLoc = 830 getCallerContext(CalleeContext.getContextFrames(), CallerContextId); 831 SampleContextFrames CallerContext(CallerContextId); 832 833 // It's possible that we haven't seen any sample directly in the caller, 834 // in which case CallerProfile will not exist. But we can't modify 835 // ProfileMap while iterating it. 836 // TODO: created function profile for those callers too 837 if (ProfileMap.find(CallerContext) == ProfileMap.end()) 838 continue; 839 FunctionSamples &CallerProfile = ProfileMap[CallerContext]; 840 841 // Since we don't have call count for inlined functions, we 842 // estimate it from inlinee's profile using entry body sample. 843 uint64_t EstimatedCallCount = CalleeProfile.getEntrySamples(); 844 // If we don't have samples with location, use 1 to indicate live. 845 if (!EstimatedCallCount && !CalleeProfile.getBodySamples().size()) 846 EstimatedCallCount = 1; 847 CallerProfile.addCalledTargetSamples( 848 CallerLeafFrameLoc.Location.LineOffset, 849 CallerLeafFrameLoc.Location.Discriminator, 850 CalleeProfile.getContext().getName(), EstimatedCallCount); 851 CallerProfile.addBodySamples(CallerLeafFrameLoc.Location.LineOffset, 852 CallerLeafFrameLoc.Location.Discriminator, 853 EstimatedCallCount); 854 CallerProfile.addTotalSamples(EstimatedCallCount); 855 } 856 } 857 858 void CSProfileGenerator::postProcessProfiles() { 859 // Compute hot/cold threshold based on profile. This will be used for cold 860 // context profile merging/trimming. 861 computeSummaryAndThreshold(); 862 863 // Run global pre-inliner to adjust/merge context profile based on estimated 864 // inline decisions. 865 if (EnableCSPreInliner) { 866 CSPreInliner(ProfileMap, *Binary, HotCountThreshold, ColdCountThreshold) 867 .run(); 868 // Turn off the profile merger by default unless it is explicitly enabled. 869 if (!CSProfMergeColdContext.getNumOccurrences()) 870 CSProfMergeColdContext = false; 871 } 872 873 // Trim and merge cold context profile using cold threshold above. 874 if (TrimColdProfile || CSProfMergeColdContext) { 875 SampleContextTrimmer(ProfileMap) 876 .trimAndMergeColdContextProfiles( 877 HotCountThreshold, TrimColdProfile, CSProfMergeColdContext, 878 CSProfMaxColdContextDepth, EnableCSPreInliner); 879 } 880 881 // Merge function samples of CS profile to calculate profile density. 882 sampleprof::SampleProfileMap ContextLessProfiles; 883 for (const auto &I : ProfileMap) { 884 ContextLessProfiles[I.second.getName()].merge(I.second); 885 } 886 887 calculateAndShowDensity(ContextLessProfiles); 888 if (GenCSNestedProfile) { 889 CSProfileConverter CSConverter(ProfileMap); 890 CSConverter.convertProfiles(); 891 FunctionSamples::ProfileIsCSFlat = false; 892 FunctionSamples::ProfileIsCSNested = EnableCSPreInliner; 893 } 894 } 895 896 void ProfileGeneratorBase::computeSummaryAndThreshold() { 897 SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); 898 auto Summary = Builder.computeSummaryForProfiles(ProfileMap); 899 HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold( 900 (Summary->getDetailedSummary())); 901 ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( 902 (Summary->getDetailedSummary())); 903 } 904 905 void ProfileGeneratorBase::extractProbesFromRange( 906 const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter, 907 bool FindDisjointRanges) { 908 const RangeSample *PRanges = &RangeCounter; 909 RangeSample Ranges; 910 if (FindDisjointRanges) { 911 findDisjointRanges(Ranges, RangeCounter); 912 PRanges = &Ranges; 913 } 914 915 for (const auto &Range : *PRanges) { 916 uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); 917 uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); 918 uint64_t Count = Range.second; 919 920 InstructionPointer IP(Binary, RangeBegin, true); 921 // Disjoint ranges may have range in the middle of two instr, 922 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range 923 // can be Addr1+1 to Addr2-1. We should ignore such range. 924 if (IP.Address > RangeEnd) 925 continue; 926 927 do { 928 const AddressProbesMap &Address2ProbesMap = 929 Binary->getAddress2ProbesMap(); 930 auto It = Address2ProbesMap.find(IP.Address); 931 if (It != Address2ProbesMap.end()) { 932 for (const auto &Probe : It->second) { 933 ProbeCounter[&Probe] += Count; 934 } 935 } 936 } while (IP.advance() && IP.Address <= RangeEnd); 937 } 938 } 939 940 static void 941 extractPrefixContextStack(SampleContextFrameVector &ContextStack, 942 const SmallVectorImpl<uint64_t> &Addresses, 943 ProfiledBinary *Binary) { 944 SmallVector<const MCDecodedPseudoProbe *, 16> Probes; 945 for (auto Addr : reverse(Addresses)) { 946 const MCDecodedPseudoProbe *CallProbe = Binary->getCallProbeForAddr(Addr); 947 // These could be the cases when a probe is not found at a calliste. Cutting 948 // off the context from here since the inliner will not know how to consume 949 // a context with unknown callsites. 950 // 1. for functions that are not sampled when 951 // --decode-probe-for-profiled-functions-only is on. 952 // 2. for a merged callsite. Callsite merging may cause the loss of original 953 // probe IDs. 954 // 3. for an external callsite. 955 if (!CallProbe) 956 break; 957 Probes.push_back(CallProbe); 958 } 959 960 std::reverse(Probes.begin(), Probes.end()); 961 962 // Extract context stack for reusing, leaf context stack will be added 963 // compressed while looking up function profile. 964 for (const auto *P : Probes) { 965 Binary->getInlineContextForProbe(P, ContextStack, true); 966 } 967 } 968 969 void CSProfileGenerator::generateProbeBasedProfile() { 970 Binary->decodePseudoProbe(); 971 // Enable pseudo probe functionalities in SampleProf 972 FunctionSamples::ProfileIsProbeBased = true; 973 for (const auto &CI : SampleCounters) { 974 const AddrBasedCtxKey *CtxKey = 975 dyn_cast<AddrBasedCtxKey>(CI.first.getPtr()); 976 SampleContextFrameVector ContextStack; 977 extractPrefixContextStack(ContextStack, CtxKey->Context, Binary); 978 // Fill in function body samples from probes, also infer caller's samples 979 // from callee's probe 980 populateBodySamplesWithProbes(CI.second.RangeCounter, ContextStack); 981 // Fill in boundary samples for a call probe 982 populateBoundarySamplesWithProbes(CI.second.BranchCounter, ContextStack); 983 } 984 } 985 986 void CSProfileGenerator::populateBodySamplesWithProbes( 987 const RangeSample &RangeCounter, SampleContextFrames ContextStack) { 988 ProbeCounterMap ProbeCounter; 989 // Extract the top frame probes by looking up each address among the range in 990 // the Address2ProbeMap 991 extractProbesFromRange(RangeCounter, ProbeCounter); 992 std::unordered_map<MCDecodedPseudoProbeInlineTree *, 993 std::unordered_set<FunctionSamples *>> 994 FrameSamples; 995 for (const auto &PI : ProbeCounter) { 996 const MCDecodedPseudoProbe *Probe = PI.first; 997 uint64_t Count = PI.second; 998 // Disjoint ranges have introduce zero-filled gap that 999 // doesn't belong to current context, filter them out. 1000 if (!Probe->isBlock() || Count == 0) 1001 continue; 1002 FunctionSamples &FunctionProfile = 1003 getFunctionProfileForLeafProbe(ContextStack, Probe); 1004 // Record the current frame and FunctionProfile whenever samples are 1005 // collected for non-danglie probes. This is for reporting all of the 1006 // zero count probes of the frame later. 1007 FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile); 1008 FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count); 1009 FunctionProfile.addTotalSamples(Count); 1010 if (Probe->isEntry()) { 1011 FunctionProfile.addHeadSamples(Count); 1012 // Look up for the caller's function profile 1013 const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe); 1014 SampleContextFrames CalleeContextId = 1015 FunctionProfile.getContext().getContextFrames(); 1016 if (InlinerDesc != nullptr && CalleeContextId.size() > 1) { 1017 // Since the context id will be compressed, we have to use callee's 1018 // context id to infer caller's context id to ensure they share the 1019 // same context prefix. 1020 SampleContextFrameVector CallerContextId; 1021 SampleContextFrame &&CallerLeafFrameLoc = 1022 getCallerContext(CalleeContextId, CallerContextId); 1023 uint64_t CallerIndex = CallerLeafFrameLoc.Location.LineOffset; 1024 assert(CallerIndex && 1025 "Inferred caller's location index shouldn't be zero!"); 1026 FunctionSamples &CallerProfile = 1027 getFunctionProfileForContext(CallerContextId); 1028 CallerProfile.setFunctionHash(InlinerDesc->FuncHash); 1029 CallerProfile.addBodySamples(CallerIndex, 0, Count); 1030 CallerProfile.addTotalSamples(Count); 1031 CallerProfile.addCalledTargetSamples( 1032 CallerIndex, 0, FunctionProfile.getContext().getName(), Count); 1033 } 1034 } 1035 } 1036 1037 // Assign zero count for remaining probes without sample hits to 1038 // differentiate from probes optimized away, of which the counts are unknown 1039 // and will be inferred by the compiler. 1040 for (auto &I : FrameSamples) { 1041 for (auto *FunctionProfile : I.second) { 1042 for (auto *Probe : I.first->getProbes()) { 1043 FunctionProfile->addBodySamplesForProbe(Probe->getIndex(), 0); 1044 } 1045 } 1046 } 1047 } 1048 1049 void CSProfileGenerator::populateBoundarySamplesWithProbes( 1050 const BranchSample &BranchCounter, SampleContextFrames ContextStack) { 1051 for (const auto &BI : BranchCounter) { 1052 uint64_t SourceOffset = BI.first.first; 1053 uint64_t TargetOffset = BI.first.second; 1054 uint64_t Count = BI.second; 1055 uint64_t SourceAddress = Binary->offsetToVirtualAddr(SourceOffset); 1056 const MCDecodedPseudoProbe *CallProbe = 1057 Binary->getCallProbeForAddr(SourceAddress); 1058 if (CallProbe == nullptr) 1059 continue; 1060 FunctionSamples &FunctionProfile = 1061 getFunctionProfileForLeafProbe(ContextStack, CallProbe); 1062 FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count); 1063 FunctionProfile.addTotalSamples(Count); 1064 StringRef CalleeName = getCalleeNameForOffset(TargetOffset); 1065 if (CalleeName.size() == 0) 1066 continue; 1067 FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(), 0, CalleeName, 1068 Count); 1069 } 1070 } 1071 1072 FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe( 1073 SampleContextFrames ContextStack, const MCDecodedPseudoProbe *LeafProbe) { 1074 1075 // Explicitly copy the context for appending the leaf context 1076 SampleContextFrameVector NewContextStack(ContextStack.begin(), 1077 ContextStack.end()); 1078 Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true); 1079 // For leaf inlined context with the top frame, we should strip off the top 1080 // frame's probe id, like: 1081 // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar" 1082 auto LeafFrame = NewContextStack.back(); 1083 LeafFrame.Location = LineLocation(0, 0); 1084 NewContextStack.pop_back(); 1085 // Compress the context string except for the leaf frame 1086 CSProfileGenerator::compressRecursionContext(NewContextStack); 1087 CSProfileGenerator::trimContext(NewContextStack); 1088 NewContextStack.push_back(LeafFrame); 1089 1090 const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid()); 1091 bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite(); 1092 FunctionSamples &FunctionProile = 1093 getFunctionProfileForContext(NewContextStack, WasLeafInlined); 1094 FunctionProile.setFunctionHash(FuncDesc->FuncHash); 1095 return FunctionProile; 1096 } 1097 1098 } // end namespace sampleprof 1099 } // end namespace llvm 1100