1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "CSPreInliner.h"
10 #include "ProfiledBinary.h"
11 #include "llvm/ADT/SCCIterator.h"
12 #include "llvm/ADT/Statistic.h"
13 #include <cstdint>
14 #include <queue>
15 
16 #define DEBUG_TYPE "cs-preinliner"
17 
18 using namespace llvm;
19 using namespace sampleprof;
20 
21 STATISTIC(PreInlNumCSInlined,
22           "Number of functions inlined with context sensitive profile");
23 STATISTIC(PreInlNumCSNotInlined,
24           "Number of functions not inlined with context sensitive profile");
25 STATISTIC(PreInlNumCSInlinedHitMinLimit,
26           "Number of functions with FDO inline stopped due to min size limit");
27 STATISTIC(PreInlNumCSInlinedHitMaxLimit,
28           "Number of functions with FDO inline stopped due to max size limit");
29 STATISTIC(
30     PreInlNumCSInlinedHitGrowthLimit,
31     "Number of functions with FDO inline stopped due to growth size limit");
32 
33 // The switches specify inline thresholds used in SampleProfileLoader inlining.
34 // TODO: the actual threshold to be tuned here because the size here is based
35 // on machine code not LLVM IR.
36 extern cl::opt<int> SampleHotCallSiteThreshold;
37 extern cl::opt<int> SampleColdCallSiteThreshold;
38 extern cl::opt<int> ProfileInlineGrowthLimit;
39 extern cl::opt<int> ProfileInlineLimitMin;
40 extern cl::opt<int> ProfileInlineLimitMax;
41 extern cl::opt<bool> SortProfiledSCC;
42 
43 cl::opt<bool> EnableCSPreInliner(
44     "csspgo-preinliner", cl::Hidden, cl::init(true),
45     cl::desc("Run a global pre-inliner to merge context profile based on "
46              "estimated global top-down inline decisions"));
47 
48 cl::opt<bool> UseContextCostForPreInliner(
49     "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
50     cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
51 
52 static cl::opt<bool> SamplePreInlineReplay(
53     "csspgo-replay-preinline", cl::Hidden, cl::init(false),
54     cl::desc(
55         "Replay previous inlining and adjust context profile accordingly"));
56 
57 CSPreInliner::CSPreInliner(SampleProfileMap &Profiles, ProfiledBinary &Binary,
58                            uint64_t HotThreshold, uint64_t ColdThreshold)
59     : UseContextCost(UseContextCostForPreInliner),
60       // TODO: Pass in a guid-to-name map in order for
61       // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
62       // as their profile context.
63       ContextTracker(Profiles, nullptr), ProfileMap(Profiles), Binary(Binary),
64       HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) {
65   // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
66   // for good performance with reasonable profile size.
67   if (!SampleHotCallSiteThreshold.getNumOccurrences())
68     SampleHotCallSiteThreshold = 1500;
69   if (!SampleColdCallSiteThreshold.getNumOccurrences())
70     SampleColdCallSiteThreshold = 0;
71 }
72 
73 std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
74   std::vector<StringRef> Order;
75   ProfiledCallGraph ProfiledCG(ContextTracker);
76 
77   // Now that we have a profiled call graph, construct top-down order
78   // by building up SCC and reversing SCC order.
79   scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
80   while (!I.isAtEnd()) {
81     auto Range = *I;
82     if (SortProfiledSCC) {
83       // Sort nodes in one SCC based on callsite hotness.
84       scc_member_iterator<ProfiledCallGraph *> SI(*I);
85       Range = *SI;
86     }
87     for (auto *Node : Range) {
88       if (Node != ProfiledCG.getEntryNode())
89         Order.push_back(Node->Name);
90     }
91     ++I;
92   }
93   std::reverse(Order.begin(), Order.end());
94 
95   return Order;
96 }
97 
98 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
99                                        const FunctionSamples *CallerSamples) {
100   assert(CallerSamples && "Expect non-null caller samples");
101 
102   // Ideally we want to consider everything a function calls, but as far as
103   // context profile is concerned, only those frames that are children of
104   // current one in the trie is relavent. So we walk the trie instead of call
105   // targets from function profile.
106   ContextTrieNode *CallerNode =
107       ContextTracker.getContextFor(CallerSamples->getContext());
108 
109   bool HasNewCandidate = false;
110   for (auto &Child : CallerNode->getAllChildContext()) {
111     ContextTrieNode *CalleeNode = &Child.second;
112     FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
113     if (!CalleeSamples)
114       continue;
115 
116     // Call site count is more reliable, so we look up the corresponding call
117     // target profile in caller's context profile to retrieve call site count.
118     uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples();
119     uint64_t CallsiteCount = 0;
120     LineLocation Callsite = CalleeNode->getCallSiteLoc();
121     if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
122       SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
123       auto It = TargetCounts.find(CalleeSamples->getName());
124       if (It != TargetCounts.end())
125         CallsiteCount = It->second;
126     }
127 
128     // TODO: call site and callee entry count should be mostly consistent, add
129     // check for that.
130     HasNewCandidate = true;
131     uint32_t CalleeSize = getFuncSize(*CalleeSamples);
132     CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
133                    CalleeSize);
134   }
135 
136   return HasNewCandidate;
137 }
138 
139 uint32_t CSPreInliner::getFuncSize(const FunctionSamples &FSamples) {
140   if (UseContextCost) {
141     return Binary.getFuncSizeForContext(FSamples.getContext());
142   }
143 
144   return FSamples.getBodySamples().size();
145 }
146 
147 bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
148   // If replay inline is requested, simply follow the inline decision of the
149   // profiled binary.
150   if (SamplePreInlineReplay)
151     return Candidate.CalleeSamples->getContext().hasAttribute(
152         ContextWasInlined);
153 
154   // Adjust threshold based on call site hotness, only do this for callsite
155   // prioritized inliner because otherwise cost-benefit check is done earlier.
156   unsigned int SampleThreshold = SampleColdCallSiteThreshold;
157   if (Candidate.CallsiteCount > HotCountThreshold)
158     SampleThreshold = SampleHotCallSiteThreshold;
159 
160   // TODO: for small cold functions, we may inlined them and we need to keep
161   // context profile accordingly.
162   if (Candidate.CallsiteCount < ColdCountThreshold)
163     SampleThreshold = SampleColdCallSiteThreshold;
164 
165   return (Candidate.SizeCost < SampleThreshold);
166 }
167 
168 void CSPreInliner::processFunction(const StringRef Name) {
169   FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
170   if (!FSamples)
171     return;
172 
173   unsigned FuncSize = getFuncSize(*FSamples);
174   unsigned FuncFinalSize = FuncSize;
175   unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
176   SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
177   SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
178 
179   LLVM_DEBUG(dbgs() << "Process " << Name
180                     << " for context-sensitive pre-inlining (pre-inline size: "
181                     << FuncSize << ", size limit: " << SizeLimit << ")\n");
182 
183   ProfiledCandidateQueue CQueue;
184   getInlineCandidates(CQueue, FSamples);
185 
186   while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
187     ProfiledInlineCandidate Candidate = CQueue.top();
188     CQueue.pop();
189     bool ShouldInline = false;
190     if ((ShouldInline = shouldInline(Candidate))) {
191       // We mark context as inlined as the corresponding context profile
192       // won't be merged into that function's base profile.
193       ++PreInlNumCSInlined;
194       ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
195       Candidate.CalleeSamples->getContext().setAttribute(
196           ContextShouldBeInlined);
197       FuncFinalSize += Candidate.SizeCost;
198       getInlineCandidates(CQueue, Candidate.CalleeSamples);
199     } else {
200       ++PreInlNumCSNotInlined;
201     }
202     LLVM_DEBUG(dbgs() << (ShouldInline ? "  Inlined" : "  Outlined")
203                       << " context profile for: "
204                       << Candidate.CalleeSamples->getContext().toString()
205                       << " (callee size: " << Candidate.SizeCost
206                       << ", call count:" << Candidate.CallsiteCount << ")\n");
207   }
208 
209   if (!CQueue.empty()) {
210     if (SizeLimit == (unsigned)ProfileInlineLimitMax)
211       ++PreInlNumCSInlinedHitMaxLimit;
212     else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
213       ++PreInlNumCSInlinedHitMinLimit;
214     else
215       ++PreInlNumCSInlinedHitGrowthLimit;
216   }
217 
218   LLVM_DEBUG({
219     if (!CQueue.empty())
220       dbgs() << "  Inline candidates ignored due to size limit (inliner "
221                 "original size: "
222              << FuncSize << ", inliner final size: " << FuncFinalSize
223              << ", size limit: " << SizeLimit << ")\n";
224 
225     while (!CQueue.empty()) {
226       ProfiledInlineCandidate Candidate = CQueue.top();
227       CQueue.pop();
228       bool WasInlined =
229           Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
230       dbgs() << "    " << Candidate.CalleeSamples->getContext().toString()
231              << " (candidate size:" << Candidate.SizeCost
232              << ", call count: " << Candidate.CallsiteCount << ", previously "
233              << (WasInlined ? "inlined)\n" : "not inlined)\n");
234     }
235   });
236 }
237 
238 void CSPreInliner::run() {
239 #ifndef NDEBUG
240   auto printProfileNames = [](SampleProfileMap &Profiles, bool IsInput) {
241     dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
242            << Profiles.size() << " total):\n";
243     for (auto &It : Profiles) {
244       const FunctionSamples &Samples = It.second;
245       dbgs() << "  [" << Samples.getContext().toString() << "] "
246              << Samples.getTotalSamples() << ":" << Samples.getHeadSamples()
247              << "\n";
248     }
249   };
250 #endif
251 
252   LLVM_DEBUG(printProfileNames(ProfileMap, true));
253 
254   // Execute global pre-inliner to estimate a global top-down inline
255   // decision and merge profiles accordingly. This helps with profile
256   // merge for ThinLTO otherwise we won't be able to merge profiles back
257   // to base profile across module/thin-backend boundaries.
258   // It also helps better compress context profile to control profile
259   // size, as we now only need context profile for functions going to
260   // be inlined.
261   for (StringRef FuncName : buildTopDownOrder()) {
262     processFunction(FuncName);
263   }
264 
265   // Not inlined context profiles are merged into its base, so we can
266   // trim out such profiles from the output.
267   std::vector<SampleContext> ProfilesToBeRemoved;
268   for (auto &It : ProfileMap) {
269     SampleContext &Context = It.second.getContext();
270     if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) {
271       assert(Context.hasState(MergedContext) &&
272              "Not inlined context profile should be merged already");
273       ProfilesToBeRemoved.push_back(It.first);
274     }
275   }
276 
277   for (auto &ContextName : ProfilesToBeRemoved) {
278     ProfileMap.erase(ContextName);
279   }
280 
281   // Make sure ProfileMap's key is consistent with FunctionSamples' name.
282   SampleContextTrimmer(ProfileMap).canonicalizeContextProfiles();
283 
284   LLVM_DEBUG(printProfileNames(ProfileMap, false));
285 }
286