1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "CSPreInliner.h" 10 #include "ProfiledBinary.h" 11 #include "llvm/ADT/SCCIterator.h" 12 #include "llvm/ADT/Statistic.h" 13 #include <cstdint> 14 #include <queue> 15 16 #define DEBUG_TYPE "cs-preinliner" 17 18 using namespace llvm; 19 using namespace sampleprof; 20 21 STATISTIC(PreInlNumCSInlined, 22 "Number of functions inlined with context sensitive profile"); 23 STATISTIC(PreInlNumCSNotInlined, 24 "Number of functions not inlined with context sensitive profile"); 25 STATISTIC(PreInlNumCSInlinedHitMinLimit, 26 "Number of functions with FDO inline stopped due to min size limit"); 27 STATISTIC(PreInlNumCSInlinedHitMaxLimit, 28 "Number of functions with FDO inline stopped due to max size limit"); 29 STATISTIC( 30 PreInlNumCSInlinedHitGrowthLimit, 31 "Number of functions with FDO inline stopped due to growth size limit"); 32 33 // The switches specify inline thresholds used in SampleProfileLoader inlining. 34 // TODO: the actual threshold to be tuned here because the size here is based 35 // on machine code not LLVM IR. 36 extern cl::opt<int> SampleHotCallSiteThreshold; 37 extern cl::opt<int> SampleColdCallSiteThreshold; 38 extern cl::opt<int> ProfileInlineGrowthLimit; 39 extern cl::opt<int> ProfileInlineLimitMin; 40 extern cl::opt<int> ProfileInlineLimitMax; 41 extern cl::opt<bool> SortProfiledSCC; 42 43 cl::opt<bool> EnableCSPreInliner( 44 "csspgo-preinliner", cl::Hidden, cl::init(true), 45 cl::desc("Run a global pre-inliner to merge context profile based on " 46 "estimated global top-down inline decisions")); 47 48 cl::opt<bool> UseContextCostForPreInliner( 49 "use-context-cost-for-preinliner", cl::Hidden, cl::init(true), 50 cl::desc("Use context-sensitive byte size cost for preinliner decisions")); 51 52 static cl::opt<bool> SamplePreInlineReplay( 53 "csspgo-replay-preinline", cl::Hidden, cl::init(false), 54 cl::desc( 55 "Replay previous inlining and adjust context profile accordingly")); 56 57 CSPreInliner::CSPreInliner(SampleProfileMap &Profiles, ProfiledBinary &Binary, 58 uint64_t HotThreshold, uint64_t ColdThreshold) 59 : UseContextCost(UseContextCostForPreInliner), 60 // TODO: Pass in a guid-to-name map in order for 61 // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes 62 // as their profile context. 63 ContextTracker(Profiles, nullptr), ProfileMap(Profiles), Binary(Binary), 64 HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) { 65 // Set default preinliner hot/cold call site threshold tuned with CSSPGO. 66 // for good performance with reasonable profile size. 67 if (!SampleHotCallSiteThreshold.getNumOccurrences()) 68 SampleHotCallSiteThreshold = 1500; 69 if (!SampleColdCallSiteThreshold.getNumOccurrences()) 70 SampleColdCallSiteThreshold = 0; 71 } 72 73 std::vector<StringRef> CSPreInliner::buildTopDownOrder() { 74 std::vector<StringRef> Order; 75 ProfiledCallGraph ProfiledCG(ContextTracker); 76 77 // Now that we have a profiled call graph, construct top-down order 78 // by building up SCC and reversing SCC order. 79 scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG); 80 while (!I.isAtEnd()) { 81 auto Range = *I; 82 if (SortProfiledSCC) { 83 // Sort nodes in one SCC based on callsite hotness. 84 scc_member_iterator<ProfiledCallGraph *> SI(*I); 85 Range = *SI; 86 } 87 for (auto *Node : Range) { 88 if (Node != ProfiledCG.getEntryNode()) 89 Order.push_back(Node->Name); 90 } 91 ++I; 92 } 93 std::reverse(Order.begin(), Order.end()); 94 95 return Order; 96 } 97 98 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, 99 const FunctionSamples *CallerSamples) { 100 assert(CallerSamples && "Expect non-null caller samples"); 101 102 // Ideally we want to consider everything a function calls, but as far as 103 // context profile is concerned, only those frames that are children of 104 // current one in the trie is relavent. So we walk the trie instead of call 105 // targets from function profile. 106 ContextTrieNode *CallerNode = 107 ContextTracker.getContextFor(CallerSamples->getContext()); 108 109 bool HasNewCandidate = false; 110 for (auto &Child : CallerNode->getAllChildContext()) { 111 ContextTrieNode *CalleeNode = &Child.second; 112 FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples(); 113 if (!CalleeSamples) 114 continue; 115 116 // Call site count is more reliable, so we look up the corresponding call 117 // target profile in caller's context profile to retrieve call site count. 118 uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples(); 119 uint64_t CallsiteCount = 0; 120 LineLocation Callsite = CalleeNode->getCallSiteLoc(); 121 if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) { 122 SampleRecord::CallTargetMap &TargetCounts = CallTargets.get(); 123 auto It = TargetCounts.find(CalleeSamples->getName()); 124 if (It != TargetCounts.end()) 125 CallsiteCount = It->second; 126 } 127 128 // TODO: call site and callee entry count should be mostly consistent, add 129 // check for that. 130 HasNewCandidate = true; 131 uint32_t CalleeSize = getFuncSize(*CalleeSamples); 132 CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount), 133 CalleeSize); 134 } 135 136 return HasNewCandidate; 137 } 138 139 uint32_t CSPreInliner::getFuncSize(const FunctionSamples &FSamples) { 140 if (UseContextCost) { 141 return Binary.getFuncSizeForContext(FSamples.getContext()); 142 } 143 144 return FSamples.getBodySamples().size(); 145 } 146 147 bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) { 148 // If replay inline is requested, simply follow the inline decision of the 149 // profiled binary. 150 if (SamplePreInlineReplay) 151 return Candidate.CalleeSamples->getContext().hasAttribute( 152 ContextWasInlined); 153 154 // Adjust threshold based on call site hotness, only do this for callsite 155 // prioritized inliner because otherwise cost-benefit check is done earlier. 156 unsigned int SampleThreshold = SampleColdCallSiteThreshold; 157 if (Candidate.CallsiteCount > HotCountThreshold) 158 SampleThreshold = SampleHotCallSiteThreshold; 159 160 // TODO: for small cold functions, we may inlined them and we need to keep 161 // context profile accordingly. 162 if (Candidate.CallsiteCount < ColdCountThreshold) 163 SampleThreshold = SampleColdCallSiteThreshold; 164 165 return (Candidate.SizeCost < SampleThreshold); 166 } 167 168 void CSPreInliner::processFunction(const StringRef Name) { 169 FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name); 170 if (!FSamples) 171 return; 172 173 unsigned FuncSize = getFuncSize(*FSamples); 174 unsigned FuncFinalSize = FuncSize; 175 unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit; 176 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); 177 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); 178 179 LLVM_DEBUG(dbgs() << "Process " << Name 180 << " for context-sensitive pre-inlining (pre-inline size: " 181 << FuncSize << ", size limit: " << SizeLimit << ")\n"); 182 183 ProfiledCandidateQueue CQueue; 184 getInlineCandidates(CQueue, FSamples); 185 186 while (!CQueue.empty() && FuncFinalSize < SizeLimit) { 187 ProfiledInlineCandidate Candidate = CQueue.top(); 188 CQueue.pop(); 189 bool ShouldInline = false; 190 if ((ShouldInline = shouldInline(Candidate))) { 191 // We mark context as inlined as the corresponding context profile 192 // won't be merged into that function's base profile. 193 ++PreInlNumCSInlined; 194 ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples); 195 Candidate.CalleeSamples->getContext().setAttribute( 196 ContextShouldBeInlined); 197 FuncFinalSize += Candidate.SizeCost; 198 getInlineCandidates(CQueue, Candidate.CalleeSamples); 199 } else { 200 ++PreInlNumCSNotInlined; 201 } 202 LLVM_DEBUG(dbgs() << (ShouldInline ? " Inlined" : " Outlined") 203 << " context profile for: " 204 << Candidate.CalleeSamples->getContext().toString() 205 << " (callee size: " << Candidate.SizeCost 206 << ", call count:" << Candidate.CallsiteCount << ")\n"); 207 } 208 209 if (!CQueue.empty()) { 210 if (SizeLimit == (unsigned)ProfileInlineLimitMax) 211 ++PreInlNumCSInlinedHitMaxLimit; 212 else if (SizeLimit == (unsigned)ProfileInlineLimitMin) 213 ++PreInlNumCSInlinedHitMinLimit; 214 else 215 ++PreInlNumCSInlinedHitGrowthLimit; 216 } 217 218 LLVM_DEBUG({ 219 if (!CQueue.empty()) 220 dbgs() << " Inline candidates ignored due to size limit (inliner " 221 "original size: " 222 << FuncSize << ", inliner final size: " << FuncFinalSize 223 << ", size limit: " << SizeLimit << ")\n"; 224 225 while (!CQueue.empty()) { 226 ProfiledInlineCandidate Candidate = CQueue.top(); 227 CQueue.pop(); 228 bool WasInlined = 229 Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); 230 dbgs() << " " << Candidate.CalleeSamples->getContext().toString() 231 << " (candidate size:" << Candidate.SizeCost 232 << ", call count: " << Candidate.CallsiteCount << ", previously " 233 << (WasInlined ? "inlined)\n" : "not inlined)\n"); 234 } 235 }); 236 } 237 238 void CSPreInliner::run() { 239 #ifndef NDEBUG 240 auto printProfileNames = [](SampleProfileMap &Profiles, bool IsInput) { 241 dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles (" 242 << Profiles.size() << " total):\n"; 243 for (auto &It : Profiles) { 244 const FunctionSamples &Samples = It.second; 245 dbgs() << " [" << Samples.getContext().toString() << "] " 246 << Samples.getTotalSamples() << ":" << Samples.getHeadSamples() 247 << "\n"; 248 } 249 }; 250 #endif 251 252 LLVM_DEBUG(printProfileNames(ProfileMap, true)); 253 254 // Execute global pre-inliner to estimate a global top-down inline 255 // decision and merge profiles accordingly. This helps with profile 256 // merge for ThinLTO otherwise we won't be able to merge profiles back 257 // to base profile across module/thin-backend boundaries. 258 // It also helps better compress context profile to control profile 259 // size, as we now only need context profile for functions going to 260 // be inlined. 261 for (StringRef FuncName : buildTopDownOrder()) { 262 processFunction(FuncName); 263 } 264 265 // Not inlined context profiles are merged into its base, so we can 266 // trim out such profiles from the output. 267 std::vector<SampleContext> ProfilesToBeRemoved; 268 for (auto &It : ProfileMap) { 269 SampleContext &Context = It.second.getContext(); 270 if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) { 271 assert(Context.hasState(MergedContext) && 272 "Not inlined context profile should be merged already"); 273 ProfilesToBeRemoved.push_back(It.first); 274 } 275 } 276 277 for (auto &ContextName : ProfilesToBeRemoved) { 278 ProfileMap.erase(ContextName); 279 } 280 281 // Make sure ProfileMap's key is consistent with FunctionSamples' name. 282 SampleContextTrimmer(ProfileMap).canonicalizeContextProfiles(); 283 284 LLVM_DEBUG(printProfileNames(ProfileMap, false)); 285 } 286