1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass performs partial inlining, typically by inlining an if statement 11 // that surrounds the body of the function. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/IPO/PartialInlining.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/ADT/DenseSet.h" 18 #include "llvm/ADT/None.h" 19 #include "llvm/ADT/Optional.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/Analysis/BlockFrequencyInfo.h" 24 #include "llvm/Analysis/BranchProbabilityInfo.h" 25 #include "llvm/Analysis/InlineCost.h" 26 #include "llvm/Analysis/LoopInfo.h" 27 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 28 #include "llvm/Analysis/ProfileSummaryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/Attributes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/CFG.h" 33 #include "llvm/IR/CallSite.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/DiagnosticInfo.h" 36 #include "llvm/IR/Dominators.h" 37 #include "llvm/IR/Function.h" 38 #include "llvm/IR/InstrTypes.h" 39 #include "llvm/IR/Instruction.h" 40 #include "llvm/IR/Instructions.h" 41 #include "llvm/IR/IntrinsicInst.h" 42 #include "llvm/IR/Intrinsics.h" 43 #include "llvm/IR/Module.h" 44 #include "llvm/IR/User.h" 45 #include "llvm/Pass.h" 46 #include "llvm/Support/BlockFrequency.h" 47 #include "llvm/Support/BranchProbability.h" 48 #include "llvm/Support/Casting.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Transforms/IPO.h" 52 #include "llvm/Transforms/Utils/Cloning.h" 53 #include "llvm/Transforms/Utils/CodeExtractor.h" 54 #include "llvm/Transforms/Utils/ValueMapper.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <cstdint> 58 #include <functional> 59 #include <iterator> 60 #include <memory> 61 #include <tuple> 62 #include <vector> 63 64 using namespace llvm; 65 66 #define DEBUG_TYPE "partial-inlining" 67 68 STATISTIC(NumPartialInlined, 69 "Number of callsites functions partially inlined into."); 70 71 // Command line option to disable partial-inlining. The default is false: 72 static cl::opt<bool> 73 DisablePartialInlining("disable-partial-inlining", cl::init(false), 74 cl::Hidden, cl::desc("Disable partial ininling")); 75 76 // This is an option used by testing: 77 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 78 cl::init(false), cl::ZeroOrMore, 79 cl::ReallyHidden, 80 cl::desc("Skip Cost Analysis")); 81 82 static cl::opt<unsigned> MaxNumInlineBlocks( 83 "max-num-inline-blocks", cl::init(5), cl::Hidden, 84 cl::desc("Max number of blocks to be partially inlined")); 85 86 // Command line option to set the maximum number of partial inlining allowed 87 // for the module. The default value of -1 means no limit. 88 static cl::opt<int> MaxNumPartialInlining( 89 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 90 cl::desc("Max number of partial inlining. The default is unlimited")); 91 92 // Used only when PGO or user annotated branch data is absent. It is 93 // the least value that is used to weigh the outline region. If BFI 94 // produces larger value, the BFI value will be used. 95 static cl::opt<int> 96 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 97 cl::Hidden, cl::ZeroOrMore, 98 cl::desc("Relative frequency of outline region to " 99 "the entry block")); 100 101 static cl::opt<unsigned> ExtraOutliningPenalty( 102 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 103 cl::desc("A debug option to add additional penalty to the computed one.")); 104 105 namespace { 106 107 struct FunctionOutliningInfo { 108 FunctionOutliningInfo() = default; 109 110 // Returns the number of blocks to be inlined including all blocks 111 // in Entries and one return block. 112 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 113 114 // A set of blocks including the function entry that guard 115 // the region to be outlined. 116 SmallVector<BasicBlock *, 4> Entries; 117 118 // The return block that is not included in the outlined region. 119 BasicBlock *ReturnBlock = nullptr; 120 121 // The dominating block of the region to be outlined. 122 BasicBlock *NonReturnBlock = nullptr; 123 124 // The set of blocks in Entries that that are predecessors to ReturnBlock 125 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 126 }; 127 128 struct PartialInlinerImpl { 129 PartialInlinerImpl( 130 std::function<AssumptionCache &(Function &)> *GetAC, 131 std::function<TargetTransformInfo &(Function &)> *GTTI, 132 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, 133 ProfileSummaryInfo *ProfSI) 134 : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} 135 136 bool run(Module &M); 137 Function *unswitchFunction(Function *F); 138 139 // This class speculatively clones the the function to be partial inlined. 140 // At the end of partial inlining, the remaining callsites to the cloned 141 // function that are not partially inlined will be fixed up to reference 142 // the original function, and the cloned function will be erased. 143 struct FunctionCloner { 144 FunctionCloner(Function *F, FunctionOutliningInfo *OI); 145 ~FunctionCloner(); 146 147 // Prepare for function outlining: making sure there is only 148 // one incoming edge from the extracted/outlined region to 149 // the return block. 150 void NormalizeReturnBlock(); 151 152 // Do function outlining. 153 // NOTE: For vararg functions that do the vararg handling in the outlined 154 // function, we temporarily generate IR that does not properly 155 // forward varargs to the outlined function. Calling InlineFunction 156 // will update calls to the outlined functions to properly forward 157 // the varargs. 158 Function *doFunctionOutlining(); 159 160 Function *OrigFunc = nullptr; 161 Function *ClonedFunc = nullptr; 162 Function *OutlinedFunc = nullptr; 163 BasicBlock *OutliningCallBB = nullptr; 164 // ClonedFunc is inlined in one of its callers after function 165 // outlining. 166 bool IsFunctionInlined = false; 167 // The cost of the region to be outlined. 168 int OutlinedRegionCost = 0; 169 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 170 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 171 }; 172 173 private: 174 int NumPartialInlining = 0; 175 std::function<AssumptionCache &(Function &)> *GetAssumptionCache; 176 std::function<TargetTransformInfo &(Function &)> *GetTTI; 177 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; 178 ProfileSummaryInfo *PSI; 179 180 // Return the frequency of the OutlininingBB relative to F's entry point. 181 // The result is no larger than 1 and is represented using BP. 182 // (Note that the outlined region's 'head' block can only have incoming 183 // edges from the guarding entry blocks). 184 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); 185 186 // Return true if the callee of CS should be partially inlined with 187 // profit. 188 bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, 189 BlockFrequency WeightedOutliningRcost, 190 OptimizationRemarkEmitter &ORE); 191 192 // Try to inline DuplicateFunction (cloned from F with call to 193 // the OutlinedFunction into its callers. Return true 194 // if there is any successful inlining. 195 bool tryPartialInline(FunctionCloner &Cloner); 196 197 // Compute the mapping from use site of DuplicationFunction to the enclosing 198 // BB's profile count. 199 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 200 DenseMap<User *, uint64_t> &SiteCountMap); 201 202 bool IsLimitReached() { 203 return (MaxNumPartialInlining != -1 && 204 NumPartialInlining >= MaxNumPartialInlining); 205 } 206 207 static CallSite getCallSite(User *U) { 208 CallSite CS; 209 if (CallInst *CI = dyn_cast<CallInst>(U)) 210 CS = CallSite(CI); 211 else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) 212 CS = CallSite(II); 213 else 214 llvm_unreachable("All uses must be calls"); 215 return CS; 216 } 217 218 static CallSite getOneCallSiteTo(Function *F) { 219 User *User = *F->user_begin(); 220 return getCallSite(User); 221 } 222 223 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 224 CallSite CS = getOneCallSiteTo(F); 225 DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); 226 BasicBlock *Block = CS.getParent(); 227 return std::make_tuple(DLoc, Block); 228 } 229 230 // Returns the costs associated with function outlining: 231 // - The first value is the non-weighted runtime cost for making the call 232 // to the outlined function, including the addtional setup cost in the 233 // outlined function itself; 234 // - The second value is the estimated size of the new call sequence in 235 // basic block Cloner.OutliningCallBB; 236 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); 237 238 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 239 // approximate both the size and runtime cost (Note that in the current 240 // inline cost analysis, there is no clear distinction there either). 241 static int computeBBInlineCost(BasicBlock *BB); 242 243 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 244 }; 245 246 struct PartialInlinerLegacyPass : public ModulePass { 247 static char ID; // Pass identification, replacement for typeid 248 249 PartialInlinerLegacyPass() : ModulePass(ID) { 250 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 251 } 252 253 void getAnalysisUsage(AnalysisUsage &AU) const override { 254 AU.addRequired<AssumptionCacheTracker>(); 255 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 256 AU.addRequired<TargetTransformInfoWrapperPass>(); 257 } 258 259 bool runOnModule(Module &M) override { 260 if (skipModule(M)) 261 return false; 262 263 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 264 TargetTransformInfoWrapperPass *TTIWP = 265 &getAnalysis<TargetTransformInfoWrapperPass>(); 266 ProfileSummaryInfo *PSI = 267 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 268 269 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 270 [&ACT](Function &F) -> AssumptionCache & { 271 return ACT->getAssumptionCache(F); 272 }; 273 274 std::function<TargetTransformInfo &(Function &)> GetTTI = 275 [&TTIWP](Function &F) -> TargetTransformInfo & { 276 return TTIWP->getTTI(F); 277 }; 278 279 return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M); 280 } 281 }; 282 283 } // end anonymous namespace 284 285 std::unique_ptr<FunctionOutliningInfo> 286 PartialInlinerImpl::computeOutliningInfo(Function *F) { 287 BasicBlock *EntryBlock = &F->front(); 288 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 289 if (!BR || BR->isUnconditional()) 290 return std::unique_ptr<FunctionOutliningInfo>(); 291 292 // Returns true if Succ is BB's successor 293 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 294 return is_contained(successors(BB), Succ); 295 }; 296 297 auto SuccSize = [](BasicBlock *BB) { 298 return std::distance(succ_begin(BB), succ_end(BB)); 299 }; 300 301 auto IsReturnBlock = [](BasicBlock *BB) { 302 TerminatorInst *TI = BB->getTerminator(); 303 return isa<ReturnInst>(TI); 304 }; 305 306 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 307 if (IsReturnBlock(Succ1)) 308 return std::make_tuple(Succ1, Succ2); 309 if (IsReturnBlock(Succ2)) 310 return std::make_tuple(Succ2, Succ1); 311 312 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 313 }; 314 315 // Detect a triangular shape: 316 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 317 if (IsSuccessor(Succ1, Succ2)) 318 return std::make_tuple(Succ1, Succ2); 319 if (IsSuccessor(Succ2, Succ1)) 320 return std::make_tuple(Succ2, Succ1); 321 322 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 323 }; 324 325 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 326 llvm::make_unique<FunctionOutliningInfo>(); 327 328 BasicBlock *CurrEntry = EntryBlock; 329 bool CandidateFound = false; 330 do { 331 // The number of blocks to be inlined has already reached 332 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 333 // disables partial inlining for the function. 334 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 335 break; 336 337 if (SuccSize(CurrEntry) != 2) 338 break; 339 340 BasicBlock *Succ1 = *succ_begin(CurrEntry); 341 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 342 343 BasicBlock *ReturnBlock, *NonReturnBlock; 344 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 345 346 if (ReturnBlock) { 347 OutliningInfo->Entries.push_back(CurrEntry); 348 OutliningInfo->ReturnBlock = ReturnBlock; 349 OutliningInfo->NonReturnBlock = NonReturnBlock; 350 CandidateFound = true; 351 break; 352 } 353 354 BasicBlock *CommSucc; 355 BasicBlock *OtherSucc; 356 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 357 358 if (!CommSucc) 359 break; 360 361 OutliningInfo->Entries.push_back(CurrEntry); 362 CurrEntry = OtherSucc; 363 } while (true); 364 365 if (!CandidateFound) 366 return std::unique_ptr<FunctionOutliningInfo>(); 367 368 // Do sanity check of the entries: threre should not 369 // be any successors (not in the entry set) other than 370 // {ReturnBlock, NonReturnBlock} 371 assert(OutliningInfo->Entries[0] == &F->front() && 372 "Function Entry must be the first in Entries vector"); 373 DenseSet<BasicBlock *> Entries; 374 for (BasicBlock *E : OutliningInfo->Entries) 375 Entries.insert(E); 376 377 // Returns true of BB has Predecessor which is not 378 // in Entries set. 379 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 380 for (auto Pred : predecessors(BB)) { 381 if (!Entries.count(Pred)) 382 return true; 383 } 384 return false; 385 }; 386 auto CheckAndNormalizeCandidate = 387 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 388 for (BasicBlock *E : OutliningInfo->Entries) { 389 for (auto Succ : successors(E)) { 390 if (Entries.count(Succ)) 391 continue; 392 if (Succ == OutliningInfo->ReturnBlock) 393 OutliningInfo->ReturnBlockPreds.push_back(E); 394 else if (Succ != OutliningInfo->NonReturnBlock) 395 return false; 396 } 397 // There should not be any outside incoming edges either: 398 if (HasNonEntryPred(E)) 399 return false; 400 } 401 return true; 402 }; 403 404 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 405 return std::unique_ptr<FunctionOutliningInfo>(); 406 407 // Now further growing the candidate's inlining region by 408 // peeling off dominating blocks from the outlining region: 409 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 410 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 411 if (SuccSize(Cand) != 2) 412 break; 413 414 if (HasNonEntryPred(Cand)) 415 break; 416 417 BasicBlock *Succ1 = *succ_begin(Cand); 418 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 419 420 BasicBlock *ReturnBlock, *NonReturnBlock; 421 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 422 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 423 break; 424 425 if (NonReturnBlock->getSinglePredecessor() != Cand) 426 break; 427 428 // Now grow and update OutlininigInfo: 429 OutliningInfo->Entries.push_back(Cand); 430 OutliningInfo->NonReturnBlock = NonReturnBlock; 431 OutliningInfo->ReturnBlockPreds.push_back(Cand); 432 Entries.insert(Cand); 433 } 434 435 return OutliningInfo; 436 } 437 438 // Check if there is PGO data or user annoated branch data: 439 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 440 if (F->getEntryCount()) 441 return true; 442 // Now check if any of the entry block has MD_prof data: 443 for (auto *E : OI->Entries) { 444 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 445 if (!BR || BR->isUnconditional()) 446 continue; 447 uint64_t T, F; 448 if (BR->extractProfMetadata(T, F)) 449 return true; 450 } 451 return false; 452 } 453 454 BranchProbability 455 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { 456 auto EntryFreq = 457 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 458 auto OutliningCallFreq = 459 Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB); 460 461 auto OutlineRegionRelFreq = 462 BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), 463 EntryFreq.getFrequency()); 464 465 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) 466 return OutlineRegionRelFreq; 467 468 // When profile data is not available, we need to be conservative in 469 // estimating the overall savings. Static branch prediction can usually 470 // guess the branch direction right (taken/non-taken), but the guessed 471 // branch probability is usually not biased enough. In case when the 472 // outlined region is predicted to be likely, its probability needs 473 // to be made higher (more biased) to not under-estimate the cost of 474 // function outlining. On the other hand, if the outlined region 475 // is predicted to be less likely, the predicted probablity is usually 476 // higher than the actual. For instance, the actual probability of the 477 // less likely target is only 5%, but the guessed probablity can be 478 // 40%. In the latter case, there is no need for further adjustement. 479 // FIXME: add an option for this. 480 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 481 return OutlineRegionRelFreq; 482 483 OutlineRegionRelFreq = std::max( 484 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 485 486 return OutlineRegionRelFreq; 487 } 488 489 bool PartialInlinerImpl::shouldPartialInline( 490 CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, 491 OptimizationRemarkEmitter &ORE) { 492 using namespace ore; 493 494 if (SkipCostAnalysis) 495 return true; 496 497 Instruction *Call = CS.getInstruction(); 498 Function *Callee = CS.getCalledFunction(); 499 assert(Callee == Cloner.ClonedFunc); 500 501 Function *Caller = CS.getCaller(); 502 auto &CalleeTTI = (*GetTTI)(*Callee); 503 InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, 504 *GetAssumptionCache, GetBFI, PSI, &ORE); 505 506 if (IC.isAlways()) { 507 ORE.emit([&]() { 508 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) 509 << NV("Callee", Cloner.OrigFunc) 510 << " should always be fully inlined, not partially"; 511 }); 512 return false; 513 } 514 515 if (IC.isNever()) { 516 ORE.emit([&]() { 517 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) 518 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 519 << NV("Caller", Caller) 520 << " because it should never be inlined (cost=never)"; 521 }); 522 return false; 523 } 524 525 if (!IC) { 526 ORE.emit([&]() { 527 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) 528 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 529 << NV("Caller", Caller) << " because too costly to inline (cost=" 530 << NV("Cost", IC.getCost()) << ", threshold=" 531 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 532 }); 533 return false; 534 } 535 const DataLayout &DL = Caller->getParent()->getDataLayout(); 536 537 // The savings of eliminating the call: 538 int NonWeightedSavings = getCallsiteCost(CS, DL); 539 BlockFrequency NormWeightedSavings(NonWeightedSavings); 540 541 // Weighted saving is smaller than weighted cost, return false 542 if (NormWeightedSavings < WeightedOutliningRcost) { 543 ORE.emit([&]() { 544 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 545 Call) 546 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 547 << NV("Caller", Caller) << " runtime overhead (overhead=" 548 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 549 << ", savings=" 550 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 551 << ")" 552 << " of making the outlined call is too high"; 553 }); 554 555 return false; 556 } 557 558 ORE.emit([&]() { 559 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) 560 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 561 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 562 << " (threshold=" 563 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 564 }); 565 return true; 566 } 567 568 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 569 // For now use a simplified version. The returned 'InlineCost' will be used 570 // to esimate the size cost as well as runtime cost of the BB. 571 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { 572 int InlineCost = 0; 573 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 574 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 575 if (isa<DbgInfoIntrinsic>(I)) 576 continue; 577 578 switch (I->getOpcode()) { 579 case Instruction::BitCast: 580 case Instruction::PtrToInt: 581 case Instruction::IntToPtr: 582 case Instruction::Alloca: 583 continue; 584 case Instruction::GetElementPtr: 585 if (cast<GetElementPtrInst>(I)->hasAllZeroIndices()) 586 continue; 587 default: 588 break; 589 } 590 591 IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I); 592 if (IntrInst) { 593 if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start || 594 IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) 595 continue; 596 } 597 598 if (CallInst *CI = dyn_cast<CallInst>(I)) { 599 InlineCost += getCallsiteCost(CallSite(CI), DL); 600 continue; 601 } 602 603 if (InvokeInst *II = dyn_cast<InvokeInst>(I)) { 604 InlineCost += getCallsiteCost(CallSite(II), DL); 605 continue; 606 } 607 608 if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { 609 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 610 continue; 611 } 612 InlineCost += InlineConstants::InstrCost; 613 } 614 return InlineCost; 615 } 616 617 std::tuple<int, int> 618 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { 619 // Now compute the cost of the call sequence to the outlined function 620 // 'OutlinedFunction' in BB 'OutliningCallBB': 621 int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB); 622 623 // Now compute the cost of the extracted/outlined function itself: 624 int OutlinedFunctionCost = 0; 625 for (BasicBlock &BB : *Cloner.OutlinedFunc) { 626 OutlinedFunctionCost += computeBBInlineCost(&BB); 627 } 628 629 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 630 "Outlined function cost should be no less than the outlined region"); 631 // The code extractor introduces a new root and exit stub blocks with 632 // additional unconditional branches. Those branches will be eliminated 633 // later with bb layout. The cost should be adjusted accordingly: 634 OutlinedFunctionCost -= 2 * InlineConstants::InstrCost; 635 636 int OutliningRuntimeOverhead = 637 OutliningFuncCallCost + 638 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 639 ExtraOutliningPenalty; 640 641 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 642 } 643 644 // Create the callsite to profile count map which is 645 // used to update the original function's entry count, 646 // after the function is partially inlined into the callsite. 647 void PartialInlinerImpl::computeCallsiteToProfCountMap( 648 Function *DuplicateFunction, 649 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 650 std::vector<User *> Users(DuplicateFunction->user_begin(), 651 DuplicateFunction->user_end()); 652 Function *CurrentCaller = nullptr; 653 std::unique_ptr<BlockFrequencyInfo> TempBFI; 654 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 655 656 auto ComputeCurrBFI = [&,this](Function *Caller) { 657 // For the old pass manager: 658 if (!GetBFI) { 659 DominatorTree DT(*Caller); 660 LoopInfo LI(DT); 661 BranchProbabilityInfo BPI(*Caller, LI); 662 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 663 CurrentCallerBFI = TempBFI.get(); 664 } else { 665 // New pass manager: 666 CurrentCallerBFI = &(*GetBFI)(*Caller); 667 } 668 }; 669 670 for (User *User : Users) { 671 CallSite CS = getCallSite(User); 672 Function *Caller = CS.getCaller(); 673 if (CurrentCaller != Caller) { 674 CurrentCaller = Caller; 675 ComputeCurrBFI(Caller); 676 } else { 677 assert(CurrentCallerBFI && "CallerBFI is not set"); 678 } 679 BasicBlock *CallBB = CS.getInstruction()->getParent(); 680 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 681 if (Count) 682 CallSiteToProfCountMap[User] = *Count; 683 else 684 CallSiteToProfCountMap[User] = 0; 685 } 686 } 687 688 PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F, 689 FunctionOutliningInfo *OI) 690 : OrigFunc(F) { 691 ClonedOI = llvm::make_unique<FunctionOutliningInfo>(); 692 693 // Clone the function, so that we can hack away on it. 694 ValueToValueMapTy VMap; 695 ClonedFunc = CloneFunction(F, VMap); 696 697 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 698 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 699 for (BasicBlock *BB : OI->Entries) { 700 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 701 } 702 for (BasicBlock *E : OI->ReturnBlockPreds) { 703 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 704 ClonedOI->ReturnBlockPreds.push_back(NewE); 705 } 706 // Go ahead and update all uses to the duplicate, so that we can just 707 // use the inliner functionality when we're done hacking. 708 F->replaceAllUsesWith(ClonedFunc); 709 } 710 711 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { 712 auto getFirstPHI = [](BasicBlock *BB) { 713 BasicBlock::iterator I = BB->begin(); 714 PHINode *FirstPhi = nullptr; 715 while (I != BB->end()) { 716 PHINode *Phi = dyn_cast<PHINode>(I); 717 if (!Phi) 718 break; 719 if (!FirstPhi) { 720 FirstPhi = Phi; 721 break; 722 } 723 } 724 return FirstPhi; 725 }; 726 727 // Special hackery is needed with PHI nodes that have inputs from more than 728 // one extracted block. For simplicity, just split the PHIs into a two-level 729 // sequence of PHIs, some of which will go in the extracted region, and some 730 // of which will go outside. 731 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 732 // only split block when necessary: 733 PHINode *FirstPhi = getFirstPHI(PreReturn); 734 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 735 736 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 737 return; 738 739 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 740 Value *CommonValue = PN->getIncomingValue(0); 741 if (all_of(PN->incoming_values(), 742 [&](Value *V) { return V == CommonValue; })) 743 return CommonValue; 744 return nullptr; 745 }; 746 747 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 748 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 749 BasicBlock::iterator I = PreReturn->begin(); 750 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 751 SmallVector<Instruction *, 4> DeadPhis; 752 while (I != PreReturn->end()) { 753 PHINode *OldPhi = dyn_cast<PHINode>(I); 754 if (!OldPhi) 755 break; 756 757 PHINode *RetPhi = 758 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 759 OldPhi->replaceAllUsesWith(RetPhi); 760 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 761 762 RetPhi->addIncoming(&*I, PreReturn); 763 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 764 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 765 OldPhi->removeIncomingValue(E); 766 } 767 768 // After incoming values splitting, the old phi may become trivial. 769 // Keeping the trivial phi can introduce definition inside the outline 770 // region which is live-out, causing necessary overhead (load, store 771 // arg passing etc). 772 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 773 OldPhi->replaceAllUsesWith(OldPhiVal); 774 DeadPhis.push_back(OldPhi); 775 } 776 ++I; 777 } 778 for (auto *DP : DeadPhis) 779 DP->eraseFromParent(); 780 781 for (auto E : ClonedOI->ReturnBlockPreds) { 782 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 783 } 784 } 785 786 Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() { 787 // Returns true if the block is to be partial inlined into the caller 788 // (i.e. not to be extracted to the out of line function) 789 auto ToBeInlined = [&, this](BasicBlock *BB) { 790 return BB == ClonedOI->ReturnBlock || 791 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != 792 ClonedOI->Entries.end()); 793 }; 794 795 // Gather up the blocks that we're going to extract. 796 std::vector<BasicBlock *> ToExtract; 797 ToExtract.push_back(ClonedOI->NonReturnBlock); 798 OutlinedRegionCost += 799 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); 800 for (BasicBlock &BB : *ClonedFunc) 801 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 802 ToExtract.push_back(&BB); 803 // FIXME: the code extractor may hoist/sink more code 804 // into the outlined function which may make the outlining 805 // overhead (the difference of the outlined function cost 806 // and OutliningRegionCost) look larger. 807 OutlinedRegionCost += computeBBInlineCost(&BB); 808 } 809 810 // The CodeExtractor needs a dominator tree. 811 DominatorTree DT; 812 DT.recalculate(*ClonedFunc); 813 814 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 815 LoopInfo LI(DT); 816 BranchProbabilityInfo BPI(*ClonedFunc, LI); 817 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 818 819 // Extract the body of the if. 820 OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 821 ClonedFuncBFI.get(), &BPI, 822 /* AllowVarargs */ true) 823 .extractCodeRegion(); 824 825 if (OutlinedFunc) { 826 OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) 827 .getInstruction() 828 ->getParent(); 829 assert(OutliningCallBB->getParent() == ClonedFunc); 830 } 831 832 return OutlinedFunc; 833 } 834 835 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 836 // Ditch the duplicate, since we're done with it, and rewrite all remaining 837 // users (function pointers, etc.) back to the original function. 838 ClonedFunc->replaceAllUsesWith(OrigFunc); 839 ClonedFunc->eraseFromParent(); 840 if (!IsFunctionInlined) { 841 // Remove the function that is speculatively created if there is no 842 // reference. 843 if (OutlinedFunc) 844 OutlinedFunc->eraseFromParent(); 845 } 846 } 847 848 Function *PartialInlinerImpl::unswitchFunction(Function *F) { 849 if (F->hasAddressTaken()) 850 return nullptr; 851 852 // Let inliner handle it 853 if (F->hasFnAttribute(Attribute::AlwaysInline)) 854 return nullptr; 855 856 if (F->hasFnAttribute(Attribute::NoInline)) 857 return nullptr; 858 859 if (PSI->isFunctionEntryCold(F)) 860 return nullptr; 861 862 if (F->user_begin() == F->user_end()) 863 return nullptr; 864 865 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 866 867 if (!OI) 868 return nullptr; 869 870 FunctionCloner Cloner(F, OI.get()); 871 Cloner.NormalizeReturnBlock(); 872 Function *OutlinedFunction = Cloner.doFunctionOutlining(); 873 874 bool AnyInline = tryPartialInline(Cloner); 875 876 if (AnyInline) 877 return OutlinedFunction; 878 879 return nullptr; 880 } 881 882 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 883 int NonWeightedRcost; 884 int SizeCost; 885 886 if (Cloner.OutlinedFunc == nullptr) 887 return false; 888 889 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 890 891 auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 892 auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 893 894 // The call sequence to the outlined function is larger than the original 895 // outlined region size, it does not increase the chances of inlining 896 // the function with outlining (The inliner uses the size increase to 897 // model the cost of inlining a callee). 898 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 899 OptimizationRemarkEmitter ORE(Cloner.OrigFunc); 900 DebugLoc DLoc; 901 BasicBlock *Block; 902 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); 903 ORE.emit([&]() { 904 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 905 DLoc, Block) 906 << ore::NV("Function", Cloner.OrigFunc) 907 << " not partially inlined into callers (Original Size = " 908 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 909 << ", Size of call sequence to outlined function = " 910 << ore::NV("NewSize", SizeCost) << ")"; 911 }); 912 return false; 913 } 914 915 assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() && 916 "F's users should all be replaced!"); 917 918 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 919 Cloner.ClonedFunc->user_end()); 920 921 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 922 if (Cloner.OrigFunc->getEntryCount()) 923 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 924 925 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 926 uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); 927 928 bool AnyInline = false; 929 for (User *User : Users) { 930 CallSite CS = getCallSite(User); 931 932 if (IsLimitReached()) 933 continue; 934 935 OptimizationRemarkEmitter ORE(CS.getCaller()); 936 937 if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE)) 938 continue; 939 940 // Construct remark before doing the inlining, as after successful inlining 941 // the callsite is removed. 942 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()); 943 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 944 << ore::NV("Caller", CS.getCaller()); 945 946 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); 947 if (!InlineFunction(CS, IFI, nullptr, true, Cloner.OutlinedFunc)) 948 continue; 949 950 ORE.emit(OR); 951 952 // Now update the entry count: 953 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 954 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 955 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 956 } 957 958 AnyInline = true; 959 NumPartialInlining++; 960 // Update the stats 961 NumPartialInlined++; 962 } 963 964 if (AnyInline) { 965 Cloner.IsFunctionInlined = true; 966 if (CalleeEntryCount) 967 Cloner.OrigFunc->setEntryCount(CalleeEntryCountV); 968 } 969 970 return AnyInline; 971 } 972 973 bool PartialInlinerImpl::run(Module &M) { 974 if (DisablePartialInlining) 975 return false; 976 977 std::vector<Function *> Worklist; 978 Worklist.reserve(M.size()); 979 for (Function &F : M) 980 if (!F.use_empty() && !F.isDeclaration()) 981 Worklist.push_back(&F); 982 983 bool Changed = false; 984 while (!Worklist.empty()) { 985 Function *CurrFunc = Worklist.back(); 986 Worklist.pop_back(); 987 988 if (CurrFunc->use_empty()) 989 continue; 990 991 bool Recursive = false; 992 for (User *U : CurrFunc->users()) 993 if (Instruction *I = dyn_cast<Instruction>(U)) 994 if (I->getParent()->getParent() == CurrFunc) { 995 Recursive = true; 996 break; 997 } 998 if (Recursive) 999 continue; 1000 1001 if (Function *NewFunc = unswitchFunction(CurrFunc)) { 1002 Worklist.push_back(NewFunc); 1003 Changed = true; 1004 } 1005 } 1006 1007 return Changed; 1008 } 1009 1010 char PartialInlinerLegacyPass::ID = 0; 1011 1012 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1013 "Partial Inliner", false, false) 1014 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1015 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1016 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1017 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1018 "Partial Inliner", false, false) 1019 1020 ModulePass *llvm::createPartialInliningPass() { 1021 return new PartialInlinerLegacyPass(); 1022 } 1023 1024 PreservedAnalyses PartialInlinerPass::run(Module &M, 1025 ModuleAnalysisManager &AM) { 1026 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1027 1028 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 1029 [&FAM](Function &F) -> AssumptionCache & { 1030 return FAM.getResult<AssumptionAnalysis>(F); 1031 }; 1032 1033 std::function<BlockFrequencyInfo &(Function &)> GetBFI = 1034 [&FAM](Function &F) -> BlockFrequencyInfo & { 1035 return FAM.getResult<BlockFrequencyAnalysis>(F); 1036 }; 1037 1038 std::function<TargetTransformInfo &(Function &)> GetTTI = 1039 [&FAM](Function &F) -> TargetTransformInfo & { 1040 return FAM.getResult<TargetIRAnalysis>(F); 1041 }; 1042 1043 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 1044 1045 if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M)) 1046 return PreservedAnalyses::none(); 1047 return PreservedAnalyses::all(); 1048 } 1049