1 //===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Outline cold regions to a separate function. 11 // TODO: Update BFI and BPI 12 // TODO: Add all the outlined functions to a separate section. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "llvm/ADT/SmallVector.h" 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/Analysis/AliasAnalysis.h" 19 #include "llvm/Analysis/BlockFrequencyInfo.h" 20 #include "llvm/Analysis/BranchProbabilityInfo.h" 21 #include "llvm/Analysis/CFG.h" 22 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 23 #include "llvm/Analysis/PostDominators.h" 24 #include "llvm/Analysis/ProfileSummaryInfo.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/CFG.h" 28 #include "llvm/IR/DataLayout.h" 29 #include "llvm/IR/DiagnosticInfo.h" 30 #include "llvm/IR/Dominators.h" 31 #include "llvm/IR/Function.h" 32 #include "llvm/IR/Instruction.h" 33 #include "llvm/IR/Instructions.h" 34 #include "llvm/IR/IntrinsicInst.h" 35 #include "llvm/IR/Metadata.h" 36 #include "llvm/IR/Module.h" 37 #include "llvm/IR/PassManager.h" 38 #include "llvm/IR/Type.h" 39 #include "llvm/IR/Use.h" 40 #include "llvm/IR/User.h" 41 #include "llvm/IR/Value.h" 42 #include "llvm/Pass.h" 43 #include "llvm/Support/BlockFrequency.h" 44 #include "llvm/Support/BranchProbability.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/Support/raw_ostream.h" 47 #include "llvm/Transforms/IPO.h" 48 #include "llvm/Transforms/IPO/HotColdSplitting.h" 49 #include "llvm/Transforms/Scalar.h" 50 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 51 #include "llvm/Transforms/Utils/Cloning.h" 52 #include "llvm/Transforms/Utils/CodeExtractor.h" 53 #include "llvm/Transforms/Utils/Local.h" 54 #include "llvm/Transforms/Utils/SSAUpdater.h" 55 #include "llvm/Transforms/Utils/ValueMapper.h" 56 #include <algorithm> 57 #include <cassert> 58 59 #define DEBUG_TYPE "hotcoldsplit" 60 61 STATISTIC(NumColdRegionsFound, "Number of cold regions found."); 62 STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined."); 63 64 using namespace llvm; 65 66 static cl::opt<bool> EnableStaticAnalyis("hot-cold-static-analysis", 67 cl::init(true), cl::Hidden); 68 69 static cl::opt<int> 70 MinOutliningThreshold("min-outlining-thresh", cl::init(3), cl::Hidden, 71 cl::desc("Code size threshold for outlining within a " 72 "single BB (as a multiple of TCC_Basic)")); 73 74 namespace { 75 76 struct PostDomTree : PostDomTreeBase<BasicBlock> { 77 PostDomTree(Function &F) { recalculate(F); } 78 }; 79 80 /// A sequence of basic blocks. 81 /// 82 /// A 0-sized SmallVector is slightly cheaper to move than a std::vector. 83 using BlockSequence = SmallVector<BasicBlock *, 0>; 84 85 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify 86 // this function unless you modify the MBB version as well. 87 // 88 /// A no successor, non-return block probably ends in unreachable and is cold. 89 /// Also consider a block that ends in an indirect branch to be a return block, 90 /// since many targets use plain indirect branches to return. 91 bool blockEndsInUnreachable(const BasicBlock &BB) { 92 if (!succ_empty(&BB)) 93 return false; 94 if (BB.empty()) 95 return true; 96 const Instruction *I = BB.getTerminator(); 97 return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I)); 98 } 99 100 static bool exceptionHandlingFunctions(const CallInst *CI) { 101 auto F = CI->getCalledFunction(); 102 if (!F) 103 return false; 104 auto FName = F->getName(); 105 return FName == "__cxa_begin_catch" || 106 FName == "__cxa_free_exception" || 107 FName == "__cxa_allocate_exception" || 108 FName == "__cxa_begin_catch" || 109 FName == "__cxa_end_catch"; 110 } 111 112 static bool unlikelyExecuted(const BasicBlock &BB) { 113 if (blockEndsInUnreachable(BB)) 114 return true; 115 // Exception handling blocks are unlikely executed. 116 if (BB.isEHPad()) 117 return true; 118 for (const Instruction &I : BB) 119 if (const CallInst *CI = dyn_cast<CallInst>(&I)) { 120 // The block is cold if it calls functions tagged as cold or noreturn. 121 if (CI->hasFnAttr(Attribute::Cold) || 122 CI->hasFnAttr(Attribute::NoReturn) || 123 exceptionHandlingFunctions(CI)) 124 return true; 125 126 // Assume that inline assembly is hot code. 127 if (isa<InlineAsm>(CI->getCalledValue())) 128 return false; 129 } 130 return false; 131 } 132 133 /// Check whether it's safe to outline \p BB. 134 static bool mayExtractBlock(const BasicBlock &BB) { 135 return !BB.hasAddressTaken(); 136 } 137 138 /// Check whether \p BB is profitable to outline (i.e. its code size cost meets 139 /// the threshold set in \p MinOutliningThreshold). 140 static bool isProfitableToOutline(const BasicBlock &BB, 141 TargetTransformInfo &TTI) { 142 int Cost = 0; 143 for (const Instruction &I : BB) { 144 if (isa<DbgInfoIntrinsic>(&I) || &I == BB.getTerminator()) 145 continue; 146 147 Cost += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); 148 149 if (Cost >= (MinOutliningThreshold * TargetTransformInfo::TCC_Basic)) 150 return true; 151 } 152 return false; 153 } 154 155 /// Identify the maximal region of cold blocks which includes \p SinkBB. 156 /// 157 /// Include all blocks post-dominated by \p SinkBB, \p SinkBB itself, and all 158 /// blocks dominated by \p SinkBB. Exclude all other blocks, and blocks which 159 /// cannot be outlined. 160 /// 161 /// Return an empty sequence if the cold region is too small to outline, or if 162 /// the cold region has no warm predecessors. 163 static BlockSequence findMaximalColdRegion(BasicBlock &SinkBB, 164 TargetTransformInfo &TTI, 165 DominatorTree &DT, 166 PostDomTree &PDT) { 167 // The maximal cold region. 168 BlockSequence ColdRegion = {}; 169 170 // The ancestor farthest-away from SinkBB, and also post-dominated by it. 171 BasicBlock *MaxAncestor = &SinkBB; 172 unsigned MaxAncestorHeight = 0; 173 174 // Visit SinkBB's ancestors using inverse DFS. 175 auto PredIt = ++idf_begin(&SinkBB); 176 auto PredEnd = idf_end(&SinkBB); 177 while (PredIt != PredEnd) { 178 BasicBlock &PredBB = **PredIt; 179 bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB); 180 181 // If SinkBB does not post-dominate a predecessor, do not mark the 182 // predecessor (or any of its predecessors) cold. 183 if (!SinkPostDom || !mayExtractBlock(PredBB)) { 184 PredIt.skipChildren(); 185 continue; 186 } 187 188 // Keep track of the post-dominated ancestor farthest away from the sink. 189 unsigned AncestorHeight = PredIt.getPathLength(); 190 if (AncestorHeight > MaxAncestorHeight) { 191 MaxAncestor = &PredBB; 192 MaxAncestorHeight = AncestorHeight; 193 } 194 195 ColdRegion.push_back(&PredBB); 196 ++PredIt; 197 } 198 199 // CodeExtractor requires that all blocks to be extracted must be dominated 200 // by the first block to be extracted. 201 // 202 // To avoid spurious or repeated outlining, require that the max ancestor 203 // has a predecessor. By construction this predecessor is not in the cold 204 // region, i.e. its existence implies we don't outline the whole function. 205 // 206 // TODO: If MaxAncestor has no predecessors, we may be able to outline the 207 // second largest cold region that has a predecessor. 208 if (pred_empty(MaxAncestor) || 209 MaxAncestor->getSinglePredecessor() == MaxAncestor) 210 return {}; 211 212 // Filter out predecessors not dominated by the max ancestor. 213 // 214 // TODO: Blocks not dominated by the max ancestor could be extracted as 215 // other cold regions. Marking outlined calls as noreturn when appropriate 216 // and outlining more than once per function could achieve most of the win. 217 auto EraseIt = remove_if(ColdRegion, [&](BasicBlock *PredBB) { 218 return PredBB != MaxAncestor && !DT.dominates(MaxAncestor, PredBB); 219 }); 220 ColdRegion.erase(EraseIt, ColdRegion.end()); 221 222 // Add SinkBB to the cold region. 223 ColdRegion.push_back(&SinkBB); 224 225 // Ensure that the first extracted block is the max ancestor. 226 if (ColdRegion[0] != MaxAncestor) { 227 auto AncestorIt = find(ColdRegion, MaxAncestor); 228 *AncestorIt = ColdRegion[0]; 229 ColdRegion[0] = MaxAncestor; 230 } 231 232 // Find all successors of SinkBB dominated by SinkBB using DFS. 233 auto SuccIt = ++df_begin(&SinkBB); 234 auto SuccEnd = df_end(&SinkBB); 235 while (SuccIt != SuccEnd) { 236 BasicBlock &SuccBB = **SuccIt; 237 bool SinkDom = DT.dominates(&SinkBB, &SuccBB); 238 239 // If SinkBB does not dominate a successor, do not mark the successor (or 240 // any of its successors) cold. 241 if (!SinkDom || !mayExtractBlock(SuccBB)) { 242 SuccIt.skipChildren(); 243 continue; 244 } 245 246 ColdRegion.push_back(&SuccBB); 247 ++SuccIt; 248 } 249 250 if (ColdRegion.size() == 1 && !isProfitableToOutline(*ColdRegion[0], TTI)) 251 return {}; 252 253 return ColdRegion; 254 } 255 256 /// Get the largest cold region in \p F. 257 static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI, 258 BlockFrequencyInfo *BFI, 259 TargetTransformInfo &TTI, 260 DominatorTree &DT, PostDomTree &PDT) { 261 // Keep track of the largest cold region. 262 BlockSequence LargestColdRegion = {}; 263 264 for (BasicBlock &BB : F) { 265 // Identify cold blocks. 266 if (!mayExtractBlock(BB)) 267 continue; 268 bool Cold = 269 PSI.isColdBlock(&BB, BFI) || (EnableStaticAnalyis && unlikelyExecuted(BB)); 270 if (!Cold) 271 continue; 272 273 LLVM_DEBUG({ 274 dbgs() << "Found cold block:\n"; 275 BB.dump(); 276 }); 277 278 // Find a maximal cold region we can outline. 279 BlockSequence ColdRegion = findMaximalColdRegion(BB, TTI, DT, PDT); 280 if (ColdRegion.empty()) { 281 LLVM_DEBUG(dbgs() << " Skipping (block not profitable to extract)\n"); 282 continue; 283 } 284 285 ++NumColdRegionsFound; 286 287 LLVM_DEBUG({ 288 llvm::dbgs() << "Identified cold region with " << ColdRegion.size() 289 << " blocks:\n"; 290 for (BasicBlock *BB : ColdRegion) 291 BB->dump(); 292 }); 293 294 // TODO: Outline more than one region. 295 if (ColdRegion.size() > LargestColdRegion.size()) 296 LargestColdRegion = std::move(ColdRegion); 297 } 298 299 return LargestColdRegion; 300 } 301 302 class HotColdSplitting { 303 public: 304 HotColdSplitting(ProfileSummaryInfo *ProfSI, 305 function_ref<BlockFrequencyInfo *(Function &)> GBFI, 306 function_ref<TargetTransformInfo &(Function &)> GTTI, 307 std::function<OptimizationRemarkEmitter &(Function &)> *GORE) 308 : PSI(ProfSI), GetBFI(GBFI), GetTTI(GTTI), GetORE(GORE) {} 309 bool run(Module &M); 310 311 private: 312 bool shouldOutlineFrom(const Function &F) const; 313 Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT, 314 BlockFrequencyInfo *BFI, TargetTransformInfo &TTI, 315 OptimizationRemarkEmitter &ORE, unsigned Count); 316 SmallPtrSet<const Function *, 2> OutlinedFunctions; 317 ProfileSummaryInfo *PSI; 318 function_ref<BlockFrequencyInfo *(Function &)> GetBFI; 319 function_ref<TargetTransformInfo &(Function &)> GetTTI; 320 std::function<OptimizationRemarkEmitter &(Function &)> *GetORE; 321 }; 322 323 class HotColdSplittingLegacyPass : public ModulePass { 324 public: 325 static char ID; 326 HotColdSplittingLegacyPass() : ModulePass(ID) { 327 initializeHotColdSplittingLegacyPassPass(*PassRegistry::getPassRegistry()); 328 } 329 330 void getAnalysisUsage(AnalysisUsage &AU) const override { 331 AU.addRequired<AssumptionCacheTracker>(); 332 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 333 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 334 AU.addRequired<TargetTransformInfoWrapperPass>(); 335 } 336 337 bool runOnModule(Module &M) override; 338 }; 339 340 } // end anonymous namespace 341 342 // Returns false if the function should not be considered for hot-cold split 343 // optimization. 344 bool HotColdSplitting::shouldOutlineFrom(const Function &F) const { 345 // Do not try to outline again from an already outlined cold function. 346 if (OutlinedFunctions.count(&F)) 347 return false; 348 349 if (F.size() <= 2) 350 return false; 351 352 // TODO: Consider only skipping functions marked `optnone` or `cold`. 353 354 if (F.hasAddressTaken()) 355 return false; 356 357 if (F.hasFnAttribute(Attribute::AlwaysInline)) 358 return false; 359 360 if (F.hasFnAttribute(Attribute::NoInline)) 361 return false; 362 363 if (F.getCallingConv() == CallingConv::Cold) 364 return false; 365 366 if (PSI->isFunctionEntryCold(&F)) 367 return false; 368 return true; 369 } 370 371 Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region, 372 DominatorTree &DT, 373 BlockFrequencyInfo *BFI, 374 TargetTransformInfo &TTI, 375 OptimizationRemarkEmitter &ORE, 376 unsigned Count) { 377 assert(!Region.empty()); 378 LLVM_DEBUG(for (auto *BB : Region) 379 llvm::dbgs() << "\nExtracting: " << *BB;); 380 381 // TODO: Pass BFI and BPI to update profile information. 382 CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr, 383 /* BPI */ nullptr, /* AllowVarArgs */ false, 384 /* AllowAlloca */ false, 385 /* Suffix */ "cold." + std::to_string(Count)); 386 387 SetVector<Value *> Inputs, Outputs, Sinks; 388 CE.findInputsOutputs(Inputs, Outputs, Sinks); 389 390 // Do not extract regions that have live exit variables. 391 if (Outputs.size() > 0) { 392 LLVM_DEBUG(llvm::dbgs() << "Not outlining; live outputs\n"); 393 return nullptr; 394 } 395 396 // TODO: Run MergeBasicBlockIntoOnlyPred on the outlined function. 397 Function *OrigF = Region[0]->getParent(); 398 if (Function *OutF = CE.extractCodeRegion()) { 399 User *U = *OutF->user_begin(); 400 CallInst *CI = cast<CallInst>(U); 401 CallSite CS(CI); 402 NumColdRegionsOutlined++; 403 if (TTI.useColdCCForColdCall(*OutF)) { 404 OutF->setCallingConv(CallingConv::Cold); 405 CS.setCallingConv(CallingConv::Cold); 406 } 407 CI->setIsNoInline(); 408 409 // Try to make the outlined code as small as possible on the assumption 410 // that it's cold. 411 assert(!OutF->hasFnAttribute(Attribute::OptimizeNone) && 412 "An outlined function should never be marked optnone"); 413 OutF->addFnAttr(Attribute::MinSize); 414 415 LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF); 416 ORE.emit([&]() { 417 return OptimizationRemark(DEBUG_TYPE, "HotColdSplit", 418 &*Region[0]->begin()) 419 << ore::NV("Original", OrigF) << " split cold code into " 420 << ore::NV("Split", OutF); 421 }); 422 return OutF; 423 } 424 425 ORE.emit([&]() { 426 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 427 &*Region[0]->begin()) 428 << "Failed to extract region at block " 429 << ore::NV("Block", Region.front()); 430 }); 431 return nullptr; 432 } 433 434 bool HotColdSplitting::run(Module &M) { 435 bool Changed = false; 436 for (auto &F : M) { 437 if (!shouldOutlineFrom(F)) { 438 LLVM_DEBUG(llvm::dbgs() << "Not outlining in " << F.getName() << "\n"); 439 continue; 440 } 441 442 LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n"); 443 DominatorTree DT(F); 444 PostDomTree PDT(F); 445 PDT.recalculate(F); 446 BlockFrequencyInfo *BFI = GetBFI(F); 447 TargetTransformInfo &TTI = GetTTI(F); 448 449 BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, TTI, DT, PDT); 450 if (ColdRegion.empty()) 451 continue; 452 453 OptimizationRemarkEmitter &ORE = (*GetORE)(F); 454 Function *Outlined = 455 extractColdRegion(ColdRegion, DT, BFI, TTI, ORE, /*Count=*/1); 456 if (Outlined) { 457 OutlinedFunctions.insert(Outlined); 458 Changed = true; 459 } 460 } 461 return Changed; 462 } 463 464 bool HotColdSplittingLegacyPass::runOnModule(Module &M) { 465 if (skipModule(M)) 466 return false; 467 ProfileSummaryInfo *PSI = 468 &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 469 auto GTTI = [this](Function &F) -> TargetTransformInfo & { 470 return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 471 }; 472 auto GBFI = [this](Function &F) { 473 return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI(); 474 }; 475 std::unique_ptr<OptimizationRemarkEmitter> ORE; 476 std::function<OptimizationRemarkEmitter &(Function &)> GetORE = 477 [&ORE](Function &F) -> OptimizationRemarkEmitter & { 478 ORE.reset(new OptimizationRemarkEmitter(&F)); 479 return *ORE.get(); 480 }; 481 482 return HotColdSplitting(PSI, GBFI, GTTI, &GetORE).run(M); 483 } 484 485 PreservedAnalyses 486 HotColdSplittingPass::run(Module &M, ModuleAnalysisManager &AM) { 487 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 488 489 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 490 [&FAM](Function &F) -> AssumptionCache & { 491 return FAM.getResult<AssumptionAnalysis>(F); 492 }; 493 494 auto GBFI = [&FAM](Function &F) { 495 return &FAM.getResult<BlockFrequencyAnalysis>(F); 496 }; 497 498 std::function<TargetTransformInfo &(Function &)> GTTI = 499 [&FAM](Function &F) -> TargetTransformInfo & { 500 return FAM.getResult<TargetIRAnalysis>(F); 501 }; 502 503 std::unique_ptr<OptimizationRemarkEmitter> ORE; 504 std::function<OptimizationRemarkEmitter &(Function &)> GetORE = 505 [&ORE](Function &F) -> OptimizationRemarkEmitter & { 506 ORE.reset(new OptimizationRemarkEmitter(&F)); 507 return *ORE.get(); 508 }; 509 510 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 511 512 if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE).run(M)) 513 return PreservedAnalyses::none(); 514 return PreservedAnalyses::all(); 515 } 516 517 char HotColdSplittingLegacyPass::ID = 0; 518 INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit", 519 "Hot Cold Splitting", false, false) 520 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 521 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 522 INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit", 523 "Hot Cold Splitting", false, false) 524 525 ModulePass *llvm::createHotColdSplittingPass() { 526 return new HotColdSplittingLegacyPass(); 527 } 528