1 //===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Outline cold regions to a separate function. 11 // TODO: Update BFI and BPI 12 // TODO: Add all the outlined functions to a separate section. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "llvm/ADT/SmallVector.h" 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/Analysis/AliasAnalysis.h" 19 #include "llvm/Analysis/BlockFrequencyInfo.h" 20 #include "llvm/Analysis/BranchProbabilityInfo.h" 21 #include "llvm/Analysis/CFG.h" 22 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 23 #include "llvm/Analysis/PostDominators.h" 24 #include "llvm/Analysis/ProfileSummaryInfo.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/CFG.h" 28 #include "llvm/IR/DataLayout.h" 29 #include "llvm/IR/DiagnosticInfo.h" 30 #include "llvm/IR/Dominators.h" 31 #include "llvm/IR/Function.h" 32 #include "llvm/IR/Instruction.h" 33 #include "llvm/IR/Instructions.h" 34 #include "llvm/IR/IntrinsicInst.h" 35 #include "llvm/IR/Metadata.h" 36 #include "llvm/IR/Module.h" 37 #include "llvm/IR/PassManager.h" 38 #include "llvm/IR/Type.h" 39 #include "llvm/IR/Use.h" 40 #include "llvm/IR/User.h" 41 #include "llvm/IR/Value.h" 42 #include "llvm/Pass.h" 43 #include "llvm/Support/BlockFrequency.h" 44 #include "llvm/Support/BranchProbability.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/Support/raw_ostream.h" 47 #include "llvm/Transforms/IPO.h" 48 #include "llvm/Transforms/IPO/HotColdSplitting.h" 49 #include "llvm/Transforms/Scalar.h" 50 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 51 #include "llvm/Transforms/Utils/Cloning.h" 52 #include "llvm/Transforms/Utils/CodeExtractor.h" 53 #include "llvm/Transforms/Utils/Local.h" 54 #include "llvm/Transforms/Utils/SSAUpdater.h" 55 #include "llvm/Transforms/Utils/ValueMapper.h" 56 #include <algorithm> 57 #include <cassert> 58 59 #define DEBUG_TYPE "hotcoldsplit" 60 61 STATISTIC(NumColdRegionsFound, "Number of cold regions found."); 62 STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined."); 63 64 using namespace llvm; 65 66 static cl::opt<bool> EnableStaticAnalyis("hot-cold-static-analysis", 67 cl::init(true), cl::Hidden); 68 69 static cl::opt<unsigned> MinOutliningInstCount( 70 "min-outlining-inst-count", cl::init(3), cl::Hidden, 71 cl::desc("Minimum number of instructions needed for a single-block region " 72 "to be an outlining candidate")); 73 74 namespace { 75 76 struct PostDomTree : PostDomTreeBase<BasicBlock> { 77 PostDomTree(Function &F) { recalculate(F); } 78 }; 79 80 /// A sequence of basic blocks. 81 /// 82 /// A 0-sized SmallVector is slightly cheaper to move than a std::vector. 83 using BlockSequence = SmallVector<BasicBlock *, 0>; 84 85 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify 86 // this function unless you modify the MBB version as well. 87 // 88 /// A no successor, non-return block probably ends in unreachable and is cold. 89 /// Also consider a block that ends in an indirect branch to be a return block, 90 /// since many targets use plain indirect branches to return. 91 bool blockEndsInUnreachable(const BasicBlock &BB) { 92 if (!succ_empty(&BB)) 93 return false; 94 if (BB.empty()) 95 return true; 96 const Instruction *I = BB.getTerminator(); 97 return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I)); 98 } 99 100 static bool exceptionHandlingFunctions(const CallInst *CI) { 101 auto F = CI->getCalledFunction(); 102 if (!F) 103 return false; 104 auto FName = F->getName(); 105 return FName == "__cxa_begin_catch" || 106 FName == "__cxa_free_exception" || 107 FName == "__cxa_allocate_exception" || 108 FName == "__cxa_begin_catch" || 109 FName == "__cxa_end_catch"; 110 } 111 112 static bool unlikelyExecuted(const BasicBlock &BB) { 113 if (blockEndsInUnreachable(BB)) 114 return true; 115 // Exception handling blocks are unlikely executed. 116 if (BB.isEHPad()) 117 return true; 118 for (const Instruction &I : BB) 119 if (const CallInst *CI = dyn_cast<CallInst>(&I)) { 120 // The block is cold if it calls functions tagged as cold or noreturn. 121 if (CI->hasFnAttr(Attribute::Cold) || 122 CI->hasFnAttr(Attribute::NoReturn) || 123 exceptionHandlingFunctions(CI)) 124 return true; 125 126 // Assume that inline assembly is hot code. 127 if (isa<InlineAsm>(CI->getCalledValue())) 128 return false; 129 } 130 return false; 131 } 132 133 /// Check whether it's safe to outline \p BB. 134 static bool mayExtractBlock(const BasicBlock &BB) { 135 return !BB.hasAddressTaken(); 136 } 137 138 /// Check whether \p BB has at least \p Min non-debug, non-terminator 139 /// instructions. 140 static bool hasMinimumInstCount(const BasicBlock &BB, unsigned Min) { 141 unsigned Count = 0; 142 for (const Instruction &I : BB) { 143 if (isa<DbgInfoIntrinsic>(&I) || &I == BB.getTerminator()) 144 continue; 145 if (++Count >= Min) 146 return true; 147 } 148 return false; 149 } 150 151 /// Identify the maximal region of cold blocks which includes \p SinkBB. 152 /// 153 /// Include all blocks post-dominated by \p SinkBB, \p SinkBB itself, and all 154 /// blocks dominated by \p SinkBB. Exclude all other blocks, and blocks which 155 /// cannot be outlined. 156 /// 157 /// Return an empty sequence if the cold region is too small to outline, or if 158 /// the cold region has no warm predecessors. 159 static BlockSequence 160 findMaximalColdRegion(BasicBlock &SinkBB, DominatorTree &DT, PostDomTree &PDT) { 161 // The maximal cold region. 162 BlockSequence ColdRegion = {}; 163 164 // The ancestor farthest-away from SinkBB, and also post-dominated by it. 165 BasicBlock *MaxAncestor = &SinkBB; 166 unsigned MaxAncestorHeight = 0; 167 168 // Visit SinkBB's ancestors using inverse DFS. 169 auto PredIt = ++idf_begin(&SinkBB); 170 auto PredEnd = idf_end(&SinkBB); 171 while (PredIt != PredEnd) { 172 BasicBlock &PredBB = **PredIt; 173 bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB); 174 175 // If SinkBB does not post-dominate a predecessor, do not mark the 176 // predecessor (or any of its predecessors) cold. 177 if (!SinkPostDom || !mayExtractBlock(PredBB)) { 178 PredIt.skipChildren(); 179 continue; 180 } 181 182 // Keep track of the post-dominated ancestor farthest away from the sink. 183 unsigned AncestorHeight = PredIt.getPathLength(); 184 if (AncestorHeight > MaxAncestorHeight) { 185 MaxAncestor = &PredBB; 186 MaxAncestorHeight = AncestorHeight; 187 } 188 189 ColdRegion.push_back(&PredBB); 190 ++PredIt; 191 } 192 193 // CodeExtractor requires that all blocks to be extracted must be dominated 194 // by the first block to be extracted. 195 // 196 // To avoid spurious or repeated outlining, require that the max ancestor 197 // has a predecessor. By construction this predecessor is not in the cold 198 // region, i.e. its existence implies we don't outline the whole function. 199 // 200 // TODO: If MaxAncestor has no predecessors, we may be able to outline the 201 // second largest cold region that has a predecessor. 202 if (pred_empty(MaxAncestor) || 203 MaxAncestor->getSinglePredecessor() == MaxAncestor) 204 return {}; 205 206 // Filter out predecessors not dominated by the max ancestor. 207 // 208 // TODO: Blocks not dominated by the max ancestor could be extracted as 209 // other cold regions. Marking outlined calls as noreturn when appropriate 210 // and outlining more than once per function could achieve most of the win. 211 auto EraseIt = remove_if(ColdRegion, [&](BasicBlock *PredBB) { 212 return PredBB != MaxAncestor && !DT.dominates(MaxAncestor, PredBB); 213 }); 214 ColdRegion.erase(EraseIt, ColdRegion.end()); 215 216 // Add SinkBB to the cold region. 217 ColdRegion.push_back(&SinkBB); 218 219 // Ensure that the first extracted block is the max ancestor. 220 if (ColdRegion[0] != MaxAncestor) { 221 auto AncestorIt = find(ColdRegion, MaxAncestor); 222 *AncestorIt = ColdRegion[0]; 223 ColdRegion[0] = MaxAncestor; 224 } 225 226 // Find all successors of SinkBB dominated by SinkBB using DFS. 227 auto SuccIt = ++df_begin(&SinkBB); 228 auto SuccEnd = df_end(&SinkBB); 229 while (SuccIt != SuccEnd) { 230 BasicBlock &SuccBB = **SuccIt; 231 bool SinkDom = DT.dominates(&SinkBB, &SuccBB); 232 233 // If SinkBB does not dominate a successor, do not mark the successor (or 234 // any of its successors) cold. 235 if (!SinkDom || !mayExtractBlock(SuccBB)) { 236 SuccIt.skipChildren(); 237 continue; 238 } 239 240 ColdRegion.push_back(&SuccBB); 241 ++SuccIt; 242 } 243 244 if (ColdRegion.size() == 1 && 245 !hasMinimumInstCount(*ColdRegion[0], MinOutliningInstCount)) 246 return {}; 247 248 return ColdRegion; 249 } 250 251 /// Get the largest cold region in \p F. 252 static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI, 253 BlockFrequencyInfo *BFI, 254 DominatorTree &DT, PostDomTree &PDT) { 255 // Keep track of the largest cold region. 256 BlockSequence LargestColdRegion = {}; 257 258 for (BasicBlock &BB : F) { 259 // Identify cold blocks. 260 if (!mayExtractBlock(BB)) 261 continue; 262 bool Cold = 263 PSI.isColdBB(&BB, BFI) || (EnableStaticAnalyis && unlikelyExecuted(BB)); 264 if (!Cold) 265 continue; 266 267 LLVM_DEBUG({ 268 dbgs() << "Found cold block:\n"; 269 BB.dump(); 270 }); 271 272 // Find a maximal cold region we can outline. 273 BlockSequence ColdRegion = findMaximalColdRegion(BB, DT, PDT); 274 if (ColdRegion.empty()) { 275 LLVM_DEBUG(dbgs() << " Skipping (block not profitable to extract)\n"); 276 continue; 277 } 278 279 ++NumColdRegionsFound; 280 281 LLVM_DEBUG({ 282 llvm::dbgs() << "Identified cold region with " << ColdRegion.size() 283 << " blocks:\n"; 284 for (BasicBlock *BB : ColdRegion) 285 BB->dump(); 286 }); 287 288 // TODO: Outline more than one region. 289 if (ColdRegion.size() > LargestColdRegion.size()) 290 LargestColdRegion = std::move(ColdRegion); 291 } 292 293 return LargestColdRegion; 294 } 295 296 class HotColdSplitting { 297 public: 298 HotColdSplitting(ProfileSummaryInfo *ProfSI, 299 function_ref<BlockFrequencyInfo *(Function &)> GBFI, 300 function_ref<TargetTransformInfo &(Function &)> GTTI, 301 std::function<OptimizationRemarkEmitter &(Function &)> *GORE) 302 : PSI(ProfSI), GetBFI(GBFI), GetTTI(GTTI), GetORE(GORE) {} 303 bool run(Module &M); 304 305 private: 306 bool shouldOutlineFrom(const Function &F) const; 307 Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT, 308 BlockFrequencyInfo *BFI, 309 OptimizationRemarkEmitter &ORE, unsigned Count); 310 SmallPtrSet<const Function *, 2> OutlinedFunctions; 311 ProfileSummaryInfo *PSI; 312 function_ref<BlockFrequencyInfo *(Function &)> GetBFI; 313 function_ref<TargetTransformInfo &(Function &)> GetTTI; 314 std::function<OptimizationRemarkEmitter &(Function &)> *GetORE; 315 }; 316 317 class HotColdSplittingLegacyPass : public ModulePass { 318 public: 319 static char ID; 320 HotColdSplittingLegacyPass() : ModulePass(ID) { 321 initializeHotColdSplittingLegacyPassPass(*PassRegistry::getPassRegistry()); 322 } 323 324 void getAnalysisUsage(AnalysisUsage &AU) const override { 325 AU.addRequired<AssumptionCacheTracker>(); 326 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 327 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 328 AU.addRequired<TargetTransformInfoWrapperPass>(); 329 } 330 331 bool runOnModule(Module &M) override; 332 }; 333 334 } // end anonymous namespace 335 336 // Returns false if the function should not be considered for hot-cold split 337 // optimization. 338 bool HotColdSplitting::shouldOutlineFrom(const Function &F) const { 339 // Do not try to outline again from an already outlined cold function. 340 if (OutlinedFunctions.count(&F)) 341 return false; 342 343 if (F.size() <= 2) 344 return false; 345 346 // TODO: Consider only skipping functions marked `optnone` or `cold`. 347 348 if (F.hasAddressTaken()) 349 return false; 350 351 if (F.hasFnAttribute(Attribute::AlwaysInline)) 352 return false; 353 354 if (F.hasFnAttribute(Attribute::NoInline)) 355 return false; 356 357 if (F.getCallingConv() == CallingConv::Cold) 358 return false; 359 360 if (PSI->isFunctionEntryCold(&F)) 361 return false; 362 return true; 363 } 364 365 Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region, 366 DominatorTree &DT, 367 BlockFrequencyInfo *BFI, 368 OptimizationRemarkEmitter &ORE, 369 unsigned Count) { 370 assert(!Region.empty()); 371 LLVM_DEBUG(for (auto *BB : Region) 372 llvm::dbgs() << "\nExtracting: " << *BB;); 373 374 // TODO: Pass BFI and BPI to update profile information. 375 CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr, 376 /* BPI */ nullptr, /* AllowVarArgs */ false, 377 /* AllowAlloca */ false, 378 /* Suffix */ "cold." + std::to_string(Count)); 379 380 SetVector<Value *> Inputs, Outputs, Sinks; 381 CE.findInputsOutputs(Inputs, Outputs, Sinks); 382 383 // Do not extract regions that have live exit variables. 384 if (Outputs.size() > 0) { 385 LLVM_DEBUG(llvm::dbgs() << "Not outlining; live outputs\n"); 386 return nullptr; 387 } 388 389 // TODO: Run MergeBasicBlockIntoOnlyPred on the outlined function. 390 Function *OrigF = Region[0]->getParent(); 391 if (Function *OutF = CE.extractCodeRegion()) { 392 User *U = *OutF->user_begin(); 393 CallInst *CI = cast<CallInst>(U); 394 CallSite CS(CI); 395 NumColdRegionsOutlined++; 396 if (GetTTI(*OutF).useColdCCForColdCall(*OutF)) { 397 OutF->setCallingConv(CallingConv::Cold); 398 CS.setCallingConv(CallingConv::Cold); 399 } 400 CI->setIsNoInline(); 401 402 // Try to make the outlined code as small as possible on the assumption 403 // that it's cold. 404 assert(!OutF->hasFnAttribute(Attribute::OptimizeNone) && 405 "An outlined function should never be marked optnone"); 406 OutF->addFnAttr(Attribute::MinSize); 407 408 LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF); 409 ORE.emit([&]() { 410 return OptimizationRemark(DEBUG_TYPE, "HotColdSplit", 411 &*Region[0]->begin()) 412 << ore::NV("Original", OrigF) << " split cold code into " 413 << ore::NV("Split", OutF); 414 }); 415 return OutF; 416 } 417 418 ORE.emit([&]() { 419 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 420 &*Region[0]->begin()) 421 << "Failed to extract region at block " 422 << ore::NV("Block", Region.front()); 423 }); 424 return nullptr; 425 } 426 427 bool HotColdSplitting::run(Module &M) { 428 bool Changed = false; 429 for (auto &F : M) { 430 if (!shouldOutlineFrom(F)) { 431 LLVM_DEBUG(llvm::dbgs() << "Not outlining in " << F.getName() << "\n"); 432 continue; 433 } 434 435 LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n"); 436 DominatorTree DT(F); 437 PostDomTree PDT(F); 438 PDT.recalculate(F); 439 BlockFrequencyInfo *BFI = GetBFI(F); 440 441 BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, DT, PDT); 442 if (ColdRegion.empty()) 443 continue; 444 445 OptimizationRemarkEmitter &ORE = (*GetORE)(F); 446 Function *Outlined = 447 extractColdRegion(ColdRegion, DT, BFI, ORE, /*Count=*/1); 448 if (Outlined) { 449 OutlinedFunctions.insert(Outlined); 450 Changed = true; 451 } 452 } 453 return Changed; 454 } 455 456 bool HotColdSplittingLegacyPass::runOnModule(Module &M) { 457 if (skipModule(M)) 458 return false; 459 ProfileSummaryInfo *PSI = 460 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 461 auto GTTI = [this](Function &F) -> TargetTransformInfo & { 462 return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 463 }; 464 auto GBFI = [this](Function &F) { 465 return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI(); 466 }; 467 std::unique_ptr<OptimizationRemarkEmitter> ORE; 468 std::function<OptimizationRemarkEmitter &(Function &)> GetORE = 469 [&ORE](Function &F) -> OptimizationRemarkEmitter & { 470 ORE.reset(new OptimizationRemarkEmitter(&F)); 471 return *ORE.get(); 472 }; 473 474 return HotColdSplitting(PSI, GBFI, GTTI, &GetORE).run(M); 475 } 476 477 PreservedAnalyses 478 HotColdSplittingPass::run(Module &M, ModuleAnalysisManager &AM) { 479 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 480 481 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 482 [&FAM](Function &F) -> AssumptionCache & { 483 return FAM.getResult<AssumptionAnalysis>(F); 484 }; 485 486 auto GBFI = [&FAM](Function &F) { 487 return &FAM.getResult<BlockFrequencyAnalysis>(F); 488 }; 489 490 std::function<TargetTransformInfo &(Function &)> GTTI = 491 [&FAM](Function &F) -> TargetTransformInfo & { 492 return FAM.getResult<TargetIRAnalysis>(F); 493 }; 494 495 std::unique_ptr<OptimizationRemarkEmitter> ORE; 496 std::function<OptimizationRemarkEmitter &(Function &)> GetORE = 497 [&ORE](Function &F) -> OptimizationRemarkEmitter & { 498 ORE.reset(new OptimizationRemarkEmitter(&F)); 499 return *ORE.get(); 500 }; 501 502 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 503 504 if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE).run(M)) 505 return PreservedAnalyses::none(); 506 return PreservedAnalyses::all(); 507 } 508 509 char HotColdSplittingLegacyPass::ID = 0; 510 INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit", 511 "Hot Cold Splitting", false, false) 512 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 513 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 514 INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit", 515 "Hot Cold Splitting", false, false) 516 517 ModulePass *llvm::createHotColdSplittingPass() { 518 return new HotColdSplittingLegacyPass(); 519 } 520