1 //===-- MachineBlockPlacement.cpp - Basic Block Code Layout optimization --===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements basic block placement transformations using the CFG 11 // structure and branch probability estimates. 12 // 13 // The pass strives to preserve the structure of the CFG (that is, retain 14 // a topological ordering of basic blocks) in the absence of a *strong* signal 15 // to the contrary from probabilities. However, within the CFG structure, it 16 // attempts to choose an ordering which favors placing more likely sequences of 17 // blocks adjacent to each other. 18 // 19 // The algorithm works from the inner-most loop within a function outward, and 20 // at each stage walks through the basic blocks, trying to coalesce them into 21 // sequential chains where allowed by the CFG (or demanded by heavy 22 // probabilities). Finally, it walks the blocks in topological order, and the 23 // first time it reaches a chain of basic blocks, it schedules them in the 24 // function in-order. 25 // 26 //===----------------------------------------------------------------------===// 27 28 #include "llvm/CodeGen/Passes.h" 29 #include "llvm/ADT/DenseMap.h" 30 #include "llvm/ADT/SmallPtrSet.h" 31 #include "llvm/ADT/SmallVector.h" 32 #include "llvm/ADT/Statistic.h" 33 #include "llvm/CodeGen/MachineBasicBlock.h" 34 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" 35 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" 36 #include "llvm/CodeGen/MachineDominators.h" 37 #include "llvm/CodeGen/MachineFunction.h" 38 #include "llvm/CodeGen/MachineFunctionPass.h" 39 #include "llvm/CodeGen/MachineLoopInfo.h" 40 #include "llvm/CodeGen/MachineModuleInfo.h" 41 #include "llvm/Support/Allocator.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/Debug.h" 44 #include "llvm/Support/raw_ostream.h" 45 #include "llvm/Target/TargetInstrInfo.h" 46 #include "llvm/Target/TargetLowering.h" 47 #include "llvm/Target/TargetSubtargetInfo.h" 48 #include <algorithm> 49 using namespace llvm; 50 51 #define DEBUG_TYPE "block-placement" 52 53 STATISTIC(NumCondBranches, "Number of conditional branches"); 54 STATISTIC(NumUncondBranches, "Number of unconditional branches"); 55 STATISTIC(CondBranchTakenFreq, 56 "Potential frequency of taking conditional branches"); 57 STATISTIC(UncondBranchTakenFreq, 58 "Potential frequency of taking unconditional branches"); 59 60 static cl::opt<unsigned> AlignAllBlock("align-all-blocks", 61 cl::desc("Force the alignment of all " 62 "blocks in the function."), 63 cl::init(0), cl::Hidden); 64 65 static cl::opt<unsigned> AlignAllNonFallThruBlocks( 66 "align-all-nofallthru-blocks", 67 cl::desc("Force the alignment of all " 68 "blocks that have no fall-through predecessors (i.e. don't add " 69 "nops that are executed)."), 70 cl::init(0), cl::Hidden); 71 72 // FIXME: Find a good default for this flag and remove the flag. 73 static cl::opt<unsigned> ExitBlockBias( 74 "block-placement-exit-block-bias", 75 cl::desc("Block frequency percentage a loop exit block needs " 76 "over the original exit to be considered the new exit."), 77 cl::init(0), cl::Hidden); 78 79 static cl::opt<bool> OutlineOptionalBranches( 80 "outline-optional-branches", 81 cl::desc("Put completely optional branches, i.e. branches with a common " 82 "post dominator, out of line."), 83 cl::init(false), cl::Hidden); 84 85 static cl::opt<unsigned> OutlineOptionalThreshold( 86 "outline-optional-threshold", 87 cl::desc("Don't outline optional branches that are a single block with an " 88 "instruction count below this threshold"), 89 cl::init(4), cl::Hidden); 90 91 static cl::opt<unsigned> LoopToColdBlockRatio( 92 "loop-to-cold-block-ratio", 93 cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " 94 "(frequency of block) is greater than this ratio"), 95 cl::init(5), cl::Hidden); 96 97 static cl::opt<bool> 98 PreciseRotationCost("precise-rotation-cost", 99 cl::desc("Model the cost of loop rotation more " 100 "precisely by using profile data."), 101 cl::init(false), cl::Hidden); 102 103 static cl::opt<unsigned> MisfetchCost( 104 "misfetch-cost", 105 cl::desc("Cost that models the probablistic risk of an instruction " 106 "misfetch due to a jump comparing to falling through, whose cost " 107 "is zero."), 108 cl::init(1), cl::Hidden); 109 110 static cl::opt<unsigned> JumpInstCost("jump-inst-cost", 111 cl::desc("Cost of jump instructions."), 112 cl::init(1), cl::Hidden); 113 114 namespace { 115 class BlockChain; 116 /// \brief Type for our function-wide basic block -> block chain mapping. 117 typedef DenseMap<MachineBasicBlock *, BlockChain *> BlockToChainMapType; 118 } 119 120 namespace { 121 /// \brief A chain of blocks which will be laid out contiguously. 122 /// 123 /// This is the datastructure representing a chain of consecutive blocks that 124 /// are profitable to layout together in order to maximize fallthrough 125 /// probabilities and code locality. We also can use a block chain to represent 126 /// a sequence of basic blocks which have some external (correctness) 127 /// requirement for sequential layout. 128 /// 129 /// Chains can be built around a single basic block and can be merged to grow 130 /// them. They participate in a block-to-chain mapping, which is updated 131 /// automatically as chains are merged together. 132 class BlockChain { 133 /// \brief The sequence of blocks belonging to this chain. 134 /// 135 /// This is the sequence of blocks for a particular chain. These will be laid 136 /// out in-order within the function. 137 SmallVector<MachineBasicBlock *, 4> Blocks; 138 139 /// \brief A handle to the function-wide basic block to block chain mapping. 140 /// 141 /// This is retained in each block chain to simplify the computation of child 142 /// block chains for SCC-formation and iteration. We store the edges to child 143 /// basic blocks, and map them back to their associated chains using this 144 /// structure. 145 BlockToChainMapType &BlockToChain; 146 147 public: 148 /// \brief Construct a new BlockChain. 149 /// 150 /// This builds a new block chain representing a single basic block in the 151 /// function. It also registers itself as the chain that block participates 152 /// in with the BlockToChain mapping. 153 BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB) 154 : Blocks(1, BB), BlockToChain(BlockToChain), LoopPredecessors(0) { 155 assert(BB && "Cannot create a chain with a null basic block"); 156 BlockToChain[BB] = this; 157 } 158 159 /// \brief Iterator over blocks within the chain. 160 typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator; 161 162 /// \brief Beginning of blocks within the chain. 163 iterator begin() { return Blocks.begin(); } 164 165 /// \brief End of blocks within the chain. 166 iterator end() { return Blocks.end(); } 167 168 /// \brief Merge a block chain into this one. 169 /// 170 /// This routine merges a block chain into this one. It takes care of forming 171 /// a contiguous sequence of basic blocks, updating the edge list, and 172 /// updating the block -> chain mapping. It does not free or tear down the 173 /// old chain, but the old chain's block list is no longer valid. 174 void merge(MachineBasicBlock *BB, BlockChain *Chain) { 175 assert(BB); 176 assert(!Blocks.empty()); 177 178 // Fast path in case we don't have a chain already. 179 if (!Chain) { 180 assert(!BlockToChain[BB]); 181 Blocks.push_back(BB); 182 BlockToChain[BB] = this; 183 return; 184 } 185 186 assert(BB == *Chain->begin()); 187 assert(Chain->begin() != Chain->end()); 188 189 // Update the incoming blocks to point to this chain, and add them to the 190 // chain structure. 191 for (MachineBasicBlock *ChainBB : *Chain) { 192 Blocks.push_back(ChainBB); 193 assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain"); 194 BlockToChain[ChainBB] = this; 195 } 196 } 197 198 #ifndef NDEBUG 199 /// \brief Dump the blocks in this chain. 200 LLVM_DUMP_METHOD void dump() { 201 for (MachineBasicBlock *MBB : *this) 202 MBB->dump(); 203 } 204 #endif // NDEBUG 205 206 /// \brief Count of predecessors within the loop currently being processed. 207 /// 208 /// This count is updated at each loop we process to represent the number of 209 /// in-loop predecessors of this chain. 210 unsigned LoopPredecessors; 211 }; 212 } 213 214 namespace { 215 class MachineBlockPlacement : public MachineFunctionPass { 216 /// \brief A typedef for a block filter set. 217 typedef SmallPtrSet<MachineBasicBlock *, 16> BlockFilterSet; 218 219 /// \brief A handle to the branch probability pass. 220 const MachineBranchProbabilityInfo *MBPI; 221 222 /// \brief A handle to the function-wide block frequency pass. 223 const MachineBlockFrequencyInfo *MBFI; 224 225 /// \brief A handle to the loop info. 226 const MachineLoopInfo *MLI; 227 228 /// \brief A handle to the target's instruction info. 229 const TargetInstrInfo *TII; 230 231 /// \brief A handle to the target's lowering info. 232 const TargetLoweringBase *TLI; 233 234 /// \brief A handle to the post dominator tree. 235 MachineDominatorTree *MDT; 236 237 /// \brief A set of blocks that are unavoidably execute, i.e. they dominate 238 /// all terminators of the MachineFunction. 239 SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks; 240 241 /// \brief Allocator and owner of BlockChain structures. 242 /// 243 /// We build BlockChains lazily while processing the loop structure of 244 /// a function. To reduce malloc traffic, we allocate them using this 245 /// slab-like allocator, and destroy them after the pass completes. An 246 /// important guarantee is that this allocator produces stable pointers to 247 /// the chains. 248 SpecificBumpPtrAllocator<BlockChain> ChainAllocator; 249 250 /// \brief Function wide BasicBlock to BlockChain mapping. 251 /// 252 /// This mapping allows efficiently moving from any given basic block to the 253 /// BlockChain it participates in, if any. We use it to, among other things, 254 /// allow implicitly defining edges between chains as the existing edges 255 /// between basic blocks. 256 DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain; 257 258 void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, 259 SmallVectorImpl<MachineBasicBlock *> &BlockWorkList, 260 const BlockFilterSet *BlockFilter = nullptr); 261 MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB, 262 BlockChain &Chain, 263 const BlockFilterSet *BlockFilter); 264 MachineBasicBlock * 265 selectBestCandidateBlock(BlockChain &Chain, 266 SmallVectorImpl<MachineBasicBlock *> &WorkList, 267 const BlockFilterSet *BlockFilter); 268 MachineBasicBlock * 269 getFirstUnplacedBlock(MachineFunction &F, const BlockChain &PlacedChain, 270 MachineFunction::iterator &PrevUnplacedBlockIt, 271 const BlockFilterSet *BlockFilter); 272 void buildChain(MachineBasicBlock *BB, BlockChain &Chain, 273 SmallVectorImpl<MachineBasicBlock *> &BlockWorkList, 274 const BlockFilterSet *BlockFilter = nullptr); 275 MachineBasicBlock *findBestLoopTop(MachineLoop &L, 276 const BlockFilterSet &LoopBlockSet); 277 MachineBasicBlock *findBestLoopExit(MachineFunction &F, MachineLoop &L, 278 const BlockFilterSet &LoopBlockSet); 279 BlockFilterSet collectLoopBlockSet(MachineFunction &F, MachineLoop &L); 280 void buildLoopChains(MachineFunction &F, MachineLoop &L); 281 void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB, 282 const BlockFilterSet &LoopBlockSet); 283 void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, 284 const BlockFilterSet &LoopBlockSet); 285 void buildCFGChains(MachineFunction &F); 286 287 public: 288 static char ID; // Pass identification, replacement for typeid 289 MachineBlockPlacement() : MachineFunctionPass(ID) { 290 initializeMachineBlockPlacementPass(*PassRegistry::getPassRegistry()); 291 } 292 293 bool runOnMachineFunction(MachineFunction &F) override; 294 295 void getAnalysisUsage(AnalysisUsage &AU) const override { 296 AU.addRequired<MachineBranchProbabilityInfo>(); 297 AU.addRequired<MachineBlockFrequencyInfo>(); 298 AU.addRequired<MachineDominatorTree>(); 299 AU.addRequired<MachineLoopInfo>(); 300 MachineFunctionPass::getAnalysisUsage(AU); 301 } 302 }; 303 } 304 305 char MachineBlockPlacement::ID = 0; 306 char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID; 307 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement", 308 "Branch Probability Basic Block Placement", false, false) 309 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) 310 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) 311 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 312 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 313 INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement", 314 "Branch Probability Basic Block Placement", false, false) 315 316 #ifndef NDEBUG 317 /// \brief Helper to print the name of a MBB. 318 /// 319 /// Only used by debug logging. 320 static std::string getBlockName(MachineBasicBlock *BB) { 321 std::string Result; 322 raw_string_ostream OS(Result); 323 OS << "BB#" << BB->getNumber(); 324 OS << " (derived from LLVM BB '" << BB->getName() << "')"; 325 OS.flush(); 326 return Result; 327 } 328 329 /// \brief Helper to print the number of a MBB. 330 /// 331 /// Only used by debug logging. 332 static std::string getBlockNum(MachineBasicBlock *BB) { 333 std::string Result; 334 raw_string_ostream OS(Result); 335 OS << "BB#" << BB->getNumber(); 336 OS.flush(); 337 return Result; 338 } 339 #endif 340 341 /// \brief Mark a chain's successors as having one fewer preds. 342 /// 343 /// When a chain is being merged into the "placed" chain, this routine will 344 /// quickly walk the successors of each block in the chain and mark them as 345 /// having one fewer active predecessor. It also adds any successors of this 346 /// chain which reach the zero-predecessor state to the worklist passed in. 347 void MachineBlockPlacement::markChainSuccessors( 348 BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, 349 SmallVectorImpl<MachineBasicBlock *> &BlockWorkList, 350 const BlockFilterSet *BlockFilter) { 351 // Walk all the blocks in this chain, marking their successors as having 352 // a predecessor placed. 353 for (MachineBasicBlock *MBB : Chain) { 354 // Add any successors for which this is the only un-placed in-loop 355 // predecessor to the worklist as a viable candidate for CFG-neutral 356 // placement. No subsequent placement of this block will violate the CFG 357 // shape, so we get to use heuristics to choose a favorable placement. 358 for (MachineBasicBlock *Succ : MBB->successors()) { 359 if (BlockFilter && !BlockFilter->count(Succ)) 360 continue; 361 BlockChain &SuccChain = *BlockToChain[Succ]; 362 // Disregard edges within a fixed chain, or edges to the loop header. 363 if (&Chain == &SuccChain || Succ == LoopHeaderBB) 364 continue; 365 366 // This is a cross-chain edge that is within the loop, so decrement the 367 // loop predecessor count of the destination chain. 368 if (SuccChain.LoopPredecessors > 0 && --SuccChain.LoopPredecessors == 0) 369 BlockWorkList.push_back(*SuccChain.begin()); 370 } 371 } 372 } 373 374 /// \brief Select the best successor for a block. 375 /// 376 /// This looks across all successors of a particular block and attempts to 377 /// select the "best" one to be the layout successor. It only considers direct 378 /// successors which also pass the block filter. It will attempt to avoid 379 /// breaking CFG structure, but cave and break such structures in the case of 380 /// very hot successor edges. 381 /// 382 /// \returns The best successor block found, or null if none are viable. 383 MachineBasicBlock * 384 MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, 385 BlockChain &Chain, 386 const BlockFilterSet *BlockFilter) { 387 const BranchProbability HotProb(4, 5); // 80% 388 389 MachineBasicBlock *BestSucc = nullptr; 390 auto BestProb = BranchProbability::getZero(); 391 392 // Adjust edge probabilities by excluding edges pointing to blocks that is 393 // either not in BlockFilter or is already in the current chain. Consider the 394 // following CFG: 395 // 396 // --->A 397 // | / \ 398 // | B C 399 // | \ / \ 400 // ----D E 401 // 402 // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after 403 // A->C is chosen as a fall-through, D won't be selected as a successor of C 404 // due to CFG constraint (the probability of C->D is not greater than 405 // HotProb). If we exclude E that is not in BlockFilter when calculating the 406 // probability of C->D, D will be selected and we will get A C D B as the 407 // layout of this loop. 408 auto AdjustedSumProb = BranchProbability::getOne(); 409 SmallVector<MachineBasicBlock *, 4> Successors; 410 for (MachineBasicBlock *Succ : BB->successors()) { 411 bool SkipSucc = false; 412 if (BlockFilter && !BlockFilter->count(Succ)) { 413 SkipSucc = true; 414 } else { 415 BlockChain *SuccChain = BlockToChain[Succ]; 416 if (SuccChain == &Chain) { 417 DEBUG(dbgs() << " " << getBlockName(Succ) 418 << " -> Already merged!\n"); 419 SkipSucc = true; 420 } else if (Succ != *SuccChain->begin()) { 421 DEBUG(dbgs() << " " << getBlockName(Succ) << " -> Mid chain!\n"); 422 continue; 423 } 424 } 425 if (SkipSucc) 426 AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ); 427 else 428 Successors.push_back(Succ); 429 } 430 431 DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n"); 432 for (MachineBasicBlock *Succ : Successors) { 433 BranchProbability SuccProb; 434 uint32_t SuccProbN = MBPI->getEdgeProbability(BB, Succ).getNumerator(); 435 uint32_t SuccProbD = AdjustedSumProb.getNumerator(); 436 if (SuccProbN >= SuccProbD) 437 SuccProb = BranchProbability::getOne(); 438 else 439 SuccProb = BranchProbability(SuccProbN, SuccProbD); 440 441 // If we outline optional branches, look whether Succ is unavoidable, i.e. 442 // dominates all terminators of the MachineFunction. If it does, other 443 // successors must be optional. Don't do this for cold branches. 444 if (OutlineOptionalBranches && SuccProb > HotProb.getCompl() && 445 UnavoidableBlocks.count(Succ) > 0) { 446 auto HasShortOptionalBranch = [&]() { 447 for (MachineBasicBlock *Pred : Succ->predecessors()) { 448 // Check whether there is an unplaced optional branch. 449 if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) || 450 BlockToChain[Pred] == &Chain) 451 continue; 452 // Check whether the optional branch has exactly one BB. 453 if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB) 454 continue; 455 // Check whether the optional branch is small. 456 if (Pred->size() < OutlineOptionalThreshold) 457 return true; 458 } 459 return false; 460 }; 461 if (!HasShortOptionalBranch()) 462 return Succ; 463 } 464 465 // Only consider successors which are either "hot", or wouldn't violate 466 // any CFG constraints. 467 BlockChain &SuccChain = *BlockToChain[Succ]; 468 if (SuccChain.LoopPredecessors != 0) { 469 if (SuccProb < HotProb) { 470 DEBUG(dbgs() << " " << getBlockName(Succ) << " -> " << SuccProb 471 << " (prob) (CFG conflict)\n"); 472 continue; 473 } 474 475 // Make sure that a hot successor doesn't have a globally more 476 // important predecessor. 477 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); 478 BlockFrequency CandidateEdgeFreq = 479 MBFI->getBlockFreq(BB) * RealSuccProb * HotProb.getCompl(); 480 bool BadCFGConflict = false; 481 for (MachineBasicBlock *Pred : Succ->predecessors()) { 482 if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) || 483 BlockToChain[Pred] == &Chain) 484 continue; 485 BlockFrequency PredEdgeFreq = 486 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ); 487 if (PredEdgeFreq >= CandidateEdgeFreq) { 488 BadCFGConflict = true; 489 break; 490 } 491 } 492 if (BadCFGConflict) { 493 DEBUG(dbgs() << " " << getBlockName(Succ) << " -> " << SuccProb 494 << " (prob) (non-cold CFG conflict)\n"); 495 continue; 496 } 497 } 498 499 DEBUG(dbgs() << " " << getBlockName(Succ) << " -> " << SuccProb 500 << " (prob)" 501 << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "") 502 << "\n"); 503 if (BestSucc && BestProb >= SuccProb) 504 continue; 505 BestSucc = Succ; 506 BestProb = SuccProb; 507 } 508 return BestSucc; 509 } 510 511 /// \brief Select the best block from a worklist. 512 /// 513 /// This looks through the provided worklist as a list of candidate basic 514 /// blocks and select the most profitable one to place. The definition of 515 /// profitable only really makes sense in the context of a loop. This returns 516 /// the most frequently visited block in the worklist, which in the case of 517 /// a loop, is the one most desirable to be physically close to the rest of the 518 /// loop body in order to improve icache behavior. 519 /// 520 /// \returns The best block found, or null if none are viable. 521 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock( 522 BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList, 523 const BlockFilterSet *BlockFilter) { 524 // Once we need to walk the worklist looking for a candidate, cleanup the 525 // worklist of already placed entries. 526 // FIXME: If this shows up on profiles, it could be folded (at the cost of 527 // some code complexity) into the loop below. 528 WorkList.erase(std::remove_if(WorkList.begin(), WorkList.end(), 529 [&](MachineBasicBlock *BB) { 530 return BlockToChain.lookup(BB) == &Chain; 531 }), 532 WorkList.end()); 533 534 MachineBasicBlock *BestBlock = nullptr; 535 BlockFrequency BestFreq; 536 for (MachineBasicBlock *MBB : WorkList) { 537 BlockChain &SuccChain = *BlockToChain[MBB]; 538 if (&SuccChain == &Chain) { 539 DEBUG(dbgs() << " " << getBlockName(MBB) << " -> Already merged!\n"); 540 continue; 541 } 542 assert(SuccChain.LoopPredecessors == 0 && "Found CFG-violating block"); 543 544 BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB); 545 DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "; 546 MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n"); 547 if (BestBlock && BestFreq >= CandidateFreq) 548 continue; 549 BestBlock = MBB; 550 BestFreq = CandidateFreq; 551 } 552 return BestBlock; 553 } 554 555 /// \brief Retrieve the first unplaced basic block. 556 /// 557 /// This routine is called when we are unable to use the CFG to walk through 558 /// all of the basic blocks and form a chain due to unnatural loops in the CFG. 559 /// We walk through the function's blocks in order, starting from the 560 /// LastUnplacedBlockIt. We update this iterator on each call to avoid 561 /// re-scanning the entire sequence on repeated calls to this routine. 562 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock( 563 MachineFunction &F, const BlockChain &PlacedChain, 564 MachineFunction::iterator &PrevUnplacedBlockIt, 565 const BlockFilterSet *BlockFilter) { 566 for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F.end(); I != E; 567 ++I) { 568 if (BlockFilter && !BlockFilter->count(&*I)) 569 continue; 570 if (BlockToChain[&*I] != &PlacedChain) { 571 PrevUnplacedBlockIt = I; 572 // Now select the head of the chain to which the unplaced block belongs 573 // as the block to place. This will force the entire chain to be placed, 574 // and satisfies the requirements of merging chains. 575 return *BlockToChain[&*I]->begin(); 576 } 577 } 578 return nullptr; 579 } 580 581 void MachineBlockPlacement::buildChain( 582 MachineBasicBlock *BB, BlockChain &Chain, 583 SmallVectorImpl<MachineBasicBlock *> &BlockWorkList, 584 const BlockFilterSet *BlockFilter) { 585 assert(BB); 586 assert(BlockToChain[BB] == &Chain); 587 MachineFunction &F = *BB->getParent(); 588 MachineFunction::iterator PrevUnplacedBlockIt = F.begin(); 589 590 MachineBasicBlock *LoopHeaderBB = BB; 591 markChainSuccessors(Chain, LoopHeaderBB, BlockWorkList, BlockFilter); 592 BB = *std::prev(Chain.end()); 593 for (;;) { 594 assert(BB); 595 assert(BlockToChain[BB] == &Chain); 596 assert(*std::prev(Chain.end()) == BB); 597 598 // Look for the best viable successor if there is one to place immediately 599 // after this block. 600 MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter); 601 602 // If an immediate successor isn't available, look for the best viable 603 // block among those we've identified as not violating the loop's CFG at 604 // this point. This won't be a fallthrough, but it will increase locality. 605 if (!BestSucc) 606 BestSucc = selectBestCandidateBlock(Chain, BlockWorkList, BlockFilter); 607 608 if (!BestSucc) { 609 BestSucc = 610 getFirstUnplacedBlock(F, Chain, PrevUnplacedBlockIt, BlockFilter); 611 if (!BestSucc) 612 break; 613 614 DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the " 615 "layout successor until the CFG reduces\n"); 616 } 617 618 // Place this block, updating the datastructures to reflect its placement. 619 BlockChain &SuccChain = *BlockToChain[BestSucc]; 620 // Zero out LoopPredecessors for the successor we're about to merge in case 621 // we selected a successor that didn't fit naturally into the CFG. 622 SuccChain.LoopPredecessors = 0; 623 DEBUG(dbgs() << "Merging from " << getBlockNum(BB) << " to " 624 << getBlockNum(BestSucc) << "\n"); 625 markChainSuccessors(SuccChain, LoopHeaderBB, BlockWorkList, BlockFilter); 626 Chain.merge(BestSucc, &SuccChain); 627 BB = *std::prev(Chain.end()); 628 } 629 630 DEBUG(dbgs() << "Finished forming chain for header block " 631 << getBlockNum(*Chain.begin()) << "\n"); 632 } 633 634 /// \brief Find the best loop top block for layout. 635 /// 636 /// Look for a block which is strictly better than the loop header for laying 637 /// out at the top of the loop. This looks for one and only one pattern: 638 /// a latch block with no conditional exit. This block will cause a conditional 639 /// jump around it or will be the bottom of the loop if we lay it out in place, 640 /// but if it it doesn't end up at the bottom of the loop for any reason, 641 /// rotation alone won't fix it. Because such a block will always result in an 642 /// unconditional jump (for the backedge) rotating it in front of the loop 643 /// header is always profitable. 644 MachineBasicBlock * 645 MachineBlockPlacement::findBestLoopTop(MachineLoop &L, 646 const BlockFilterSet &LoopBlockSet) { 647 // Check that the header hasn't been fused with a preheader block due to 648 // crazy branches. If it has, we need to start with the header at the top to 649 // prevent pulling the preheader into the loop body. 650 BlockChain &HeaderChain = *BlockToChain[L.getHeader()]; 651 if (!LoopBlockSet.count(*HeaderChain.begin())) 652 return L.getHeader(); 653 654 DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(L.getHeader()) 655 << "\n"); 656 657 BlockFrequency BestPredFreq; 658 MachineBasicBlock *BestPred = nullptr; 659 for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) { 660 if (!LoopBlockSet.count(Pred)) 661 continue; 662 DEBUG(dbgs() << " header pred: " << getBlockName(Pred) << ", " 663 << Pred->succ_size() << " successors, "; 664 MBFI->printBlockFreq(dbgs(), Pred) << " freq\n"); 665 if (Pred->succ_size() > 1) 666 continue; 667 668 BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); 669 if (!BestPred || PredFreq > BestPredFreq || 670 (!(PredFreq < BestPredFreq) && 671 Pred->isLayoutSuccessor(L.getHeader()))) { 672 BestPred = Pred; 673 BestPredFreq = PredFreq; 674 } 675 } 676 677 // If no direct predecessor is fine, just use the loop header. 678 if (!BestPred) 679 return L.getHeader(); 680 681 // Walk backwards through any straight line of predecessors. 682 while (BestPred->pred_size() == 1 && 683 (*BestPred->pred_begin())->succ_size() == 1 && 684 *BestPred->pred_begin() != L.getHeader()) 685 BestPred = *BestPred->pred_begin(); 686 687 DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n"); 688 return BestPred; 689 } 690 691 /// \brief Find the best loop exiting block for layout. 692 /// 693 /// This routine implements the logic to analyze the loop looking for the best 694 /// block to layout at the top of the loop. Typically this is done to maximize 695 /// fallthrough opportunities. 696 MachineBasicBlock * 697 MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, 698 const BlockFilterSet &LoopBlockSet) { 699 // We don't want to layout the loop linearly in all cases. If the loop header 700 // is just a normal basic block in the loop, we want to look for what block 701 // within the loop is the best one to layout at the top. However, if the loop 702 // header has be pre-merged into a chain due to predecessors not having 703 // analyzable branches, *and* the predecessor it is merged with is *not* part 704 // of the loop, rotating the header into the middle of the loop will create 705 // a non-contiguous range of blocks which is Very Bad. So start with the 706 // header and only rotate if safe. 707 BlockChain &HeaderChain = *BlockToChain[L.getHeader()]; 708 if (!LoopBlockSet.count(*HeaderChain.begin())) 709 return nullptr; 710 711 BlockFrequency BestExitEdgeFreq; 712 unsigned BestExitLoopDepth = 0; 713 MachineBasicBlock *ExitingBB = nullptr; 714 // If there are exits to outer loops, loop rotation can severely limit 715 // fallthrough opportunites unless it selects such an exit. Keep a set of 716 // blocks where rotating to exit with that block will reach an outer loop. 717 SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop; 718 719 DEBUG(dbgs() << "Finding best loop exit for: " << getBlockName(L.getHeader()) 720 << "\n"); 721 for (MachineBasicBlock *MBB : L.getBlocks()) { 722 BlockChain &Chain = *BlockToChain[MBB]; 723 // Ensure that this block is at the end of a chain; otherwise it could be 724 // mid-way through an inner loop or a successor of an unanalyzable branch. 725 if (MBB != *std::prev(Chain.end())) 726 continue; 727 728 // Now walk the successors. We need to establish whether this has a viable 729 // exiting successor and whether it has a viable non-exiting successor. 730 // We store the old exiting state and restore it if a viable looping 731 // successor isn't found. 732 MachineBasicBlock *OldExitingBB = ExitingBB; 733 BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq; 734 bool HasLoopingSucc = false; 735 for (MachineBasicBlock *Succ : MBB->successors()) { 736 if (Succ->isEHPad()) 737 continue; 738 if (Succ == MBB) 739 continue; 740 BlockChain &SuccChain = *BlockToChain[Succ]; 741 // Don't split chains, either this chain or the successor's chain. 742 if (&Chain == &SuccChain) { 743 DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> " 744 << getBlockName(Succ) << " (chain conflict)\n"); 745 continue; 746 } 747 748 auto SuccProb = MBPI->getEdgeProbability(MBB, Succ); 749 if (LoopBlockSet.count(Succ)) { 750 DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> " 751 << getBlockName(Succ) << " (" << SuccProb << ")\n"); 752 HasLoopingSucc = true; 753 continue; 754 } 755 756 unsigned SuccLoopDepth = 0; 757 if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) { 758 SuccLoopDepth = ExitLoop->getLoopDepth(); 759 if (ExitLoop->contains(&L)) 760 BlocksExitingToOuterLoop.insert(MBB); 761 } 762 763 BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb; 764 DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> " 765 << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("; 766 MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n"); 767 // Note that we bias this toward an existing layout successor to retain 768 // incoming order in the absence of better information. The exit must have 769 // a frequency higher than the current exit before we consider breaking 770 // the layout. 771 BranchProbability Bias(100 - ExitBlockBias, 100); 772 if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth || 773 ExitEdgeFreq > BestExitEdgeFreq || 774 (MBB->isLayoutSuccessor(Succ) && 775 !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) { 776 BestExitEdgeFreq = ExitEdgeFreq; 777 ExitingBB = MBB; 778 } 779 } 780 781 if (!HasLoopingSucc) { 782 // Restore the old exiting state, no viable looping successor was found. 783 ExitingBB = OldExitingBB; 784 BestExitEdgeFreq = OldBestExitEdgeFreq; 785 continue; 786 } 787 } 788 // Without a candidate exiting block or with only a single block in the 789 // loop, just use the loop header to layout the loop. 790 if (!ExitingBB || L.getNumBlocks() == 1) 791 return nullptr; 792 793 // Also, if we have exit blocks which lead to outer loops but didn't select 794 // one of them as the exiting block we are rotating toward, disable loop 795 // rotation altogether. 796 if (!BlocksExitingToOuterLoop.empty() && 797 !BlocksExitingToOuterLoop.count(ExitingBB)) 798 return nullptr; 799 800 DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB) << "\n"); 801 return ExitingBB; 802 } 803 804 /// \brief Attempt to rotate an exiting block to the bottom of the loop. 805 /// 806 /// Once we have built a chain, try to rotate it to line up the hot exit block 807 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary 808 /// branches. For example, if the loop has fallthrough into its header and out 809 /// of its bottom already, don't rotate it. 810 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, 811 MachineBasicBlock *ExitingBB, 812 const BlockFilterSet &LoopBlockSet) { 813 if (!ExitingBB) 814 return; 815 816 MachineBasicBlock *Top = *LoopChain.begin(); 817 bool ViableTopFallthrough = false; 818 for (MachineBasicBlock *Pred : Top->predecessors()) { 819 BlockChain *PredChain = BlockToChain[Pred]; 820 if (!LoopBlockSet.count(Pred) && 821 (!PredChain || Pred == *std::prev(PredChain->end()))) { 822 ViableTopFallthrough = true; 823 break; 824 } 825 } 826 827 // If the header has viable fallthrough, check whether the current loop 828 // bottom is a viable exiting block. If so, bail out as rotating will 829 // introduce an unnecessary branch. 830 if (ViableTopFallthrough) { 831 MachineBasicBlock *Bottom = *std::prev(LoopChain.end()); 832 for (MachineBasicBlock *Succ : Bottom->successors()) { 833 BlockChain *SuccChain = BlockToChain[Succ]; 834 if (!LoopBlockSet.count(Succ) && 835 (!SuccChain || Succ == *SuccChain->begin())) 836 return; 837 } 838 } 839 840 BlockChain::iterator ExitIt = 841 std::find(LoopChain.begin(), LoopChain.end(), ExitingBB); 842 if (ExitIt == LoopChain.end()) 843 return; 844 845 std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end()); 846 } 847 848 /// \brief Attempt to rotate a loop based on profile data to reduce branch cost. 849 /// 850 /// With profile data, we can determine the cost in terms of missed fall through 851 /// opportunities when rotating a loop chain and select the best rotation. 852 /// Basically, there are three kinds of cost to consider for each rotation: 853 /// 1. The possibly missed fall through edge (if it exists) from BB out of 854 /// the loop to the loop header. 855 /// 2. The possibly missed fall through edges (if they exist) from the loop 856 /// exits to BB out of the loop. 857 /// 3. The missed fall through edge (if it exists) from the last BB to the 858 /// first BB in the loop chain. 859 /// Therefore, the cost for a given rotation is the sum of costs listed above. 860 /// We select the best rotation with the smallest cost. 861 void MachineBlockPlacement::rotateLoopWithProfile( 862 BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) { 863 auto HeaderBB = L.getHeader(); 864 auto HeaderIter = std::find(LoopChain.begin(), LoopChain.end(), HeaderBB); 865 auto RotationPos = LoopChain.end(); 866 867 BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency(); 868 869 // A utility lambda that scales up a block frequency by dividing it by a 870 // branch probability which is the reciprocal of the scale. 871 auto ScaleBlockFrequency = [](BlockFrequency Freq, 872 unsigned Scale) -> BlockFrequency { 873 if (Scale == 0) 874 return 0; 875 // Use operator / between BlockFrequency and BranchProbability to implement 876 // saturating multiplication. 877 return Freq / BranchProbability(1, Scale); 878 }; 879 880 // Compute the cost of the missed fall-through edge to the loop header if the 881 // chain head is not the loop header. As we only consider natural loops with 882 // single header, this computation can be done only once. 883 BlockFrequency HeaderFallThroughCost(0); 884 for (auto *Pred : HeaderBB->predecessors()) { 885 BlockChain *PredChain = BlockToChain[Pred]; 886 if (!LoopBlockSet.count(Pred) && 887 (!PredChain || Pred == *std::prev(PredChain->end()))) { 888 auto EdgeFreq = 889 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB); 890 auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost); 891 // If the predecessor has only an unconditional jump to the header, we 892 // need to consider the cost of this jump. 893 if (Pred->succ_size() == 1) 894 FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost); 895 HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost); 896 } 897 } 898 899 // Here we collect all exit blocks in the loop, and for each exit we find out 900 // its hottest exit edge. For each loop rotation, we define the loop exit cost 901 // as the sum of frequencies of exit edges we collect here, excluding the exit 902 // edge from the tail of the loop chain. 903 SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq; 904 for (auto BB : LoopChain) { 905 auto LargestExitEdgeProb = BranchProbability::getZero(); 906 for (auto *Succ : BB->successors()) { 907 BlockChain *SuccChain = BlockToChain[Succ]; 908 if (!LoopBlockSet.count(Succ) && 909 (!SuccChain || Succ == *SuccChain->begin())) { 910 auto SuccProb = MBPI->getEdgeProbability(BB, Succ); 911 LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb); 912 } 913 } 914 if (LargestExitEdgeProb > BranchProbability::getZero()) { 915 auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb; 916 ExitsWithFreq.emplace_back(BB, ExitFreq); 917 } 918 } 919 920 // In this loop we iterate every block in the loop chain and calculate the 921 // cost assuming the block is the head of the loop chain. When the loop ends, 922 // we should have found the best candidate as the loop chain's head. 923 for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()), 924 EndIter = LoopChain.end(); 925 Iter != EndIter; Iter++, TailIter++) { 926 // TailIter is used to track the tail of the loop chain if the block we are 927 // checking (pointed by Iter) is the head of the chain. 928 if (TailIter == LoopChain.end()) 929 TailIter = LoopChain.begin(); 930 931 auto TailBB = *TailIter; 932 933 // Calculate the cost by putting this BB to the top. 934 BlockFrequency Cost = 0; 935 936 // If the current BB is the loop header, we need to take into account the 937 // cost of the missed fall through edge from outside of the loop to the 938 // header. 939 if (Iter != HeaderIter) 940 Cost += HeaderFallThroughCost; 941 942 // Collect the loop exit cost by summing up frequencies of all exit edges 943 // except the one from the chain tail. 944 for (auto &ExitWithFreq : ExitsWithFreq) 945 if (TailBB != ExitWithFreq.first) 946 Cost += ExitWithFreq.second; 947 948 // The cost of breaking the once fall-through edge from the tail to the top 949 // of the loop chain. Here we need to consider three cases: 950 // 1. If the tail node has only one successor, then we will get an 951 // additional jmp instruction. So the cost here is (MisfetchCost + 952 // JumpInstCost) * tail node frequency. 953 // 2. If the tail node has two successors, then we may still get an 954 // additional jmp instruction if the layout successor after the loop 955 // chain is not its CFG successor. Note that the more frequently executed 956 // jmp instruction will be put ahead of the other one. Assume the 957 // frequency of those two branches are x and y, where x is the frequency 958 // of the edge to the chain head, then the cost will be 959 // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency. 960 // 3. If the tail node has more than two successors (this rarely happens), 961 // we won't consider any additional cost. 962 if (TailBB->isSuccessor(*Iter)) { 963 auto TailBBFreq = MBFI->getBlockFreq(TailBB); 964 if (TailBB->succ_size() == 1) 965 Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(), 966 MisfetchCost + JumpInstCost); 967 else if (TailBB->succ_size() == 2) { 968 auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter); 969 auto TailToHeadFreq = TailBBFreq * TailToHeadProb; 970 auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2) 971 ? TailBBFreq * TailToHeadProb.getCompl() 972 : TailToHeadFreq; 973 Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) + 974 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost); 975 } 976 } 977 978 DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockNum(*Iter) 979 << " to the top: " << Cost.getFrequency() << "\n"); 980 981 if (Cost < SmallestRotationCost) { 982 SmallestRotationCost = Cost; 983 RotationPos = Iter; 984 } 985 } 986 987 if (RotationPos != LoopChain.end()) { 988 DEBUG(dbgs() << "Rotate loop by making " << getBlockNum(*RotationPos) 989 << " to the top\n"); 990 std::rotate(LoopChain.begin(), RotationPos, LoopChain.end()); 991 } 992 } 993 994 /// \brief Collect blocks in the given loop that are to be placed. 995 /// 996 /// When profile data is available, exclude cold blocks from the returned set; 997 /// otherwise, collect all blocks in the loop. 998 MachineBlockPlacement::BlockFilterSet 999 MachineBlockPlacement::collectLoopBlockSet(MachineFunction &F, MachineLoop &L) { 1000 BlockFilterSet LoopBlockSet; 1001 1002 // Filter cold blocks off from LoopBlockSet when profile data is available. 1003 // Collect the sum of frequencies of incoming edges to the loop header from 1004 // outside. If we treat the loop as a super block, this is the frequency of 1005 // the loop. Then for each block in the loop, we calculate the ratio between 1006 // its frequency and the frequency of the loop block. When it is too small, 1007 // don't add it to the loop chain. If there are outer loops, then this block 1008 // will be merged into the first outer loop chain for which this block is not 1009 // cold anymore. This needs precise profile data and we only do this when 1010 // profile data is available. 1011 if (F.getFunction()->getEntryCount()) { 1012 BlockFrequency LoopFreq(0); 1013 for (auto LoopPred : L.getHeader()->predecessors()) 1014 if (!L.contains(LoopPred)) 1015 LoopFreq += MBFI->getBlockFreq(LoopPred) * 1016 MBPI->getEdgeProbability(LoopPred, L.getHeader()); 1017 1018 for (MachineBasicBlock *LoopBB : L.getBlocks()) { 1019 auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency(); 1020 if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio) 1021 continue; 1022 LoopBlockSet.insert(LoopBB); 1023 } 1024 } else 1025 LoopBlockSet.insert(L.block_begin(), L.block_end()); 1026 1027 return LoopBlockSet; 1028 } 1029 1030 /// \brief Forms basic block chains from the natural loop structures. 1031 /// 1032 /// These chains are designed to preserve the existing *structure* of the code 1033 /// as much as possible. We can then stitch the chains together in a way which 1034 /// both preserves the topological structure and minimizes taken conditional 1035 /// branches. 1036 void MachineBlockPlacement::buildLoopChains(MachineFunction &F, 1037 MachineLoop &L) { 1038 // First recurse through any nested loops, building chains for those inner 1039 // loops. 1040 for (MachineLoop *InnerLoop : L) 1041 buildLoopChains(F, *InnerLoop); 1042 1043 SmallVector<MachineBasicBlock *, 16> BlockWorkList; 1044 BlockFilterSet LoopBlockSet = collectLoopBlockSet(F, L); 1045 1046 // Check if we have profile data for this function. If yes, we will rotate 1047 // this loop by modeling costs more precisely which requires the profile data 1048 // for better layout. 1049 bool RotateLoopWithProfile = 1050 PreciseRotationCost && F.getFunction()->getEntryCount(); 1051 1052 // First check to see if there is an obviously preferable top block for the 1053 // loop. This will default to the header, but may end up as one of the 1054 // predecessors to the header if there is one which will result in strictly 1055 // fewer branches in the loop body. 1056 // When we use profile data to rotate the loop, this is unnecessary. 1057 MachineBasicBlock *LoopTop = 1058 RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet); 1059 1060 // If we selected just the header for the loop top, look for a potentially 1061 // profitable exit block in the event that rotating the loop can eliminate 1062 // branches by placing an exit edge at the bottom. 1063 MachineBasicBlock *ExitingBB = nullptr; 1064 if (!RotateLoopWithProfile && LoopTop == L.getHeader()) 1065 ExitingBB = findBestLoopExit(F, L, LoopBlockSet); 1066 1067 BlockChain &LoopChain = *BlockToChain[LoopTop]; 1068 1069 // FIXME: This is a really lame way of walking the chains in the loop: we 1070 // walk the blocks, and use a set to prevent visiting a particular chain 1071 // twice. 1072 SmallPtrSet<BlockChain *, 4> UpdatedPreds; 1073 assert(LoopChain.LoopPredecessors == 0); 1074 UpdatedPreds.insert(&LoopChain); 1075 1076 for (MachineBasicBlock *LoopBB : LoopBlockSet) { 1077 BlockChain &Chain = *BlockToChain[LoopBB]; 1078 if (!UpdatedPreds.insert(&Chain).second) 1079 continue; 1080 1081 assert(Chain.LoopPredecessors == 0); 1082 for (MachineBasicBlock *ChainBB : Chain) { 1083 assert(BlockToChain[ChainBB] == &Chain); 1084 for (MachineBasicBlock *Pred : ChainBB->predecessors()) { 1085 if (BlockToChain[Pred] == &Chain || !LoopBlockSet.count(Pred)) 1086 continue; 1087 ++Chain.LoopPredecessors; 1088 } 1089 } 1090 1091 if (Chain.LoopPredecessors == 0) 1092 BlockWorkList.push_back(*Chain.begin()); 1093 } 1094 1095 buildChain(LoopTop, LoopChain, BlockWorkList, &LoopBlockSet); 1096 1097 if (RotateLoopWithProfile) 1098 rotateLoopWithProfile(LoopChain, L, LoopBlockSet); 1099 else 1100 rotateLoop(LoopChain, ExitingBB, LoopBlockSet); 1101 1102 DEBUG({ 1103 // Crash at the end so we get all of the debugging output first. 1104 bool BadLoop = false; 1105 if (LoopChain.LoopPredecessors) { 1106 BadLoop = true; 1107 dbgs() << "Loop chain contains a block without its preds placed!\n" 1108 << " Loop header: " << getBlockName(*L.block_begin()) << "\n" 1109 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"; 1110 } 1111 for (MachineBasicBlock *ChainBB : LoopChain) { 1112 dbgs() << " ... " << getBlockName(ChainBB) << "\n"; 1113 if (!LoopBlockSet.erase(ChainBB)) { 1114 // We don't mark the loop as bad here because there are real situations 1115 // where this can occur. For example, with an unanalyzable fallthrough 1116 // from a loop block to a non-loop block or vice versa. 1117 dbgs() << "Loop chain contains a block not contained by the loop!\n" 1118 << " Loop header: " << getBlockName(*L.block_begin()) << "\n" 1119 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n" 1120 << " Bad block: " << getBlockName(ChainBB) << "\n"; 1121 } 1122 } 1123 1124 if (!LoopBlockSet.empty()) { 1125 BadLoop = true; 1126 for (MachineBasicBlock *LoopBB : LoopBlockSet) 1127 dbgs() << "Loop contains blocks never placed into a chain!\n" 1128 << " Loop header: " << getBlockName(*L.block_begin()) << "\n" 1129 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n" 1130 << " Bad block: " << getBlockName(LoopBB) << "\n"; 1131 } 1132 assert(!BadLoop && "Detected problems with the placement of this loop."); 1133 }); 1134 } 1135 1136 void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { 1137 // Ensure that every BB in the function has an associated chain to simplify 1138 // the assumptions of the remaining algorithm. 1139 SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch. 1140 for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { 1141 MachineBasicBlock *BB = &*FI; 1142 BlockChain *Chain = 1143 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB); 1144 // Also, merge any blocks which we cannot reason about and must preserve 1145 // the exact fallthrough behavior for. 1146 for (;;) { 1147 Cond.clear(); 1148 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. 1149 if (!TII->AnalyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough()) 1150 break; 1151 1152 MachineFunction::iterator NextFI = std::next(FI); 1153 MachineBasicBlock *NextBB = &*NextFI; 1154 // Ensure that the layout successor is a viable block, as we know that 1155 // fallthrough is a possibility. 1156 assert(NextFI != FE && "Can't fallthrough past the last block."); 1157 DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: " 1158 << getBlockName(BB) << " -> " << getBlockName(NextBB) 1159 << "\n"); 1160 Chain->merge(NextBB, nullptr); 1161 FI = NextFI; 1162 BB = NextBB; 1163 } 1164 } 1165 1166 if (OutlineOptionalBranches) { 1167 // Find the nearest common dominator of all of F's terminators. 1168 MachineBasicBlock *Terminator = nullptr; 1169 for (MachineBasicBlock &MBB : F) { 1170 if (MBB.succ_size() == 0) { 1171 if (Terminator == nullptr) 1172 Terminator = &MBB; 1173 else 1174 Terminator = MDT->findNearestCommonDominator(Terminator, &MBB); 1175 } 1176 } 1177 1178 // MBBs dominating this common dominator are unavoidable. 1179 UnavoidableBlocks.clear(); 1180 for (MachineBasicBlock &MBB : F) { 1181 if (MDT->dominates(&MBB, Terminator)) { 1182 UnavoidableBlocks.insert(&MBB); 1183 } 1184 } 1185 } 1186 1187 // Build any loop-based chains. 1188 for (MachineLoop *L : *MLI) 1189 buildLoopChains(F, *L); 1190 1191 SmallVector<MachineBasicBlock *, 16> BlockWorkList; 1192 1193 SmallPtrSet<BlockChain *, 4> UpdatedPreds; 1194 for (MachineBasicBlock &MBB : F) { 1195 BlockChain &Chain = *BlockToChain[&MBB]; 1196 if (!UpdatedPreds.insert(&Chain).second) 1197 continue; 1198 1199 assert(Chain.LoopPredecessors == 0); 1200 for (MachineBasicBlock *ChainBB : Chain) { 1201 assert(BlockToChain[ChainBB] == &Chain); 1202 for (MachineBasicBlock *Pred : ChainBB->predecessors()) { 1203 if (BlockToChain[Pred] == &Chain) 1204 continue; 1205 ++Chain.LoopPredecessors; 1206 } 1207 } 1208 1209 if (Chain.LoopPredecessors == 0) 1210 BlockWorkList.push_back(*Chain.begin()); 1211 } 1212 1213 BlockChain &FunctionChain = *BlockToChain[&F.front()]; 1214 buildChain(&F.front(), FunctionChain, BlockWorkList); 1215 1216 #ifndef NDEBUG 1217 typedef SmallPtrSet<MachineBasicBlock *, 16> FunctionBlockSetType; 1218 #endif 1219 DEBUG({ 1220 // Crash at the end so we get all of the debugging output first. 1221 bool BadFunc = false; 1222 FunctionBlockSetType FunctionBlockSet; 1223 for (MachineBasicBlock &MBB : F) 1224 FunctionBlockSet.insert(&MBB); 1225 1226 for (MachineBasicBlock *ChainBB : FunctionChain) 1227 if (!FunctionBlockSet.erase(ChainBB)) { 1228 BadFunc = true; 1229 dbgs() << "Function chain contains a block not in the function!\n" 1230 << " Bad block: " << getBlockName(ChainBB) << "\n"; 1231 } 1232 1233 if (!FunctionBlockSet.empty()) { 1234 BadFunc = true; 1235 for (MachineBasicBlock *RemainingBB : FunctionBlockSet) 1236 dbgs() << "Function contains blocks never placed into a chain!\n" 1237 << " Bad block: " << getBlockName(RemainingBB) << "\n"; 1238 } 1239 assert(!BadFunc && "Detected problems with the block placement."); 1240 }); 1241 1242 // Splice the blocks into place. 1243 MachineFunction::iterator InsertPos = F.begin(); 1244 for (MachineBasicBlock *ChainBB : FunctionChain) { 1245 DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain " 1246 : " ... ") 1247 << getBlockName(ChainBB) << "\n"); 1248 if (InsertPos != MachineFunction::iterator(ChainBB)) 1249 F.splice(InsertPos, ChainBB); 1250 else 1251 ++InsertPos; 1252 1253 // Update the terminator of the previous block. 1254 if (ChainBB == *FunctionChain.begin()) 1255 continue; 1256 MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB)); 1257 1258 // FIXME: It would be awesome of updateTerminator would just return rather 1259 // than assert when the branch cannot be analyzed in order to remove this 1260 // boiler plate. 1261 Cond.clear(); 1262 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. 1263 if (!TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond)) { 1264 // The "PrevBB" is not yet updated to reflect current code layout, so, 1265 // o. it may fall-through to a block without explict "goto" instruction 1266 // before layout, and no longer fall-through it after layout; or 1267 // o. just opposite. 1268 // 1269 // AnalyzeBranch() may return erroneous value for FBB when these two 1270 // situations take place. For the first scenario FBB is mistakenly set 1271 // NULL; for the 2nd scenario, the FBB, which is expected to be NULL, 1272 // is mistakenly pointing to "*BI". 1273 // 1274 bool needUpdateBr = true; 1275 if (!Cond.empty() && (!FBB || FBB == ChainBB)) { 1276 PrevBB->updateTerminator(); 1277 needUpdateBr = false; 1278 Cond.clear(); 1279 TBB = FBB = nullptr; 1280 if (TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond)) { 1281 // FIXME: This should never take place. 1282 TBB = FBB = nullptr; 1283 } 1284 } 1285 1286 // If PrevBB has a two-way branch, try to re-order the branches 1287 // such that we branch to the successor with higher probability first. 1288 if (TBB && !Cond.empty() && FBB && 1289 MBPI->getEdgeProbability(PrevBB, FBB) > 1290 MBPI->getEdgeProbability(PrevBB, TBB) && 1291 !TII->ReverseBranchCondition(Cond)) { 1292 DEBUG(dbgs() << "Reverse order of the two branches: " 1293 << getBlockName(PrevBB) << "\n"); 1294 DEBUG(dbgs() << " Edge probability: " 1295 << MBPI->getEdgeProbability(PrevBB, FBB) << " vs " 1296 << MBPI->getEdgeProbability(PrevBB, TBB) << "\n"); 1297 DebugLoc dl; // FIXME: this is nowhere 1298 TII->RemoveBranch(*PrevBB); 1299 TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl); 1300 needUpdateBr = true; 1301 } 1302 if (needUpdateBr) 1303 PrevBB->updateTerminator(); 1304 } 1305 } 1306 1307 // Fixup the last block. 1308 Cond.clear(); 1309 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. 1310 if (!TII->AnalyzeBranch(F.back(), TBB, FBB, Cond)) 1311 F.back().updateTerminator(); 1312 1313 // Walk through the backedges of the function now that we have fully laid out 1314 // the basic blocks and align the destination of each backedge. We don't rely 1315 // exclusively on the loop info here so that we can align backedges in 1316 // unnatural CFGs and backedges that were introduced purely because of the 1317 // loop rotations done during this layout pass. 1318 // FIXME: Use Function::optForSize(). 1319 if (F.getFunction()->hasFnAttribute(Attribute::OptimizeForSize)) 1320 return; 1321 if (FunctionChain.begin() == FunctionChain.end()) 1322 return; // Empty chain. 1323 1324 const BranchProbability ColdProb(1, 5); // 20% 1325 BlockFrequency EntryFreq = MBFI->getBlockFreq(&F.front()); 1326 BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb; 1327 for (MachineBasicBlock *ChainBB : FunctionChain) { 1328 if (ChainBB == *FunctionChain.begin()) 1329 continue; 1330 1331 // Don't align non-looping basic blocks. These are unlikely to execute 1332 // enough times to matter in practice. Note that we'll still handle 1333 // unnatural CFGs inside of a natural outer loop (the common case) and 1334 // rotated loops. 1335 MachineLoop *L = MLI->getLoopFor(ChainBB); 1336 if (!L) 1337 continue; 1338 1339 unsigned Align = TLI->getPrefLoopAlignment(L); 1340 if (!Align) 1341 continue; // Don't care about loop alignment. 1342 1343 // If the block is cold relative to the function entry don't waste space 1344 // aligning it. 1345 BlockFrequency Freq = MBFI->getBlockFreq(ChainBB); 1346 if (Freq < WeightedEntryFreq) 1347 continue; 1348 1349 // If the block is cold relative to its loop header, don't align it 1350 // regardless of what edges into the block exist. 1351 MachineBasicBlock *LoopHeader = L->getHeader(); 1352 BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader); 1353 if (Freq < (LoopHeaderFreq * ColdProb)) 1354 continue; 1355 1356 // Check for the existence of a non-layout predecessor which would benefit 1357 // from aligning this block. 1358 MachineBasicBlock *LayoutPred = 1359 &*std::prev(MachineFunction::iterator(ChainBB)); 1360 1361 // Force alignment if all the predecessors are jumps. We already checked 1362 // that the block isn't cold above. 1363 if (!LayoutPred->isSuccessor(ChainBB)) { 1364 ChainBB->setAlignment(Align); 1365 continue; 1366 } 1367 1368 // Align this block if the layout predecessor's edge into this block is 1369 // cold relative to the block. When this is true, other predecessors make up 1370 // all of the hot entries into the block and thus alignment is likely to be 1371 // important. 1372 BranchProbability LayoutProb = 1373 MBPI->getEdgeProbability(LayoutPred, ChainBB); 1374 BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb; 1375 if (LayoutEdgeFreq <= (Freq * ColdProb)) 1376 ChainBB->setAlignment(Align); 1377 } 1378 } 1379 1380 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) { 1381 // Check for single-block functions and skip them. 1382 if (std::next(F.begin()) == F.end()) 1383 return false; 1384 1385 if (skipOptnoneFunction(*F.getFunction())) 1386 return false; 1387 1388 MBPI = &getAnalysis<MachineBranchProbabilityInfo>(); 1389 MBFI = &getAnalysis<MachineBlockFrequencyInfo>(); 1390 MLI = &getAnalysis<MachineLoopInfo>(); 1391 TII = F.getSubtarget().getInstrInfo(); 1392 TLI = F.getSubtarget().getTargetLowering(); 1393 MDT = &getAnalysis<MachineDominatorTree>(); 1394 assert(BlockToChain.empty()); 1395 1396 buildCFGChains(F); 1397 1398 BlockToChain.clear(); 1399 ChainAllocator.DestroyAll(); 1400 1401 if (AlignAllBlock) 1402 // Align all of the blocks in the function to a specific alignment. 1403 for (MachineBasicBlock &MBB : F) 1404 MBB.setAlignment(AlignAllBlock); 1405 else if (AlignAllNonFallThruBlocks) { 1406 // Align all of the blocks that have no fall-through predecessors to a 1407 // specific alignment. 1408 for (auto MBI = std::next(F.begin()), MBE = F.end(); MBI != MBE; ++MBI) { 1409 auto LayoutPred = std::prev(MBI); 1410 if (!LayoutPred->isSuccessor(&*MBI)) 1411 MBI->setAlignment(AlignAllNonFallThruBlocks); 1412 } 1413 } 1414 1415 // We always return true as we have no way to track whether the final order 1416 // differs from the original order. 1417 return true; 1418 } 1419 1420 namespace { 1421 /// \brief A pass to compute block placement statistics. 1422 /// 1423 /// A separate pass to compute interesting statistics for evaluating block 1424 /// placement. This is separate from the actual placement pass so that they can 1425 /// be computed in the absence of any placement transformations or when using 1426 /// alternative placement strategies. 1427 class MachineBlockPlacementStats : public MachineFunctionPass { 1428 /// \brief A handle to the branch probability pass. 1429 const MachineBranchProbabilityInfo *MBPI; 1430 1431 /// \brief A handle to the function-wide block frequency pass. 1432 const MachineBlockFrequencyInfo *MBFI; 1433 1434 public: 1435 static char ID; // Pass identification, replacement for typeid 1436 MachineBlockPlacementStats() : MachineFunctionPass(ID) { 1437 initializeMachineBlockPlacementStatsPass(*PassRegistry::getPassRegistry()); 1438 } 1439 1440 bool runOnMachineFunction(MachineFunction &F) override; 1441 1442 void getAnalysisUsage(AnalysisUsage &AU) const override { 1443 AU.addRequired<MachineBranchProbabilityInfo>(); 1444 AU.addRequired<MachineBlockFrequencyInfo>(); 1445 AU.setPreservesAll(); 1446 MachineFunctionPass::getAnalysisUsage(AU); 1447 } 1448 }; 1449 } 1450 1451 char MachineBlockPlacementStats::ID = 0; 1452 char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStats::ID; 1453 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats", 1454 "Basic Block Placement Stats", false, false) 1455 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) 1456 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) 1457 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats", 1458 "Basic Block Placement Stats", false, false) 1459 1460 bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) { 1461 // Check for single-block functions and skip them. 1462 if (std::next(F.begin()) == F.end()) 1463 return false; 1464 1465 MBPI = &getAnalysis<MachineBranchProbabilityInfo>(); 1466 MBFI = &getAnalysis<MachineBlockFrequencyInfo>(); 1467 1468 for (MachineBasicBlock &MBB : F) { 1469 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB); 1470 Statistic &NumBranches = 1471 (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches; 1472 Statistic &BranchTakenFreq = 1473 (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq; 1474 for (MachineBasicBlock *Succ : MBB.successors()) { 1475 // Skip if this successor is a fallthrough. 1476 if (MBB.isLayoutSuccessor(Succ)) 1477 continue; 1478 1479 BlockFrequency EdgeFreq = 1480 BlockFreq * MBPI->getEdgeProbability(&MBB, Succ); 1481 ++NumBranches; 1482 BranchTakenFreq += EdgeFreq.getFrequency(); 1483 } 1484 } 1485 1486 return false; 1487 } 1488