1 //===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // Loops should be simplified before this analysis.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
15 #include "llvm/ADT/SCCIterator.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <numeric>
18 
19 using namespace llvm;
20 using namespace llvm::bfi_detail;
21 
22 #define DEBUG_TYPE "block-freq"
23 
24 ScaledNumber<uint64_t> BlockMass::toScaled() const {
25   if (isFull())
26     return ScaledNumber<uint64_t>(1, 0);
27   return ScaledNumber<uint64_t>(getMass() + 1, -64);
28 }
29 
30 LLVM_DUMP_METHOD void BlockMass::dump() const { print(dbgs()); }
31 
32 static char getHexDigit(int N) {
33   assert(N < 16);
34   if (N < 10)
35     return '0' + N;
36   return 'a' + N - 10;
37 }
38 
39 raw_ostream &BlockMass::print(raw_ostream &OS) const {
40   for (int Digits = 0; Digits < 16; ++Digits)
41     OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf);
42   return OS;
43 }
44 
45 namespace {
46 
47 typedef BlockFrequencyInfoImplBase::BlockNode BlockNode;
48 typedef BlockFrequencyInfoImplBase::Distribution Distribution;
49 typedef BlockFrequencyInfoImplBase::Distribution::WeightList WeightList;
50 typedef BlockFrequencyInfoImplBase::Scaled64 Scaled64;
51 typedef BlockFrequencyInfoImplBase::LoopData LoopData;
52 typedef BlockFrequencyInfoImplBase::Weight Weight;
53 typedef BlockFrequencyInfoImplBase::FrequencyData FrequencyData;
54 
55 /// \brief Dithering mass distributer.
56 ///
57 /// This class splits up a single mass into portions by weight, dithering to
58 /// spread out error.  No mass is lost.  The dithering precision depends on the
59 /// precision of the product of \a BlockMass and \a BranchProbability.
60 ///
61 /// The distribution algorithm follows.
62 ///
63 ///  1. Initialize by saving the sum of the weights in \a RemWeight and the
64 ///     mass to distribute in \a RemMass.
65 ///
66 ///  2. For each portion:
67 ///
68 ///      1. Construct a branch probability, P, as the portion's weight divided
69 ///         by the current value of \a RemWeight.
70 ///      2. Calculate the portion's mass as \a RemMass times P.
71 ///      3. Update \a RemWeight and \a RemMass at each portion by subtracting
72 ///         the current portion's weight and mass.
73 struct DitheringDistributer {
74   uint32_t RemWeight;
75   BlockMass RemMass;
76 
77   DitheringDistributer(Distribution &Dist, const BlockMass &Mass);
78 
79   BlockMass takeMass(uint32_t Weight);
80 };
81 
82 } // end anonymous namespace
83 
84 DitheringDistributer::DitheringDistributer(Distribution &Dist,
85                                            const BlockMass &Mass) {
86   Dist.normalize();
87   RemWeight = Dist.Total;
88   RemMass = Mass;
89 }
90 
91 BlockMass DitheringDistributer::takeMass(uint32_t Weight) {
92   assert(Weight && "invalid weight");
93   assert(Weight <= RemWeight);
94   BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight);
95 
96   // Decrement totals (dither).
97   RemWeight -= Weight;
98   RemMass -= Mass;
99   return Mass;
100 }
101 
102 void Distribution::add(const BlockNode &Node, uint64_t Amount,
103                        Weight::DistType Type) {
104   assert(Amount && "invalid weight of 0");
105   uint64_t NewTotal = Total + Amount;
106 
107   // Check for overflow.  It should be impossible to overflow twice.
108   bool IsOverflow = NewTotal < Total;
109   assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow");
110   DidOverflow |= IsOverflow;
111 
112   // Update the total.
113   Total = NewTotal;
114 
115   // Save the weight.
116   Weights.push_back(Weight(Type, Node, Amount));
117 }
118 
119 static void combineWeight(Weight &W, const Weight &OtherW) {
120   assert(OtherW.TargetNode.isValid());
121   if (!W.Amount) {
122     W = OtherW;
123     return;
124   }
125   assert(W.Type == OtherW.Type);
126   assert(W.TargetNode == OtherW.TargetNode);
127   assert(OtherW.Amount && "Expected non-zero weight");
128   if (W.Amount > W.Amount + OtherW.Amount)
129     // Saturate on overflow.
130     W.Amount = UINT64_MAX;
131   else
132     W.Amount += OtherW.Amount;
133 }
134 
135 static void combineWeightsBySorting(WeightList &Weights) {
136   // Sort so edges to the same node are adjacent.
137   std::sort(Weights.begin(), Weights.end(),
138             [](const Weight &L,
139                const Weight &R) { return L.TargetNode < R.TargetNode; });
140 
141   // Combine adjacent edges.
142   WeightList::iterator O = Weights.begin();
143   for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E;
144        ++O, (I = L)) {
145     *O = *I;
146 
147     // Find the adjacent weights to the same node.
148     for (++L; L != E && I->TargetNode == L->TargetNode; ++L)
149       combineWeight(*O, *L);
150   }
151 
152   // Erase extra entries.
153   Weights.erase(O, Weights.end());
154 }
155 
156 static void combineWeightsByHashing(WeightList &Weights) {
157   // Collect weights into a DenseMap.
158   typedef DenseMap<BlockNode::IndexType, Weight> HashTable;
159   HashTable Combined(NextPowerOf2(2 * Weights.size()));
160   for (const Weight &W : Weights)
161     combineWeight(Combined[W.TargetNode.Index], W);
162 
163   // Check whether anything changed.
164   if (Weights.size() == Combined.size())
165     return;
166 
167   // Fill in the new weights.
168   Weights.clear();
169   Weights.reserve(Combined.size());
170   for (const auto &I : Combined)
171     Weights.push_back(I.second);
172 }
173 
174 static void combineWeights(WeightList &Weights) {
175   // Use a hash table for many successors to keep this linear.
176   if (Weights.size() > 128) {
177     combineWeightsByHashing(Weights);
178     return;
179   }
180 
181   combineWeightsBySorting(Weights);
182 }
183 
184 static uint64_t shiftRightAndRound(uint64_t N, int Shift) {
185   assert(Shift >= 0);
186   assert(Shift < 64);
187   if (!Shift)
188     return N;
189   return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1));
190 }
191 
192 void Distribution::normalize() {
193   // Early exit for termination nodes.
194   if (Weights.empty())
195     return;
196 
197   // Only bother if there are multiple successors.
198   if (Weights.size() > 1)
199     combineWeights(Weights);
200 
201   // Early exit when combined into a single successor.
202   if (Weights.size() == 1) {
203     Total = 1;
204     Weights.front().Amount = 1;
205     return;
206   }
207 
208   // Determine how much to shift right so that the total fits into 32-bits.
209   //
210   // If we shift at all, shift by 1 extra.  Otherwise, the lower limit of 1
211   // for each weight can cause a 32-bit overflow.
212   int Shift = 0;
213   if (DidOverflow)
214     Shift = 33;
215   else if (Total > UINT32_MAX)
216     Shift = 33 - countLeadingZeros(Total);
217 
218   // Early exit if nothing needs to be scaled.
219   if (!Shift) {
220     // If we didn't overflow then combineWeights() shouldn't have changed the
221     // sum of the weights, but let's double-check.
222     assert(Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0),
223                                     [](uint64_t Sum, const Weight &W) {
224                       return Sum + W.Amount;
225                     }) &&
226            "Expected total to be correct");
227     return;
228   }
229 
230   // Recompute the total through accumulation (rather than shifting it) so that
231   // it's accurate after shifting and any changes combineWeights() made above.
232   Total = 0;
233 
234   // Sum the weights to each node and shift right if necessary.
235   for (Weight &W : Weights) {
236     // Scale down below UINT32_MAX.  Since Shift is larger than necessary, we
237     // can round here without concern about overflow.
238     assert(W.TargetNode.isValid());
239     W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift));
240     assert(W.Amount <= UINT32_MAX);
241 
242     // Update the total.
243     Total += W.Amount;
244   }
245   assert(Total <= UINT32_MAX);
246 }
247 
248 void BlockFrequencyInfoImplBase::clear() {
249   // Swap with a default-constructed std::vector, since std::vector<>::clear()
250   // does not actually clear heap storage.
251   std::vector<FrequencyData>().swap(Freqs);
252   std::vector<WorkingData>().swap(Working);
253   Loops.clear();
254 }
255 
256 /// \brief Clear all memory not needed downstream.
257 ///
258 /// Releases all memory not used downstream.  In particular, saves Freqs.
259 static void cleanup(BlockFrequencyInfoImplBase &BFI) {
260   std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs));
261   BFI.clear();
262   BFI.Freqs = std::move(SavedFreqs);
263 }
264 
265 bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
266                                            const LoopData *OuterLoop,
267                                            const BlockNode &Pred,
268                                            const BlockNode &Succ,
269                                            uint64_t Weight) {
270   if (!Weight)
271     Weight = 1;
272 
273   auto isLoopHeader = [&OuterLoop](const BlockNode &Node) {
274     return OuterLoop && OuterLoop->isHeader(Node);
275   };
276 
277   BlockNode Resolved = Working[Succ.Index].getResolvedNode();
278 
279 #ifndef NDEBUG
280   auto debugSuccessor = [&](const char *Type) {
281     dbgs() << "  =>"
282            << " [" << Type << "] weight = " << Weight;
283     if (!isLoopHeader(Resolved))
284       dbgs() << ", succ = " << getBlockName(Succ);
285     if (Resolved != Succ)
286       dbgs() << ", resolved = " << getBlockName(Resolved);
287     dbgs() << "\n";
288   };
289   (void)debugSuccessor;
290 #endif
291 
292   if (isLoopHeader(Resolved)) {
293     DEBUG(debugSuccessor("backedge"));
294     Dist.addBackedge(Resolved, Weight);
295     return true;
296   }
297 
298   if (Working[Resolved.Index].getContainingLoop() != OuterLoop) {
299     DEBUG(debugSuccessor("  exit  "));
300     Dist.addExit(Resolved, Weight);
301     return true;
302   }
303 
304   if (Resolved < Pred) {
305     if (!isLoopHeader(Pred)) {
306       // If OuterLoop is an irreducible loop, we can't actually handle this.
307       assert((!OuterLoop || !OuterLoop->isIrreducible()) &&
308              "unhandled irreducible control flow");
309 
310       // Irreducible backedge.  Abort.
311       DEBUG(debugSuccessor("abort!!!"));
312       return false;
313     }
314 
315     // If "Pred" is a loop header, then this isn't really a backedge; rather,
316     // OuterLoop must be irreducible.  These false backedges can come only from
317     // secondary loop headers.
318     assert(OuterLoop && OuterLoop->isIrreducible() && !isLoopHeader(Resolved) &&
319            "unhandled irreducible control flow");
320   }
321 
322   DEBUG(debugSuccessor(" local  "));
323   Dist.addLocal(Resolved, Weight);
324   return true;
325 }
326 
327 bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
328     const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist) {
329   // Copy the exit map into Dist.
330   for (const auto &I : Loop.Exits)
331     if (!addToDist(Dist, OuterLoop, Loop.getHeader(), I.first,
332                    I.second.getMass()))
333       // Irreducible backedge.
334       return false;
335 
336   return true;
337 }
338 
339 /// \brief Compute the loop scale for a loop.
340 void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
341   // Compute loop scale.
342   DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n");
343 
344   // Infinite loops need special handling. If we give the back edge an infinite
345   // mass, they may saturate all the other scales in the function down to 1,
346   // making all the other region temperatures look exactly the same. Choose an
347   // arbitrary scale to avoid these issues.
348   //
349   // FIXME: An alternate way would be to select a symbolic scale which is later
350   // replaced to be the maximum of all computed scales plus 1. This would
351   // appropriately describe the loop as having a large scale, without skewing
352   // the final frequency computation.
353   const Scaled64 InifiniteLoopScale(1, 12);
354 
355   // LoopScale == 1 / ExitMass
356   // ExitMass == HeadMass - BackedgeMass
357   BlockMass TotalBackedgeMass;
358   for (auto &Mass : Loop.BackedgeMass)
359     TotalBackedgeMass += Mass;
360   BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass;
361 
362   // Block scale stores the inverse of the scale. If this is an infinite loop,
363   // its exit mass will be zero. In this case, use an arbitrary scale for the
364   // loop scale.
365   Loop.Scale =
366       ExitMass.isEmpty() ? InifiniteLoopScale : ExitMass.toScaled().inverse();
367 
368   DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
369                << " - " << TotalBackedgeMass << ")\n"
370                << " - scale = " << Loop.Scale << "\n");
371 }
372 
373 /// \brief Package up a loop.
374 void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) {
375   DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n");
376 
377   // Clear the subloop exits to prevent quadratic memory usage.
378   for (const BlockNode &M : Loop.Nodes) {
379     if (auto *Loop = Working[M.Index].getPackagedLoop())
380       Loop->Exits.clear();
381     DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n");
382   }
383   Loop.IsPackaged = true;
384 }
385 
386 #ifndef NDEBUG
387 static void debugAssign(const BlockFrequencyInfoImplBase &BFI,
388                         const DitheringDistributer &D, const BlockNode &T,
389                         const BlockMass &M, const char *Desc) {
390   dbgs() << "  => assign " << M << " (" << D.RemMass << ")";
391   if (Desc)
392     dbgs() << " [" << Desc << "]";
393   if (T.isValid())
394     dbgs() << " to " << BFI.getBlockName(T);
395   dbgs() << "\n";
396 }
397 #endif
398 
399 void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
400                                                 LoopData *OuterLoop,
401                                                 Distribution &Dist) {
402   BlockMass Mass = Working[Source.Index].getMass();
403   DEBUG(dbgs() << "  => mass:  " << Mass << "\n");
404 
405   // Distribute mass to successors as laid out in Dist.
406   DitheringDistributer D(Dist, Mass);
407 
408   for (const Weight &W : Dist.Weights) {
409     // Check for a local edge (non-backedge and non-exit).
410     BlockMass Taken = D.takeMass(W.Amount);
411     if (W.Type == Weight::Local) {
412       Working[W.TargetNode.Index].getMass() += Taken;
413       DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
414       continue;
415     }
416 
417     // Backedges and exits only make sense if we're processing a loop.
418     assert(OuterLoop && "backedge or exit outside of loop");
419 
420     // Check for a backedge.
421     if (W.Type == Weight::Backedge) {
422       OuterLoop->BackedgeMass[OuterLoop->getHeaderIndex(W.TargetNode)] += Taken;
423       DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back"));
424       continue;
425     }
426 
427     // This must be an exit.
428     assert(W.Type == Weight::Exit);
429     OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken));
430     DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit"));
431   }
432 }
433 
434 static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
435                                      const Scaled64 &Min, const Scaled64 &Max) {
436   // Scale the Factor to a size that creates integers.  Ideally, integers would
437   // be scaled so that Max == UINT64_MAX so that they can be best
438   // differentiated.  However, in the presence of large frequency values, small
439   // frequencies are scaled down to 1, making it impossible to differentiate
440   // small, unequal numbers. When the spread between Min and Max frequencies
441   // fits well within MaxBits, we make the scale be at least 8.
442   const unsigned MaxBits = 64;
443   const unsigned SpreadBits = (Max / Min).lg();
444   Scaled64 ScalingFactor;
445   if (SpreadBits <= MaxBits - 3) {
446     // If the values are small enough, make the scaling factor at least 8 to
447     // allow distinguishing small values.
448     ScalingFactor = Min.inverse();
449     ScalingFactor <<= 3;
450   } else {
451     // If the values need more than MaxBits to be represented, saturate small
452     // frequency values down to 1 by using a scaling factor that benefits large
453     // frequency values.
454     ScalingFactor = Scaled64(1, MaxBits) / Max;
455   }
456 
457   // Translate the floats to integers.
458   DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
459                << ", factor = " << ScalingFactor << "\n");
460   for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {
461     Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor;
462     BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>());
463     DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = "
464                  << BFI.Freqs[Index].Scaled << ", scaled = " << Scaled
465                  << ", int = " << BFI.Freqs[Index].Integer << "\n");
466   }
467 }
468 
469 /// \brief Unwrap a loop package.
470 ///
471 /// Visits all the members of a loop, adjusting their BlockData according to
472 /// the loop's pseudo-node.
473 static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) {
474   DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop)
475                << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale
476                << "\n");
477   Loop.Scale *= Loop.Mass.toScaled();
478   Loop.IsPackaged = false;
479   DEBUG(dbgs() << "  => combined-scale = " << Loop.Scale << "\n");
480 
481   // Propagate the head scale through the loop.  Since members are visited in
482   // RPO, the head scale will be updated by the loop scale first, and then the
483   // final head scale will be used for updated the rest of the members.
484   for (const BlockNode &N : Loop.Nodes) {
485     const auto &Working = BFI.Working[N.Index];
486     Scaled64 &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale
487                                        : BFI.Freqs[N.Index].Scaled;
488     Scaled64 New = Loop.Scale * F;
489     DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " << New
490                  << "\n");
491     F = New;
492   }
493 }
494 
495 void BlockFrequencyInfoImplBase::unwrapLoops() {
496   // Set initial frequencies from loop-local masses.
497   for (size_t Index = 0; Index < Working.size(); ++Index)
498     Freqs[Index].Scaled = Working[Index].Mass.toScaled();
499 
500   for (LoopData &Loop : Loops)
501     unwrapLoop(*this, Loop);
502 }
503 
504 void BlockFrequencyInfoImplBase::finalizeMetrics() {
505   // Unwrap loop packages in reverse post-order, tracking min and max
506   // frequencies.
507   auto Min = Scaled64::getLargest();
508   auto Max = Scaled64::getZero();
509   for (size_t Index = 0; Index < Working.size(); ++Index) {
510     // Update min/max scale.
511     Min = std::min(Min, Freqs[Index].Scaled);
512     Max = std::max(Max, Freqs[Index].Scaled);
513   }
514 
515   // Convert to integers.
516   convertFloatingToInteger(*this, Min, Max);
517 
518   // Clean up data structures.
519   cleanup(*this);
520 
521   // Print out the final stats.
522   DEBUG(dump());
523 }
524 
525 BlockFrequency
526 BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const {
527   if (!Node.isValid())
528     return 0;
529   return Freqs[Node.Index].Integer;
530 }
531 
532 Scaled64
533 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
534   if (!Node.isValid())
535     return Scaled64::getZero();
536   return Freqs[Node.Index].Scaled;
537 }
538 
539 void BlockFrequencyInfoImplBase::setBlockFreq(const BlockNode &Node,
540                                               uint64_t Freq) {
541   assert(Node.isValid() && "Expected valid node");
542   assert(Node.Index < Freqs.size() && "Expected legal index");
543   Freqs[Node.Index].Integer = Freq;
544 }
545 
546 std::string
547 BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const {
548   return std::string();
549 }
550 
551 std::string
552 BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const {
553   return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*");
554 }
555 
556 raw_ostream &
557 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
558                                            const BlockNode &Node) const {
559   return OS << getFloatingBlockFreq(Node);
560 }
561 
562 raw_ostream &
563 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
564                                            const BlockFrequency &Freq) const {
565   Scaled64 Block(Freq.getFrequency(), 0);
566   Scaled64 Entry(getEntryFreq(), 0);
567 
568   return OS << Block / Entry;
569 }
570 
571 void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) {
572   Start = OuterLoop.getHeader();
573   Nodes.reserve(OuterLoop.Nodes.size());
574   for (auto N : OuterLoop.Nodes)
575     addNode(N);
576   indexNodes();
577 }
578 
579 void IrreducibleGraph::addNodesInFunction() {
580   Start = 0;
581   for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)
582     if (!BFI.Working[Index].isPackaged())
583       addNode(Index);
584   indexNodes();
585 }
586 
587 void IrreducibleGraph::indexNodes() {
588   for (auto &I : Nodes)
589     Lookup[I.Node.Index] = &I;
590 }
591 
592 void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ,
593                                const BFIBase::LoopData *OuterLoop) {
594   if (OuterLoop && OuterLoop->isHeader(Succ))
595     return;
596   auto L = Lookup.find(Succ.Index);
597   if (L == Lookup.end())
598     return;
599   IrrNode &SuccIrr = *L->second;
600   Irr.Edges.push_back(&SuccIrr);
601   SuccIrr.Edges.push_front(&Irr);
602   ++SuccIrr.NumIn;
603 }
604 
605 namespace llvm {
606 template <> struct GraphTraits<IrreducibleGraph> {
607   typedef bfi_detail::IrreducibleGraph GraphT;
608 
609   typedef const GraphT::IrrNode NodeType;
610   typedef GraphT::IrrNode::iterator ChildIteratorType;
611 
612   static const NodeType *getEntryNode(const GraphT &G) {
613     return G.StartIrr;
614   }
615   static ChildIteratorType child_begin(NodeType *N) { return N->succ_begin(); }
616   static ChildIteratorType child_end(NodeType *N) { return N->succ_end(); }
617 };
618 } // end namespace llvm
619 
620 /// \brief Find extra irreducible headers.
621 ///
622 /// Find entry blocks and other blocks with backedges, which exist when \c G
623 /// contains irreducible sub-SCCs.
624 static void findIrreducibleHeaders(
625     const BlockFrequencyInfoImplBase &BFI,
626     const IrreducibleGraph &G,
627     const std::vector<const IrreducibleGraph::IrrNode *> &SCC,
628     LoopData::NodeList &Headers, LoopData::NodeList &Others) {
629   // Map from nodes in the SCC to whether it's an entry block.
630   SmallDenseMap<const IrreducibleGraph::IrrNode *, bool, 8> InSCC;
631 
632   // InSCC also acts the set of nodes in the graph.  Seed it.
633   for (const auto *I : SCC)
634     InSCC[I] = false;
635 
636   for (auto I = InSCC.begin(), E = InSCC.end(); I != E; ++I) {
637     auto &Irr = *I->first;
638     for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) {
639       if (InSCC.count(P))
640         continue;
641 
642       // This is an entry block.
643       I->second = true;
644       Headers.push_back(Irr.Node);
645       DEBUG(dbgs() << "  => entry = " << BFI.getBlockName(Irr.Node) << "\n");
646       break;
647     }
648   }
649   assert(Headers.size() >= 2 &&
650          "Expected irreducible CFG; -loop-info is likely invalid");
651   if (Headers.size() == InSCC.size()) {
652     // Every block is a header.
653     std::sort(Headers.begin(), Headers.end());
654     return;
655   }
656 
657   // Look for extra headers from irreducible sub-SCCs.
658   for (const auto &I : InSCC) {
659     // Entry blocks are already headers.
660     if (I.second)
661       continue;
662 
663     auto &Irr = *I.first;
664     for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) {
665       // Skip forward edges.
666       if (P->Node < Irr.Node)
667         continue;
668 
669       // Skip predecessors from entry blocks.  These can have inverted
670       // ordering.
671       if (InSCC.lookup(P))
672         continue;
673 
674       // Store the extra header.
675       Headers.push_back(Irr.Node);
676       DEBUG(dbgs() << "  => extra = " << BFI.getBlockName(Irr.Node) << "\n");
677       break;
678     }
679     if (Headers.back() == Irr.Node)
680       // Added this as a header.
681       continue;
682 
683     // This is not a header.
684     Others.push_back(Irr.Node);
685     DEBUG(dbgs() << "  => other = " << BFI.getBlockName(Irr.Node) << "\n");
686   }
687   std::sort(Headers.begin(), Headers.end());
688   std::sort(Others.begin(), Others.end());
689 }
690 
691 static void createIrreducibleLoop(
692     BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G,
693     LoopData *OuterLoop, std::list<LoopData>::iterator Insert,
694     const std::vector<const IrreducibleGraph::IrrNode *> &SCC) {
695   // Translate the SCC into RPO.
696   DEBUG(dbgs() << " - found-scc\n");
697 
698   LoopData::NodeList Headers;
699   LoopData::NodeList Others;
700   findIrreducibleHeaders(BFI, G, SCC, Headers, Others);
701 
702   auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(),
703                                 Headers.end(), Others.begin(), Others.end());
704 
705   // Update loop hierarchy.
706   for (const auto &N : Loop->Nodes)
707     if (BFI.Working[N.Index].isLoopHeader())
708       BFI.Working[N.Index].Loop->Parent = &*Loop;
709     else
710       BFI.Working[N.Index].Loop = &*Loop;
711 }
712 
713 iterator_range<std::list<LoopData>::iterator>
714 BlockFrequencyInfoImplBase::analyzeIrreducible(
715     const IrreducibleGraph &G, LoopData *OuterLoop,
716     std::list<LoopData>::iterator Insert) {
717   assert((OuterLoop == nullptr) == (Insert == Loops.begin()));
718   auto Prev = OuterLoop ? std::prev(Insert) : Loops.end();
719 
720   for (auto I = scc_begin(G); !I.isAtEnd(); ++I) {
721     if (I->size() < 2)
722       continue;
723 
724     // Translate the SCC into RPO.
725     createIrreducibleLoop(*this, G, OuterLoop, Insert, *I);
726   }
727 
728   if (OuterLoop)
729     return make_range(std::next(Prev), Insert);
730   return make_range(Loops.begin(), Insert);
731 }
732 
733 void
734 BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) {
735   OuterLoop.Exits.clear();
736   for (auto &Mass : OuterLoop.BackedgeMass)
737     Mass = BlockMass::getEmpty();
738   auto O = OuterLoop.Nodes.begin() + 1;
739   for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I)
740     if (!Working[I->Index].isPackaged())
741       *O++ = *I;
742   OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end());
743 }
744 
745 void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) {
746   assert(Loop.isIrreducible() && "this only makes sense on irreducible loops");
747 
748   // Since the loop has more than one header block, the mass flowing back into
749   // each header will be different. Adjust the mass in each header loop to
750   // reflect the masses flowing through back edges.
751   //
752   // To do this, we distribute the initial mass using the backedge masses
753   // as weights for the distribution.
754   BlockMass LoopMass = BlockMass::getFull();
755   Distribution Dist;
756 
757   DEBUG(dbgs() << "adjust-loop-header-mass:\n");
758   for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
759     auto &HeaderNode = Loop.Nodes[H];
760     auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(HeaderNode)];
761     DEBUG(dbgs() << " - Add back edge mass for node "
762                  << getBlockName(HeaderNode) << ": " << BackedgeMass << "\n");
763     if (BackedgeMass.getMass() > 0)
764       Dist.addLocal(HeaderNode, BackedgeMass.getMass());
765     else
766       DEBUG(dbgs() << "   Nothing added. Back edge mass is zero\n");
767   }
768 
769   DitheringDistributer D(Dist, LoopMass);
770 
771   DEBUG(dbgs() << " Distribute loop mass " << LoopMass
772                << " to headers using above weights\n");
773   for (const Weight &W : Dist.Weights) {
774     BlockMass Taken = D.takeMass(W.Amount);
775     assert(W.Type == Weight::Local && "all weights should be local");
776     Working[W.TargetNode.Index].getMass() = Taken;
777     DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
778   }
779 }
780