1 //===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a profile inference algorithm. Given an incomplete and
10 // possibly imprecise block counts, the algorithm reconstructs realistic block
11 // and edge counts that satisfy flow conservation rules, while minimally modify
12 // input block counts.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "llvm/Transforms/Utils/SampleProfileInference.h"
17 #include "llvm/Support/Debug.h"
18 #include <queue>
19 #include <set>
20 
21 using namespace llvm;
22 #define DEBUG_TYPE "sample-profile-inference"
23 
24 namespace {
25 
26 /// A value indicating an infinite flow/capacity/weight of a block/edge.
27 /// Not using numeric_limits<int64_t>::max(), as the values can be summed up
28 /// during the execution.
29 static constexpr int64_t INF = ((int64_t)1) << 50;
30 
31 /// The minimum-cost maximum flow algorithm.
32 ///
33 /// The algorithm finds the maximum flow of minimum cost on a given (directed)
34 /// network using a modified version of the classical Moore-Bellman-Ford
35 /// approach. The algorithm applies a number of augmentation iterations in which
36 /// flow is sent along paths of positive capacity from the source to the sink.
37 /// The worst-case time complexity of the implementation is O(v(f)*m*n), where
38 /// where m is the number of edges, n is the number of vertices, and v(f) is the
39 /// value of the maximum flow. However, the observed running time on typical
40 /// instances is sub-quadratic, that is, o(n^2).
41 ///
42 /// The input is a set of edges with specified costs and capacities, and a pair
43 /// of nodes (source and sink). The output is the flow along each edge of the
44 /// minimum total cost respecting the given edge capacities.
45 class MinCostMaxFlow {
46 public:
47   // Initialize algorithm's data structures for a network of a given size.
48   void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) {
49     Source = SourceNode;
50     Target = SinkNode;
51 
52     Nodes = std::vector<Node>(NodeCount);
53     Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>());
54   }
55 
56   // Run the algorithm.
57   int64_t run() {
58     // Find an augmenting path and update the flow along the path
59     size_t AugmentationIters = 0;
60     while (findAugmentingPath()) {
61       augmentFlowAlongPath();
62       AugmentationIters++;
63     }
64 
65     // Compute the total flow and its cost
66     int64_t TotalCost = 0;
67     int64_t TotalFlow = 0;
68     for (uint64_t Src = 0; Src < Nodes.size(); Src++) {
69       for (auto &Edge : Edges[Src]) {
70         if (Edge.Flow > 0) {
71           TotalCost += Edge.Cost * Edge.Flow;
72           if (Src == Source)
73             TotalFlow += Edge.Flow;
74         }
75       }
76     }
77     LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters
78                       << " iterations with " << TotalFlow << " total flow"
79                       << " of " << TotalCost << " cost\n");
80     return TotalCost;
81   }
82 
83   /// Adding an edge to the network with a specified capacity and a cost.
84   /// Multiple edges between a pair of nodes are allowed but self-edges
85   /// are not supported.
86   void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) {
87     assert(Capacity > 0 && "adding an edge of zero capacity");
88     assert(Src != Dst && "loop edge are not supported");
89 
90     Edge SrcEdge;
91     SrcEdge.Dst = Dst;
92     SrcEdge.Cost = Cost;
93     SrcEdge.Capacity = Capacity;
94     SrcEdge.Flow = 0;
95     SrcEdge.RevEdgeIndex = Edges[Dst].size();
96 
97     Edge DstEdge;
98     DstEdge.Dst = Src;
99     DstEdge.Cost = -Cost;
100     DstEdge.Capacity = 0;
101     DstEdge.Flow = 0;
102     DstEdge.RevEdgeIndex = Edges[Src].size();
103 
104     Edges[Src].push_back(SrcEdge);
105     Edges[Dst].push_back(DstEdge);
106   }
107 
108   /// Adding an edge to the network of infinite capacity and a given cost.
109   void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) {
110     addEdge(Src, Dst, INF, Cost);
111   }
112 
113   /// Get the total flow from a given source node.
114   /// Returns a list of pairs (target node, amount of flow to the target).
115   const std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const {
116     std::vector<std::pair<uint64_t, int64_t>> Flow;
117     for (auto &Edge : Edges[Src]) {
118       if (Edge.Flow > 0)
119         Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow));
120     }
121     return Flow;
122   }
123 
124   /// Get the total flow between a pair of nodes.
125   int64_t getFlow(uint64_t Src, uint64_t Dst) const {
126     int64_t Flow = 0;
127     for (auto &Edge : Edges[Src]) {
128       if (Edge.Dst == Dst) {
129         Flow += Edge.Flow;
130       }
131     }
132     return Flow;
133   }
134 
135   /// A cost of increasing a block's count by one.
136   static constexpr int64_t AuxCostInc = 10;
137   /// A cost of decreasing a block's count by one.
138   static constexpr int64_t AuxCostDec = 20;
139   /// A cost of increasing a count of zero-weight block by one.
140   static constexpr int64_t AuxCostIncZero = 11;
141   /// A cost of increasing the entry block's count by one.
142   static constexpr int64_t AuxCostIncEntry = 40;
143   /// A cost of decreasing the entry block's count by one.
144   static constexpr int64_t AuxCostDecEntry = 10;
145   /// A cost of taking an unlikely jump.
146   static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20;
147 
148 private:
149   /// Check for existence of an augmenting path with a positive capacity.
150   bool findAugmentingPath() {
151     // Initialize data structures
152     for (auto &Node : Nodes) {
153       Node.Distance = INF;
154       Node.ParentNode = uint64_t(-1);
155       Node.ParentEdgeIndex = uint64_t(-1);
156       Node.Taken = false;
157     }
158 
159     std::queue<uint64_t> Queue;
160     Queue.push(Source);
161     Nodes[Source].Distance = 0;
162     Nodes[Source].Taken = true;
163     while (!Queue.empty()) {
164       uint64_t Src = Queue.front();
165       Queue.pop();
166       Nodes[Src].Taken = false;
167       // Although the residual network contains edges with negative costs
168       // (in particular, backward edges), it can be shown that there are no
169       // negative-weight cycles and the following two invariants are maintained:
170       // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V,
171       // where Dist is the length of the shortest path between two nodes. This
172       // allows to prune the search-space of the path-finding algorithm using
173       // the following early-stop criteria:
174       // -- If we find a path with zero-distance from Source to Target, stop the
175       //    search, as the path is the shortest since Dist[Source, Target] >= 0;
176       // -- If we have Dist[Source, V] > Dist[Source, Target], then do not
177       //    process node V, as it is guaranteed _not_ to be on a shortest path
178       //    from Source to Target; it follows from inequalities
179       //    Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target]
180       //                         >= Dist[Source, V]
181       if (Nodes[Target].Distance == 0)
182         break;
183       if (Nodes[Src].Distance > Nodes[Target].Distance)
184         continue;
185 
186       // Process adjacent edges
187       for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) {
188         auto &Edge = Edges[Src][EdgeIdx];
189         if (Edge.Flow < Edge.Capacity) {
190           uint64_t Dst = Edge.Dst;
191           int64_t NewDistance = Nodes[Src].Distance + Edge.Cost;
192           if (Nodes[Dst].Distance > NewDistance) {
193             // Update the distance and the parent node/edge
194             Nodes[Dst].Distance = NewDistance;
195             Nodes[Dst].ParentNode = Src;
196             Nodes[Dst].ParentEdgeIndex = EdgeIdx;
197             // Add the node to the queue, if it is not there yet
198             if (!Nodes[Dst].Taken) {
199               Queue.push(Dst);
200               Nodes[Dst].Taken = true;
201             }
202           }
203         }
204       }
205     }
206 
207     return Nodes[Target].Distance != INF;
208   }
209 
210   /// Update the current flow along the augmenting path.
211   void augmentFlowAlongPath() {
212     // Find path capacity
213     int64_t PathCapacity = INF;
214     uint64_t Now = Target;
215     while (Now != Source) {
216       uint64_t Pred = Nodes[Now].ParentNode;
217       auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
218       PathCapacity = std::min(PathCapacity, Edge.Capacity - Edge.Flow);
219       Now = Pred;
220     }
221 
222     assert(PathCapacity > 0 && "found incorrect augmenting path");
223 
224     // Update the flow along the path
225     Now = Target;
226     while (Now != Source) {
227       uint64_t Pred = Nodes[Now].ParentNode;
228       auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
229       auto &RevEdge = Edges[Now][Edge.RevEdgeIndex];
230 
231       Edge.Flow += PathCapacity;
232       RevEdge.Flow -= PathCapacity;
233 
234       Now = Pred;
235     }
236   }
237 
238   /// An node in a flow network.
239   struct Node {
240     /// The cost of the cheapest path from the source to the current node.
241     int64_t Distance;
242     /// The node preceding the current one in the path.
243     uint64_t ParentNode;
244     /// The index of the edge between ParentNode and the current node.
245     uint64_t ParentEdgeIndex;
246     /// An indicator of whether the current node is in a queue.
247     bool Taken;
248   };
249   /// An edge in a flow network.
250   struct Edge {
251     /// The cost of the edge.
252     int64_t Cost;
253     /// The capacity of the edge.
254     int64_t Capacity;
255     /// The current flow on the edge.
256     int64_t Flow;
257     /// The destination node of the edge.
258     uint64_t Dst;
259     /// The index of the reverse edge between Dst and the current node.
260     uint64_t RevEdgeIndex;
261   };
262 
263   /// The set of network nodes.
264   std::vector<Node> Nodes;
265   /// The set of network edges.
266   std::vector<std::vector<Edge>> Edges;
267   /// Source node of the flow.
268   uint64_t Source;
269   /// Target (sink) node of the flow.
270   uint64_t Target;
271 };
272 
273 /// Initializing flow network for a given function.
274 ///
275 /// Every block is split into three nodes that are responsible for (i) an
276 /// incoming flow, (ii) an outgoing flow, and (iii) penalizing an increase or
277 /// reduction of the block weight.
278 void initializeNetwork(MinCostMaxFlow &Network, FlowFunction &Func) {
279   uint64_t NumBlocks = Func.Blocks.size();
280   assert(NumBlocks > 1 && "Too few blocks in a function");
281   LLVM_DEBUG(dbgs() << "Initializing profi for " << NumBlocks << " blocks\n");
282 
283   // Pre-process data: make sure the entry weight is at least 1
284   if (Func.Blocks[Func.Entry].Weight == 0) {
285     Func.Blocks[Func.Entry].Weight = 1;
286   }
287   // Introducing dummy source/sink pairs to allow flow circulation.
288   // The nodes corresponding to blocks of Func have indicies in the range
289   // [0..3 * NumBlocks); the dummy nodes are indexed by the next four values.
290   uint64_t S = 3 * NumBlocks;
291   uint64_t T = S + 1;
292   uint64_t S1 = S + 2;
293   uint64_t T1 = S + 3;
294 
295   Network.initialize(3 * NumBlocks + 4, S1, T1);
296 
297   // Create three nodes for every block of the function
298   for (uint64_t B = 0; B < NumBlocks; B++) {
299     auto &Block = Func.Blocks[B];
300     assert((!Block.UnknownWeight || Block.Weight == 0 || Block.isEntry()) &&
301            "non-zero weight of a block w/o weight except for an entry");
302 
303     // Split every block into two nodes
304     uint64_t Bin = 3 * B;
305     uint64_t Bout = 3 * B + 1;
306     uint64_t Baux = 3 * B + 2;
307     if (Block.Weight > 0) {
308       Network.addEdge(S1, Bout, Block.Weight, 0);
309       Network.addEdge(Bin, T1, Block.Weight, 0);
310     }
311 
312     // Edges from S and to T
313     assert((!Block.isEntry() || !Block.isExit()) &&
314            "a block cannot be an entry and an exit");
315     if (Block.isEntry()) {
316       Network.addEdge(S, Bin, 0);
317     } else if (Block.isExit()) {
318       Network.addEdge(Bout, T, 0);
319     }
320 
321     // An auxiliary node to allow increase/reduction of block counts:
322     // We assume that decreasing block counts is more expensive than increasing,
323     // and thus, setting separate costs here. In the future we may want to tune
324     // the relative costs so as to maximize the quality of generated profiles.
325     int64_t AuxCostInc = MinCostMaxFlow::AuxCostInc;
326     int64_t AuxCostDec = MinCostMaxFlow::AuxCostDec;
327     if (Block.UnknownWeight) {
328       // Do not penalize changing weights of blocks w/o known profile count
329       AuxCostInc = 0;
330       AuxCostDec = 0;
331     } else {
332       // Increasing the count for "cold" blocks with zero initial count is more
333       // expensive than for "hot" ones
334       if (Block.Weight == 0) {
335         AuxCostInc = MinCostMaxFlow::AuxCostIncZero;
336       }
337       // Modifying the count of the entry block is expensive
338       if (Block.isEntry()) {
339         AuxCostInc = MinCostMaxFlow::AuxCostIncEntry;
340         AuxCostDec = MinCostMaxFlow::AuxCostDecEntry;
341       }
342     }
343     // For blocks with self-edges, do not penalize a reduction of the count,
344     // as all of the increase can be attributed to the self-edge
345     if (Block.HasSelfEdge) {
346       AuxCostDec = 0;
347     }
348 
349     Network.addEdge(Bin, Baux, AuxCostInc);
350     Network.addEdge(Baux, Bout, AuxCostInc);
351     if (Block.Weight > 0) {
352       Network.addEdge(Bout, Baux, AuxCostDec);
353       Network.addEdge(Baux, Bin, AuxCostDec);
354     }
355   }
356 
357   // Creating edges for every jump
358   for (auto &Jump : Func.Jumps) {
359     uint64_t Src = Jump.Source;
360     uint64_t Dst = Jump.Target;
361     if (Src != Dst) {
362       uint64_t SrcOut = 3 * Src + 1;
363       uint64_t DstIn = 3 * Dst;
364       uint64_t Cost = Jump.IsUnlikely ? MinCostMaxFlow::AuxCostUnlikely : 0;
365       Network.addEdge(SrcOut, DstIn, Cost);
366     }
367   }
368 
369   // Make sure we have a valid flow circulation
370   Network.addEdge(T, S, 0);
371 }
372 
373 /// Extract resulting block and edge counts from the flow network.
374 void extractWeights(MinCostMaxFlow &Network, FlowFunction &Func) {
375   uint64_t NumBlocks = Func.Blocks.size();
376 
377   // Extract resulting block counts
378   for (uint64_t Src = 0; Src < NumBlocks; Src++) {
379     auto &Block = Func.Blocks[Src];
380     uint64_t SrcOut = 3 * Src + 1;
381     int64_t Flow = 0;
382     for (auto &Adj : Network.getFlow(SrcOut)) {
383       uint64_t DstIn = Adj.first;
384       int64_t DstFlow = Adj.second;
385       bool IsAuxNode = (DstIn < 3 * NumBlocks && DstIn % 3 == 2);
386       if (!IsAuxNode || Block.HasSelfEdge) {
387         Flow += DstFlow;
388       }
389     }
390     Block.Flow = Flow;
391     assert(Flow >= 0 && "negative block flow");
392   }
393 
394   // Extract resulting jump counts
395   for (auto &Jump : Func.Jumps) {
396     uint64_t Src = Jump.Source;
397     uint64_t Dst = Jump.Target;
398     int64_t Flow = 0;
399     if (Src != Dst) {
400       uint64_t SrcOut = 3 * Src + 1;
401       uint64_t DstIn = 3 * Dst;
402       Flow = Network.getFlow(SrcOut, DstIn);
403     } else {
404       uint64_t SrcOut = 3 * Src + 1;
405       uint64_t SrcAux = 3 * Src + 2;
406       int64_t AuxFlow = Network.getFlow(SrcOut, SrcAux);
407       if (AuxFlow > 0)
408         Flow = AuxFlow;
409     }
410     Jump.Flow = Flow;
411     assert(Flow >= 0 && "negative jump flow");
412   }
413 }
414 
415 #ifndef NDEBUG
416 /// Verify that the computed flow values satisfy flow conservation rules
417 void verifyWeights(const FlowFunction &Func) {
418   const uint64_t NumBlocks = Func.Blocks.size();
419   auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
420   auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
421   for (auto &Jump : Func.Jumps) {
422     InFlow[Jump.Target] += Jump.Flow;
423     OutFlow[Jump.Source] += Jump.Flow;
424   }
425 
426   uint64_t TotalInFlow = 0;
427   uint64_t TotalOutFlow = 0;
428   for (uint64_t I = 0; I < NumBlocks; I++) {
429     auto &Block = Func.Blocks[I];
430     if (Block.isEntry()) {
431       TotalInFlow += Block.Flow;
432       assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
433     } else if (Block.isExit()) {
434       TotalOutFlow += Block.Flow;
435       assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
436     } else {
437       assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
438       assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
439     }
440   }
441   assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow");
442 }
443 #endif
444 
445 } // end of anonymous namespace
446 
447 /// Apply the profile inference algorithm for a given flow function
448 void llvm::applyFlowInference(FlowFunction &Func) {
449   // Create and apply an inference network model
450   auto InferenceNetwork = MinCostMaxFlow();
451   initializeNetwork(InferenceNetwork, Func);
452   InferenceNetwork.run();
453 
454   // Extract flow values for every block and every edge
455   extractWeights(InferenceNetwork, Func);
456 
457 #ifndef NDEBUG
458   // Verify the result
459   verifyWeights(Func);
460 #endif
461 }
462