1 //===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a profile inference algorithm. Given an incomplete and 10 // possibly imprecise block counts, the algorithm reconstructs realistic block 11 // and edge counts that satisfy flow conservation rules, while minimally modify 12 // input block counts. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "llvm/Transforms/Utils/SampleProfileInference.h" 17 #include "llvm/Support/Debug.h" 18 #include <queue> 19 #include <set> 20 21 using namespace llvm; 22 #define DEBUG_TYPE "sample-profile-inference" 23 24 namespace { 25 26 /// A value indicating an infinite flow/capacity/weight of a block/edge. 27 /// Not using numeric_limits<int64_t>::max(), as the values can be summed up 28 /// during the execution. 29 static constexpr int64_t INF = ((int64_t)1) << 50; 30 31 /// The minimum-cost maximum flow algorithm. 32 /// 33 /// The algorithm finds the maximum flow of minimum cost on a given (directed) 34 /// network using a modified version of the classical Moore-Bellman-Ford 35 /// approach. The algorithm applies a number of augmentation iterations in which 36 /// flow is sent along paths of positive capacity from the source to the sink. 37 /// The worst-case time complexity of the implementation is O(v(f)*m*n), where 38 /// where m is the number of edges, n is the number of vertices, and v(f) is the 39 /// value of the maximum flow. However, the observed running time on typical 40 /// instances is sub-quadratic, that is, o(n^2). 41 /// 42 /// The input is a set of edges with specified costs and capacities, and a pair 43 /// of nodes (source and sink). The output is the flow along each edge of the 44 /// minimum total cost respecting the given edge capacities. 45 class MinCostMaxFlow { 46 public: 47 // Initialize algorithm's data structures for a network of a given size. 48 void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) { 49 Source = SourceNode; 50 Target = SinkNode; 51 52 Nodes = std::vector<Node>(NodeCount); 53 Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>()); 54 } 55 56 // Run the algorithm. 57 int64_t run() { 58 // Find an augmenting path and update the flow along the path 59 size_t AugmentationIters = 0; 60 while (findAugmentingPath()) { 61 augmentFlowAlongPath(); 62 AugmentationIters++; 63 } 64 65 // Compute the total flow and its cost 66 int64_t TotalCost = 0; 67 int64_t TotalFlow = 0; 68 for (uint64_t Src = 0; Src < Nodes.size(); Src++) { 69 for (auto &Edge : Edges[Src]) { 70 if (Edge.Flow > 0) { 71 TotalCost += Edge.Cost * Edge.Flow; 72 if (Src == Source) 73 TotalFlow += Edge.Flow; 74 } 75 } 76 } 77 LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters 78 << " iterations with " << TotalFlow << " total flow" 79 << " of " << TotalCost << " cost\n"); 80 return TotalCost; 81 } 82 83 /// Adding an edge to the network with a specified capacity and a cost. 84 /// Multiple edges between a pair of nodes are allowed but self-edges 85 /// are not supported. 86 void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) { 87 assert(Capacity > 0 && "adding an edge of zero capacity"); 88 assert(Src != Dst && "loop edge are not supported"); 89 90 Edge SrcEdge; 91 SrcEdge.Dst = Dst; 92 SrcEdge.Cost = Cost; 93 SrcEdge.Capacity = Capacity; 94 SrcEdge.Flow = 0; 95 SrcEdge.RevEdgeIndex = Edges[Dst].size(); 96 97 Edge DstEdge; 98 DstEdge.Dst = Src; 99 DstEdge.Cost = -Cost; 100 DstEdge.Capacity = 0; 101 DstEdge.Flow = 0; 102 DstEdge.RevEdgeIndex = Edges[Src].size(); 103 104 Edges[Src].push_back(SrcEdge); 105 Edges[Dst].push_back(DstEdge); 106 } 107 108 /// Adding an edge to the network of infinite capacity and a given cost. 109 void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) { 110 addEdge(Src, Dst, INF, Cost); 111 } 112 113 /// Get the total flow from a given source node. 114 /// Returns a list of pairs (target node, amount of flow to the target). 115 const std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const { 116 std::vector<std::pair<uint64_t, int64_t>> Flow; 117 for (auto &Edge : Edges[Src]) { 118 if (Edge.Flow > 0) 119 Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow)); 120 } 121 return Flow; 122 } 123 124 /// Get the total flow between a pair of nodes. 125 int64_t getFlow(uint64_t Src, uint64_t Dst) const { 126 int64_t Flow = 0; 127 for (auto &Edge : Edges[Src]) { 128 if (Edge.Dst == Dst) { 129 Flow += Edge.Flow; 130 } 131 } 132 return Flow; 133 } 134 135 /// A cost of increasing a block's count by one. 136 static constexpr int64_t AuxCostInc = 10; 137 /// A cost of decreasing a block's count by one. 138 static constexpr int64_t AuxCostDec = 20; 139 /// A cost of increasing a count of zero-weight block by one. 140 static constexpr int64_t AuxCostIncZero = 11; 141 /// A cost of increasing the entry block's count by one. 142 static constexpr int64_t AuxCostIncEntry = 40; 143 /// A cost of decreasing the entry block's count by one. 144 static constexpr int64_t AuxCostDecEntry = 10; 145 /// A cost of taking an unlikely jump. 146 static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20; 147 148 private: 149 /// Check for existence of an augmenting path with a positive capacity. 150 bool findAugmentingPath() { 151 // Initialize data structures 152 for (auto &Node : Nodes) { 153 Node.Distance = INF; 154 Node.ParentNode = uint64_t(-1); 155 Node.ParentEdgeIndex = uint64_t(-1); 156 Node.Taken = false; 157 } 158 159 std::queue<uint64_t> Queue; 160 Queue.push(Source); 161 Nodes[Source].Distance = 0; 162 Nodes[Source].Taken = true; 163 while (!Queue.empty()) { 164 uint64_t Src = Queue.front(); 165 Queue.pop(); 166 Nodes[Src].Taken = false; 167 // Although the residual network contains edges with negative costs 168 // (in particular, backward edges), it can be shown that there are no 169 // negative-weight cycles and the following two invariants are maintained: 170 // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V, 171 // where Dist is the length of the shortest path between two nodes. This 172 // allows to prune the search-space of the path-finding algorithm using 173 // the following early-stop criteria: 174 // -- If we find a path with zero-distance from Source to Target, stop the 175 // search, as the path is the shortest since Dist[Source, Target] >= 0; 176 // -- If we have Dist[Source, V] > Dist[Source, Target], then do not 177 // process node V, as it is guaranteed _not_ to be on a shortest path 178 // from Source to Target; it follows from inequalities 179 // Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target] 180 // >= Dist[Source, V] 181 if (Nodes[Target].Distance == 0) 182 break; 183 if (Nodes[Src].Distance > Nodes[Target].Distance) 184 continue; 185 186 // Process adjacent edges 187 for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) { 188 auto &Edge = Edges[Src][EdgeIdx]; 189 if (Edge.Flow < Edge.Capacity) { 190 uint64_t Dst = Edge.Dst; 191 int64_t NewDistance = Nodes[Src].Distance + Edge.Cost; 192 if (Nodes[Dst].Distance > NewDistance) { 193 // Update the distance and the parent node/edge 194 Nodes[Dst].Distance = NewDistance; 195 Nodes[Dst].ParentNode = Src; 196 Nodes[Dst].ParentEdgeIndex = EdgeIdx; 197 // Add the node to the queue, if it is not there yet 198 if (!Nodes[Dst].Taken) { 199 Queue.push(Dst); 200 Nodes[Dst].Taken = true; 201 } 202 } 203 } 204 } 205 } 206 207 return Nodes[Target].Distance != INF; 208 } 209 210 /// Update the current flow along the augmenting path. 211 void augmentFlowAlongPath() { 212 // Find path capacity 213 int64_t PathCapacity = INF; 214 uint64_t Now = Target; 215 while (Now != Source) { 216 uint64_t Pred = Nodes[Now].ParentNode; 217 auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex]; 218 PathCapacity = std::min(PathCapacity, Edge.Capacity - Edge.Flow); 219 Now = Pred; 220 } 221 222 assert(PathCapacity > 0 && "found incorrect augmenting path"); 223 224 // Update the flow along the path 225 Now = Target; 226 while (Now != Source) { 227 uint64_t Pred = Nodes[Now].ParentNode; 228 auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex]; 229 auto &RevEdge = Edges[Now][Edge.RevEdgeIndex]; 230 231 Edge.Flow += PathCapacity; 232 RevEdge.Flow -= PathCapacity; 233 234 Now = Pred; 235 } 236 } 237 238 /// An node in a flow network. 239 struct Node { 240 /// The cost of the cheapest path from the source to the current node. 241 int64_t Distance; 242 /// The node preceding the current one in the path. 243 uint64_t ParentNode; 244 /// The index of the edge between ParentNode and the current node. 245 uint64_t ParentEdgeIndex; 246 /// An indicator of whether the current node is in a queue. 247 bool Taken; 248 }; 249 /// An edge in a flow network. 250 struct Edge { 251 /// The cost of the edge. 252 int64_t Cost; 253 /// The capacity of the edge. 254 int64_t Capacity; 255 /// The current flow on the edge. 256 int64_t Flow; 257 /// The destination node of the edge. 258 uint64_t Dst; 259 /// The index of the reverse edge between Dst and the current node. 260 uint64_t RevEdgeIndex; 261 }; 262 263 /// The set of network nodes. 264 std::vector<Node> Nodes; 265 /// The set of network edges. 266 std::vector<std::vector<Edge>> Edges; 267 /// Source node of the flow. 268 uint64_t Source; 269 /// Target (sink) node of the flow. 270 uint64_t Target; 271 }; 272 273 /// Initializing flow network for a given function. 274 /// 275 /// Every block is split into three nodes that are responsible for (i) an 276 /// incoming flow, (ii) an outgoing flow, and (iii) penalizing an increase or 277 /// reduction of the block weight. 278 void initializeNetwork(MinCostMaxFlow &Network, FlowFunction &Func) { 279 uint64_t NumBlocks = Func.Blocks.size(); 280 assert(NumBlocks > 1 && "Too few blocks in a function"); 281 LLVM_DEBUG(dbgs() << "Initializing profi for " << NumBlocks << " blocks\n"); 282 283 // Pre-process data: make sure the entry weight is at least 1 284 if (Func.Blocks[Func.Entry].Weight == 0) { 285 Func.Blocks[Func.Entry].Weight = 1; 286 } 287 // Introducing dummy source/sink pairs to allow flow circulation. 288 // The nodes corresponding to blocks of Func have indicies in the range 289 // [0..3 * NumBlocks); the dummy nodes are indexed by the next four values. 290 uint64_t S = 3 * NumBlocks; 291 uint64_t T = S + 1; 292 uint64_t S1 = S + 2; 293 uint64_t T1 = S + 3; 294 295 Network.initialize(3 * NumBlocks + 4, S1, T1); 296 297 // Create three nodes for every block of the function 298 for (uint64_t B = 0; B < NumBlocks; B++) { 299 auto &Block = Func.Blocks[B]; 300 assert((!Block.UnknownWeight || Block.Weight == 0 || Block.isEntry()) && 301 "non-zero weight of a block w/o weight except for an entry"); 302 303 // Split every block into two nodes 304 uint64_t Bin = 3 * B; 305 uint64_t Bout = 3 * B + 1; 306 uint64_t Baux = 3 * B + 2; 307 if (Block.Weight > 0) { 308 Network.addEdge(S1, Bout, Block.Weight, 0); 309 Network.addEdge(Bin, T1, Block.Weight, 0); 310 } 311 312 // Edges from S and to T 313 assert((!Block.isEntry() || !Block.isExit()) && 314 "a block cannot be an entry and an exit"); 315 if (Block.isEntry()) { 316 Network.addEdge(S, Bin, 0); 317 } else if (Block.isExit()) { 318 Network.addEdge(Bout, T, 0); 319 } 320 321 // An auxiliary node to allow increase/reduction of block counts: 322 // We assume that decreasing block counts is more expensive than increasing, 323 // and thus, setting separate costs here. In the future we may want to tune 324 // the relative costs so as to maximize the quality of generated profiles. 325 int64_t AuxCostInc = MinCostMaxFlow::AuxCostInc; 326 int64_t AuxCostDec = MinCostMaxFlow::AuxCostDec; 327 if (Block.UnknownWeight) { 328 // Do not penalize changing weights of blocks w/o known profile count 329 AuxCostInc = 0; 330 AuxCostDec = 0; 331 } else { 332 // Increasing the count for "cold" blocks with zero initial count is more 333 // expensive than for "hot" ones 334 if (Block.Weight == 0) { 335 AuxCostInc = MinCostMaxFlow::AuxCostIncZero; 336 } 337 // Modifying the count of the entry block is expensive 338 if (Block.isEntry()) { 339 AuxCostInc = MinCostMaxFlow::AuxCostIncEntry; 340 AuxCostDec = MinCostMaxFlow::AuxCostDecEntry; 341 } 342 } 343 // For blocks with self-edges, do not penalize a reduction of the count, 344 // as all of the increase can be attributed to the self-edge 345 if (Block.HasSelfEdge) { 346 AuxCostDec = 0; 347 } 348 349 Network.addEdge(Bin, Baux, AuxCostInc); 350 Network.addEdge(Baux, Bout, AuxCostInc); 351 if (Block.Weight > 0) { 352 Network.addEdge(Bout, Baux, AuxCostDec); 353 Network.addEdge(Baux, Bin, AuxCostDec); 354 } 355 } 356 357 // Creating edges for every jump 358 for (auto &Jump : Func.Jumps) { 359 uint64_t Src = Jump.Source; 360 uint64_t Dst = Jump.Target; 361 if (Src != Dst) { 362 uint64_t SrcOut = 3 * Src + 1; 363 uint64_t DstIn = 3 * Dst; 364 uint64_t Cost = Jump.IsUnlikely ? MinCostMaxFlow::AuxCostUnlikely : 0; 365 Network.addEdge(SrcOut, DstIn, Cost); 366 } 367 } 368 369 // Make sure we have a valid flow circulation 370 Network.addEdge(T, S, 0); 371 } 372 373 /// Extract resulting block and edge counts from the flow network. 374 void extractWeights(MinCostMaxFlow &Network, FlowFunction &Func) { 375 uint64_t NumBlocks = Func.Blocks.size(); 376 377 // Extract resulting block counts 378 for (uint64_t Src = 0; Src < NumBlocks; Src++) { 379 auto &Block = Func.Blocks[Src]; 380 uint64_t SrcOut = 3 * Src + 1; 381 int64_t Flow = 0; 382 for (auto &Adj : Network.getFlow(SrcOut)) { 383 uint64_t DstIn = Adj.first; 384 int64_t DstFlow = Adj.second; 385 bool IsAuxNode = (DstIn < 3 * NumBlocks && DstIn % 3 == 2); 386 if (!IsAuxNode || Block.HasSelfEdge) { 387 Flow += DstFlow; 388 } 389 } 390 Block.Flow = Flow; 391 assert(Flow >= 0 && "negative block flow"); 392 } 393 394 // Extract resulting jump counts 395 for (auto &Jump : Func.Jumps) { 396 uint64_t Src = Jump.Source; 397 uint64_t Dst = Jump.Target; 398 int64_t Flow = 0; 399 if (Src != Dst) { 400 uint64_t SrcOut = 3 * Src + 1; 401 uint64_t DstIn = 3 * Dst; 402 Flow = Network.getFlow(SrcOut, DstIn); 403 } else { 404 uint64_t SrcOut = 3 * Src + 1; 405 uint64_t SrcAux = 3 * Src + 2; 406 int64_t AuxFlow = Network.getFlow(SrcOut, SrcAux); 407 if (AuxFlow > 0) 408 Flow = AuxFlow; 409 } 410 Jump.Flow = Flow; 411 assert(Flow >= 0 && "negative jump flow"); 412 } 413 } 414 415 #ifndef NDEBUG 416 /// Verify that the computed flow values satisfy flow conservation rules 417 void verifyWeights(const FlowFunction &Func) { 418 const uint64_t NumBlocks = Func.Blocks.size(); 419 auto InFlow = std::vector<uint64_t>(NumBlocks, 0); 420 auto OutFlow = std::vector<uint64_t>(NumBlocks, 0); 421 for (auto &Jump : Func.Jumps) { 422 InFlow[Jump.Target] += Jump.Flow; 423 OutFlow[Jump.Source] += Jump.Flow; 424 } 425 426 uint64_t TotalInFlow = 0; 427 uint64_t TotalOutFlow = 0; 428 for (uint64_t I = 0; I < NumBlocks; I++) { 429 auto &Block = Func.Blocks[I]; 430 if (Block.isEntry()) { 431 TotalInFlow += Block.Flow; 432 assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow"); 433 } else if (Block.isExit()) { 434 TotalOutFlow += Block.Flow; 435 assert(Block.Flow == InFlow[I] && "incorrectly computed control flow"); 436 } else { 437 assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow"); 438 assert(Block.Flow == InFlow[I] && "incorrectly computed control flow"); 439 } 440 } 441 assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow"); 442 } 443 #endif 444 445 } // end of anonymous namespace 446 447 /// Apply the profile inference algorithm for a given flow function 448 void llvm::applyFlowInference(FlowFunction &Func) { 449 // Create and apply an inference network model 450 auto InferenceNetwork = MinCostMaxFlow(); 451 initializeNetwork(InferenceNetwork, Func); 452 InferenceNetwork.run(); 453 454 // Extract flow values for every block and every edge 455 extractWeights(InferenceNetwork, Func); 456 457 #ifndef NDEBUG 458 // Verify the result 459 verifyWeights(Func); 460 #endif 461 } 462