1#!/usr/bin/env python3 2import csv 3import math 4import os 5import random 6import sys 7 8import matplotlib 9matplotlib.use("Agg") 10import matplotlib.backends.backend_pdf 11import matplotlib.pyplot as plt 12import numpy as np 13import pandas as pd 14import seaborn as sns 15 16 17# Make sure a legend has the same color across all generated graphs. 18def get_cmap(n, name="hsv"): 19 """Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 20 RGB color; the keyword argument name must be a standard mpl colormap name.""" 21 return plt.cm.get_cmap(name, n) 22 23 24color_index = 0 25bar_color_maps = {} 26colors = [] 27n_colors = 360 28linear_colors = get_cmap(n_colors) 29for i in range(n_colors): 30 colors.append(linear_colors(i)) 31# Shuffle the colors so that adjacent bars in a graph are obvious to differentiate. 32random.shuffle(colors) 33 34 35def num_to_gb(n): 36 one_gb = 1024 * 1024 * 1024 37 if float(n) % one_gb == 0: 38 return "{}".format(n / one_gb) 39 # Keep two decimal points. 40 return "{0:.2f}".format(float(n) / one_gb) 41 42 43def plot_miss_stats_graphs( 44 csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name 45): 46 miss_ratios = {} 47 for file in os.listdir(csv_result_dir): 48 if not file.startswith(file_prefix): 49 continue 50 if not file.endswith(file_suffix): 51 continue 52 print("Processing file {}/{}".format(csv_result_dir, file)) 53 mrc_file_path = csv_result_dir + "/" + file 54 with open(mrc_file_path, "r") as csvfile: 55 rows = csv.reader(csvfile, delimiter=",") 56 for row in rows: 57 cache_name = row[0] 58 num_shard_bits = int(row[1]) 59 ghost_capacity = int(row[2]) 60 capacity = int(row[3]) 61 miss_ratio = float(row[4]) 62 config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity) 63 if config not in miss_ratios: 64 miss_ratios[config] = {} 65 miss_ratios[config]["x"] = [] 66 miss_ratios[config]["y"] = [] 67 miss_ratios[config]["x"].append(capacity) 68 miss_ratios[config]["y"].append(miss_ratio) 69 fig = plt.figure() 70 for config in miss_ratios: 71 plt.plot( 72 miss_ratios[config]["x"], miss_ratios[config]["y"], label=config 73 ) 74 plt.xlabel("Cache capacity") 75 plt.ylabel(ylabel) 76 plt.xscale("log", basex=2) 77 plt.ylim(ymin=0) 78 plt.title("{}".format(file)) 79 plt.legend() 80 fig.savefig( 81 output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight" 82 ) 83 84 85def plot_miss_stats_diff_lru_graphs( 86 csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name 87): 88 miss_ratios = {} 89 for file in os.listdir(csv_result_dir): 90 if not file.startswith(file_prefix): 91 continue 92 if not file.endswith(file_suffix): 93 continue 94 print("Processing file {}/{}".format(csv_result_dir, file)) 95 mrc_file_path = csv_result_dir + "/" + file 96 with open(mrc_file_path, "r") as csvfile: 97 rows = csv.reader(csvfile, delimiter=",") 98 for row in rows: 99 cache_name = row[0] 100 num_shard_bits = int(row[1]) 101 ghost_capacity = int(row[2]) 102 capacity = int(row[3]) 103 miss_ratio = float(row[4]) 104 config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity) 105 if config not in miss_ratios: 106 miss_ratios[config] = {} 107 miss_ratios[config]["x"] = [] 108 miss_ratios[config]["y"] = [] 109 miss_ratios[config]["x"].append(capacity) 110 miss_ratios[config]["y"].append(miss_ratio) 111 if "lru-0-0" not in miss_ratios: 112 return 113 fig = plt.figure() 114 for config in miss_ratios: 115 diffs = [0] * len(miss_ratios["lru-0-0"]["x"]) 116 for i in range(len(miss_ratios["lru-0-0"]["x"])): 117 for j in range(len(miss_ratios[config]["x"])): 118 if miss_ratios["lru-0-0"]["x"][i] == miss_ratios[config]["x"][j]: 119 diffs[i] = ( 120 miss_ratios[config]["y"][j] - miss_ratios["lru-0-0"]["y"][i] 121 ) 122 break 123 plt.plot(miss_ratios["lru-0-0"]["x"], diffs, label=config) 124 plt.xlabel("Cache capacity") 125 plt.ylabel(ylabel) 126 plt.xscale("log", basex=2) 127 plt.title("{}".format(file)) 128 plt.legend() 129 fig.savefig( 130 output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight" 131 ) 132 133 134def sanitize(label): 135 # matplotlib cannot plot legends that is prefixed with "_" 136 # so we need to remove them here. 137 index = 0 138 for i in range(len(label)): 139 if label[i] == "_": 140 index += 1 141 else: 142 break 143 data = label[index:] 144 # The value of uint64_max in c++. 145 if "18446744073709551615" in data: 146 return "max" 147 return data 148 149 150# Read the csv file vertically, i.e., group the data by columns. 151def read_data_for_plot_vertical(csvfile): 152 x = [] 153 labels = [] 154 label_stats = {} 155 csv_rows = csv.reader(csvfile, delimiter=",") 156 data_rows = [] 157 for row in csv_rows: 158 data_rows.append(row) 159 # header 160 for i in range(1, len(data_rows[0])): 161 labels.append(sanitize(data_rows[0][i])) 162 label_stats[i - 1] = [] 163 for i in range(1, len(data_rows)): 164 for j in range(len(data_rows[i])): 165 if j == 0: 166 x.append(sanitize(data_rows[i][j])) 167 continue 168 label_stats[j - 1].append(float(data_rows[i][j])) 169 return x, labels, label_stats 170 171 172# Read the csv file horizontally, i.e., group the data by rows. 173def read_data_for_plot_horizontal(csvfile): 174 x = [] 175 labels = [] 176 label_stats = {} 177 csv_rows = csv.reader(csvfile, delimiter=",") 178 data_rows = [] 179 for row in csv_rows: 180 data_rows.append(row) 181 # header 182 for i in range(1, len(data_rows)): 183 labels.append(sanitize(data_rows[i][0])) 184 label_stats[i - 1] = [] 185 for i in range(1, len(data_rows[0])): 186 x.append(sanitize(data_rows[0][i])) 187 for i in range(1, len(data_rows)): 188 for j in range(len(data_rows[i])): 189 if j == 0: 190 # label 191 continue 192 label_stats[i - 1].append(float(data_rows[i][j])) 193 return x, labels, label_stats 194 195 196def read_data_for_plot(csvfile, vertical): 197 if vertical: 198 return read_data_for_plot_vertical(csvfile) 199 return read_data_for_plot_horizontal(csvfile) 200 201 202def plot_line_charts( 203 csv_result_dir, 204 output_result_dir, 205 filename_prefix, 206 filename_suffix, 207 pdf_name, 208 xlabel, 209 ylabel, 210 title, 211 vertical, 212 legend, 213): 214 global color_index, bar_color_maps, colors 215 pdf = matplotlib.backends.backend_pdf.PdfPages(output_result_dir + "/" + pdf_name) 216 for file in os.listdir(csv_result_dir): 217 if not file.endswith(filename_suffix): 218 continue 219 if not file.startswith(filename_prefix): 220 continue 221 print("Processing file {}/{}".format(csv_result_dir, file)) 222 with open(csv_result_dir + "/" + file, "r") as csvfile: 223 x, labels, label_stats = read_data_for_plot(csvfile, vertical) 224 if len(x) == 0 or len(labels) == 0: 225 continue 226 # plot figure 227 fig = plt.figure() 228 for label_index in label_stats: 229 # Assign a unique color to this label. 230 if labels[label_index] not in bar_color_maps: 231 bar_color_maps[labels[label_index]] = colors[color_index] 232 color_index += 1 233 plt.plot( 234 [int(x[i]) for i in range(len(x) - 1)], 235 label_stats[label_index][:-1], 236 label=labels[label_index], 237 color=bar_color_maps[labels[label_index]], 238 ) 239 240 # Translate time unit into x labels. 241 if "_60" in file: 242 plt.xlabel("{} (Minute)".format(xlabel)) 243 if "_3600" in file: 244 plt.xlabel("{} (Hour)".format(xlabel)) 245 plt.ylabel(ylabel) 246 plt.title("{} {}".format(title, file)) 247 if legend: 248 plt.legend() 249 pdf.savefig(fig) 250 pdf.close() 251 252 253def plot_stacked_bar_charts( 254 csv_result_dir, 255 output_result_dir, 256 filename_suffix, 257 pdf_name, 258 xlabel, 259 ylabel, 260 title, 261 vertical, 262 x_prefix, 263): 264 global color_index, bar_color_maps, colors 265 pdf = matplotlib.backends.backend_pdf.PdfPages( 266 "{}/{}".format(output_result_dir, pdf_name) 267 ) 268 for file in os.listdir(csv_result_dir): 269 if not file.endswith(filename_suffix): 270 continue 271 with open(csv_result_dir + "/" + file, "r") as csvfile: 272 print("Processing file {}/{}".format(csv_result_dir, file)) 273 x, labels, label_stats = read_data_for_plot(csvfile, vertical) 274 if len(x) == 0 or len(label_stats) == 0: 275 continue 276 # Plot figure 277 fig = plt.figure() 278 ind = np.arange(len(x)) # the x locations for the groups 279 width = 0.5 # the width of the bars: can also be len(x) sequence 280 bars = [] 281 bottom_bars = [] 282 for _i in label_stats[0]: 283 bottom_bars.append(0) 284 for i in range(0, len(label_stats)): 285 # Assign a unique color to this label. 286 if labels[i] not in bar_color_maps: 287 bar_color_maps[labels[i]] = colors[color_index] 288 color_index += 1 289 p = plt.bar( 290 ind, 291 label_stats[i], 292 width, 293 bottom=bottom_bars, 294 color=bar_color_maps[labels[i]], 295 ) 296 bars.append(p[0]) 297 for j in range(len(label_stats[i])): 298 bottom_bars[j] += label_stats[i][j] 299 plt.xlabel(xlabel) 300 plt.ylabel(ylabel) 301 plt.xticks( 302 ind, [x_prefix + x[i] for i in range(len(x))], rotation=20, fontsize=8 303 ) 304 plt.legend(bars, labels) 305 plt.title("{} filename:{}".format(title, file)) 306 pdf.savefig(fig) 307 pdf.close() 308 309 310def plot_heatmap(csv_result_dir, output_result_dir, filename_suffix, pdf_name, title): 311 pdf = matplotlib.backends.backend_pdf.PdfPages( 312 "{}/{}".format(output_result_dir, pdf_name) 313 ) 314 for file in os.listdir(csv_result_dir): 315 if not file.endswith(filename_suffix): 316 continue 317 csv_file_name = "{}/{}".format(csv_result_dir, file) 318 print("Processing file {}/{}".format(csv_result_dir, file)) 319 corr_table = pd.read_csv(csv_file_name) 320 corr_table = corr_table.pivot("label", "corr", "value") 321 fig = plt.figure() 322 sns.heatmap(corr_table, annot=True, linewidths=0.5, fmt=".2") 323 plt.title("{} filename:{}".format(title, file)) 324 pdf.savefig(fig) 325 pdf.close() 326 327 328def plot_timeline(csv_result_dir, output_result_dir): 329 plot_line_charts( 330 csv_result_dir, 331 output_result_dir, 332 filename_prefix="", 333 filename_suffix="access_timeline", 334 pdf_name="access_time.pdf", 335 xlabel="Time", 336 ylabel="Throughput", 337 title="Access timeline with group by label", 338 vertical=False, 339 legend=True, 340 ) 341 342 343def convert_to_0_if_nan(n): 344 if math.isnan(n): 345 return 0.0 346 return n 347 348 349def plot_correlation(csv_result_dir, output_result_dir): 350 # Processing the correlation input first. 351 label_str_file = {} 352 for file in os.listdir(csv_result_dir): 353 if not file.endswith("correlation_input"): 354 continue 355 csv_file_name = "{}/{}".format(csv_result_dir, file) 356 print("Processing file {}/{}".format(csv_result_dir, file)) 357 corr_table = pd.read_csv(csv_file_name) 358 label_str = file.split("_")[0] 359 label = file[len(label_str) + 1 :] 360 label = label[: len(label) - len("_correlation_input")] 361 362 output_file = "{}/{}_correlation_output".format(csv_result_dir, label_str) 363 if output_file not in label_str_file: 364 f = open("{}/{}_correlation_output".format(csv_result_dir, label_str), "w+") 365 label_str_file[output_file] = f 366 f.write("label,corr,value\n") 367 f = label_str_file[output_file] 368 f.write( 369 "{},{},{}\n".format( 370 label, 371 "LA+A", 372 convert_to_0_if_nan( 373 corr_table["num_accesses_since_last_access"].corr( 374 corr_table["num_accesses_till_next_access"], method="spearman" 375 ) 376 ), 377 ) 378 ) 379 f.write( 380 "{},{},{}\n".format( 381 label, 382 "PA+A", 383 convert_to_0_if_nan( 384 corr_table["num_past_accesses"].corr( 385 corr_table["num_accesses_till_next_access"], method="spearman" 386 ) 387 ), 388 ) 389 ) 390 f.write( 391 "{},{},{}\n".format( 392 label, 393 "LT+A", 394 convert_to_0_if_nan( 395 corr_table["elapsed_time_since_last_access"].corr( 396 corr_table["num_accesses_till_next_access"], method="spearman" 397 ) 398 ), 399 ) 400 ) 401 f.write( 402 "{},{},{}\n".format( 403 label, 404 "LA+T", 405 convert_to_0_if_nan( 406 corr_table["num_accesses_since_last_access"].corr( 407 corr_table["elapsed_time_till_next_access"], method="spearman" 408 ) 409 ), 410 ) 411 ) 412 f.write( 413 "{},{},{}\n".format( 414 label, 415 "LT+T", 416 convert_to_0_if_nan( 417 corr_table["elapsed_time_since_last_access"].corr( 418 corr_table["elapsed_time_till_next_access"], method="spearman" 419 ) 420 ), 421 ) 422 ) 423 f.write( 424 "{},{},{}\n".format( 425 label, 426 "PA+T", 427 convert_to_0_if_nan( 428 corr_table["num_past_accesses"].corr( 429 corr_table["elapsed_time_till_next_access"], method="spearman" 430 ) 431 ), 432 ) 433 ) 434 for label_str in label_str_file: 435 label_str_file[label_str].close() 436 437 plot_heatmap( 438 csv_result_dir, 439 output_result_dir, 440 "correlation_output", 441 "correlation.pdf", 442 "Correlation", 443 ) 444 445 446def plot_reuse_graphs(csv_result_dir, output_result_dir): 447 plot_stacked_bar_charts( 448 csv_result_dir, 449 output_result_dir, 450 filename_suffix="avg_reuse_interval_naccesses", 451 pdf_name="avg_reuse_interval_naccesses.pdf", 452 xlabel="", 453 ylabel="Percentage of accesses", 454 title="Average reuse interval", 455 vertical=True, 456 x_prefix="< ", 457 ) 458 plot_stacked_bar_charts( 459 csv_result_dir, 460 output_result_dir, 461 filename_suffix="avg_reuse_interval", 462 pdf_name="avg_reuse_interval.pdf", 463 xlabel="", 464 ylabel="Percentage of blocks", 465 title="Average reuse interval", 466 vertical=True, 467 x_prefix="< ", 468 ) 469 plot_stacked_bar_charts( 470 csv_result_dir, 471 output_result_dir, 472 filename_suffix="access_reuse_interval", 473 pdf_name="reuse_interval.pdf", 474 xlabel="Seconds", 475 ylabel="Percentage of accesses", 476 title="Reuse interval", 477 vertical=True, 478 x_prefix="< ", 479 ) 480 plot_stacked_bar_charts( 481 csv_result_dir, 482 output_result_dir, 483 filename_suffix="reuse_lifetime", 484 pdf_name="reuse_lifetime.pdf", 485 xlabel="Seconds", 486 ylabel="Percentage of blocks", 487 title="Reuse lifetime", 488 vertical=True, 489 x_prefix="< ", 490 ) 491 plot_line_charts( 492 csv_result_dir, 493 output_result_dir, 494 filename_prefix="", 495 filename_suffix="reuse_blocks_timeline", 496 pdf_name="reuse_blocks_timeline.pdf", 497 xlabel="", 498 ylabel="Percentage of blocks", 499 title="Reuse blocks timeline", 500 vertical=False, 501 legend=False, 502 ) 503 504 505def plot_percentage_access_summary(csv_result_dir, output_result_dir): 506 plot_stacked_bar_charts( 507 csv_result_dir, 508 output_result_dir, 509 filename_suffix="percentage_of_accesses_summary", 510 pdf_name="percentage_access.pdf", 511 xlabel="", 512 ylabel="Percentage of accesses", 513 title="", 514 vertical=True, 515 x_prefix="", 516 ) 517 plot_stacked_bar_charts( 518 csv_result_dir, 519 output_result_dir, 520 filename_suffix="percent_ref_keys", 521 pdf_name="percent_ref_keys.pdf", 522 xlabel="", 523 ylabel="Percentage of blocks", 524 title="", 525 vertical=True, 526 x_prefix="", 527 ) 528 plot_stacked_bar_charts( 529 csv_result_dir, 530 output_result_dir, 531 filename_suffix="percent_data_size_on_ref_keys", 532 pdf_name="percent_data_size_on_ref_keys.pdf", 533 xlabel="", 534 ylabel="Percentage of blocks", 535 title="", 536 vertical=True, 537 x_prefix="", 538 ) 539 plot_stacked_bar_charts( 540 csv_result_dir, 541 output_result_dir, 542 filename_suffix="percent_accesses_on_ref_keys", 543 pdf_name="percent_accesses_on_ref_keys.pdf", 544 xlabel="", 545 ylabel="Percentage of blocks", 546 title="", 547 vertical=True, 548 x_prefix="", 549 ) 550 551 552def plot_access_count_summary(csv_result_dir, output_result_dir): 553 plot_stacked_bar_charts( 554 csv_result_dir, 555 output_result_dir, 556 filename_suffix="access_count_summary", 557 pdf_name="access_count_summary.pdf", 558 xlabel="Access count", 559 ylabel="Percentage of blocks", 560 title="", 561 vertical=True, 562 x_prefix="< ", 563 ) 564 plot_line_charts( 565 csv_result_dir, 566 output_result_dir, 567 filename_prefix="", 568 filename_suffix="skewness", 569 pdf_name="skew.pdf", 570 xlabel="", 571 ylabel="Percentage of accesses", 572 title="Skewness", 573 vertical=True, 574 legend=False, 575 ) 576 577 578def plot_miss_ratio_timeline(csv_result_dir, output_result_dir): 579 plot_line_charts( 580 csv_result_dir, 581 output_result_dir, 582 filename_prefix="", 583 filename_suffix="3600_miss_ratio_timeline", 584 pdf_name="miss_ratio_timeline.pdf", 585 xlabel="Time", 586 ylabel="Miss Ratio (%)", 587 title="Miss ratio timeline", 588 vertical=False, 589 legend=True, 590 ) 591 plot_line_charts( 592 csv_result_dir, 593 output_result_dir, 594 filename_prefix="", 595 filename_suffix="3600_miss_timeline", 596 pdf_name="miss_timeline.pdf", 597 xlabel="Time", 598 ylabel="# of misses ", 599 title="Miss timeline", 600 vertical=False, 601 legend=True, 602 ) 603 plot_line_charts( 604 csv_result_dir, 605 output_result_dir, 606 filename_prefix="", 607 filename_suffix="3600_miss_timeline", 608 pdf_name="miss_timeline.pdf", 609 xlabel="Time", 610 ylabel="# of misses ", 611 title="Miss timeline", 612 vertical=False, 613 legend=True, 614 ) 615 plot_line_charts( 616 csv_result_dir, 617 output_result_dir, 618 filename_prefix="", 619 filename_suffix="3600_policy_timeline", 620 pdf_name="policy_timeline.pdf", 621 xlabel="Time", 622 ylabel="# of times a policy is selected ", 623 title="Policy timeline", 624 vertical=False, 625 legend=True, 626 ) 627 plot_line_charts( 628 csv_result_dir, 629 output_result_dir, 630 filename_prefix="", 631 filename_suffix="3600_policy_ratio_timeline", 632 pdf_name="policy_ratio_timeline.pdf", 633 xlabel="Time", 634 ylabel="Percentage of times a policy is selected ", 635 title="Policy timeline", 636 vertical=False, 637 legend=True, 638 ) 639 640 641if __name__ == "__main__": 642 if len(sys.argv) < 3: 643 print( 644 "Must provide two arguments: \n" 645 "1) The directory that saves a list of " 646 "directories which contain block cache trace analyzer result files. \n" 647 "2) the directory to save plotted graphs. \n" 648 ) 649 exit(1) 650 csv_result_dir = sys.argv[1] 651 output_result_dir = sys.argv[2] 652 print( 653 "Processing directory {} and save graphs to {}.".format( 654 csv_result_dir, output_result_dir 655 ) 656 ) 657 for csv_relative_dir in os.listdir(csv_result_dir): 658 csv_abs_dir = csv_result_dir + "/" + csv_relative_dir 659 result_dir = output_result_dir + "/" + csv_relative_dir 660 if not os.path.isdir(csv_abs_dir): 661 print("{} is not a directory".format(csv_abs_dir)) 662 continue 663 print("Processing experiment dir: {}".format(csv_relative_dir)) 664 if not os.path.exists(result_dir): 665 os.makedirs(result_dir) 666 plot_access_count_summary(csv_abs_dir, result_dir) 667 plot_timeline(csv_abs_dir, result_dir) 668 plot_miss_ratio_timeline(csv_result_dir, output_result_dir) 669 plot_correlation(csv_abs_dir, result_dir) 670 plot_reuse_graphs(csv_abs_dir, result_dir) 671 plot_percentage_access_summary(csv_abs_dir, result_dir) 672 plot_miss_stats_graphs( 673 csv_abs_dir, 674 result_dir, 675 file_prefix="", 676 file_suffix="mrc", 677 ylabel="Miss ratio (%)", 678 pdf_file_name="mrc", 679 ) 680 plot_miss_stats_diff_lru_graphs( 681 csv_abs_dir, 682 result_dir, 683 file_prefix="", 684 file_suffix="mrc", 685 ylabel="Miss ratio (%)", 686 pdf_file_name="mrc_diff_lru", 687 ) 688 # The following stats are only available in pysim. 689 for time_unit in ["1", "60", "3600"]: 690 plot_miss_stats_graphs( 691 csv_abs_dir, 692 result_dir, 693 file_prefix="ml_{}_".format(time_unit), 694 file_suffix="p95mb", 695 ylabel="p95 number of byte miss per {} seconds".format(time_unit), 696 pdf_file_name="p95mb_per{}_seconds".format(time_unit), 697 ) 698 plot_miss_stats_graphs( 699 csv_abs_dir, 700 result_dir, 701 file_prefix="ml_{}_".format(time_unit), 702 file_suffix="avgmb", 703 ylabel="Average number of byte miss per {} seconds".format(time_unit), 704 pdf_file_name="avgmb_per{}_seconds".format(time_unit), 705 ) 706 plot_miss_stats_diff_lru_graphs( 707 csv_abs_dir, 708 result_dir, 709 file_prefix="ml_{}_".format(time_unit), 710 file_suffix="p95mb", 711 ylabel="p95 number of byte miss per {} seconds".format(time_unit), 712 pdf_file_name="p95mb_per{}_seconds_diff_lru".format(time_unit), 713 ) 714 plot_miss_stats_diff_lru_graphs( 715 csv_abs_dir, 716 result_dir, 717 file_prefix="ml_{}_".format(time_unit), 718 file_suffix="avgmb", 719 ylabel="Average number of byte miss per {} seconds".format(time_unit), 720 pdf_file_name="avgmb_per{}_seconds_diff_lru".format(time_unit), 721 ) 722