1#!/usr/bin/env python3
2import csv
3import math
4import os
5import random
6import sys
7
8import matplotlib
9matplotlib.use("Agg")
10import matplotlib.backends.backend_pdf
11import matplotlib.pyplot as plt
12import numpy as np
13import pandas as pd
14import seaborn as sns
15
16
17# Make sure a legend has the same color across all generated graphs.
18def get_cmap(n, name="hsv"):
19    """Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
20    RGB color; the keyword argument name must be a standard mpl colormap name."""
21    return plt.cm.get_cmap(name, n)
22
23
24color_index = 0
25bar_color_maps = {}
26colors = []
27n_colors = 360
28linear_colors = get_cmap(n_colors)
29for i in range(n_colors):
30    colors.append(linear_colors(i))
31# Shuffle the colors so that adjacent bars in a graph are obvious to differentiate.
32random.shuffle(colors)
33
34
35def num_to_gb(n):
36    one_gb = 1024 * 1024 * 1024
37    if float(n) % one_gb == 0:
38        return "{}".format(n / one_gb)
39    # Keep two decimal points.
40    return "{0:.2f}".format(float(n) / one_gb)
41
42
43def plot_miss_stats_graphs(
44    csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name
45):
46    miss_ratios = {}
47    for file in os.listdir(csv_result_dir):
48        if not file.startswith(file_prefix):
49            continue
50        if not file.endswith(file_suffix):
51            continue
52        print("Processing file {}/{}".format(csv_result_dir, file))
53        mrc_file_path = csv_result_dir + "/" + file
54        with open(mrc_file_path, "r") as csvfile:
55            rows = csv.reader(csvfile, delimiter=",")
56            for row in rows:
57                cache_name = row[0]
58                num_shard_bits = int(row[1])
59                ghost_capacity = int(row[2])
60                capacity = int(row[3])
61                miss_ratio = float(row[4])
62                config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
63                if config not in miss_ratios:
64                    miss_ratios[config] = {}
65                    miss_ratios[config]["x"] = []
66                    miss_ratios[config]["y"] = []
67                miss_ratios[config]["x"].append(capacity)
68                miss_ratios[config]["y"].append(miss_ratio)
69            fig = plt.figure()
70            for config in miss_ratios:
71                plt.plot(
72                    miss_ratios[config]["x"], miss_ratios[config]["y"], label=config
73                )
74            plt.xlabel("Cache capacity")
75            plt.ylabel(ylabel)
76            plt.xscale("log", basex=2)
77            plt.ylim(ymin=0)
78            plt.title("{}".format(file))
79            plt.legend()
80            fig.savefig(
81                output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight"
82            )
83
84
85def plot_miss_stats_diff_lru_graphs(
86    csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name
87):
88    miss_ratios = {}
89    for file in os.listdir(csv_result_dir):
90        if not file.startswith(file_prefix):
91            continue
92        if not file.endswith(file_suffix):
93            continue
94        print("Processing file {}/{}".format(csv_result_dir, file))
95        mrc_file_path = csv_result_dir + "/" + file
96        with open(mrc_file_path, "r") as csvfile:
97            rows = csv.reader(csvfile, delimiter=",")
98            for row in rows:
99                cache_name = row[0]
100                num_shard_bits = int(row[1])
101                ghost_capacity = int(row[2])
102                capacity = int(row[3])
103                miss_ratio = float(row[4])
104                config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
105                if config not in miss_ratios:
106                    miss_ratios[config] = {}
107                    miss_ratios[config]["x"] = []
108                    miss_ratios[config]["y"] = []
109                miss_ratios[config]["x"].append(capacity)
110                miss_ratios[config]["y"].append(miss_ratio)
111    if "lru-0-0" not in miss_ratios:
112        return
113    fig = plt.figure()
114    for config in miss_ratios:
115        diffs = [0] * len(miss_ratios["lru-0-0"]["x"])
116        for i in range(len(miss_ratios["lru-0-0"]["x"])):
117            for j in range(len(miss_ratios[config]["x"])):
118                if miss_ratios["lru-0-0"]["x"][i] == miss_ratios[config]["x"][j]:
119                    diffs[i] = (
120                        miss_ratios[config]["y"][j] - miss_ratios["lru-0-0"]["y"][i]
121                    )
122                    break
123        plt.plot(miss_ratios["lru-0-0"]["x"], diffs, label=config)
124    plt.xlabel("Cache capacity")
125    plt.ylabel(ylabel)
126    plt.xscale("log", basex=2)
127    plt.title("{}".format(file))
128    plt.legend()
129    fig.savefig(
130        output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight"
131    )
132
133
134def sanitize(label):
135    # matplotlib cannot plot legends that is prefixed with "_"
136    # so we need to remove them here.
137    index = 0
138    for i in range(len(label)):
139        if label[i] == "_":
140            index += 1
141        else:
142            break
143    data = label[index:]
144    # The value of uint64_max in c++.
145    if "18446744073709551615" in data:
146        return "max"
147    return data
148
149
150# Read the csv file vertically, i.e., group the data by columns.
151def read_data_for_plot_vertical(csvfile):
152    x = []
153    labels = []
154    label_stats = {}
155    csv_rows = csv.reader(csvfile, delimiter=",")
156    data_rows = []
157    for row in csv_rows:
158        data_rows.append(row)
159    # header
160    for i in range(1, len(data_rows[0])):
161        labels.append(sanitize(data_rows[0][i]))
162        label_stats[i - 1] = []
163    for i in range(1, len(data_rows)):
164        for j in range(len(data_rows[i])):
165            if j == 0:
166                x.append(sanitize(data_rows[i][j]))
167                continue
168            label_stats[j - 1].append(float(data_rows[i][j]))
169    return x, labels, label_stats
170
171
172# Read the csv file horizontally, i.e., group the data by rows.
173def read_data_for_plot_horizontal(csvfile):
174    x = []
175    labels = []
176    label_stats = {}
177    csv_rows = csv.reader(csvfile, delimiter=",")
178    data_rows = []
179    for row in csv_rows:
180        data_rows.append(row)
181    # header
182    for i in range(1, len(data_rows)):
183        labels.append(sanitize(data_rows[i][0]))
184        label_stats[i - 1] = []
185    for i in range(1, len(data_rows[0])):
186        x.append(sanitize(data_rows[0][i]))
187    for i in range(1, len(data_rows)):
188        for j in range(len(data_rows[i])):
189            if j == 0:
190                # label
191                continue
192            label_stats[i - 1].append(float(data_rows[i][j]))
193    return x, labels, label_stats
194
195
196def read_data_for_plot(csvfile, vertical):
197    if vertical:
198        return read_data_for_plot_vertical(csvfile)
199    return read_data_for_plot_horizontal(csvfile)
200
201
202def plot_line_charts(
203    csv_result_dir,
204    output_result_dir,
205    filename_prefix,
206    filename_suffix,
207    pdf_name,
208    xlabel,
209    ylabel,
210    title,
211    vertical,
212    legend,
213):
214    global color_index, bar_color_maps, colors
215    pdf = matplotlib.backends.backend_pdf.PdfPages(output_result_dir + "/" + pdf_name)
216    for file in os.listdir(csv_result_dir):
217        if not file.endswith(filename_suffix):
218            continue
219        if not file.startswith(filename_prefix):
220            continue
221        print("Processing file {}/{}".format(csv_result_dir, file))
222        with open(csv_result_dir + "/" + file, "r") as csvfile:
223            x, labels, label_stats = read_data_for_plot(csvfile, vertical)
224            if len(x) == 0 or len(labels) == 0:
225                continue
226            # plot figure
227            fig = plt.figure()
228            for label_index in label_stats:
229                # Assign a unique color to this label.
230                if labels[label_index] not in bar_color_maps:
231                    bar_color_maps[labels[label_index]] = colors[color_index]
232                    color_index += 1
233                plt.plot(
234                    [int(x[i]) for i in range(len(x) - 1)],
235                    label_stats[label_index][:-1],
236                    label=labels[label_index],
237                    color=bar_color_maps[labels[label_index]],
238                )
239
240            # Translate time unit into x labels.
241            if "_60" in file:
242                plt.xlabel("{} (Minute)".format(xlabel))
243            if "_3600" in file:
244                plt.xlabel("{} (Hour)".format(xlabel))
245            plt.ylabel(ylabel)
246            plt.title("{} {}".format(title, file))
247            if legend:
248                plt.legend()
249            pdf.savefig(fig)
250    pdf.close()
251
252
253def plot_stacked_bar_charts(
254    csv_result_dir,
255    output_result_dir,
256    filename_suffix,
257    pdf_name,
258    xlabel,
259    ylabel,
260    title,
261    vertical,
262    x_prefix,
263):
264    global color_index, bar_color_maps, colors
265    pdf = matplotlib.backends.backend_pdf.PdfPages(
266        "{}/{}".format(output_result_dir, pdf_name)
267    )
268    for file in os.listdir(csv_result_dir):
269        if not file.endswith(filename_suffix):
270            continue
271        with open(csv_result_dir + "/" + file, "r") as csvfile:
272            print("Processing file {}/{}".format(csv_result_dir, file))
273            x, labels, label_stats = read_data_for_plot(csvfile, vertical)
274            if len(x) == 0 or len(label_stats) == 0:
275                continue
276            # Plot figure
277            fig = plt.figure()
278            ind = np.arange(len(x))  # the x locations for the groups
279            width = 0.5  # the width of the bars: can also be len(x) sequence
280            bars = []
281            bottom_bars = []
282            for _i in label_stats[0]:
283                bottom_bars.append(0)
284            for i in range(0, len(label_stats)):
285                # Assign a unique color to this label.
286                if labels[i] not in bar_color_maps:
287                    bar_color_maps[labels[i]] = colors[color_index]
288                    color_index += 1
289                p = plt.bar(
290                    ind,
291                    label_stats[i],
292                    width,
293                    bottom=bottom_bars,
294                    color=bar_color_maps[labels[i]],
295                )
296                bars.append(p[0])
297                for j in range(len(label_stats[i])):
298                    bottom_bars[j] += label_stats[i][j]
299            plt.xlabel(xlabel)
300            plt.ylabel(ylabel)
301            plt.xticks(
302                ind, [x_prefix + x[i] for i in range(len(x))], rotation=20, fontsize=8
303            )
304            plt.legend(bars, labels)
305            plt.title("{} filename:{}".format(title, file))
306            pdf.savefig(fig)
307    pdf.close()
308
309
310def plot_heatmap(csv_result_dir, output_result_dir, filename_suffix, pdf_name, title):
311    pdf = matplotlib.backends.backend_pdf.PdfPages(
312        "{}/{}".format(output_result_dir, pdf_name)
313    )
314    for file in os.listdir(csv_result_dir):
315        if not file.endswith(filename_suffix):
316            continue
317        csv_file_name = "{}/{}".format(csv_result_dir, file)
318        print("Processing file {}/{}".format(csv_result_dir, file))
319        corr_table = pd.read_csv(csv_file_name)
320        corr_table = corr_table.pivot("label", "corr", "value")
321        fig = plt.figure()
322        sns.heatmap(corr_table, annot=True, linewidths=0.5, fmt=".2")
323        plt.title("{} filename:{}".format(title, file))
324        pdf.savefig(fig)
325    pdf.close()
326
327
328def plot_timeline(csv_result_dir, output_result_dir):
329    plot_line_charts(
330        csv_result_dir,
331        output_result_dir,
332        filename_prefix="",
333        filename_suffix="access_timeline",
334        pdf_name="access_time.pdf",
335        xlabel="Time",
336        ylabel="Throughput",
337        title="Access timeline with group by label",
338        vertical=False,
339        legend=True,
340    )
341
342
343def convert_to_0_if_nan(n):
344    if math.isnan(n):
345        return 0.0
346    return n
347
348
349def plot_correlation(csv_result_dir, output_result_dir):
350    # Processing the correlation input first.
351    label_str_file = {}
352    for file in os.listdir(csv_result_dir):
353        if not file.endswith("correlation_input"):
354            continue
355        csv_file_name = "{}/{}".format(csv_result_dir, file)
356        print("Processing file {}/{}".format(csv_result_dir, file))
357        corr_table = pd.read_csv(csv_file_name)
358        label_str = file.split("_")[0]
359        label = file[len(label_str) + 1 :]
360        label = label[: len(label) - len("_correlation_input")]
361
362        output_file = "{}/{}_correlation_output".format(csv_result_dir, label_str)
363        if output_file not in label_str_file:
364            f = open("{}/{}_correlation_output".format(csv_result_dir, label_str), "w+")
365            label_str_file[output_file] = f
366            f.write("label,corr,value\n")
367        f = label_str_file[output_file]
368        f.write(
369            "{},{},{}\n".format(
370                label,
371                "LA+A",
372                convert_to_0_if_nan(
373                    corr_table["num_accesses_since_last_access"].corr(
374                        corr_table["num_accesses_till_next_access"], method="spearman"
375                    )
376                ),
377            )
378        )
379        f.write(
380            "{},{},{}\n".format(
381                label,
382                "PA+A",
383                convert_to_0_if_nan(
384                    corr_table["num_past_accesses"].corr(
385                        corr_table["num_accesses_till_next_access"], method="spearman"
386                    )
387                ),
388            )
389        )
390        f.write(
391            "{},{},{}\n".format(
392                label,
393                "LT+A",
394                convert_to_0_if_nan(
395                    corr_table["elapsed_time_since_last_access"].corr(
396                        corr_table["num_accesses_till_next_access"], method="spearman"
397                    )
398                ),
399            )
400        )
401        f.write(
402            "{},{},{}\n".format(
403                label,
404                "LA+T",
405                convert_to_0_if_nan(
406                    corr_table["num_accesses_since_last_access"].corr(
407                        corr_table["elapsed_time_till_next_access"], method="spearman"
408                    )
409                ),
410            )
411        )
412        f.write(
413            "{},{},{}\n".format(
414                label,
415                "LT+T",
416                convert_to_0_if_nan(
417                    corr_table["elapsed_time_since_last_access"].corr(
418                        corr_table["elapsed_time_till_next_access"], method="spearman"
419                    )
420                ),
421            )
422        )
423        f.write(
424            "{},{},{}\n".format(
425                label,
426                "PA+T",
427                convert_to_0_if_nan(
428                    corr_table["num_past_accesses"].corr(
429                        corr_table["elapsed_time_till_next_access"], method="spearman"
430                    )
431                ),
432            )
433        )
434    for label_str in label_str_file:
435        label_str_file[label_str].close()
436
437    plot_heatmap(
438        csv_result_dir,
439        output_result_dir,
440        "correlation_output",
441        "correlation.pdf",
442        "Correlation",
443    )
444
445
446def plot_reuse_graphs(csv_result_dir, output_result_dir):
447    plot_stacked_bar_charts(
448        csv_result_dir,
449        output_result_dir,
450        filename_suffix="avg_reuse_interval_naccesses",
451        pdf_name="avg_reuse_interval_naccesses.pdf",
452        xlabel="",
453        ylabel="Percentage of accesses",
454        title="Average reuse interval",
455        vertical=True,
456        x_prefix="< ",
457    )
458    plot_stacked_bar_charts(
459        csv_result_dir,
460        output_result_dir,
461        filename_suffix="avg_reuse_interval",
462        pdf_name="avg_reuse_interval.pdf",
463        xlabel="",
464        ylabel="Percentage of blocks",
465        title="Average reuse interval",
466        vertical=True,
467        x_prefix="< ",
468    )
469    plot_stacked_bar_charts(
470        csv_result_dir,
471        output_result_dir,
472        filename_suffix="access_reuse_interval",
473        pdf_name="reuse_interval.pdf",
474        xlabel="Seconds",
475        ylabel="Percentage of accesses",
476        title="Reuse interval",
477        vertical=True,
478        x_prefix="< ",
479    )
480    plot_stacked_bar_charts(
481        csv_result_dir,
482        output_result_dir,
483        filename_suffix="reuse_lifetime",
484        pdf_name="reuse_lifetime.pdf",
485        xlabel="Seconds",
486        ylabel="Percentage of blocks",
487        title="Reuse lifetime",
488        vertical=True,
489        x_prefix="< ",
490    )
491    plot_line_charts(
492        csv_result_dir,
493        output_result_dir,
494        filename_prefix="",
495        filename_suffix="reuse_blocks_timeline",
496        pdf_name="reuse_blocks_timeline.pdf",
497        xlabel="",
498        ylabel="Percentage of blocks",
499        title="Reuse blocks timeline",
500        vertical=False,
501        legend=False,
502    )
503
504
505def plot_percentage_access_summary(csv_result_dir, output_result_dir):
506    plot_stacked_bar_charts(
507        csv_result_dir,
508        output_result_dir,
509        filename_suffix="percentage_of_accesses_summary",
510        pdf_name="percentage_access.pdf",
511        xlabel="",
512        ylabel="Percentage of accesses",
513        title="",
514        vertical=True,
515        x_prefix="",
516    )
517    plot_stacked_bar_charts(
518        csv_result_dir,
519        output_result_dir,
520        filename_suffix="percent_ref_keys",
521        pdf_name="percent_ref_keys.pdf",
522        xlabel="",
523        ylabel="Percentage of blocks",
524        title="",
525        vertical=True,
526        x_prefix="",
527    )
528    plot_stacked_bar_charts(
529        csv_result_dir,
530        output_result_dir,
531        filename_suffix="percent_data_size_on_ref_keys",
532        pdf_name="percent_data_size_on_ref_keys.pdf",
533        xlabel="",
534        ylabel="Percentage of blocks",
535        title="",
536        vertical=True,
537        x_prefix="",
538    )
539    plot_stacked_bar_charts(
540        csv_result_dir,
541        output_result_dir,
542        filename_suffix="percent_accesses_on_ref_keys",
543        pdf_name="percent_accesses_on_ref_keys.pdf",
544        xlabel="",
545        ylabel="Percentage of blocks",
546        title="",
547        vertical=True,
548        x_prefix="",
549    )
550
551
552def plot_access_count_summary(csv_result_dir, output_result_dir):
553    plot_stacked_bar_charts(
554        csv_result_dir,
555        output_result_dir,
556        filename_suffix="access_count_summary",
557        pdf_name="access_count_summary.pdf",
558        xlabel="Access count",
559        ylabel="Percentage of blocks",
560        title="",
561        vertical=True,
562        x_prefix="< ",
563    )
564    plot_line_charts(
565        csv_result_dir,
566        output_result_dir,
567        filename_prefix="",
568        filename_suffix="skewness",
569        pdf_name="skew.pdf",
570        xlabel="",
571        ylabel="Percentage of accesses",
572        title="Skewness",
573        vertical=True,
574        legend=False,
575    )
576
577
578def plot_miss_ratio_timeline(csv_result_dir, output_result_dir):
579    plot_line_charts(
580        csv_result_dir,
581        output_result_dir,
582        filename_prefix="",
583        filename_suffix="3600_miss_ratio_timeline",
584        pdf_name="miss_ratio_timeline.pdf",
585        xlabel="Time",
586        ylabel="Miss Ratio (%)",
587        title="Miss ratio timeline",
588        vertical=False,
589        legend=True,
590    )
591    plot_line_charts(
592        csv_result_dir,
593        output_result_dir,
594        filename_prefix="",
595        filename_suffix="3600_miss_timeline",
596        pdf_name="miss_timeline.pdf",
597        xlabel="Time",
598        ylabel="# of misses ",
599        title="Miss timeline",
600        vertical=False,
601        legend=True,
602    )
603    plot_line_charts(
604        csv_result_dir,
605        output_result_dir,
606        filename_prefix="",
607        filename_suffix="3600_miss_timeline",
608        pdf_name="miss_timeline.pdf",
609        xlabel="Time",
610        ylabel="# of misses ",
611        title="Miss timeline",
612        vertical=False,
613        legend=True,
614    )
615    plot_line_charts(
616        csv_result_dir,
617        output_result_dir,
618        filename_prefix="",
619        filename_suffix="3600_policy_timeline",
620        pdf_name="policy_timeline.pdf",
621        xlabel="Time",
622        ylabel="# of times a policy is selected ",
623        title="Policy timeline",
624        vertical=False,
625        legend=True,
626    )
627    plot_line_charts(
628        csv_result_dir,
629        output_result_dir,
630        filename_prefix="",
631        filename_suffix="3600_policy_ratio_timeline",
632        pdf_name="policy_ratio_timeline.pdf",
633        xlabel="Time",
634        ylabel="Percentage of times a policy is selected ",
635        title="Policy timeline",
636        vertical=False,
637        legend=True,
638    )
639
640
641if __name__ == "__main__":
642    if len(sys.argv) < 3:
643        print(
644            "Must provide two arguments: \n"
645            "1) The directory that saves a list of "
646            "directories which contain block cache trace analyzer result files. \n"
647            "2) the directory to save plotted graphs. \n"
648        )
649        exit(1)
650    csv_result_dir = sys.argv[1]
651    output_result_dir = sys.argv[2]
652    print(
653        "Processing directory {} and save graphs to {}.".format(
654            csv_result_dir, output_result_dir
655        )
656    )
657    for csv_relative_dir in os.listdir(csv_result_dir):
658        csv_abs_dir = csv_result_dir + "/" + csv_relative_dir
659        result_dir = output_result_dir + "/" + csv_relative_dir
660        if not os.path.isdir(csv_abs_dir):
661            print("{} is not a directory".format(csv_abs_dir))
662            continue
663        print("Processing experiment dir: {}".format(csv_relative_dir))
664        if not os.path.exists(result_dir):
665            os.makedirs(result_dir)
666        plot_access_count_summary(csv_abs_dir, result_dir)
667        plot_timeline(csv_abs_dir, result_dir)
668        plot_miss_ratio_timeline(csv_result_dir, output_result_dir)
669        plot_correlation(csv_abs_dir, result_dir)
670        plot_reuse_graphs(csv_abs_dir, result_dir)
671        plot_percentage_access_summary(csv_abs_dir, result_dir)
672        plot_miss_stats_graphs(
673            csv_abs_dir,
674            result_dir,
675            file_prefix="",
676            file_suffix="mrc",
677            ylabel="Miss ratio (%)",
678            pdf_file_name="mrc",
679        )
680        plot_miss_stats_diff_lru_graphs(
681            csv_abs_dir,
682            result_dir,
683            file_prefix="",
684            file_suffix="mrc",
685            ylabel="Miss ratio (%)",
686            pdf_file_name="mrc_diff_lru",
687        )
688        # The following stats are only available in pysim.
689        for time_unit in ["1", "60", "3600"]:
690            plot_miss_stats_graphs(
691                csv_abs_dir,
692                result_dir,
693                file_prefix="ml_{}_".format(time_unit),
694                file_suffix="p95mb",
695                ylabel="p95 number of byte miss per {} seconds".format(time_unit),
696                pdf_file_name="p95mb_per{}_seconds".format(time_unit),
697            )
698            plot_miss_stats_graphs(
699                csv_abs_dir,
700                result_dir,
701                file_prefix="ml_{}_".format(time_unit),
702                file_suffix="avgmb",
703                ylabel="Average number of byte miss per {} seconds".format(time_unit),
704                pdf_file_name="avgmb_per{}_seconds".format(time_unit),
705            )
706            plot_miss_stats_diff_lru_graphs(
707                csv_abs_dir,
708                result_dir,
709                file_prefix="ml_{}_".format(time_unit),
710                file_suffix="p95mb",
711                ylabel="p95 number of byte miss per {} seconds".format(time_unit),
712                pdf_file_name="p95mb_per{}_seconds_diff_lru".format(time_unit),
713            )
714            plot_miss_stats_diff_lru_graphs(
715                csv_abs_dir,
716                result_dir,
717                file_prefix="ml_{}_".format(time_unit),
718                file_suffix="avgmb",
719                ylabel="Average number of byte miss per {} seconds".format(time_unit),
720                pdf_file_name="avgmb_per{}_seconds_diff_lru".format(time_unit),
721            )
722