stream-join/plot_workers.py at master · ouyangyuchen/stream-join · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import pandas as pd
import matplotlib.pyplot as plt
import argparse
import sys


def plot_benchmark_results(csv_filepath, output_filepath):
    """
    Reads benchmark data from a CSV and plots average throughput vs. workers.

    Args:
        csv_filepath (str): The path to the input CSV file.
        output_filepath (str): The path to save the output plot image.
    """
    try:
        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(csv_filepath)
        print(f"Successfully loaded {csv_filepath}. Found {len(df)} rows.")
    except FileNotFoundError:
        print(f"Error: Input file not found at '{csv_filepath}'", file=sys.stderr)
        sys.exit(1)
    except pd.errors.EmptyDataError:
        print(f"Error: Input file '{csv_filepath}' is empty.", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Error reading CSV file '{csv_filepath}': {e}", file=sys.stderr)
        sys.exit(1)

    # --- Data Preparation ---
    df_success = df[df["Status"] == "SUCCESS"].copy()
    print(f"Found {len(df_success)} successful runs.")

    if df_success.empty:
        print(
            "No successful runs found in the data. Cannot generate plot.",
            file=sys.stderr,
        )
        sys.exit(0)

    df_success["Workers"] = pd.to_numeric(df_success["Workers"], errors="coerce")
    df_success["Throughput_tuples_s"] = pd.to_numeric(
        df_success["Throughput_tuples_s"], errors="coerce"
    )
    df_success.dropna(subset=["Workers", "Throughput_tuples_s"], inplace=True)
    df_avg = df_success.groupby(["JoinerType", "IndexType", "Workers"], as_index=False)[
        "Throughput_tuples_s"
    ].mean()
    print("Calculated average throughputs.")
    # print(df_avg.to_string()) # Uncomment to print averaged data

    # --- Plotting Setup ---
    joiners = sorted(df_avg["JoinerType"].unique())
    indexes = sorted(df_avg["IndexType"].unique())
    colors = plt.cm.get_cmap("tab10", len(joiners))
    color_map = {joiner: colors(i) for i, joiner in enumerate(joiners)}
    style_map = {
        "bplustree": "-.",
        "alex": "--",
        "pgm": ":",
        "list": "-",
    }
    default_styles = ["-", "--", ":", "-."]
    for i, idx in enumerate(indexes):
        if idx not in style_map:
            style_map[idx] = default_styles[i % len(default_styles)]

    # --- Create the Plot ---
    plt.style.use("seaborn-v0_8-whitegrid")
    fig, ax = plt.subplots(figsize=(12, 8))

    for (joiner, index), group in df_avg.groupby(["JoinerType", "IndexType"]):
        group = group.sort_values("Workers")
        ax.plot(
            group["Workers"],
            group["Throughput_tuples_s"],
            label=f"{joiner} / {index}",
            color=color_map.get(joiner),
            linestyle=style_map.get(index),
            marker="o",
            markersize=6,
            linewidth=1.5,
        )

    # --- Customize the Plot ---
    ax.set_title("Average Throughput vs. Number of Workers", fontsize=16)
    ax.set_xlabel("Number of Workers", fontsize=12)
    ax.set_ylabel("Average Throughput (tuples/s)", fontsize=12)
    worker_ticks = sorted(df_avg["Workers"].unique())
    ax.set_xticks(worker_ticks)
    ax.set_xticklabels([int(w) for w in worker_ticks])
    ax.legend(
        title="Joiner / Index",
        bbox_to_anchor=(1.04, 1),
        loc="upper left",
        borderaxespad=0.0,
    )
    fig.tight_layout(rect=[0, 0, 0.8, 1])
    ax.grid(True, which="both", linestyle="--", linewidth=0.5, alpha=0.7)
    ax.set_ylim(bottom=0)

    # --- Show and Save ---
    try:
        # Use the provided output_filepath
        plt.savefig(output_filepath, bbox_inches="tight")
        print(f"Plot saved to '{output_filepath}'")
    except Exception as e:
        print(f"Error saving plot to '{output_filepath}': {e}", file=sys.stderr)

    plt.show()


def main():
    """Parses command-line arguments and calls the plotting function."""
    parser = argparse.ArgumentParser(
        description="Plot benchmark results from a CSV file. "
        "Shows Average Throughput vs. Workers, styled by Joiner and Index type."
    )
    parser.add_argument(
        "csv_file", help="Path to the input CSV file generated by the benchmark script."
    )
    # Add the optional output file argument
    parser.add_argument(
        "-o",
        "--output",
        dest="output_file",  # Store the value in args.output_file
        default="throughput_vs_workers.png",  # Default value if not provided
        help="Path to save the output plot image (default: throughput_vs_workers.png).",
    )
    args = parser.parse_args()

    # Pass both arguments to the plotting function
    plot_benchmark_results(args.csv_file, args.output_file)


if __name__ == "__main__":
    main()