anemoi-dataloader-microbenchmark/plot.py at main · cbovalo/anemoi-dataloader-microbenchmark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import itertools
import textwrap

def get_varying_columns(grouped, columns):
    varying_columns = []
    for col in columns:
        unique_values = grouped[col].unique()  # Get unique values for the column
        if len(unique_values) > 1:  # Check if more than one unique value exists
            varying_columns.append((col,list(unique_values)))
    return varying_columns

def generate_config_strings(varying_values):
    keys, values = zip(*varying_values)  # Separate keys and their unique values
    combinations = list(itertools.product(*values))  # Get all possible combinations

    def get_string(key, val):
            if key == "res-ds":
                    return val
            else:
                    return f"{key}={str(val)}"

    config_strings = ["\n".join(get_string(key, val) for key, val in zip(keys, combo)) for combo in combinations]

    return config_strings

def add_ds_index(varying_cols, dataset_df):
        need_index=False
        for cols in varying_cols:
                if cols[0] == "res-ds":
                        need_index=True
                        ds_col=cols
                        break
        if need_index:
                #print("Adding index for datasets")
                # Wrap and format text
                #loop over all datasets
                #print(ds_col[1])
                names=[]
                for entry in ds_col[1]:
                        #get the index in the form 'o1280-ds[ID]',
                        index = entry.split("ds", 1)[-1]
                        subset=dataset_df.loc[dataset_df["datasetID"] ==int(index)]
                        ds_name=list(subset["dataset"])[0]
                        #print(f"{index=}, {ds_name.iloc[0]=}")
                        names.append(f"{entry}={ds_name}")
                        #print(f"ds{index}={ds_name}")

                wrapped_ds = [textwrap.fill(str(d), width=200) for d in names]  # Adjust width as needed
                text_str = "\n".join(wrapped_ds)
                #print(text_str)

                # Add text at the bottom of the plot
                #plt.figtext(0.5, 0.12, text_str, wrap=True, ha="center", fontsize=8, bbox={"facecolor": "white", "alpha": 0.5, "pad": 5})
                plt.annotate(text_str, xy=(0.5, 0.05), xycoords='figure fraction',
                        ha="center", fontsize=8, va="bottom",
                        bbox={"facecolor": "white", "alpha": 0.5, "pad": 5})

                # Adjust layout to prevent cropping
                plt.subplots_adjust(bottom=0.3)

#TODO   fix so it plots something when there is no difference. currently 'Error plotting: not enough values to unpack (expected 2, got 0)'
#       Add a text listing at the bottom of the plot explaining 'ds*'
#       Replace res-ds with a global ds
#size of the input batch in MB
def plot_anemoi_dataloader_benchmark(csv, show_plot=True, outdir="out", outname="out", header="", plot_iter_per_s=False):
        file=csv
        filename=os.path.basename(file)
        #print(f"Loading {file}")
        df=pd.read_csv(file)
        #issue where if you vary res you have a different res and dataset, so have to fuse them bc its really 1 change
        #also dataset paths are too large to plot
        # so replace res and dataset with "res-datasetID", and make a lookup table for the ids if we want to print the path later
        df['datasetID'] = pd.factorize(df['dataset'])[0]
        dataset_df = df.filter(["datasetID","dataset"], axis=1).drop_duplicates() #works but not unique
        df['res-ds'] = df.apply(lambda row: f"{row.res}-ds{row.datasetID}", axis=1)
        df =df.drop(['dataset','datasetID','res'], axis=1)

        #df_avg = df.groupby("num_workers", as_index=False)["throughput(byte/s)"].mean()
        grouped = df.groupby(["res-ds", "rollout", "batch_size","num_workers", "prefetch_factor", "pin_memory", "num_procs"], as_index=False).mean()
        #print(grouped)
        varying_cols = get_varying_columns(grouped, ["res-ds", "rollout", "batch_size","num_workers", "prefetch_factor", "pin_memory", "num_procs"])
        #print(varying_cols)
        configs=generate_config_strings(varying_cols)
        #print(configs)

        # Extract relevant columns
        x = configs
        if plot_iter_per_s:
                #({format(av_throughput_per_process)}B/s / {format(input_batch_size)}B)")
                y = grouped["proc-throughput(byte/s)"] / grouped['input_batch_per_proc(bytes)']
        else:
                y = grouped["proc-throughput(byte/s)"] / 1024 / 1024

        # Plot the bar chart
        plt.figure(figsize=(8, 5))
        plt.bar(x, y, color="royalblue")
        #plt.tick_params("x", rotation=45)

        #Generate an index at the bottom of the plot explaining what 'ds*' means
        add_ds_index(varying_cols=varying_cols, dataset_df=dataset_df)

        # Labels and title
        plt.xlabel("Config")
        plt.ylabel("Throughput (MB/s)")
        title="Dataloader throughput per 'GPU'"
        if plot_iter_per_s:
                plt.ylabel("Throughput (iter/s)")
                title="Per-GPU Throughput upper-bound from filesystem"
        if header != "":
                title=f"{title} - {header}"
        plt.title(title)
        #plt.grid() #goes on top

        # Show the plot
        print(f"Saving plot to {outdir}/{outname}.png")
        plt.savefig(f"{outdir}/{outname}.png")
        #plt.show()

#plot_anemoi_dataloader_benchmark("dev.csv")

def plot_mem_monitor(csv, show_plot=True, outdir="out", filename_prefix=""):
        file=csv
        filename=os.path.basename(file)
        if filename_prefix != "":
                filename=f"{filename_prefix}-{filename}"
        df=pd.read_csv(file)
        df["pname-pid"] = df["pname"] + "-" + df["pid"].astype(str)
        start_time=df["time"][0]
        df["elapsed-time"] = df["time"] - start_time

        # Set color palette
        unique_processes = df["pname-pid"].unique()
        palette = sns.color_palette("tab10", len(unique_processes))
        color_map = dict(zip(unique_processes, palette))

        # Plot memory usage over time
        plt.figure(figsize=(12, 6))
        GB=1024*1024*1024
        #for metric in ["rss", "pss", "uss", "shared"]: # *ss all the same, nothing shared
        first_dl=True
        for metric in ["rss"]:
                for process in unique_processes:
                        subset = df[df["pname-pid"] == process]
                        if (process.startswith("system")):
                                plt.plot(subset["elapsed-time"], subset["used_mem"]/GB, label=f"used_mem", color=color_map[process], linestyle="-")
                                plt.plot(subset["elapsed-time"], subset["dl_used_mem"]/GB, label=f"DL RSS mem sum", color=color_map[process], linestyle=":")
                                plt.plot(subset["elapsed-time"], subset["total_mem"]/GB, label=f"total_mem", color=color_map[process], linestyle="--")
                        else:
                                if first_dl:
                                        label="Dataloader - RSS"
                                        first_dl=False
                                else:
                                        label=str() #exclude from legend to prevent spam with many dataloaders

                                plt.plot(subset["elapsed-time"], subset[metric]/GB, label=f"{label}", color=color_map[process], linestyle="-" if metric == "rss" else "--")

        plt.xlabel("Time (s)")
        plt.ylabel("CPU Memory (GB)")
        plt.title(f"Memory Usage Over Time - {filename}")
        plt.legend(loc="upper left")
        plt.grid()
        out_file=f"{outdir}/{filename}.png"
        print(f"Plotting memory usage to {out_file}")
        plt.savefig(f"{out_file}")
        if show_plot:
                plt.show()

if __name__ == "__main__":
    plot_anemoi_dataloader_benchmark("dev.csv", plot_iter_per_s=True)