ChartScope/tools/data/generate_chart_image.py at main · davidhalladay/ChartScope · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import requests
import json
import random
import os
from pathlib import Path
from tqdm import tqdm
import subprocess
from subprocess import STDOUT, check_output
from joblib import Parallel, delayed
import threading

def save_error_outputs(error_message, output_dir_path):
    with open(output_dir_path, 'a') as f:
        f.write(error_message)

def run_command(cmd, error_count, output_dir_name, chart_type, image_name):
    directory = Path(os.path.join(output_dir_name, chart_type))
    directory.mkdir(parents=True, exist_ok=True)

    fail_flag = False
    try:
        result = check_output(cmd, stderr=STDOUT, timeout=10)
        # result = subprocess.run(cmd, stderr=STDOUT, timeout=30)
    except Exception as e:
        fail_flag = True
        error_count.append(1)
        if hasattr(e, 'output') and e.output is not None:
            save_error_outputs(e.output.decode("utf-8"), os.path.join(output_dir_name, chart_type, image_name.replace('.png', '.txt')))
        else:
            save_error_outputs(str(e), os.path.join(output_dir_name, chart_type, image_name.replace('.png', '.txt')))
    return fail_flag

def generate_chart_image(chart_type, exp_path, chart_img_output_dir, error_output_dir, sample_json_size=None, sample_script_size=None):
    error_count = []
    total_count = 1

    # create a directory to save the json files
    chart_output_dir_path = os.path.join(chart_img_output_dir, chart_type, 'chart')
    directory = Path(chart_output_dir_path)
    directory.mkdir(parents=True, exist_ok=True)

    if sample_json_size is not None:
        json_files = random.sample(sorted(os.listdir(os.path.join(exp_path, chart_type, 'json'))), k=sample_json_size)
    else:
        json_files = sorted(os.listdir(os.path.join(exp_path, chart_type, 'json')))
    if sample_script_size is not None:
        python_scripts = random.sample(sorted(os.listdir(os.path.join(exp_path, chart_type, 'code'))), k=sample_script_size)
    else:
        python_scripts = sorted(os.listdir(os.path.join(exp_path, chart_type, 'code')))

    # if the python script fail for first k json file, then skip the rest of the json files
    k = 5
    threshold = 0.5 # if the error rate is larger than threshold, skip the rest of the json files
    fail_dictionary = {}
    for script_idx in range(len(python_scripts)):
        fail_dictionary[python_scripts[script_idx]] = 0

    json_count = 0
    pbar = tqdm(json_files, leave=False)
    for json_file in pbar:
        pbar.set_description("Processing {}. Error rate: {} \n".format(chart_type, float(sum(error_count))/total_count))
        json_path = os.path.join(exp_path, chart_type, 'json', json_file)

        # thread_list = []
        for script_idx in tqdm(range(len(python_scripts))):
            total_count += 1
            python_script_path = os.path.join(exp_path, chart_type, 'code', python_scripts[script_idx])
            image_name = json_file.split('.json')[0] + '_{}.png'.format(python_scripts[script_idx].split('.')[0])
            image_save_path = os.path.join(chart_output_dir_path, image_name)

            # skip the first k json files if the python script fail for the first k json files
            if float(fail_dictionary[python_scripts[script_idx]]/(json_count+0.001)) > threshold:
                continue

            # if image_save_path exists, skip
            if os.path.exists(image_save_path):
                continue
            else:
                command = ['python3', python_script_path, json_path, image_save_path]
                fail_flag = run_command(command, error_count, error_output_dir, chart_type, image_name)

            if fail_flag:
                fail_dictionary[python_scripts[script_idx]] += 1

        json_count += 1

    print('Error rate: ', sum(error_count)/total_count)

    with open(os.path.join(chart_img_output_dir, 'error_rate.txt'), 'a') as f:
        f.write('Chart type: {}. Error rate: {}. \n'.format(chart_type, sum(error_count)/total_count))

if __name__ == '__main__':

    # get output chart types
    exp_path = 'data/final'
    chart_img_output_dir = exp_path #'exp/full_gpt4_v1/outputs_chart_examples'
    error_output_dir = 'data/rawdata/merged_python_error_logs'
    metadata_path = 'data/metadata.json'
    sample_json_size = None
    sample_script_size = None

    os.makedirs(error_output_dir, exist_ok=True)

    chart_types = [ name for name in os.listdir(exp_path) if os.path.isdir(os.path.join(exp_path, name)) ]
    chart_types = sorted(chart_types)

    # template file path
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)

    core_type_list = [x.replace(' ', '_') for x in metadata['core_chart_type']]
    advanced_type_list = [x.replace(' ', '_') for x in metadata['advanced_chart_type']]
    all_type_list = core_type_list + advanced_type_list

    Parallel(n_jobs=1)(delayed(generate_chart_image)(i, exp_path, chart_img_output_dir, error_output_dir, sample_json_size, sample_script_size) for i in all_type_list)

    # calculate the number of image in each chart type
    chart_output_dir_path = os.path.join(chart_img_output_dir, 'chart')
    for chart_type in all_type_list:
        chart_output_dir_path = os.path.join(chart_img_output_dir, chart_type, 'chart')
        print(chart_type, len(os.listdir(chart_output_dir_path)))