ChartScope/scripts_api/generate_json_data_and_qa.py at main · davidhalladay/ChartScope · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import requests
import json
import random
import os
import time
from pathlib import Path
import time
from tqdm import tqdm
import os
import sys

sys.path.append('.')
from utils.utils import GPT
from utils.output_templates import cognition_QA_template, cognition_QA_template_v2


def get_gpt_input_prompts_data(selected_type, type_definition, selected_topic, template, template_readme):
    complexity_statements = [
        f"The complexity of the story should be low, focusing on a simple case of an {selected_type} chart with a single category representing the data.",
        f"The complexity of the story should be high, focusing on a complex case of an {selected_type} chart with multiple categories representing the data, rather than a single category."
    ]
    # random select 0 or 1
    complexity_idx = random.sample([i for i in range(len(complexity_statements))], k=1)[0]
    complexity_statement = complexity_statements[complexity_idx]

    prompts_data = [
        f"Here is the definition of {selected_type} chart: {type_definition}. Generate {selected_type} chart data by following these steps: Firstly, produce a short background story (around 100 words) for the chart data. The topic should be related to the {selected_topic}. The story can be like appeared on an article from a newspaper, magazine, report, academic paper, etc. The story should include a trend (such as Upward Trend, Downward Trend, Stable Trend, Cyclical Trend, Seasonal Trend, Random Trend, Exponential Trend, Linear Trend, Logarithmic Trend, or Parabolic Trend), but it does not need to contain specific numbers. {complexity_statement} Just return the story without additional comments.",

        f"Secondly, based on the generated story, produce specific numbers for it and generate the chart data in the following JSON format: {template}. You can refer to this README file for the meaning and type of each attribute or label in the JSON template. README: {template_readme}. Note that the chart data should follow the trend mentioned in the story."
    ]
    return prompts_data, complexity_idx


def get_gpt_input_prompts_QA(selected_type, type_definition, chart_data, data_readme, num_questions_per_QA, cognition_template_txt):

    prompts_data = [
        f"Below are the definition of a {selected_type} chart, the chart data, and the README for the data: \n Definition: {type_definition} \n Chart data: {chart_data} \n README: {data_readme}. \n Imagine you are examining a chart image with the raw data provided above. Follow the instructions below to generate question-answer pairs for this chart image: 1. Descriptive Caption: Create a descriptive caption of 100 words based on the chart data, title, chart type, etc. This caption should be objective and should not include any summarization or findings. 2. Summary: Write a summary of 100 words based on the given chart. This summary should describe the chart data and give some findings. Avoid trivial summaries and ensure that you do not just describe the data objectively as in the descriptive caption. 3. Literal Question-Answer Pairs: Design {num_questions_per_QA} question-answer pairs that ask for specific values or data points from the chart. Each question should have a direct answer found explicitly in the chart data. Examples: 'What is the value of the first data point?', 'What is the value of X in year Y?', and 'Is the value of N year 30?'. 4. Inferential Question-Answer Pairs: Create {num_questions_per_QA} question-answer pairs that require understanding the chart data as a whole and making inferences. These questions might ask about extreme values (highest, lowest, etc.), comparisons, or conditions met by data points. Examples include: 'What is the highest value in the chart?', 'Which category has the lowest value?', 'How many years have X value under 500?', and 'Is the value of A higher than that of B at T time?'. 5. Challenging Reasoning Question-Answer Pairs: Develop {num_questions_per_QA} question-answer pairs that require higher-level mathematical reasoning. These questions might involve calculations like averages over a period, median values, sums, differences, ratios, and other statistical values. Examples include: 'What is the average value for the period 2010-2020?', 'What is the sum of values for category X?', and 'Is the value of A higher than that of B by 13?' Here are some important rules when creating QAs: Please do not copy the example questions. Note: All questions can be yes/no, numerical, or open-ended with a definite answer. Do not create questions where the answer is uncertain or unknown. These {num_questions_per_QA} questions should cover different aspects of the chart without repetition. It is crucial that each question's answer is accurate. Here are some essential rules for answers: For each question, please produce a long answer first, which includes the explanation or calculation process to get the answer (as detailed as possible). After that, generate a short answer that only extracts the answer (unit should follow the definition in the chart data so don't include unit in the short answer) from the long answer. The return should be a JSON following the format below: {cognition_template_txt}"
    ]
    return prompts_data


def extract_and_save_json_from_gpt_response(outputs, save_dir_path, count):

    output_json = json.loads(outputs)
    with open(os.path.join(save_dir_path, f'{count:06}.json'), 'w') as f:
        json.dump(output_json, f, indent=4)


def save_story_from_gpt_response(outputs, save_dir_path, count):
    with open(os.path.join(save_dir_path, f'{count:06}.txt'), 'w') as f:
        f.write(outputs)

def save_error_outputs(error_message, output_dir_path):
    with open(os.path.join(output_dir_path, 'error_message.txt'), 'a') as f:
        f.write(error_message)


def load_template_json_and_readme(template_path):
    with open(os.path.join(template_path, 'definition.md'), 'r') as f:
        definition = f.read()
    with open(os.path.join(template_path, 'base.json'), 'r') as f:
        template_json = f.read()
    with open(os.path.join(template_path, 'README.md'), 'r') as f:
        readme = f.read()

    # output = 'The definition for this type of chart: \n' + definition + '\n\n' + 'JSON template: \n' + template_json + '\n\n' + 'README for the template: \n' + readme + '\n\n.'
    return definition, template_json, readme

def load_topics_json(template_dir_path, all_type_list):

    topic_dict = dict()
    for selected_type_ in all_type_list:
        with open(os.path.join(template_dir_path, selected_type_, 'topics.json'), 'r') as f:
            topics = json.load(f)
        topic_dict[selected_type_] = []
        for topic, des in topics.items():
            topic_dict[selected_type_].append(f"{topic} ({des}) ")
    return topic_dict

def main():

    # template file path
    metadata_path = 'data/metadata.json'
    template_dir_path = 'data/rawdata/json_templates/json_templates'

    with open(metadata_path, 'r') as f:
        metadata = json.load(f)

    num_iter = 5
    output_dir_name = 'data/rawdata/data_and_qa/outputs'

    # setting for QA generation
    num_questions_per_QA = 5

    # GPT
    GPT_BASE = 'Qwen3' # gpt
    GPT_DEPLOY_NAME = '' # gpt-o4-mini
    my_GPT = GPT(GPT_BASE, GPT_DEPLOY_NAME)

    # create chart type list
    core_type_list = metadata['core_chart_type']
    advanced_type_list = metadata['advanced_chart_type']
    all_type_list = core_type_list + advanced_type_list
    all_type_list = [selected_type.replace(' ', '_') for selected_type in all_type_list]

    # load topics json
    topic_dict = load_topics_json(template_dir_path, all_type_list)

    print("Generating json data...")

    count_for_each_type = dict()
    for selected_type_ in all_type_list:
        count_for_each_type[selected_type_] = 0

    topic_for_each_type = dict()
    for selected_type_ in all_type_list:
        topic_for_each_type[selected_type_] = dict()

    complexity_for_each_type = dict()
    for selected_type_ in all_type_list:
        complexity_for_each_type[selected_type_] = dict()

    for iter in tqdm(range(num_iter)):
        for selected_type_ in tqdm(all_type_list):

            selected_type = selected_type_.replace('_', ' ')

            # create output directory
            json_output_dir_path = os.path.join(output_dir_name, selected_type_, 'json')
            qa_output_dir_path = os.path.join(output_dir_name, selected_type_, 'qa')
            story_output_dir_path = os.path.join(output_dir_name, selected_type_, 'story')
            base_output_dir_path = os.path.join(output_dir_name, selected_type_)
            directory = Path(json_output_dir_path)
            directory.mkdir(parents=True, exist_ok=True)
            os.makedirs(qa_output_dir_path, exist_ok=True)
            os.makedirs(story_output_dir_path, exist_ok=True)

            json_gpt_output_dir_path = os.path.join(output_dir_name+'_gpt', selected_type_, 'json')
            qa_gpt_output_dir_path = os.path.join(output_dir_name+'_gpt', selected_type_, 'qa')
            directory = Path(json_gpt_output_dir_path)
            directory.mkdir(parents=True, exist_ok=True)
            os.makedirs(qa_gpt_output_dir_path, exist_ok=True)

            json_template_dir_path = os.path.join(template_dir_path, selected_type_)

            type_definition, template, template_readme = load_template_json_and_readme(json_template_dir_path)

            ######### Start generate json data and QA based on the template and readme #########

            selected_topic = random.sample(topic_dict[selected_type_], k=1)[0]
            count = count_for_each_type[selected_type_]
            topic_for_each_type[selected_type_][f'{count:06}'] = selected_topic

            try:
                ###################
                # JSON data generation
                ###################
                # create input prompts
                prompts, complex_idx = get_gpt_input_prompts_data(selected_type, type_definition, selected_topic, template, template_readme)
                complexity_for_each_type[selected_type_][f'{count:06}'] = complex_idx

                outputs = my_GPT.get_json_expert_gpt_response(prompts, os.path.join(json_gpt_output_dir_path, f'json_gpt_outputs_{count}.txt'))

                # save the json files
                save_story_from_gpt_response(outputs[0].choices[0].message.content, story_output_dir_path, count)
                extract_and_save_json_from_gpt_response(outputs[1].choices[0].message.content, json_output_dir_path, count)

                # create input prompts
                input_json_data = outputs[1].choices[0].message.content

                ###################
                # QA generation
                ###################
                # short QA generation
                prompts = get_gpt_input_prompts_QA(selected_type, type_definition, input_json_data, template_readme, num_questions_per_QA, cognition_QA_template_v2)
                outputs = my_GPT.get_QA_expert_gpt_response(prompts, os.path.join(qa_gpt_output_dir_path, f'qa_gpt_outputs_{count}.txt'))

                # save the json files
                extract_and_save_json_from_gpt_response(outputs[0].choices[0].message.content, qa_output_dir_path, count)

                count += 1
                count_for_each_type[selected_type_] = count

            except Exception as e:
                save_error_outputs(repr(e), base_output_dir_path)


    # save topic_for_each_type in output_dir_name
    with open(os.path.join(output_dir_name, 'annotation_topic.json'), 'w') as f:
        json.dump(topic_for_each_type, f, indent=4)

    # save complexity_for_each_type in output_dir_name
    with open(os.path.join(output_dir_name, 'annotation_complexity.json'), 'w') as f:
        json.dump(complexity_for_each_type, f, indent=4)


if __name__ == '__main__':

    main()