-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgenerate_json_data_and_qa.py
More file actions
201 lines (146 loc) · 11.7 KB
/
generate_json_data_and_qa.py
File metadata and controls
201 lines (146 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import requests
import json
import random
import os
import time
from pathlib import Path
import time
from tqdm import tqdm
import os
import sys
sys.path.append('.')
from utils.utils import GPT
from utils.output_templates import cognition_QA_template, cognition_QA_template_v2
def get_gpt_input_prompts_data(selected_type, type_definition, selected_topic, template, template_readme):
complexity_statements = [
f"The complexity of the story should be low, focusing on a simple case of an {selected_type} chart with a single category representing the data.",
f"The complexity of the story should be high, focusing on a complex case of an {selected_type} chart with multiple categories representing the data, rather than a single category."
]
# random select 0 or 1
complexity_idx = random.sample([i for i in range(len(complexity_statements))], k=1)[0]
complexity_statement = complexity_statements[complexity_idx]
prompts_data = [
f"Here is the definition of {selected_type} chart: {type_definition}. Generate {selected_type} chart data by following these steps: Firstly, produce a short background story (around 100 words) for the chart data. The topic should be related to the {selected_topic}. The story can be like appeared on an article from a newspaper, magazine, report, academic paper, etc. The story should include a trend (such as Upward Trend, Downward Trend, Stable Trend, Cyclical Trend, Seasonal Trend, Random Trend, Exponential Trend, Linear Trend, Logarithmic Trend, or Parabolic Trend), but it does not need to contain specific numbers. {complexity_statement} Just return the story without additional comments.",
f"Secondly, based on the generated story, produce specific numbers for it and generate the chart data in the following JSON format: {template}. You can refer to this README file for the meaning and type of each attribute or label in the JSON template. README: {template_readme}. Note that the chart data should follow the trend mentioned in the story."
]
return prompts_data, complexity_idx
def get_gpt_input_prompts_QA(selected_type, type_definition, chart_data, data_readme, num_questions_per_QA, cognition_template_txt):
prompts_data = [
f"Below are the definition of a {selected_type} chart, the chart data, and the README for the data: \n Definition: {type_definition} \n Chart data: {chart_data} \n README: {data_readme}. \n Imagine you are examining a chart image with the raw data provided above. Follow the instructions below to generate question-answer pairs for this chart image: 1. Descriptive Caption: Create a descriptive caption of 100 words based on the chart data, title, chart type, etc. This caption should be objective and should not include any summarization or findings. 2. Summary: Write a summary of 100 words based on the given chart. This summary should describe the chart data and give some findings. Avoid trivial summaries and ensure that you do not just describe the data objectively as in the descriptive caption. 3. Literal Question-Answer Pairs: Design {num_questions_per_QA} question-answer pairs that ask for specific values or data points from the chart. Each question should have a direct answer found explicitly in the chart data. Examples: 'What is the value of the first data point?', 'What is the value of X in year Y?', and 'Is the value of N year 30?'. 4. Inferential Question-Answer Pairs: Create {num_questions_per_QA} question-answer pairs that require understanding the chart data as a whole and making inferences. These questions might ask about extreme values (highest, lowest, etc.), comparisons, or conditions met by data points. Examples include: 'What is the highest value in the chart?', 'Which category has the lowest value?', 'How many years have X value under 500?', and 'Is the value of A higher than that of B at T time?'. 5. Challenging Reasoning Question-Answer Pairs: Develop {num_questions_per_QA} question-answer pairs that require higher-level mathematical reasoning. These questions might involve calculations like averages over a period, median values, sums, differences, ratios, and other statistical values. Examples include: 'What is the average value for the period 2010-2020?', 'What is the sum of values for category X?', and 'Is the value of A higher than that of B by 13?' Here are some important rules when creating QAs: Please do not copy the example questions. Note: All questions can be yes/no, numerical, or open-ended with a definite answer. Do not create questions where the answer is uncertain or unknown. These {num_questions_per_QA} questions should cover different aspects of the chart without repetition. It is crucial that each question's answer is accurate. Here are some essential rules for answers: For each question, please produce a long answer first, which includes the explanation or calculation process to get the answer (as detailed as possible). After that, generate a short answer that only extracts the answer (unit should follow the definition in the chart data so don't include unit in the short answer) from the long answer. The return should be a JSON following the format below: {cognition_template_txt}"
]
return prompts_data
def extract_and_save_json_from_gpt_response(outputs, save_dir_path, count):
output_json = json.loads(outputs)
with open(os.path.join(save_dir_path, f'{count:06}.json'), 'w') as f:
json.dump(output_json, f, indent=4)
def save_story_from_gpt_response(outputs, save_dir_path, count):
with open(os.path.join(save_dir_path, f'{count:06}.txt'), 'w') as f:
f.write(outputs)
def save_error_outputs(error_message, output_dir_path):
with open(os.path.join(output_dir_path, 'error_message.txt'), 'a') as f:
f.write(error_message)
def load_template_json_and_readme(template_path):
with open(os.path.join(template_path, 'definition.md'), 'r') as f:
definition = f.read()
with open(os.path.join(template_path, 'base.json'), 'r') as f:
template_json = f.read()
with open(os.path.join(template_path, 'README.md'), 'r') as f:
readme = f.read()
# output = 'The definition for this type of chart: \n' + definition + '\n\n' + 'JSON template: \n' + template_json + '\n\n' + 'README for the template: \n' + readme + '\n\n.'
return definition, template_json, readme
def load_topics_json(template_dir_path, all_type_list):
topic_dict = dict()
for selected_type_ in all_type_list:
with open(os.path.join(template_dir_path, selected_type_, 'topics.json'), 'r') as f:
topics = json.load(f)
topic_dict[selected_type_] = []
for topic, des in topics.items():
topic_dict[selected_type_].append(f"{topic} ({des}) ")
return topic_dict
def main():
# template file path
metadata_path = 'data/metadata.json'
template_dir_path = 'data/rawdata/json_templates/json_templates'
with open(metadata_path, 'r') as f:
metadata = json.load(f)
num_iter = 5
output_dir_name = 'data/rawdata/data_and_qa/outputs'
# setting for QA generation
num_questions_per_QA = 5
# GPT
GPT_BASE = 'Qwen3' # gpt
GPT_DEPLOY_NAME = '' # gpt-o4-mini
my_GPT = GPT(GPT_BASE, GPT_DEPLOY_NAME)
# create chart type list
core_type_list = metadata['core_chart_type']
advanced_type_list = metadata['advanced_chart_type']
all_type_list = core_type_list + advanced_type_list
all_type_list = [selected_type.replace(' ', '_') for selected_type in all_type_list]
# load topics json
topic_dict = load_topics_json(template_dir_path, all_type_list)
print("Generating json data...")
count_for_each_type = dict()
for selected_type_ in all_type_list:
count_for_each_type[selected_type_] = 0
topic_for_each_type = dict()
for selected_type_ in all_type_list:
topic_for_each_type[selected_type_] = dict()
complexity_for_each_type = dict()
for selected_type_ in all_type_list:
complexity_for_each_type[selected_type_] = dict()
for iter in tqdm(range(num_iter)):
for selected_type_ in tqdm(all_type_list):
selected_type = selected_type_.replace('_', ' ')
# create output directory
json_output_dir_path = os.path.join(output_dir_name, selected_type_, 'json')
qa_output_dir_path = os.path.join(output_dir_name, selected_type_, 'qa')
story_output_dir_path = os.path.join(output_dir_name, selected_type_, 'story')
base_output_dir_path = os.path.join(output_dir_name, selected_type_)
directory = Path(json_output_dir_path)
directory.mkdir(parents=True, exist_ok=True)
os.makedirs(qa_output_dir_path, exist_ok=True)
os.makedirs(story_output_dir_path, exist_ok=True)
json_gpt_output_dir_path = os.path.join(output_dir_name+'_gpt', selected_type_, 'json')
qa_gpt_output_dir_path = os.path.join(output_dir_name+'_gpt', selected_type_, 'qa')
directory = Path(json_gpt_output_dir_path)
directory.mkdir(parents=True, exist_ok=True)
os.makedirs(qa_gpt_output_dir_path, exist_ok=True)
json_template_dir_path = os.path.join(template_dir_path, selected_type_)
type_definition, template, template_readme = load_template_json_and_readme(json_template_dir_path)
######### Start generate json data and QA based on the template and readme #########
selected_topic = random.sample(topic_dict[selected_type_], k=1)[0]
count = count_for_each_type[selected_type_]
topic_for_each_type[selected_type_][f'{count:06}'] = selected_topic
try:
###################
# JSON data generation
###################
# create input prompts
prompts, complex_idx = get_gpt_input_prompts_data(selected_type, type_definition, selected_topic, template, template_readme)
complexity_for_each_type[selected_type_][f'{count:06}'] = complex_idx
outputs = my_GPT.get_json_expert_gpt_response(prompts, os.path.join(json_gpt_output_dir_path, f'json_gpt_outputs_{count}.txt'))
# save the json files
save_story_from_gpt_response(outputs[0].choices[0].message.content, story_output_dir_path, count)
extract_and_save_json_from_gpt_response(outputs[1].choices[0].message.content, json_output_dir_path, count)
# create input prompts
input_json_data = outputs[1].choices[0].message.content
###################
# QA generation
###################
# short QA generation
prompts = get_gpt_input_prompts_QA(selected_type, type_definition, input_json_data, template_readme, num_questions_per_QA, cognition_QA_template_v2)
outputs = my_GPT.get_QA_expert_gpt_response(prompts, os.path.join(qa_gpt_output_dir_path, f'qa_gpt_outputs_{count}.txt'))
# save the json files
extract_and_save_json_from_gpt_response(outputs[0].choices[0].message.content, qa_output_dir_path, count)
count += 1
count_for_each_type[selected_type_] = count
except Exception as e:
save_error_outputs(repr(e), base_output_dir_path)
# save topic_for_each_type in output_dir_name
with open(os.path.join(output_dir_name, 'annotation_topic.json'), 'w') as f:
json.dump(topic_for_each_type, f, indent=4)
# save complexity_for_each_type in output_dir_name
with open(os.path.join(output_dir_name, 'annotation_complexity.json'), 'w') as f:
json.dump(complexity_for_each_type, f, indent=4)
if __name__ == '__main__':
main()