diff --git a/or1_scripts/data_preprocess/download_and_filter_data_1p5b.py b/or1_scripts/data_preprocess/download_and_filter_data_1p5b.py index 045faa2..5ab20d5 100644 --- a/or1_scripts/data_preprocess/download_and_filter_data_1p5b.py +++ b/or1_scripts/data_preprocess/download_and_filter_data_1p5b.py @@ -66,6 +66,14 @@ def filter_fn(example): math_data_list = [item for item in data_list if item['ability'] == 'math'] code_data_list = [item for item in data_list if item['ability'] == 'code'] + for i in range(len(code_data_list)): + new_ground_truth = {} + item = code_data_list[i]['reward_model']['ground_truth'] + for key in item: + if item[key] is not None: + new_ground_truth[key] = item[key] + code_data_list[i]['reward_model']['ground_truth'] = new_ground_truth + local_dir = args.local_dir hdfs_dir = args.hdfs_dir os.makedirs(local_dir, exist_ok=True) diff --git a/or1_scripts/data_preprocess/download_and_filter_data_32b.py b/or1_scripts/data_preprocess/download_and_filter_data_32b.py index 4bb3541..8bcf946 100644 --- a/or1_scripts/data_preprocess/download_and_filter_data_32b.py +++ b/or1_scripts/data_preprocess/download_and_filter_data_32b.py @@ -66,6 +66,14 @@ def filter_fn(example): math_data_list = [item for item in data_list if item['ability'] == 'math'] code_data_list = [item for item in data_list if item['ability'] == 'code'] + for i in range(len(code_data_list)): + new_ground_truth = {} + item = code_data_list[i]['reward_model']['ground_truth'] + for key in item: + if item[key] is not None: + new_ground_truth[key] = item[key] + code_data_list[i]['reward_model']['ground_truth'] = new_ground_truth + local_dir = args.local_dir hdfs_dir = args.hdfs_dir os.makedirs(local_dir, exist_ok=True) diff --git a/or1_scripts/data_preprocess/download_and_filter_data_7b.py b/or1_scripts/data_preprocess/download_and_filter_data_7b.py index 2de63ff..6b472c2 100644 --- a/or1_scripts/data_preprocess/download_and_filter_data_7b.py +++ b/or1_scripts/data_preprocess/download_and_filter_data_7b.py @@ -66,6 +66,14 @@ def filter_fn(example): math_data_list = [item for item in data_list if item['ability'] == 'math'] code_data_list = [item for item in data_list if item['ability'] == 'code'] + for i in range(len(code_data_list)): + new_ground_truth = {} + item = code_data_list[i]['reward_model']['ground_truth'] + for key in item: + if item[key] is not None: + new_ground_truth[key] = item[key] + code_data_list[i]['reward_model']['ground_truth'] = new_ground_truth + local_dir = args.local_dir hdfs_dir = args.hdfs_dir os.makedirs(local_dir, exist_ok=True)