From baa5abc38c556e4bec73d1fca360c1cdd6c4fee9 Mon Sep 17 00:00:00 2001 From: geemi725 Date: Thu, 12 Jun 2025 15:52:16 -0700 Subject: [PATCH 1/2] default eval-mode --- grade_outputs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grade_outputs.py b/grade_outputs.py index 8043811..0d4da39 100644 --- a/grade_outputs.py +++ b/grade_outputs.py @@ -72,7 +72,7 @@ async def grade_answers( target=str(row["target"]), predicted=str(row["predicted"]), unsure=None, - evaluation_mode=row["evaluation_mode"], + evaluation_mode=row.get("evaluation_mode", "llm_verifier"), partial_match=True, llm_match=True, ) From ae948d8bccee3bfb6785fca7a99606c3a9f2f95c Mon Sep 17 00:00:00 2001 From: geemi725 Date: Thu, 12 Jun 2025 16:14:05 -0700 Subject: [PATCH 2/2] eval_mode columns --- generate_zeroshot_evals.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/generate_zeroshot_evals.py b/generate_zeroshot_evals.py index 3f8a22c..3a07fbb 100644 --- a/generate_zeroshot_evals.py +++ b/generate_zeroshot_evals.py @@ -105,16 +105,18 @@ async def evaluate( ) ) - results.append( - { - "uuid": query.id, - "question": query.question, - "predicted": query.predicted, - "target": query.target, - "unsure": query.unsure, - "evaluation_mode": query.evaluation_mode, - } - ) + result_dict = { + "uuid": query.id, + "question": query.question, + "predicted": query.predicted, + "target": query.target, + "unsure": query.unsure, + } + + if query.evaluation_mode is not None: + result_dict["evaluation_mode"] = query.evaluation_mode + + results.append(result_dict) # make directory if it doesn't exist if not os.path.exists(output_dir):