diff --git a/bixbench/graders.py b/bixbench/graders.py index f2e81b0..ddf9682 100644 --- a/bixbench/graders.py +++ b/bixbench/graders.py @@ -159,8 +159,7 @@ def _grade_range_verifier( ) -> GradeResult: lower, upper = ast.literal_eval(target) correct = lower <= float(predicted) <= upper - if correct: - grade_type = GradeType.CORRECT + grade_type = GradeType.CORRECT if correct else GradeType.INCORRECT return GradeResult( grade=grade_type.numeric_grade, diff --git a/pyproject.toml b/pyproject.toml index a941145..697cd98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,9 @@ check-hidden = true ignore-words-list = "coefficent,LasR" skip = "uv.lock,bixbench_results/baseline_eval_data/*,bixbench-v1.5_results/*" +[tool.pytest.ini_options] +pythonpath = ["."] + [tool.refurb] enable_all = true ignore = [ diff --git a/tests/test_utils.py b/tests/test_utils.py index ab30405..482803e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -109,6 +109,28 @@ async def test_grade_mcq_answer( assert grade_result.refusal == expected_refusal +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("target", "predicted", "expected_grade", "expected_correct"), + [ + pytest.param("(1.0, 3.0)", "1.0", 1, True, id="lower_bound"), + pytest.param("(1.0, 3.0)", "2.0", 1, True, id="inside_range"), + pytest.param("(1.0, 3.0)", "3.0", 1, True, id="upper_bound"), + pytest.param("(1.0, 3.0)", "4.0", 0, False, id="outside_range"), + ], +) +async def test_grade_range_answer( + target: str, predicted: str, expected_grade: int, expected_correct: bool +): + grade_result = await MCQGrader().grade( + target, predicted, evaluation_mode="range_verifier" + ) + + assert grade_result.grade == expected_grade + assert grade_result.correct is expected_correct + assert grade_result.refusal is False + + @pytest.mark.parametrize( ("grades", "is_refused", "metrics"), [