From 15712d761563b23f9065aac7ca8a8c3082692a80 Mon Sep 17 00:00:00 2001 From: jatinkumar300403 Date: Sun, 21 Jun 2026 11:22:10 +0530 Subject: [PATCH] feat(eval): add abstention detection and comparison-aware scoring for static_json Add a consistent policy for comparison-only scenarios so the scorer can distinguish between an agent that refuses to answer (abstention) and one that provides a valid comparison in natural language instead of structured JSON. Changes: - Add abstained flag to ScorerResult for downstream filtering - Introduce is_abstained() helper detecting empty or decline-to-answer responses - Grant 0.5 partial credit when gold values appear in plain-text answers - Add tests for abstention, correct comparison, and wrong comparison Signed-off-by: jatinkumar300403 --- src/evaluation/models.py | 1 + src/evaluation/scorers/static_json.py | 78 +++++++++++++++---- .../tests/test_static_json_scorer.py | 69 ++++++++++++++-- 3 files changed, 129 insertions(+), 19 deletions(-) diff --git a/src/evaluation/models.py b/src/evaluation/models.py index 353619f2..05f9abb8 100644 --- a/src/evaluation/models.py +++ b/src/evaluation/models.py @@ -77,6 +77,7 @@ class ScorerResult(BaseModel): scorer: str passed: bool score: float = 0.0 + abstained: bool = False rationale: str = "" details: dict[str, Any] = Field(default_factory=dict) diff --git a/src/evaluation/scorers/static_json.py b/src/evaluation/scorers/static_json.py index a26c53db..73b8eab0 100644 --- a/src/evaluation/scorers/static_json.py +++ b/src/evaluation/scorers/static_json.py @@ -114,9 +114,7 @@ def _extract_balanced_structure(content: str) -> str: (content.find("("), "(", ")"), ] candidates = [ - (idx, open_ch, close_ch) - for idx, open_ch, close_ch in candidates - if idx != -1 + (idx, open_ch, close_ch) for idx, open_ch, close_ch in candidates if idx != -1 ] if not candidates: @@ -367,9 +365,7 @@ def evaluate_static_json( precision = exact_matches / total_model_keys if total_model_keys else 0.0 recall = exact_matches / total_gold_keys if total_gold_keys else 0.0 f1 = ( - 2 * precision * recall / (precision + recall) - if precision + recall > 0 - else 0.0 + 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0 ) partial_exact = exact_matches / total_gold_keys if total_gold_keys else 0.0 @@ -392,6 +388,7 @@ def evaluate_static_json( details=details, ) + def evaluate_static_json_batch( pairs: list[tuple[Any, Any]], *, @@ -439,6 +436,30 @@ def evaluate_static_json_batch( "examples": [score.to_dict() for score in scores], } + +def is_abstained(answer: Any) -> bool: + """Detect if the answer is empty or explicitly abstains.""" + if answer is None: + return True + raw = str(answer).strip().lower() + if not raw: + return True + content = extract_answer_text(answer).strip().lower() + if not content: + return True + + abstention_phrases = [ + "i don't know", + "i do not know", + "i cannot answer", + "i can't answer", + "not enough information", + "unable to determine", + "cannot determine", + ] + return any(phrase in raw or phrase in content for phrase in abstention_phrases) + + class StaticJsonScorer: """Evaluation scorer wrapper for the trajectory-based pipeline.""" @@ -466,20 +487,51 @@ def __call__( static_score = evaluate_static_json(gold_answer, answer) passed = static_score.strict_exact_match_accuracy == 1.0 + score = round(static_score.f1, 3) + abstained = False + rationale = ( + "strict structured match" + if passed + else "structured answer differs from ground truth" + ) + + if not passed and static_score.exact_value_matches == 0: + if is_abstained(answer): + abstained = True + rationale = "agent abstained from answering" + else: + gold_flat = flatten_answer(gold_answer) + if gold_flat and len(gold_flat) <= 2: + raw_answer_lower = str(answer).lower() + values_found = sum( + 1 + for val in gold_flat.values() + if str(val).lower() in raw_answer_lower + ) + if values_found == len(gold_flat): + score = 0.5 + rationale = "comparison match (values found in text but structure missing)" + static_score.details.append( + KeyComparison( + key="comparison_match", + gold_value=str(list(gold_flat.values())), + model_value="found in text", + exact=False, + match_type="comparison", + similarity=0.5, + ) + ) return ScorerResult( scorer=self.name, passed=passed, - score=round(static_score.f1, 3), - rationale=( - "strict structured match" - if passed - else "structured answer differs from ground truth" - ), + score=score, + abstained=abstained, + rationale=rationale, details=static_score.to_dict(), ) def install(name: str = "static_json") -> None: """Register the static JSON scorer.""" - register(name, StaticJsonScorer(name=name)) \ No newline at end of file + register(name, StaticJsonScorer(name=name)) diff --git a/src/evaluation/tests/test_static_json_scorer.py b/src/evaluation/tests/test_static_json_scorer.py index 175a320c..2ce9c7b2 100644 --- a/src/evaluation/tests/test_static_json_scorer.py +++ b/src/evaluation/tests/test_static_json_scorer.py @@ -1,10 +1,13 @@ +from evaluation.models import Scenario from evaluation.scorers.static_json import ( + StaticJsonScorer, evaluate_static_json, evaluate_static_json_batch, flatten_answer, parse_structured_answer, ) + def test_parse_json_object_from_noisy_markdown_answer(): raw = 'Answer:\n```json\n{"energy": 3, "material": 12}\n```' @@ -122,11 +125,6 @@ def test_batch_evaluation(): assert result["strict_exact_match_accuracy"] == 0.5 - -from evaluation.models import Scenario -from evaluation.scorers.static_json import StaticJsonScorer - - def test_static_json_scorer_wrapper_exact_match(): scenario = Scenario.from_raw( { @@ -147,4 +145,63 @@ def test_static_json_scorer_wrapper_exact_match(): assert result.scorer == "static_json" assert result.passed is True assert result.score == 1.0 - assert result.details["strict_exact_match_accuracy"] == 1.0 \ No newline at end of file + assert result.details["strict_exact_match_accuracy"] == 1.0 + + +def test_static_json_scorer_abstention(): + scenario = Scenario.from_raw( + { + "id": "12", + "text": "Which machine should be prioritized?", + "expected_answer": '{"machine": "Motor_B", "severity": "Zone_D"}', + "scoring_method": "static_json", + } + ) + scorer = StaticJsonScorer() + result = scorer(scenario, "I don't know the answer.", "") + assert result.passed is False + assert result.score == 0.0 + assert result.abstained is True + assert result.rationale == "agent abstained from answering" + + +def test_static_json_scorer_comparison_match(): + scenario = Scenario.from_raw( + { + "id": "13", + "text": "Which machine should be prioritized?", + "expected_answer": '{"machine": "Motor_B", "severity": "Zone_D"}', + "scoring_method": "static_json", + } + ) + scorer = StaticJsonScorer() + result = scorer( + scenario, + "Motor_B should be prioritized because it is in Zone_D severity.", + "", + ) + assert result.passed is False + assert result.score == 0.5 + assert result.abstained is False + assert "comparison match" in result.rationale + + +def test_static_json_scorer_wrong_comparison(): + scenario = Scenario.from_raw( + { + "id": "14", + "text": "Which machine should be prioritized?", + "expected_answer": '{"machine": "Motor_B", "severity": "Zone_D"}', + "scoring_method": "static_json", + } + ) + scorer = StaticJsonScorer() + result = scorer( + scenario, + "Motor_A is the machine to prioritize because it is in Zone_C.", + "", + ) + assert result.passed is False + assert result.score == 0.0 + assert result.abstained is False + assert "structured answer differs" in result.rationale