From 15712d761563b23f9065aac7ca8a8c3082692a80 Mon Sep 17 00:00:00 2001
From: jatinkumar300403 <jatin_johnny@yahoo.com>
Date: Sun, 21 Jun 2026 11:22:10 +0530
Subject: [PATCH] feat(eval): add abstention detection and comparison-aware
 scoring for static_json

Add a consistent policy for comparison-only scenarios so the scorer can distinguish between an agent that refuses to answer (abstention) and one that provides a valid comparison in natural language instead of structured JSON.

Changes:

- Add abstained flag to ScorerResult for downstream filtering

- Introduce is_abstained() helper detecting empty or decline-to-answer responses

- Grant 0.5 partial credit when gold values appear in plain-text answers

- Add tests for abstention, correct comparison, and wrong comparison

Signed-off-by: jatinkumar300403 <jatin_johnny@yahoo.com>
---
 src/evaluation/models.py                      |  1 +
 src/evaluation/scorers/static_json.py         | 78 +++++++++++++++----
 .../tests/test_static_json_scorer.py          | 69 ++++++++++++++--
 3 files changed, 129 insertions(+), 19 deletions(-)

diff --git a/src/evaluation/models.py b/src/evaluation/models.py
index 353619f2..05f9abb8 100644
--- a/src/evaluation/models.py
+++ b/src/evaluation/models.py
@@ -77,6 +77,7 @@ class ScorerResult(BaseModel):
     scorer: str
     passed: bool
     score: float = 0.0
+    abstained: bool = False
     rationale: str = ""
     details: dict[str, Any] = Field(default_factory=dict)
 
diff --git a/src/evaluation/scorers/static_json.py b/src/evaluation/scorers/static_json.py
index a26c53db..73b8eab0 100644
--- a/src/evaluation/scorers/static_json.py
+++ b/src/evaluation/scorers/static_json.py
@@ -114,9 +114,7 @@ def _extract_balanced_structure(content: str) -> str:
         (content.find("("), "(", ")"),
     ]
     candidates = [
-        (idx, open_ch, close_ch)
-        for idx, open_ch, close_ch in candidates
-        if idx != -1
+        (idx, open_ch, close_ch) for idx, open_ch, close_ch in candidates if idx != -1
     ]
 
     if not candidates:
@@ -367,9 +365,7 @@ def evaluate_static_json(
     precision = exact_matches / total_model_keys if total_model_keys else 0.0
     recall = exact_matches / total_gold_keys if total_gold_keys else 0.0
     f1 = (
-        2 * precision * recall / (precision + recall)
-        if precision + recall > 0
-        else 0.0
+        2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
     )
 
     partial_exact = exact_matches / total_gold_keys if total_gold_keys else 0.0
@@ -392,6 +388,7 @@ def evaluate_static_json(
         details=details,
     )
 
+
 def evaluate_static_json_batch(
     pairs: list[tuple[Any, Any]],
     *,
@@ -439,6 +436,30 @@ def evaluate_static_json_batch(
         "examples": [score.to_dict() for score in scores],
     }
 
+
+def is_abstained(answer: Any) -> bool:
+    """Detect if the answer is empty or explicitly abstains."""
+    if answer is None:
+        return True
+    raw = str(answer).strip().lower()
+    if not raw:
+        return True
+    content = extract_answer_text(answer).strip().lower()
+    if not content:
+        return True
+
+    abstention_phrases = [
+        "i don't know",
+        "i do not know",
+        "i cannot answer",
+        "i can't answer",
+        "not enough information",
+        "unable to determine",
+        "cannot determine",
+    ]
+    return any(phrase in raw or phrase in content for phrase in abstention_phrases)
+
+
 class StaticJsonScorer:
     """Evaluation scorer wrapper for the trajectory-based pipeline."""
 
@@ -466,20 +487,51 @@ def __call__(
 
         static_score = evaluate_static_json(gold_answer, answer)
         passed = static_score.strict_exact_match_accuracy == 1.0
+        score = round(static_score.f1, 3)
+        abstained = False
+        rationale = (
+            "strict structured match"
+            if passed
+            else "structured answer differs from ground truth"
+        )
+
+        if not passed and static_score.exact_value_matches == 0:
+            if is_abstained(answer):
+                abstained = True
+                rationale = "agent abstained from answering"
+            else:
+                gold_flat = flatten_answer(gold_answer)
+                if gold_flat and len(gold_flat) <= 2:
+                    raw_answer_lower = str(answer).lower()
+                    values_found = sum(
+                        1
+                        for val in gold_flat.values()
+                        if str(val).lower() in raw_answer_lower
+                    )
+                    if values_found == len(gold_flat):
+                        score = 0.5
+                        rationale = "comparison match (values found in text but structure missing)"
+                        static_score.details.append(
+                            KeyComparison(
+                                key="comparison_match",
+                                gold_value=str(list(gold_flat.values())),
+                                model_value="found in text",
+                                exact=False,
+                                match_type="comparison",
+                                similarity=0.5,
+                            )
+                        )
 
         return ScorerResult(
             scorer=self.name,
             passed=passed,
-            score=round(static_score.f1, 3),
-            rationale=(
-                "strict structured match"
-                if passed
-                else "structured answer differs from ground truth"
-            ),
+            score=score,
+            abstained=abstained,
+            rationale=rationale,
             details=static_score.to_dict(),
         )
 
 
 def install(name: str = "static_json") -> None:
     """Register the static JSON scorer."""
-    register(name, StaticJsonScorer(name=name))
\ No newline at end of file
+    register(name, StaticJsonScorer(name=name))
diff --git a/src/evaluation/tests/test_static_json_scorer.py b/src/evaluation/tests/test_static_json_scorer.py
index 175a320c..2ce9c7b2 100644
--- a/src/evaluation/tests/test_static_json_scorer.py
+++ b/src/evaluation/tests/test_static_json_scorer.py
@@ -1,10 +1,13 @@
+from evaluation.models import Scenario
 from evaluation.scorers.static_json import (
+    StaticJsonScorer,
     evaluate_static_json,
     evaluate_static_json_batch,
     flatten_answer,
     parse_structured_answer,
 )
 
+
 def test_parse_json_object_from_noisy_markdown_answer():
     raw = 'Answer:\n```json\n{"energy": 3, "material": 12}\n```'
 
@@ -122,11 +125,6 @@ def test_batch_evaluation():
     assert result["strict_exact_match_accuracy"] == 0.5
 
 
-
-from evaluation.models import Scenario
-from evaluation.scorers.static_json import StaticJsonScorer
-
-
 def test_static_json_scorer_wrapper_exact_match():
     scenario = Scenario.from_raw(
         {
@@ -147,4 +145,63 @@ def test_static_json_scorer_wrapper_exact_match():
     assert result.scorer == "static_json"
     assert result.passed is True
     assert result.score == 1.0
-    assert result.details["strict_exact_match_accuracy"] == 1.0
\ No newline at end of file
+    assert result.details["strict_exact_match_accuracy"] == 1.0
+
+
+def test_static_json_scorer_abstention():
+    scenario = Scenario.from_raw(
+        {
+            "id": "12",
+            "text": "Which machine should be prioritized?",
+            "expected_answer": '{"machine": "Motor_B", "severity": "Zone_D"}',
+            "scoring_method": "static_json",
+        }
+    )
+    scorer = StaticJsonScorer()
+    result = scorer(scenario, "I don't know the answer.", "")
+    assert result.passed is False
+    assert result.score == 0.0
+    assert result.abstained is True
+    assert result.rationale == "agent abstained from answering"
+
+
+def test_static_json_scorer_comparison_match():
+    scenario = Scenario.from_raw(
+        {
+            "id": "13",
+            "text": "Which machine should be prioritized?",
+            "expected_answer": '{"machine": "Motor_B", "severity": "Zone_D"}',
+            "scoring_method": "static_json",
+        }
+    )
+    scorer = StaticJsonScorer()
+    result = scorer(
+        scenario,
+        "Motor_B should be prioritized because it is in Zone_D severity.",
+        "",
+    )
+    assert result.passed is False
+    assert result.score == 0.5
+    assert result.abstained is False
+    assert "comparison match" in result.rationale
+
+
+def test_static_json_scorer_wrong_comparison():
+    scenario = Scenario.from_raw(
+        {
+            "id": "14",
+            "text": "Which machine should be prioritized?",
+            "expected_answer": '{"machine": "Motor_B", "severity": "Zone_D"}',
+            "scoring_method": "static_json",
+        }
+    )
+    scorer = StaticJsonScorer()
+    result = scorer(
+        scenario,
+        "Motor_A is the machine to prioritize because it is in Zone_C.",
+        "",
+    )
+    assert result.passed is False
+    assert result.score == 0.0
+    assert result.abstained is False
+    assert "structured answer differs" in result.rationale