Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/evaluation/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ class ScorerResult(BaseModel):
scorer: str
passed: bool
score: float = 0.0
abstained: bool = False
rationale: str = ""
details: dict[str, Any] = Field(default_factory=dict)

Expand Down
78 changes: 65 additions & 13 deletions src/evaluation/scorers/static_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,7 @@ def _extract_balanced_structure(content: str) -> str:
(content.find("("), "(", ")"),
]
candidates = [
(idx, open_ch, close_ch)
for idx, open_ch, close_ch in candidates
if idx != -1
(idx, open_ch, close_ch) for idx, open_ch, close_ch in candidates if idx != -1
]

if not candidates:
Expand Down Expand Up @@ -367,9 +365,7 @@ def evaluate_static_json(
precision = exact_matches / total_model_keys if total_model_keys else 0.0
recall = exact_matches / total_gold_keys if total_gold_keys else 0.0
f1 = (
2 * precision * recall / (precision + recall)
if precision + recall > 0
else 0.0
2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
)

partial_exact = exact_matches / total_gold_keys if total_gold_keys else 0.0
Expand All @@ -392,6 +388,7 @@ def evaluate_static_json(
details=details,
)


def evaluate_static_json_batch(
pairs: list[tuple[Any, Any]],
*,
Expand Down Expand Up @@ -439,6 +436,30 @@ def evaluate_static_json_batch(
"examples": [score.to_dict() for score in scores],
}


def is_abstained(answer: Any) -> bool:
"""Detect if the answer is empty or explicitly abstains."""
if answer is None:
return True
raw = str(answer).strip().lower()
if not raw:
return True
content = extract_answer_text(answer).strip().lower()
if not content:
return True

abstention_phrases = [
"i don't know",
"i do not know",
"i cannot answer",
"i can't answer",
"not enough information",
"unable to determine",
"cannot determine",
]
return any(phrase in raw or phrase in content for phrase in abstention_phrases)


class StaticJsonScorer:
"""Evaluation scorer wrapper for the trajectory-based pipeline."""

Expand Down Expand Up @@ -466,20 +487,51 @@ def __call__(

static_score = evaluate_static_json(gold_answer, answer)
passed = static_score.strict_exact_match_accuracy == 1.0
score = round(static_score.f1, 3)
abstained = False
rationale = (
"strict structured match"
if passed
else "structured answer differs from ground truth"
)

if not passed and static_score.exact_value_matches == 0:
if is_abstained(answer):
abstained = True
rationale = "agent abstained from answering"
else:
gold_flat = flatten_answer(gold_answer)
if gold_flat and len(gold_flat) <= 2:
raw_answer_lower = str(answer).lower()
values_found = sum(
1
for val in gold_flat.values()
if str(val).lower() in raw_answer_lower
)
if values_found == len(gold_flat):
score = 0.5
rationale = "comparison match (values found in text but structure missing)"
static_score.details.append(
KeyComparison(
key="comparison_match",
gold_value=str(list(gold_flat.values())),
model_value="found in text",
exact=False,
match_type="comparison",
similarity=0.5,
)
)

return ScorerResult(
scorer=self.name,
passed=passed,
score=round(static_score.f1, 3),
rationale=(
"strict structured match"
if passed
else "structured answer differs from ground truth"
),
score=score,
abstained=abstained,
rationale=rationale,
details=static_score.to_dict(),
)


def install(name: str = "static_json") -> None:
"""Register the static JSON scorer."""
register(name, StaticJsonScorer(name=name))
register(name, StaticJsonScorer(name=name))
69 changes: 63 additions & 6 deletions src/evaluation/tests/test_static_json_scorer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from evaluation.models import Scenario
from evaluation.scorers.static_json import (
StaticJsonScorer,
evaluate_static_json,
evaluate_static_json_batch,
flatten_answer,
parse_structured_answer,
)


def test_parse_json_object_from_noisy_markdown_answer():
raw = 'Answer:\n```json\n{"energy": 3, "material": 12}\n```'

Expand Down Expand Up @@ -122,11 +125,6 @@ def test_batch_evaluation():
assert result["strict_exact_match_accuracy"] == 0.5



from evaluation.models import Scenario
from evaluation.scorers.static_json import StaticJsonScorer


def test_static_json_scorer_wrapper_exact_match():
scenario = Scenario.from_raw(
{
Expand All @@ -147,4 +145,63 @@ def test_static_json_scorer_wrapper_exact_match():
assert result.scorer == "static_json"
assert result.passed is True
assert result.score == 1.0
assert result.details["strict_exact_match_accuracy"] == 1.0
assert result.details["strict_exact_match_accuracy"] == 1.0


def test_static_json_scorer_abstention():
scenario = Scenario.from_raw(
{
"id": "12",
"text": "Which machine should be prioritized?",
"expected_answer": '{"machine": "Motor_B", "severity": "Zone_D"}',
"scoring_method": "static_json",
}
)
scorer = StaticJsonScorer()
result = scorer(scenario, "I don't know the answer.", "")
assert result.passed is False
assert result.score == 0.0
assert result.abstained is True
assert result.rationale == "agent abstained from answering"


def test_static_json_scorer_comparison_match():
scenario = Scenario.from_raw(
{
"id": "13",
"text": "Which machine should be prioritized?",
"expected_answer": '{"machine": "Motor_B", "severity": "Zone_D"}',
"scoring_method": "static_json",
}
)
scorer = StaticJsonScorer()
result = scorer(
scenario,
"Motor_B should be prioritized because it is in Zone_D severity.",
"",
)
assert result.passed is False
assert result.score == 0.5
assert result.abstained is False
assert "comparison match" in result.rationale


def test_static_json_scorer_wrong_comparison():
scenario = Scenario.from_raw(
{
"id": "14",
"text": "Which machine should be prioritized?",
"expected_answer": '{"machine": "Motor_B", "severity": "Zone_D"}',
"scoring_method": "static_json",
}
)
scorer = StaticJsonScorer()
result = scorer(
scenario,
"Motor_A is the machine to prioritize because it is in Zone_C.",
"",
)
assert result.passed is False
assert result.score == 0.0
assert result.abstained is False
assert "structured answer differs" in result.rationale