Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .github/workflows/validate-evaluators.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@ jobs:

- name: Install dependencies
run: |
pip install pyyaml
# TODO: switch to `pip install agentevals-grader-sdk` once published to PyPI
pip install "agentevals-grader-sdk @ git+https://github.com/agentevals-dev/agentevals.git#subdirectory=packages/grader-sdk-py"
pip install pyyaml agentevals-evaluator-sdk

- name: Discover and validate all evaluators
run: |
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.venv/
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,13 @@ author: your-github-username
Run the validation script to catch issues before submitting:

```bash
pip install agentevals-grader-sdk pyyaml
pip install pyyaml agentevals-evaluator-sdk
python scripts/validate_evaluator.py evaluators/my_evaluator
```

This checks:
- **Manifest schema** -- required fields, entrypoint exists, name matches directory
- **Syntax and imports** -- compiles cleanly, uses `@grader` decorator
- **Syntax and imports** -- compiles cleanly, uses `@evaluator` decorator
- **Smoke run** -- runs the evaluator with synthetic input and validates the `EvalResult` output (correct types for `score`, `details`, `status`, etc.)

You can also test with a full eval run:
Expand Down
59 changes: 59 additions & 0 deletions evaluators/contains/contains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Substring containment evaluator.

Scores each invocation 1.0 if final_response contains the configured substring,
otherwise 0.0.

Config:
substring (str): Required. If omitted, returns NOT_EVALUATED.

Usage in eval_config.yaml:
config:
substring: "expected phrase"
"""

from __future__ import annotations

from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator


@evaluator
def contains(input: EvalInput) -> EvalResult:
substring = (input.config.get("substring") or "").strip()
n = len(input.invocations)
if not substring:
return EvalResult(
score=0.0,
status=EvalStatus.NOT_EVALUATED,
per_invocation_scores=[None] * n,
details={"reason": "missing config: substring"},
)

case_insensitive = bool(input.config.get("case_insensitive", False))
normalize = str.lower if case_insensitive else lambda s: s
substring_cmp = normalize(substring)

scores: list[float] = []
issues: list[str] = []

for inv in input.invocations:
response_text = inv.final_response or ""
if case_insensitive:
ok = substring_cmp in normalize(response_text)
else:
ok = substring in response_text
if ok:
scores.append(1.0)
else:
scores.append(0.0)
issues.append(f"{inv.invocation_id}: response does not contain {substring!r}")

overall = sum(scores) / len(scores) if scores else 0.0
return EvalResult(
score=overall,
per_invocation_scores=scores,
details={"issues": issues} if issues else None,
)


if __name__ == "__main__":
contains.run()
6 changes: 6 additions & 0 deletions evaluators/contains/evaluator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: contains
description: Scores whether each final response contains a configured substring (case-sensitive or case-insensitive)
language: python
entrypoint: contains.py
tags: [string, contains]
author: agentevals-dev
60 changes: 60 additions & 0 deletions evaluators/equals/equals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Exact string match evaluator.

Config:
expected (str): Required. If omitted, returns NOT_EVALUATED.
case_insensitive (bool, default True): Compare normalized strings.
strip (bool, default True): Strip whitespace before compare.

Usage:
config:
expected: "4"
"""

from __future__ import annotations

from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator


@evaluator
def equals(input: EvalInput) -> EvalResult:
expected = input.config.get("expected")
if expected is None:
n = len(input.invocations)
return EvalResult(
score=0.0,
status=EvalStatus.NOT_EVALUATED,
per_invocation_scores=[None] * n,
details={"reason": "missing config: expected"},
)

case_insensitive = bool(input.config.get("case_insensitive", False))
strip = bool(input.config.get("strip", True))

def norm(s: str) -> str:
t = s.strip() if strip else s
return t.lower() if case_insensitive else t

exp = norm(str(expected))
scores: list[float] = []
issues: list[str] = []

for inv in input.invocations:
got = norm(inv.final_response or "")
if got == exp:
scores.append(1.0)
else:
scores.append(0.0)
issues.append(
f"{inv.invocation_id}: expected {expected!r}, got {inv.final_response!r}"
)

overall = sum(scores) / len(scores) if scores else 0.0
return EvalResult(
score=overall,
per_invocation_scores=scores,
details={"issues": issues} if issues else None,
)


if __name__ == "__main__":
equals.run()
6 changes: 6 additions & 0 deletions evaluators/equals/evaluator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: equals
description: Scores whether each final response exactly matches a configured expected string
language: python
entrypoint: equals.py
tags: [string, equals]
author: agentevals-dev
6 changes: 6 additions & 0 deletions evaluators/is_json/evaluator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: is_json
description: Scores whether each final response parses as JSON (optional markdown code fence extraction)
language: python
entrypoint: is_json.py
tags: [json, structured]
author: agentevals-dev
56 changes: 56 additions & 0 deletions evaluators/is_json/is_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""JSON parse check evaluator.

Tries to parse final_response as JSON. Optionally extracts fenced ```json ... ``` blocks.

Config:
extract_markdown_fence (bool, default True): Strip ```json fences if present.

Usage:
config:
extract_markdown_fence: true
"""

from __future__ import annotations

import json
import re

from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator

_FENCE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?```\s*$", re.DOTALL | re.IGNORECASE)


def _parse_json_payload(text: str, extract_fence: bool) -> object:
raw = (text or "").strip()
if extract_fence:
m = _FENCE.match(raw)
if m:
raw = m.group(1).strip()
return json.loads(raw)


@evaluator
def is_json(input: EvalInput) -> EvalResult:
extract_fence = bool(input.config.get("extract_markdown_fence", True))

scores: list[float] = []
issues: list[str] = []

for inv in input.invocations:
try:
_parse_json_payload(inv.final_response or "", extract_fence)
scores.append(1.0)
except (json.JSONDecodeError, TypeError, ValueError) as exc:
scores.append(0.0)
issues.append(f"{inv.invocation_id}: not valid JSON ({exc})")

overall = sum(scores) / len(scores) if scores else 0.0
return EvalResult(
score=overall,
per_invocation_scores=scores,
details={"issues": issues} if issues else None,
)


if __name__ == "__main__":
is_json.run()
6 changes: 6 additions & 0 deletions evaluators/levenshtein_ratio/evaluator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: levenshtein_ratio
description: Scores similarity of each response to a reference string using normalized Levenshtein distance
language: python
entrypoint: levenshtein_ratio.py
tags: [string, levenshtein]
author: agentevals-dev
82 changes: 82 additions & 0 deletions evaluators/levenshtein_ratio/levenshtein_ratio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Normalized Levenshtein similarity evaluator.

Score for an invocation is 1.0 - (edit_distance / max(len(a), len(b), 1)), clamped to [0, 1].

Config:
expected (str): Required. If omitted, returns NOT_EVALUATED.
case_insensitive (bool, default False): Compare lowercased strings.

Usage:
config:
expected: "reference answer"
"""

from __future__ import annotations

from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator


def _levenshtein(a: str, b: str) -> int:
"""Classic O(nm) edit distance."""
if len(a) < len(b):
a, b = b, a
if not b:
return len(a)
prev = list(range(len(b) + 1))
for i, ca in enumerate(a):
cur = [i + 1]
for j, cb in enumerate(b):
ins = prev[j + 1] + 1
delete = cur[j] + 1
sub = prev[j] + (ca != cb)
cur.append(min(ins, delete, sub))
prev = cur
return prev[-1]


@evaluator
def levenshtein_ratio(input: EvalInput) -> EvalResult:
expected = input.config.get("expected")
if expected is None:
n = len(input.invocations)
return EvalResult(
score=0.0,
status=EvalStatus.NOT_EVALUATED,
per_invocation_scores=[None] * n,
details={"reason": "missing config: expected"},
)

case_insensitive = bool(input.config.get("case_insensitive", False))
ref = str(expected)
if case_insensitive:
ref = ref.lower()

scores: list[float] = []
details_rows: list[dict] = []

for inv in input.invocations:
got = inv.final_response or ""
a, b = (got.lower(), ref) if case_insensitive else (got, ref)
dist = _levenshtein(a, b)
denom = max(len(a), len(b), 1)
sim = 1.0 - (dist / denom)
sim = max(0.0, min(1.0, sim))
scores.append(sim)
details_rows.append(
{
"invocation_id": inv.invocation_id,
"distance": dist,
"similarity": sim,
}
)

overall = sum(scores) / len(scores) if scores else 0.0
return EvalResult(
score=overall,
per_invocation_scores=scores,
details={"per_invocation": details_rows},
)


if __name__ == "__main__":
levenshtein_ratio.run()
6 changes: 6 additions & 0 deletions evaluators/regex_match/evaluator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: regex_match
description: Scores whether each final response matches a configured regular expression
language: python
entrypoint: regex_match.py
tags: [regex]
author: agentevals-dev
Loading
Loading