diff --git a/.github/workflows/validate-evaluators.yaml b/.github/workflows/validate-evaluators.yaml index 565286b..f494cb2 100644 --- a/.github/workflows/validate-evaluators.yaml +++ b/.github/workflows/validate-evaluators.yaml @@ -20,9 +20,7 @@ jobs: - name: Install dependencies run: | - pip install pyyaml - # TODO: switch to `pip install agentevals-grader-sdk` once published to PyPI - pip install "agentevals-grader-sdk @ git+https://github.com/agentevals-dev/agentevals.git#subdirectory=packages/grader-sdk-py" + pip install pyyaml agentevals-evaluator-sdk - name: Discover and validate all evaluators run: | diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0cafc1c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.venv/ \ No newline at end of file diff --git a/README.md b/README.md index c94955a..0a7fac5 100644 --- a/README.md +++ b/README.md @@ -107,13 +107,13 @@ author: your-github-username Run the validation script to catch issues before submitting: ```bash -pip install agentevals-grader-sdk pyyaml +pip install pyyaml agentevals-evaluator-sdk python scripts/validate_evaluator.py evaluators/my_evaluator ``` This checks: - **Manifest schema** -- required fields, entrypoint exists, name matches directory -- **Syntax and imports** -- compiles cleanly, uses `@grader` decorator +- **Syntax and imports** -- compiles cleanly, uses `@evaluator` decorator - **Smoke run** -- runs the evaluator with synthetic input and validates the `EvalResult` output (correct types for `score`, `details`, `status`, etc.) You can also test with a full eval run: diff --git a/evaluators/contains/contains.py b/evaluators/contains/contains.py new file mode 100644 index 0000000..cbe6eca --- /dev/null +++ b/evaluators/contains/contains.py @@ -0,0 +1,59 @@ +"""Substring containment evaluator. + +Scores each invocation 1.0 if final_response contains the configured substring, +otherwise 0.0. + +Config: + substring (str): Required. If omitted, returns NOT_EVALUATED. + +Usage in eval_config.yaml: + config: + substring: "expected phrase" +""" + +from __future__ import annotations + +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator + + +@evaluator +def contains(input: EvalInput) -> EvalResult: + substring = (input.config.get("substring") or "").strip() + n = len(input.invocations) + if not substring: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing config: substring"}, + ) + + case_insensitive = bool(input.config.get("case_insensitive", False)) + normalize = str.lower if case_insensitive else lambda s: s + substring_cmp = normalize(substring) + + scores: list[float] = [] + issues: list[str] = [] + + for inv in input.invocations: + response_text = inv.final_response or "" + if case_insensitive: + ok = substring_cmp in normalize(response_text) + else: + ok = substring in response_text + if ok: + scores.append(1.0) + else: + scores.append(0.0) + issues.append(f"{inv.invocation_id}: response does not contain {substring!r}") + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"issues": issues} if issues else None, + ) + + +if __name__ == "__main__": + contains.run() diff --git a/evaluators/contains/evaluator.yaml b/evaluators/contains/evaluator.yaml new file mode 100644 index 0000000..7395d68 --- /dev/null +++ b/evaluators/contains/evaluator.yaml @@ -0,0 +1,6 @@ +name: contains +description: Scores whether each final response contains a configured substring (case-sensitive or case-insensitive) +language: python +entrypoint: contains.py +tags: [string, contains] +author: agentevals-dev diff --git a/evaluators/equals/equals.py b/evaluators/equals/equals.py new file mode 100644 index 0000000..d85b1e1 --- /dev/null +++ b/evaluators/equals/equals.py @@ -0,0 +1,60 @@ +"""Exact string match evaluator. + +Config: + expected (str): Required. If omitted, returns NOT_EVALUATED. + case_insensitive (bool, default True): Compare normalized strings. + strip (bool, default True): Strip whitespace before compare. + +Usage: + config: + expected: "4" +""" + +from __future__ import annotations + +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator + + +@evaluator +def equals(input: EvalInput) -> EvalResult: + expected = input.config.get("expected") + if expected is None: + n = len(input.invocations) + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing config: expected"}, + ) + + case_insensitive = bool(input.config.get("case_insensitive", False)) + strip = bool(input.config.get("strip", True)) + + def norm(s: str) -> str: + t = s.strip() if strip else s + return t.lower() if case_insensitive else t + + exp = norm(str(expected)) + scores: list[float] = [] + issues: list[str] = [] + + for inv in input.invocations: + got = norm(inv.final_response or "") + if got == exp: + scores.append(1.0) + else: + scores.append(0.0) + issues.append( + f"{inv.invocation_id}: expected {expected!r}, got {inv.final_response!r}" + ) + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"issues": issues} if issues else None, + ) + + +if __name__ == "__main__": + equals.run() diff --git a/evaluators/equals/evaluator.yaml b/evaluators/equals/evaluator.yaml new file mode 100644 index 0000000..31915ef --- /dev/null +++ b/evaluators/equals/evaluator.yaml @@ -0,0 +1,6 @@ +name: equals +description: Scores whether each final response exactly matches a configured expected string +language: python +entrypoint: equals.py +tags: [string, equals] +author: agentevals-dev diff --git a/evaluators/is_json/evaluator.yaml b/evaluators/is_json/evaluator.yaml new file mode 100644 index 0000000..1236e9e --- /dev/null +++ b/evaluators/is_json/evaluator.yaml @@ -0,0 +1,6 @@ +name: is_json +description: Scores whether each final response parses as JSON (optional markdown code fence extraction) +language: python +entrypoint: is_json.py +tags: [json, structured] +author: agentevals-dev diff --git a/evaluators/is_json/is_json.py b/evaluators/is_json/is_json.py new file mode 100644 index 0000000..deb4980 --- /dev/null +++ b/evaluators/is_json/is_json.py @@ -0,0 +1,56 @@ +"""JSON parse check evaluator. + +Tries to parse final_response as JSON. Optionally extracts fenced ```json ... ``` blocks. + +Config: + extract_markdown_fence (bool, default True): Strip ```json fences if present. + +Usage: + config: + extract_markdown_fence: true +""" + +from __future__ import annotations + +import json +import re + +from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator + +_FENCE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?```\s*$", re.DOTALL | re.IGNORECASE) + + +def _parse_json_payload(text: str, extract_fence: bool) -> object: + raw = (text or "").strip() + if extract_fence: + m = _FENCE.match(raw) + if m: + raw = m.group(1).strip() + return json.loads(raw) + + +@evaluator +def is_json(input: EvalInput) -> EvalResult: + extract_fence = bool(input.config.get("extract_markdown_fence", True)) + + scores: list[float] = [] + issues: list[str] = [] + + for inv in input.invocations: + try: + _parse_json_payload(inv.final_response or "", extract_fence) + scores.append(1.0) + except (json.JSONDecodeError, TypeError, ValueError) as exc: + scores.append(0.0) + issues.append(f"{inv.invocation_id}: not valid JSON ({exc})") + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"issues": issues} if issues else None, + ) + + +if __name__ == "__main__": + is_json.run() diff --git a/evaluators/levenshtein_ratio/evaluator.yaml b/evaluators/levenshtein_ratio/evaluator.yaml new file mode 100644 index 0000000..a9263db --- /dev/null +++ b/evaluators/levenshtein_ratio/evaluator.yaml @@ -0,0 +1,6 @@ +name: levenshtein_ratio +description: Scores similarity of each response to a reference string using normalized Levenshtein distance +language: python +entrypoint: levenshtein_ratio.py +tags: [string, levenshtein] +author: agentevals-dev diff --git a/evaluators/levenshtein_ratio/levenshtein_ratio.py b/evaluators/levenshtein_ratio/levenshtein_ratio.py new file mode 100644 index 0000000..730b917 --- /dev/null +++ b/evaluators/levenshtein_ratio/levenshtein_ratio.py @@ -0,0 +1,82 @@ +"""Normalized Levenshtein similarity evaluator. + +Score for an invocation is 1.0 - (edit_distance / max(len(a), len(b), 1)), clamped to [0, 1]. + +Config: + expected (str): Required. If omitted, returns NOT_EVALUATED. + case_insensitive (bool, default False): Compare lowercased strings. + +Usage: + config: + expected: "reference answer" +""" + +from __future__ import annotations + +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator + + +def _levenshtein(a: str, b: str) -> int: + """Classic O(nm) edit distance.""" + if len(a) < len(b): + a, b = b, a + if not b: + return len(a) + prev = list(range(len(b) + 1)) + for i, ca in enumerate(a): + cur = [i + 1] + for j, cb in enumerate(b): + ins = prev[j + 1] + 1 + delete = cur[j] + 1 + sub = prev[j] + (ca != cb) + cur.append(min(ins, delete, sub)) + prev = cur + return prev[-1] + + +@evaluator +def levenshtein_ratio(input: EvalInput) -> EvalResult: + expected = input.config.get("expected") + if expected is None: + n = len(input.invocations) + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing config: expected"}, + ) + + case_insensitive = bool(input.config.get("case_insensitive", False)) + ref = str(expected) + if case_insensitive: + ref = ref.lower() + + scores: list[float] = [] + details_rows: list[dict] = [] + + for inv in input.invocations: + got = inv.final_response or "" + a, b = (got.lower(), ref) if case_insensitive else (got, ref) + dist = _levenshtein(a, b) + denom = max(len(a), len(b), 1) + sim = 1.0 - (dist / denom) + sim = max(0.0, min(1.0, sim)) + scores.append(sim) + details_rows.append( + { + "invocation_id": inv.invocation_id, + "distance": dist, + "similarity": sim, + } + ) + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"per_invocation": details_rows}, + ) + + +if __name__ == "__main__": + levenshtein_ratio.run() diff --git a/evaluators/regex_match/evaluator.yaml b/evaluators/regex_match/evaluator.yaml new file mode 100644 index 0000000..c630ed0 --- /dev/null +++ b/evaluators/regex_match/evaluator.yaml @@ -0,0 +1,6 @@ +name: regex_match +description: Scores whether each final response matches a configured regular expression +language: python +entrypoint: regex_match.py +tags: [regex] +author: agentevals-dev diff --git a/evaluators/regex_match/regex_match.py b/evaluators/regex_match/regex_match.py new file mode 100644 index 0000000..271602e --- /dev/null +++ b/evaluators/regex_match/regex_match.py @@ -0,0 +1,81 @@ +"""Regex on final response evaluator. + +Config: + pattern (str): Required. If omitted or invalid, returns NOT_EVALUATED. + flags (str, optional): "IGNORECASE" | "MULTILINE" | "DOTALL" — combined with |. + +Usage: + config: + pattern: "^The answer" + flags: IGNORECASE +""" + +from __future__ import annotations + +import re + +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator + +_FLAG_MAP = { + "IGNORECASE": re.IGNORECASE, + "MULTILINE": re.MULTILINE, + "DOTALL": re.DOTALL, +} + + +@evaluator +def regex_match(input: EvalInput) -> EvalResult: + pattern = input.config.get("pattern") + n = len(input.invocations) + if not pattern: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing config: pattern"}, + ) + + flag_names = input.config.get("flags") + flags = 0 + if isinstance(flag_names, str): + for part in flag_names.replace("|", ",").split(","): + key = part.strip().upper() + if key in _FLAG_MAP: + flags |= _FLAG_MAP[key] + elif isinstance(flag_names, list): + for part in flag_names: + key = str(part).strip().upper() + if key in _FLAG_MAP: + flags |= _FLAG_MAP[key] + + try: + rx = re.compile(str(pattern), flags) + except re.error as exc: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "invalid regex pattern", "error": str(exc)}, + ) + + scores: list[float] = [] + issues: list[str] = [] + + for inv in input.invocations: + text = inv.final_response or "" + if rx.search(text): + scores.append(1.0) + else: + scores.append(0.0) + issues.append(f"{inv.invocation_id}: no match for pattern {pattern!r}") + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"issues": issues} if issues else None, + ) + + +if __name__ == "__main__": + regex_match.run() diff --git a/evaluators/tool_coverage/tool_coverage.py b/evaluators/tool_coverage/tool_coverage.py index 3bc7a5b..7a17d38 100644 --- a/evaluators/tool_coverage/tool_coverage.py +++ b/evaluators/tool_coverage/tool_coverage.py @@ -18,7 +18,7 @@ def tool_coverage(input: EvalInput) -> EvalResult: details: list[str] = [] for inv in input.invocations: - actual = len(inv.tool_calls) + actual = len(inv.intermediate_steps.tool_calls) if actual >= min_calls: scores.append(1.0) else: diff --git a/evaluators/tool_sequence_match/evaluator.yaml b/evaluators/tool_sequence_match/evaluator.yaml new file mode 100644 index 0000000..b58999c --- /dev/null +++ b/evaluators/tool_sequence_match/evaluator.yaml @@ -0,0 +1,6 @@ +name: tool_sequence_match +description: Scores whether tool calls match an expected list of tool names (order-sensitive or multiset) +language: python +entrypoint: tool_sequence_match.py +tags: [tools, trajectory] +author: agentevals-dev diff --git a/evaluators/tool_sequence_match/tool_sequence_match.py b/evaluators/tool_sequence_match/tool_sequence_match.py new file mode 100644 index 0000000..a729707 --- /dev/null +++ b/evaluators/tool_sequence_match/tool_sequence_match.py @@ -0,0 +1,78 @@ +"""Expected tool call sequence evaluator. + +Compares the ordered list of tool names in each invocation to config. + +Config: + expected_tool_names (list[str]): Required non-empty. Otherwise returns NOT_EVALUATED. + require_order (bool, default True): If False, compares multisets (same counts per name). + +Usage: + config: + expected_tool_names: ["search", "calculator"] + require_order: true +""" + +from __future__ import annotations + +from collections import Counter + +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator + + +@evaluator +def tool_sequence_match(input: EvalInput) -> EvalResult: + expected = input.config.get("expected_tool_names") + n = len(input.invocations) + if expected is None or not isinstance(expected, list): + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing or invalid config: expected_tool_names (need a list of names)"}, + ) + if not expected: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing or empty config: expected_tool_names"}, + ) + + want = [str(x) for x in expected] + require_order = bool(input.config.get("require_order", True)) + + scores: list[float] = [] + issues: list[str] = [] + + for inv in input.invocations: + actual = [] + for call in inv.intermediate_steps.tool_calls or []: + if isinstance(call, dict): + n = call.get("name") + if n is not None: + actual.append(str(n)) + + if require_order: + ok = actual == want + else: + ok = Counter(actual) == Counter(want) + + if ok: + scores.append(1.0) + else: + scores.append(0.0) + issues.append( + f"{inv.invocation_id}: expected {want!r}, got {actual!r} " + f"(require_order={require_order})" + ) + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"issues": issues} if issues else None, + ) + + +if __name__ == "__main__": + tool_sequence_match.run() diff --git a/scripts/test_input.json b/scripts/test_input.json index 79d41f5..7b5939c 100644 --- a/scripts/test_input.json +++ b/scripts/test_input.json @@ -7,8 +7,10 @@ "invocation_id": "ci-test-001", "user_content": "What is 2+2?", "final_response": "The answer is 4.", - "tool_calls": [{"name": "calculator", "args": {"expr": "2+2"}}], - "tool_responses": [{"name": "calculator", "output": "4"}] + "intermediate_steps": { + "tool_calls": [{"name": "calculator", "args": {"expr": "2+2"}}], + "tool_responses": [{"name": "calculator", "output": "4"}] + } } ], "expected_invocations": null diff --git a/scripts/validate_evaluator.py b/scripts/validate_evaluator.py index e80570e..13379e3 100644 --- a/scripts/validate_evaluator.py +++ b/scripts/validate_evaluator.py @@ -94,14 +94,14 @@ def validate_syntax(evaluator_dir: Path, manifest: dict) -> bool: _ok(f"Python syntax valid ({entry_path})") source = entry_path.read_text() - if "agentevals_grader_sdk" not in source: + if "agentevals_evaluator_sdk" not in source: _fail( - f"{entry_path} does not import agentevals_grader_sdk. " + f"{entry_path} does not import agentevals_evaluator_sdk. " f"Evaluators must use the SDK or implement the stdin/stdout protocol." ) return False - if "@grader" not in source: - _fail(f"{entry_path} does not use the @grader decorator") + if "@evaluator" not in source: + _fail(f"{entry_path} does not use the @evaluator decorator") return False if 'if __name__ == "__main__"' not in source and "if __name__ == '__main__'" not in source: _fail(f"{entry_path} missing 'if __name__ == \"__main__\"' block with .run() call") @@ -158,7 +158,7 @@ def validate_smoke_run(evaluator_dir: Path, manifest: dict) -> bool: if not stdout: stderr_preview = result.stderr.strip()[:500] _fail( - f"Evaluator produced no output on stdout" + "Evaluator produced no output on stdout" + (f"\n stderr: {stderr_preview}" if stderr_preview else "") ) return False @@ -208,10 +208,17 @@ def validate_smoke_run(evaluator_dir: Path, manifest: dict) -> bool: f"got {type(per_inv).__name__}" ) return False + for i, x in enumerate(per_inv): + if x is not None and not isinstance(x, (int, float)): + _fail( + f"'per_invocation_scores[{i}]' must be a number or null, " + f"got {type(x).__name__}" + ) + return False # Full Pydantic validation via the SDK if available try: - from agentevals_grader_sdk import EvalResult + from agentevals_evaluator_sdk import EvalResult EvalResult.model_validate(output) _ok("Output validates against EvalResult schema (Pydantic)") except ImportError: