Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .github/workflows/validate-graders.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Validate graders

on:
pull_request:
paths:
- "graders/**"
- "scripts/validate_grader.py"
- "scripts/test_input.json"

jobs:
validate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install dependencies
run: pip install agentevals-grader-sdk pyyaml

- name: Discover and validate all graders
run: |
grader_dirs=$(find graders -mindepth 1 -maxdepth 1 -type d | sort)
if [ -z "$grader_dirs" ]; then
echo "No grader directories found."
exit 0
fi
python scripts/validate_grader.py $grader_dirs
20 changes: 16 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,27 @@ tags: [quality, tools]
author: your-github-username
```

### 4. Test locally
### 4. Validate locally

Add it to an eval config as a local `type: code` grader and run it:
Run the validation script to catch issues before submitting:

```bash
pip install agentevals-grader-sdk pyyaml
python scripts/validate_grader.py graders/my_grader
```

This checks:
- **Manifest schema** -- required fields, entrypoint exists, name matches directory
- **Syntax and imports** -- compiles cleanly, uses `@grader` decorator
- **Smoke run** -- runs the grader with synthetic input and validates the `EvalResult` output (correct types for `score`, `details`, `status`, etc.)

You can also test with a full eval run:

```yaml
metrics:
- name: my_grader
type: code
path: ./my_grader/my_grader.py
path: ./graders/my_grader/my_grader.py
threshold: 0.5
```

Expand All @@ -133,7 +145,7 @@ graders/

3. Open a PR against `main`

A CI workflow will validate your `grader.yaml` manifest. Once merged, the workflow regenerates `index.yaml` automatically, and your grader becomes available to everyone via `agentevals grader list`.
CI will automatically validate your grader (manifest, syntax, and smoke run). Once merged, a separate workflow regenerates `index.yaml`, and your grader becomes available to everyone via `agentevals grader list`.

## Supported languages

Expand Down
2 changes: 1 addition & 1 deletion graders/peters_grader/peters_grader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@

@grader
def peters_grader(input: EvalInput) -> EvalResult:
return EvalResult(score=0.123, details="All good")
return EvalResult(score=0.123, details={"message": "All good"})
15 changes: 15 additions & 0 deletions scripts/test_input.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"metric_name": "test",
"threshold": 0.5,
"config": {},
"invocations": [
{
"invocation_id": "ci-test-001",
"user_content": "What is 2+2?",
"final_response": "The answer is 4.",
"tool_calls": [{"name": "calculator", "args": {"expr": "2+2"}}],
"tool_responses": [{"name": "calculator", "output": "4"}]
}
],
"expected_invocations": null
}
282 changes: 282 additions & 0 deletions scripts/validate_grader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
#!/usr/bin/env python3
"""Validate a grader directory: manifest, syntax, and smoke run.

Usage:
python scripts/validate_grader.py graders/my_grader
python scripts/validate_grader.py graders/* # validate all
"""

from __future__ import annotations

import json
import subprocess
import sys
from pathlib import Path

import yaml

SCRIPT_DIR = Path(__file__).resolve().parent
TEST_INPUT = SCRIPT_DIR / "test_input.json"

REQUIRED_MANIFEST_FIELDS = {"name", "description", "language", "entrypoint"}
VALID_STATUSES = {"PASSED", "FAILED", "NOT_EVALUATED"}

LANGUAGE_EXTENSIONS = {
"python": {".py"},
"javascript": {".js"},
"typescript": {".ts"},
}


def _fail(msg: str) -> None:
print(f" FAIL: {msg}", file=sys.stderr)


def _ok(msg: str) -> None:
print(f" OK: {msg}")


def validate_manifest(grader_dir: Path) -> dict | None:
"""Check grader.yaml exists and has required fields. Returns parsed manifest or None."""
manifest_path = grader_dir / "grader.yaml"
if not manifest_path.exists():
_fail(f"Missing grader.yaml in {grader_dir}")
return None

try:
manifest = yaml.safe_load(manifest_path.read_text())
except yaml.YAMLError as exc:
_fail(f"Invalid YAML in {manifest_path}: {exc}")
return None

if not isinstance(manifest, dict):
_fail(f"grader.yaml must be a YAML mapping, got {type(manifest).__name__}")
return None

missing = REQUIRED_MANIFEST_FIELDS - set(manifest.keys())
if missing:
_fail(f"grader.yaml missing required fields: {sorted(missing)}")
return None

entrypoint = manifest["entrypoint"]
entry_path = grader_dir / entrypoint
if not entry_path.exists():
_fail(f"Entrypoint file not found: {entry_path}")
return None

dir_name = grader_dir.name
if manifest["name"] != dir_name:
_fail(
f"Manifest name '{manifest['name']}' does not match "
f"directory name '{dir_name}'"
)
return None

_ok(f"Manifest valid ({manifest_path})")
return manifest


def validate_syntax(grader_dir: Path, manifest: dict) -> bool:
"""Check syntax and basic structure of the grader source file."""
language = manifest.get("language", "python")
entrypoint = manifest["entrypoint"]
entry_path = grader_dir / entrypoint

if language == "python":
result = subprocess.run(
[sys.executable, "-m", "py_compile", str(entry_path)],
capture_output=True,
text=True,
)
if result.returncode != 0:
_fail(f"Syntax error in {entry_path}:\n{result.stderr}")
return False
_ok(f"Python syntax valid ({entry_path})")

source = entry_path.read_text()
if "agentevals_grader_sdk" not in source:
_fail(
f"{entry_path} does not import agentevals_grader_sdk. "
f"Graders must use the SDK or implement the stdin/stdout protocol."
)
return False
if "@grader" not in source:
_fail(f"{entry_path} does not use the @grader decorator")
return False
_ok("Imports and decorator present")

elif language in ("javascript", "typescript"):
ext = Path(entrypoint).suffix
expected = LANGUAGE_EXTENSIONS.get(language, set())
if ext not in expected:
_fail(
f"Entrypoint extension '{ext}' doesn't match "
f"language '{language}' (expected {expected})"
)
return False
_ok(f"Extension matches language ({entry_path})")

return True


def validate_smoke_run(grader_dir: Path, manifest: dict) -> bool:
"""Run the grader with synthetic input and validate the output."""
language = manifest.get("language", "python")
entrypoint = manifest["entrypoint"]
entry_path = grader_dir / entrypoint

if language == "python":
cmd = [sys.executable, str(entry_path)]
elif language in ("javascript", "typescript"):
cmd = ["node", str(entry_path)]
else:
_ok(f"Skipping smoke run for unsupported language: {language}")
return True

test_input = TEST_INPUT.read_text()

result = subprocess.run(
cmd,
input=test_input,
capture_output=True,
text=True,
timeout=30,
)

if result.returncode != 0:
stderr_preview = result.stderr.strip()[:500]
_fail(
f"Grader exited with code {result.returncode}\n"
f" stderr: {stderr_preview}"
)
return False

stdout = result.stdout.strip()
if not stdout:
stderr_preview = result.stderr.strip()[:500]
_fail(
f"Grader produced no output on stdout"
+ (f"\n stderr: {stderr_preview}" if stderr_preview else "")
)
return False

try:
output = json.loads(stdout)
except json.JSONDecodeError as exc:
_fail(f"Grader stdout is not valid JSON: {exc}\n stdout: {stdout[:200]}")
return False

# Validate score
score = output.get("score")
if score is None:
_fail("Output missing required 'score' field")
return False
if not isinstance(score, (int, float)):
_fail(f"'score' must be a number, got {type(score).__name__}: {score}")
return False
if score < 0.0 or score > 1.0:
_fail(f"'score' must be in [0.0, 1.0], got {score}")
return False

# Validate status
status = output.get("status")
if status is not None and status not in VALID_STATUSES:
_fail(
f"'status' must be one of {sorted(VALID_STATUSES)} or null, "
f"got '{status}'"
)
return False

# Validate details type
details = output.get("details")
if details is not None and not isinstance(details, dict):
_fail(
f"'details' must be a dict or null, "
f"got {type(details).__name__}: {details!r}"
)
return False

# Validate per_invocation_scores type
per_inv = output.get("per_invocation_scores")
if per_inv is not None:
if not isinstance(per_inv, list):
_fail(
f"'per_invocation_scores' must be a list, "
f"got {type(per_inv).__name__}"
)
return False

# Full Pydantic validation via the SDK if available
try:
from agentevals_grader_sdk import EvalResult
EvalResult.model_validate(output)
_ok("Output validates against EvalResult schema (Pydantic)")
except ImportError:
_ok("Output JSON structure valid (SDK not installed, skipped Pydantic check)")
except Exception as exc:
_fail(f"Output fails EvalResult validation: {exc}")
return False

_ok(f"Smoke run passed (score={score})")
return True


def validate_grader(grader_dir: Path) -> bool:
"""Run all validations on a single grader directory."""
print(f"\nValidating: {grader_dir}")
print(f"{'─' * 50}")

manifest = validate_manifest(grader_dir)
if manifest is None:
return False

if not validate_syntax(grader_dir, manifest):
return False

if not validate_smoke_run(grader_dir, manifest):
return False

return True


def main() -> None:
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <grader_dir> [<grader_dir> ...]", file=sys.stderr)
sys.exit(2)

dirs = [Path(arg) for arg in sys.argv[1:]]
results: dict[str, bool] = {}

for d in dirs:
if not d.is_dir():
print(f"\nSkipping {d} (not a directory)", file=sys.stderr)
continue
if not (d / "grader.yaml").exists():
print(f"\nSkipping {d} (no grader.yaml)", file=sys.stderr)
continue
results[str(d)] = validate_grader(d)

print(f"\n{'=' * 50}")
print("Summary:")
all_passed = True
for name, passed in results.items():
icon = "PASS" if passed else "FAIL"
print(f" [{icon}] {name}")
if not passed:
all_passed = False

if not results:
print(" No graders found to validate.")
sys.exit(2)

print()
if all_passed:
print(f"All {len(results)} grader(s) passed.")
else:
failed = sum(1 for v in results.values() if not v)
print(f"{failed} of {len(results)} grader(s) failed.")
sys.exit(1)


if __name__ == "__main__":
main()
Loading