diff --git a/.github/workflows/validate-graders.yaml b/.github/workflows/validate-graders.yaml new file mode 100644 index 0000000..732087b --- /dev/null +++ b/.github/workflows/validate-graders.yaml @@ -0,0 +1,31 @@ +name: Validate graders + +on: + pull_request: + paths: + - "graders/**" + - "scripts/validate_grader.py" + - "scripts/test_input.json" + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install agentevals-grader-sdk pyyaml + + - name: Discover and validate all graders + run: | + grader_dirs=$(find graders -mindepth 1 -maxdepth 1 -type d | sort) + if [ -z "$grader_dirs" ]; then + echo "No grader directories found." + exit 0 + fi + python scripts/validate_grader.py $grader_dirs diff --git a/README.md b/README.md index 5fb2c64..c88b485 100644 --- a/README.md +++ b/README.md @@ -99,15 +99,27 @@ tags: [quality, tools] author: your-github-username ``` -### 4. Test locally +### 4. Validate locally -Add it to an eval config as a local `type: code` grader and run it: +Run the validation script to catch issues before submitting: + +```bash +pip install agentevals-grader-sdk pyyaml +python scripts/validate_grader.py graders/my_grader +``` + +This checks: +- **Manifest schema** -- required fields, entrypoint exists, name matches directory +- **Syntax and imports** -- compiles cleanly, uses `@grader` decorator +- **Smoke run** -- runs the grader with synthetic input and validates the `EvalResult` output (correct types for `score`, `details`, `status`, etc.) + +You can also test with a full eval run: ```yaml metrics: - name: my_grader type: code - path: ./my_grader/my_grader.py + path: ./graders/my_grader/my_grader.py threshold: 0.5 ``` @@ -133,7 +145,7 @@ graders/ 3. Open a PR against `main` -A CI workflow will validate your `grader.yaml` manifest. Once merged, the workflow regenerates `index.yaml` automatically, and your grader becomes available to everyone via `agentevals grader list`. +CI will automatically validate your grader (manifest, syntax, and smoke run). Once merged, a separate workflow regenerates `index.yaml`, and your grader becomes available to everyone via `agentevals grader list`. ## Supported languages diff --git a/graders/peters_grader/peters_grader.py b/graders/peters_grader/peters_grader.py index 8a5272a..9112661 100644 --- a/graders/peters_grader/peters_grader.py +++ b/graders/peters_grader/peters_grader.py @@ -14,4 +14,4 @@ @grader def peters_grader(input: EvalInput) -> EvalResult: - return EvalResult(score=0.123, details="All good") + return EvalResult(score=0.123, details={"message": "All good"}) diff --git a/scripts/test_input.json b/scripts/test_input.json new file mode 100644 index 0000000..79d41f5 --- /dev/null +++ b/scripts/test_input.json @@ -0,0 +1,15 @@ +{ + "metric_name": "test", + "threshold": 0.5, + "config": {}, + "invocations": [ + { + "invocation_id": "ci-test-001", + "user_content": "What is 2+2?", + "final_response": "The answer is 4.", + "tool_calls": [{"name": "calculator", "args": {"expr": "2+2"}}], + "tool_responses": [{"name": "calculator", "output": "4"}] + } + ], + "expected_invocations": null +} diff --git a/scripts/validate_grader.py b/scripts/validate_grader.py new file mode 100644 index 0000000..88b1360 --- /dev/null +++ b/scripts/validate_grader.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +"""Validate a grader directory: manifest, syntax, and smoke run. + +Usage: + python scripts/validate_grader.py graders/my_grader + python scripts/validate_grader.py graders/* # validate all +""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +import yaml + +SCRIPT_DIR = Path(__file__).resolve().parent +TEST_INPUT = SCRIPT_DIR / "test_input.json" + +REQUIRED_MANIFEST_FIELDS = {"name", "description", "language", "entrypoint"} +VALID_STATUSES = {"PASSED", "FAILED", "NOT_EVALUATED"} + +LANGUAGE_EXTENSIONS = { + "python": {".py"}, + "javascript": {".js"}, + "typescript": {".ts"}, +} + + +def _fail(msg: str) -> None: + print(f" FAIL: {msg}", file=sys.stderr) + + +def _ok(msg: str) -> None: + print(f" OK: {msg}") + + +def validate_manifest(grader_dir: Path) -> dict | None: + """Check grader.yaml exists and has required fields. Returns parsed manifest or None.""" + manifest_path = grader_dir / "grader.yaml" + if not manifest_path.exists(): + _fail(f"Missing grader.yaml in {grader_dir}") + return None + + try: + manifest = yaml.safe_load(manifest_path.read_text()) + except yaml.YAMLError as exc: + _fail(f"Invalid YAML in {manifest_path}: {exc}") + return None + + if not isinstance(manifest, dict): + _fail(f"grader.yaml must be a YAML mapping, got {type(manifest).__name__}") + return None + + missing = REQUIRED_MANIFEST_FIELDS - set(manifest.keys()) + if missing: + _fail(f"grader.yaml missing required fields: {sorted(missing)}") + return None + + entrypoint = manifest["entrypoint"] + entry_path = grader_dir / entrypoint + if not entry_path.exists(): + _fail(f"Entrypoint file not found: {entry_path}") + return None + + dir_name = grader_dir.name + if manifest["name"] != dir_name: + _fail( + f"Manifest name '{manifest['name']}' does not match " + f"directory name '{dir_name}'" + ) + return None + + _ok(f"Manifest valid ({manifest_path})") + return manifest + + +def validate_syntax(grader_dir: Path, manifest: dict) -> bool: + """Check syntax and basic structure of the grader source file.""" + language = manifest.get("language", "python") + entrypoint = manifest["entrypoint"] + entry_path = grader_dir / entrypoint + + if language == "python": + result = subprocess.run( + [sys.executable, "-m", "py_compile", str(entry_path)], + capture_output=True, + text=True, + ) + if result.returncode != 0: + _fail(f"Syntax error in {entry_path}:\n{result.stderr}") + return False + _ok(f"Python syntax valid ({entry_path})") + + source = entry_path.read_text() + if "agentevals_grader_sdk" not in source: + _fail( + f"{entry_path} does not import agentevals_grader_sdk. " + f"Graders must use the SDK or implement the stdin/stdout protocol." + ) + return False + if "@grader" not in source: + _fail(f"{entry_path} does not use the @grader decorator") + return False + _ok("Imports and decorator present") + + elif language in ("javascript", "typescript"): + ext = Path(entrypoint).suffix + expected = LANGUAGE_EXTENSIONS.get(language, set()) + if ext not in expected: + _fail( + f"Entrypoint extension '{ext}' doesn't match " + f"language '{language}' (expected {expected})" + ) + return False + _ok(f"Extension matches language ({entry_path})") + + return True + + +def validate_smoke_run(grader_dir: Path, manifest: dict) -> bool: + """Run the grader with synthetic input and validate the output.""" + language = manifest.get("language", "python") + entrypoint = manifest["entrypoint"] + entry_path = grader_dir / entrypoint + + if language == "python": + cmd = [sys.executable, str(entry_path)] + elif language in ("javascript", "typescript"): + cmd = ["node", str(entry_path)] + else: + _ok(f"Skipping smoke run for unsupported language: {language}") + return True + + test_input = TEST_INPUT.read_text() + + result = subprocess.run( + cmd, + input=test_input, + capture_output=True, + text=True, + timeout=30, + ) + + if result.returncode != 0: + stderr_preview = result.stderr.strip()[:500] + _fail( + f"Grader exited with code {result.returncode}\n" + f" stderr: {stderr_preview}" + ) + return False + + stdout = result.stdout.strip() + if not stdout: + stderr_preview = result.stderr.strip()[:500] + _fail( + f"Grader produced no output on stdout" + + (f"\n stderr: {stderr_preview}" if stderr_preview else "") + ) + return False + + try: + output = json.loads(stdout) + except json.JSONDecodeError as exc: + _fail(f"Grader stdout is not valid JSON: {exc}\n stdout: {stdout[:200]}") + return False + + # Validate score + score = output.get("score") + if score is None: + _fail("Output missing required 'score' field") + return False + if not isinstance(score, (int, float)): + _fail(f"'score' must be a number, got {type(score).__name__}: {score}") + return False + if score < 0.0 or score > 1.0: + _fail(f"'score' must be in [0.0, 1.0], got {score}") + return False + + # Validate status + status = output.get("status") + if status is not None and status not in VALID_STATUSES: + _fail( + f"'status' must be one of {sorted(VALID_STATUSES)} or null, " + f"got '{status}'" + ) + return False + + # Validate details type + details = output.get("details") + if details is not None and not isinstance(details, dict): + _fail( + f"'details' must be a dict or null, " + f"got {type(details).__name__}: {details!r}" + ) + return False + + # Validate per_invocation_scores type + per_inv = output.get("per_invocation_scores") + if per_inv is not None: + if not isinstance(per_inv, list): + _fail( + f"'per_invocation_scores' must be a list, " + f"got {type(per_inv).__name__}" + ) + return False + + # Full Pydantic validation via the SDK if available + try: + from agentevals_grader_sdk import EvalResult + EvalResult.model_validate(output) + _ok("Output validates against EvalResult schema (Pydantic)") + except ImportError: + _ok("Output JSON structure valid (SDK not installed, skipped Pydantic check)") + except Exception as exc: + _fail(f"Output fails EvalResult validation: {exc}") + return False + + _ok(f"Smoke run passed (score={score})") + return True + + +def validate_grader(grader_dir: Path) -> bool: + """Run all validations on a single grader directory.""" + print(f"\nValidating: {grader_dir}") + print(f"{'─' * 50}") + + manifest = validate_manifest(grader_dir) + if manifest is None: + return False + + if not validate_syntax(grader_dir, manifest): + return False + + if not validate_smoke_run(grader_dir, manifest): + return False + + return True + + +def main() -> None: + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} [ ...]", file=sys.stderr) + sys.exit(2) + + dirs = [Path(arg) for arg in sys.argv[1:]] + results: dict[str, bool] = {} + + for d in dirs: + if not d.is_dir(): + print(f"\nSkipping {d} (not a directory)", file=sys.stderr) + continue + if not (d / "grader.yaml").exists(): + print(f"\nSkipping {d} (no grader.yaml)", file=sys.stderr) + continue + results[str(d)] = validate_grader(d) + + print(f"\n{'=' * 50}") + print("Summary:") + all_passed = True + for name, passed in results.items(): + icon = "PASS" if passed else "FAIL" + print(f" [{icon}] {name}") + if not passed: + all_passed = False + + if not results: + print(" No graders found to validate.") + sys.exit(2) + + print() + if all_passed: + print(f"All {len(results)} grader(s) passed.") + else: + failed = sum(1 for v in results.values() if not v) + print(f"{failed} of {len(results)} grader(s) failed.") + sys.exit(1) + + +if __name__ == "__main__": + main()