diff --git a/.github/workflows/validate-evaluators.yaml b/.github/workflows/validate-evaluators.yaml
index 565286b..f494cb2 100644
--- a/.github/workflows/validate-evaluators.yaml
+++ b/.github/workflows/validate-evaluators.yaml
@@ -20,9 +20,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install pyyaml
-          # TODO: switch to `pip install agentevals-grader-sdk` once published to PyPI
-          pip install "agentevals-grader-sdk @ git+https://github.com/agentevals-dev/agentevals.git#subdirectory=packages/grader-sdk-py"
+          pip install pyyaml agentevals-evaluator-sdk
 
       - name: Discover and validate all evaluators
         run: |
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0cafc1c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.venv/
\ No newline at end of file
diff --git a/README.md b/README.md
index c94955a..0a7fac5 100644
--- a/README.md
+++ b/README.md
@@ -107,13 +107,13 @@ author: your-github-username
 Run the validation script to catch issues before submitting:
 
 ```bash
-pip install agentevals-grader-sdk pyyaml
+pip install pyyaml agentevals-evaluator-sdk
 python scripts/validate_evaluator.py evaluators/my_evaluator
 ```
 
 This checks:
 - **Manifest schema** -- required fields, entrypoint exists, name matches directory
-- **Syntax and imports** -- compiles cleanly, uses `@grader` decorator
+- **Syntax and imports** -- compiles cleanly, uses `@evaluator` decorator
 - **Smoke run** -- runs the evaluator with synthetic input and validates the `EvalResult` output (correct types for `score`, `details`, `status`, etc.)
 
 You can also test with a full eval run:
diff --git a/evaluators/contains/contains.py b/evaluators/contains/contains.py
new file mode 100644
index 0000000..cbe6eca
--- /dev/null
+++ b/evaluators/contains/contains.py
@@ -0,0 +1,59 @@
+"""Substring containment evaluator.
+
+Scores each invocation 1.0 if final_response contains the configured substring,
+otherwise 0.0.
+
+Config:
+  substring (str): Required. If omitted, returns NOT_EVALUATED.
+
+Usage in eval_config.yaml:
+    config:
+      substring: "expected phrase"
+"""
+
+from __future__ import annotations
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+
+@evaluator
+def contains(input: EvalInput) -> EvalResult:
+    substring = (input.config.get("substring") or "").strip()
+    n = len(input.invocations)
+    if not substring:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing config: substring"},
+        )
+
+    case_insensitive = bool(input.config.get("case_insensitive", False))
+    normalize = str.lower if case_insensitive else lambda s: s
+    substring_cmp = normalize(substring)
+
+    scores: list[float] = []
+    issues: list[str] = []
+
+    for inv in input.invocations:
+        response_text = inv.final_response or ""
+        if case_insensitive:
+            ok = substring_cmp in normalize(response_text)
+        else:
+            ok = substring in response_text
+        if ok:
+            scores.append(1.0)
+        else:
+            scores.append(0.0)
+            issues.append(f"{inv.invocation_id}: response does not contain {substring!r}")
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"issues": issues} if issues else None,
+    )
+
+
+if __name__ == "__main__":
+    contains.run()
diff --git a/evaluators/contains/evaluator.yaml b/evaluators/contains/evaluator.yaml
new file mode 100644
index 0000000..7395d68
--- /dev/null
+++ b/evaluators/contains/evaluator.yaml
@@ -0,0 +1,6 @@
+name: contains
+description: Scores whether each final response contains a configured substring (case-sensitive or case-insensitive)
+language: python
+entrypoint: contains.py
+tags: [string, contains]
+author: agentevals-dev
diff --git a/evaluators/equals/equals.py b/evaluators/equals/equals.py
new file mode 100644
index 0000000..d85b1e1
--- /dev/null
+++ b/evaluators/equals/equals.py
@@ -0,0 +1,60 @@
+"""Exact string match evaluator.
+
+Config:
+  expected (str): Required. If omitted, returns NOT_EVALUATED.
+  case_insensitive (bool, default True): Compare normalized strings.
+  strip (bool, default True): Strip whitespace before compare.
+
+Usage:
+    config:
+      expected: "4"
+"""
+
+from __future__ import annotations
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+
+@evaluator
+def equals(input: EvalInput) -> EvalResult:
+    expected = input.config.get("expected")
+    if expected is None:
+        n = len(input.invocations)
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing config: expected"},
+        )
+
+    case_insensitive = bool(input.config.get("case_insensitive", False))
+    strip = bool(input.config.get("strip", True))
+
+    def norm(s: str) -> str:
+        t = s.strip() if strip else s
+        return t.lower() if case_insensitive else t
+
+    exp = norm(str(expected))
+    scores: list[float] = []
+    issues: list[str] = []
+
+    for inv in input.invocations:
+        got = norm(inv.final_response or "")
+        if got == exp:
+            scores.append(1.0)
+        else:
+            scores.append(0.0)
+            issues.append(
+                f"{inv.invocation_id}: expected {expected!r}, got {inv.final_response!r}"
+            )
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"issues": issues} if issues else None,
+    )
+
+
+if __name__ == "__main__":
+    equals.run()
diff --git a/evaluators/equals/evaluator.yaml b/evaluators/equals/evaluator.yaml
new file mode 100644
index 0000000..31915ef
--- /dev/null
+++ b/evaluators/equals/evaluator.yaml
@@ -0,0 +1,6 @@
+name: equals
+description: Scores whether each final response exactly matches a configured expected string
+language: python
+entrypoint: equals.py
+tags: [string, equals]
+author: agentevals-dev
diff --git a/evaluators/is_json/evaluator.yaml b/evaluators/is_json/evaluator.yaml
new file mode 100644
index 0000000..1236e9e
--- /dev/null
+++ b/evaluators/is_json/evaluator.yaml
@@ -0,0 +1,6 @@
+name: is_json
+description: Scores whether each final response parses as JSON (optional markdown code fence extraction)
+language: python
+entrypoint: is_json.py
+tags: [json, structured]
+author: agentevals-dev
diff --git a/evaluators/is_json/is_json.py b/evaluators/is_json/is_json.py
new file mode 100644
index 0000000..deb4980
--- /dev/null
+++ b/evaluators/is_json/is_json.py
@@ -0,0 +1,56 @@
+"""JSON parse check evaluator.
+
+Tries to parse final_response as JSON. Optionally extracts fenced ```json ... ``` blocks.
+
+Config:
+  extract_markdown_fence (bool, default True): Strip ```json fences if present.
+
+Usage:
+    config:
+      extract_markdown_fence: true
+"""
+
+from __future__ import annotations
+
+import json
+import re
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+
+_FENCE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?```\s*$", re.DOTALL | re.IGNORECASE)
+
+
+def _parse_json_payload(text: str, extract_fence: bool) -> object:
+    raw = (text or "").strip()
+    if extract_fence:
+        m = _FENCE.match(raw)
+        if m:
+            raw = m.group(1).strip()
+    return json.loads(raw)
+
+
+@evaluator
+def is_json(input: EvalInput) -> EvalResult:
+    extract_fence = bool(input.config.get("extract_markdown_fence", True))
+
+    scores: list[float] = []
+    issues: list[str] = []
+
+    for inv in input.invocations:
+        try:
+            _parse_json_payload(inv.final_response or "", extract_fence)
+            scores.append(1.0)
+        except (json.JSONDecodeError, TypeError, ValueError) as exc:
+            scores.append(0.0)
+            issues.append(f"{inv.invocation_id}: not valid JSON ({exc})")
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"issues": issues} if issues else None,
+    )
+
+
+if __name__ == "__main__":
+    is_json.run()
diff --git a/evaluators/levenshtein_ratio/evaluator.yaml b/evaluators/levenshtein_ratio/evaluator.yaml
new file mode 100644
index 0000000..a9263db
--- /dev/null
+++ b/evaluators/levenshtein_ratio/evaluator.yaml
@@ -0,0 +1,6 @@
+name: levenshtein_ratio
+description: Scores similarity of each response to a reference string using normalized Levenshtein distance
+language: python
+entrypoint: levenshtein_ratio.py
+tags: [string, levenshtein]
+author: agentevals-dev
diff --git a/evaluators/levenshtein_ratio/levenshtein_ratio.py b/evaluators/levenshtein_ratio/levenshtein_ratio.py
new file mode 100644
index 0000000..730b917
--- /dev/null
+++ b/evaluators/levenshtein_ratio/levenshtein_ratio.py
@@ -0,0 +1,82 @@
+"""Normalized Levenshtein similarity evaluator.
+
+Score for an invocation is 1.0 - (edit_distance / max(len(a), len(b), 1)), clamped to [0, 1].
+
+Config:
+  expected (str): Required. If omitted, returns NOT_EVALUATED.
+  case_insensitive (bool, default False): Compare lowercased strings.
+
+Usage:
+    config:
+      expected: "reference answer"
+"""
+
+from __future__ import annotations
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+
+def _levenshtein(a: str, b: str) -> int:
+    """Classic O(nm) edit distance."""
+    if len(a) < len(b):
+        a, b = b, a
+    if not b:
+        return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a):
+        cur = [i + 1]
+        for j, cb in enumerate(b):
+            ins = prev[j + 1] + 1
+            delete = cur[j] + 1
+            sub = prev[j] + (ca != cb)
+            cur.append(min(ins, delete, sub))
+        prev = cur
+    return prev[-1]
+
+
+@evaluator
+def levenshtein_ratio(input: EvalInput) -> EvalResult:
+    expected = input.config.get("expected")
+    if expected is None:
+        n = len(input.invocations)
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing config: expected"},
+        )
+
+    case_insensitive = bool(input.config.get("case_insensitive", False))
+    ref = str(expected)
+    if case_insensitive:
+        ref = ref.lower()
+
+    scores: list[float] = []
+    details_rows: list[dict] = []
+
+    for inv in input.invocations:
+        got = inv.final_response or ""
+        a, b = (got.lower(), ref) if case_insensitive else (got, ref)
+        dist = _levenshtein(a, b)
+        denom = max(len(a), len(b), 1)
+        sim = 1.0 - (dist / denom)
+        sim = max(0.0, min(1.0, sim))
+        scores.append(sim)
+        details_rows.append(
+            {
+                "invocation_id": inv.invocation_id,
+                "distance": dist,
+                "similarity": sim,
+            }
+        )
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"per_invocation": details_rows},
+    )
+
+
+if __name__ == "__main__":
+    levenshtein_ratio.run()
diff --git a/evaluators/regex_match/evaluator.yaml b/evaluators/regex_match/evaluator.yaml
new file mode 100644
index 0000000..c630ed0
--- /dev/null
+++ b/evaluators/regex_match/evaluator.yaml
@@ -0,0 +1,6 @@
+name: regex_match
+description: Scores whether each final response matches a configured regular expression
+language: python
+entrypoint: regex_match.py
+tags: [regex]
+author: agentevals-dev
diff --git a/evaluators/regex_match/regex_match.py b/evaluators/regex_match/regex_match.py
new file mode 100644
index 0000000..271602e
--- /dev/null
+++ b/evaluators/regex_match/regex_match.py
@@ -0,0 +1,81 @@
+"""Regex on final response evaluator.
+
+Config:
+  pattern (str): Required. If omitted or invalid, returns NOT_EVALUATED.
+  flags (str, optional): "IGNORECASE" | "MULTILINE" | "DOTALL" — combined with |.
+
+Usage:
+    config:
+      pattern: "^The answer"
+      flags: IGNORECASE
+"""
+
+from __future__ import annotations
+
+import re
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+_FLAG_MAP = {
+    "IGNORECASE": re.IGNORECASE,
+    "MULTILINE": re.MULTILINE,
+    "DOTALL": re.DOTALL,
+}
+
+
+@evaluator
+def regex_match(input: EvalInput) -> EvalResult:
+    pattern = input.config.get("pattern")
+    n = len(input.invocations)
+    if not pattern:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing config: pattern"},
+        )
+
+    flag_names = input.config.get("flags")
+    flags = 0
+    if isinstance(flag_names, str):
+        for part in flag_names.replace("|", ",").split(","):
+            key = part.strip().upper()
+            if key in _FLAG_MAP:
+                flags |= _FLAG_MAP[key]
+    elif isinstance(flag_names, list):
+        for part in flag_names:
+            key = str(part).strip().upper()
+            if key in _FLAG_MAP:
+                flags |= _FLAG_MAP[key]
+
+    try:
+        rx = re.compile(str(pattern), flags)
+    except re.error as exc:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "invalid regex pattern", "error": str(exc)},
+        )
+
+    scores: list[float] = []
+    issues: list[str] = []
+
+    for inv in input.invocations:
+        text = inv.final_response or ""
+        if rx.search(text):
+            scores.append(1.0)
+        else:
+            scores.append(0.0)
+            issues.append(f"{inv.invocation_id}: no match for pattern {pattern!r}")
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"issues": issues} if issues else None,
+    )
+
+
+if __name__ == "__main__":
+    regex_match.run()
diff --git a/evaluators/tool_coverage/tool_coverage.py b/evaluators/tool_coverage/tool_coverage.py
index 3bc7a5b..7a17d38 100644
--- a/evaluators/tool_coverage/tool_coverage.py
+++ b/evaluators/tool_coverage/tool_coverage.py
@@ -18,7 +18,7 @@ def tool_coverage(input: EvalInput) -> EvalResult:
     details: list[str] = []
 
     for inv in input.invocations:
-        actual = len(inv.tool_calls)
+        actual = len(inv.intermediate_steps.tool_calls)
         if actual >= min_calls:
             scores.append(1.0)
         else:
diff --git a/evaluators/tool_sequence_match/evaluator.yaml b/evaluators/tool_sequence_match/evaluator.yaml
new file mode 100644
index 0000000..b58999c
--- /dev/null
+++ b/evaluators/tool_sequence_match/evaluator.yaml
@@ -0,0 +1,6 @@
+name: tool_sequence_match
+description: Scores whether tool calls match an expected list of tool names (order-sensitive or multiset)
+language: python
+entrypoint: tool_sequence_match.py
+tags: [tools, trajectory]
+author: agentevals-dev
diff --git a/evaluators/tool_sequence_match/tool_sequence_match.py b/evaluators/tool_sequence_match/tool_sequence_match.py
new file mode 100644
index 0000000..a729707
--- /dev/null
+++ b/evaluators/tool_sequence_match/tool_sequence_match.py
@@ -0,0 +1,78 @@
+"""Expected tool call sequence evaluator.
+
+Compares the ordered list of tool names in each invocation to config.
+
+Config:
+  expected_tool_names (list[str]): Required non-empty. Otherwise returns NOT_EVALUATED.
+  require_order (bool, default True): If False, compares multisets (same counts per name).
+
+Usage:
+    config:
+      expected_tool_names: ["search", "calculator"]
+      require_order: true
+"""
+
+from __future__ import annotations
+
+from collections import Counter
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+
+@evaluator
+def tool_sequence_match(input: EvalInput) -> EvalResult:
+    expected = input.config.get("expected_tool_names")
+    n = len(input.invocations)
+    if expected is None or not isinstance(expected, list):
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing or invalid config: expected_tool_names (need a list of names)"},
+        )
+    if not expected:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing or empty config: expected_tool_names"},
+        )
+
+    want = [str(x) for x in expected]
+    require_order = bool(input.config.get("require_order", True))
+
+    scores: list[float] = []
+    issues: list[str] = []
+
+    for inv in input.invocations:
+        actual = []
+        for call in inv.intermediate_steps.tool_calls or []:
+            if isinstance(call, dict):
+                n = call.get("name")
+                if n is not None:
+                    actual.append(str(n))
+
+        if require_order:
+            ok = actual == want
+        else:
+            ok = Counter(actual) == Counter(want)
+
+        if ok:
+            scores.append(1.0)
+        else:
+            scores.append(0.0)
+            issues.append(
+                f"{inv.invocation_id}: expected {want!r}, got {actual!r} "
+                f"(require_order={require_order})"
+            )
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"issues": issues} if issues else None,
+    )
+
+
+if __name__ == "__main__":
+    tool_sequence_match.run()
diff --git a/scripts/test_input.json b/scripts/test_input.json
index 79d41f5..7b5939c 100644
--- a/scripts/test_input.json
+++ b/scripts/test_input.json
@@ -7,8 +7,10 @@
       "invocation_id": "ci-test-001",
       "user_content": "What is 2+2?",
       "final_response": "The answer is 4.",
-      "tool_calls": [{"name": "calculator", "args": {"expr": "2+2"}}],
-      "tool_responses": [{"name": "calculator", "output": "4"}]
+      "intermediate_steps": {
+        "tool_calls": [{"name": "calculator", "args": {"expr": "2+2"}}],
+        "tool_responses": [{"name": "calculator", "output": "4"}]
+      }
     }
   ],
   "expected_invocations": null
diff --git a/scripts/validate_evaluator.py b/scripts/validate_evaluator.py
index e80570e..13379e3 100644
--- a/scripts/validate_evaluator.py
+++ b/scripts/validate_evaluator.py
@@ -94,14 +94,14 @@ def validate_syntax(evaluator_dir: Path, manifest: dict) -> bool:
         _ok(f"Python syntax valid ({entry_path})")
 
         source = entry_path.read_text()
-        if "agentevals_grader_sdk" not in source:
+        if "agentevals_evaluator_sdk" not in source:
             _fail(
-                f"{entry_path} does not import agentevals_grader_sdk. "
+                f"{entry_path} does not import agentevals_evaluator_sdk. "
                 f"Evaluators must use the SDK or implement the stdin/stdout protocol."
             )
             return False
-        if "@grader" not in source:
-            _fail(f"{entry_path} does not use the @grader decorator")
+        if "@evaluator" not in source:
+            _fail(f"{entry_path} does not use the @evaluator decorator")
             return False
         if 'if __name__ == "__main__"' not in source and "if __name__ == '__main__'" not in source:
             _fail(f"{entry_path} missing 'if __name__ == \"__main__\"' block with .run() call")
@@ -158,7 +158,7 @@ def validate_smoke_run(evaluator_dir: Path, manifest: dict) -> bool:
     if not stdout:
         stderr_preview = result.stderr.strip()[:500]
         _fail(
-            f"Evaluator produced no output on stdout"
+            "Evaluator produced no output on stdout"
             + (f"\n  stderr: {stderr_preview}" if stderr_preview else "")
         )
         return False
@@ -208,10 +208,17 @@ def validate_smoke_run(evaluator_dir: Path, manifest: dict) -> bool:
                 f"got {type(per_inv).__name__}"
             )
             return False
+        for i, x in enumerate(per_inv):
+            if x is not None and not isinstance(x, (int, float)):
+                _fail(
+                    f"'per_invocation_scores[{i}]' must be a number or null, "
+                    f"got {type(x).__name__}"
+                )
+                return False
 
     # Full Pydantic validation via the SDK if available
     try:
-        from agentevals_grader_sdk import EvalResult
+        from agentevals_evaluator_sdk import EvalResult
         EvalResult.model_validate(output)
         _ok("Output validates against EvalResult schema (Pydantic)")
     except ImportError: