From bd6cc7b2cb635185b1dacc839c1fe8f718428e12 Mon Sep 17 00:00:00 2001
From: wiliyam <wiliyam@users.noreply.github.com>
Date: Wed, 1 Apr 2026 23:19:34 +0000
Subject: [PATCH 1/4] feat: add StringCheckGrader support for OpenAI Evals
 backend (#95)

---
 src/agentevals/config.py              | 37 ++++++++++++++++++++++-----
 src/agentevals/openai_eval_backend.py |  9 +++++++
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/src/agentevals/config.py b/src/agentevals/config.py
index f7a3149..836c633 100644
--- a/src/agentevals/config.py
+++ b/src/agentevals/config.py
@@ -53,6 +53,19 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
     ref: str = Field(description="Source-specific reference (e.g. path within the repo).")
 
 
+_VALID_STRING_CHECK_OPERATIONS = frozenset(
+    {
+        "eq",
+        "ne",
+        "like",
+        "ilike",
+        "contains",
+        "not_contains",
+        "starts_with",
+        "ends_with",
+    }
+)
+
 _VALID_SIMILARITY_METRICS = frozenset(
     {
         "fuzzy_match",
@@ -83,13 +96,23 @@ class OpenAIEvalDef(BaseModel):
     @classmethod
     def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
         grader_type = v.get("type")
-        if grader_type != "text_similarity":
-            raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
-        metric = v.get("evaluation_metric")
-        if not metric:
-            raise ValueError("'evaluation_metric' is required for text_similarity grader")
-        if metric not in _VALID_SIMILARITY_METRICS:
-            raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
+        if grader_type == "text_similarity":
+            metric = v.get("evaluation_metric")
+            if not metric:
+                raise ValueError("'evaluation_metric' is required for text_similarity grader")
+            if metric not in _VALID_SIMILARITY_METRICS:
+                raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
+        elif grader_type == "string_check":
+            operation = v.get("operation")
+            if not operation:
+                raise ValueError("'operation' is required for string_check grader")
+            if operation not in _VALID_STRING_CHECK_OPERATIONS:
+                raise ValueError(f"Unknown operation '{operation}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}")
+            if "reference" not in v:
+                raise ValueError("'reference' is required for string_check grader")
+        else:
+            supported = "'text_similarity', 'string_check'"
+            raise ValueError(f"Unsupported grader type '{grader_type}'. Supported: {supported}")
         return v
 
 
diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py
index a6e9c00..c92881b 100644
--- a/src/agentevals/openai_eval_backend.py
+++ b/src/agentevals/openai_eval_backend.py
@@ -51,6 +51,15 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
             "pass_threshold": evaluator_def.threshold,
         }
 
+    if grader_type == "string_check":
+        return {
+            "type": "string_check",
+            "name": evaluator_def.name,
+            "input": "{{ item.actual_response }}",
+            "reference": grader["reference"],
+            "operation": grader["operation"],
+        }
+
     raise ValueError(f"Unsupported grader type: {grader_type}")
 
 

From a50231585f9e73333c0945df2ae2c82f74ad7910 Mon Sep 17 00:00:00 2001
From: wiliyam <wiliyam@users.noreply.github.com>
Date: Thu, 2 Apr 2026 12:35:17 +0000
Subject: [PATCH 2/4] fix: address review feedback - grader-aware schema, gate
 expected_invocations on grader type, use _SUPPORTED_GRADER_TYPES constant

---
 src/agentevals/config.py              | 109 +++++++-------------------
 src/agentevals/openai_eval_backend.py |  32 +++++++-
 2 files changed, 57 insertions(+), 84 deletions(-)

diff --git a/src/agentevals/config.py b/src/agentevals/config.py
index 836c633..b6258aa 100644
--- a/src/agentevals/config.py
+++ b/src/agentevals/config.py
@@ -53,19 +53,6 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
     ref: str = Field(description="Source-specific reference (e.g. path within the repo).")
 
 
-_VALID_STRING_CHECK_OPERATIONS = frozenset(
-    {
-        "eq",
-        "ne",
-        "like",
-        "ilike",
-        "contains",
-        "not_contains",
-        "starts_with",
-        "ends_with",
-    }
-)
-
 _VALID_SIMILARITY_METRICS = frozenset(
     {
         "fuzzy_match",
@@ -82,6 +69,22 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
     }
 )
 
+_VALID_STRING_CHECK_OPERATIONS = frozenset(
+    {
+        "eq",
+        "ne",
+        "like",
+        "ilike",
+        "contains",
+        "not_contains",
+        "starts_with",
+        "ends_with",
+    }
+)
+
+# All supported grader types — use this constant in error messages and checks.
+_SUPPORTED_GRADER_TYPES = frozenset({"text_similarity", "string_check"})
+
 
 class OpenAIEvalDef(BaseModel):
     """An evaluator that delegates grading to the OpenAI Evals API."""
@@ -96,23 +99,31 @@ class OpenAIEvalDef(BaseModel):
     @classmethod
     def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
         grader_type = v.get("type")
+        if grader_type not in _SUPPORTED_GRADER_TYPES:
+            raise ValueError(
+                f"Unsupported grader type '{grader_type}'. "
+                f"Supported: {sorted(_SUPPORTED_GRADER_TYPES)}"
+            )
+
         if grader_type == "text_similarity":
             metric = v.get("evaluation_metric")
             if not metric:
                 raise ValueError("'evaluation_metric' is required for text_similarity grader")
             if metric not in _VALID_SIMILARITY_METRICS:
-                raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
+                raise ValueError(
+                    f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}"
+                )
         elif grader_type == "string_check":
             operation = v.get("operation")
             if not operation:
                 raise ValueError("'operation' is required for string_check grader")
             if operation not in _VALID_STRING_CHECK_OPERATIONS:
-                raise ValueError(f"Unknown operation '{operation}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}")
+                raise ValueError(
+                    f"Unknown operation '{operation}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}"
+                )
             if "reference" not in v:
                 raise ValueError("'reference' is required for string_check grader")
-        else:
-            supported = "'text_similarity', 'string_check'"
-            raise ValueError(f"Unsupported grader type '{grader_type}'. Supported: {supported}")
+
         return v
 
 
@@ -120,65 +131,3 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
     BuiltinMetricDef | CodeEvaluatorDef | RemoteEvaluatorDef | OpenAIEvalDef,
     Field(discriminator="type"),
 ]
-
-
-class EvalRunConfig(BaseModel):
-    trace_files: list[str] = Field(description="Paths to trace files (Jaeger JSON or OTLP JSON).")
-
-    eval_set_file: str | None = Field(
-        default=None,
-        description="Path to a golden eval set JSON file (ADK EvalSet format).",
-    )
-
-    metrics: list[str] = Field(
-        default_factory=lambda: ["tool_trajectory_avg_score"],
-        description="List of built-in metric names to evaluate.",
-    )
-
-    custom_evaluators: list[CustomEvaluatorDef] = Field(
-        default_factory=list,
-        description="Custom evaluator definitions.",
-    )
-
-    trace_format: str = Field(
-        default="jaeger-json",
-        description="Format of the trace files (jaeger-json or otlp-json).",
-    )
-
-    judge_model: str | None = Field(
-        default=None,
-        description="LLM model for judge-based metrics.",
-    )
-
-    threshold: float | None = Field(
-        default=None,
-        description="Score threshold for pass/fail.",
-    )
-
-    trajectory_match_type: str | None = Field(
-        default=None,
-        description="Match type for tool_trajectory_avg_score: 'EXACT', 'IN_ORDER', or 'ANY_ORDER'. Default: EXACT.",
-    )
-
-    @field_validator("trajectory_match_type")
-    @classmethod
-    def _validate_trajectory_match_type(cls, v: str | None) -> str | None:
-        valid = {"EXACT", "IN_ORDER", "ANY_ORDER"}
-        if v is not None and v.upper() not in valid:
-            raise ValueError(f"Invalid trajectory_match_type '{v}'. Valid values: {sorted(valid)}")
-        return v.upper() if v is not None else v
-
-    output_format: str = Field(
-        default="table",
-        description="Output format: 'table', 'json', or 'summary'.",
-    )
-
-    max_concurrent_traces: int = Field(
-        default=10,
-        description="Maximum number of traces to evaluate concurrently.",
-    )
-
-    max_concurrent_evals: int = Field(
-        default=5,
-        description="Maximum number of concurrent metric evaluations (LLM API calls).",
-    )
diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py
index c92881b..e9dff70 100644
--- a/src/agentevals/openai_eval_backend.py
+++ b/src/agentevals/openai_eval_backend.py
@@ -22,6 +22,7 @@
 
 _POLL_INTERVAL_SECONDS = 2
 
+# Schema for graders that compare actual vs expected (e.g. text_similarity).
 _TEXT_PAIR_SCHEMA = {
     "type": "object",
     "properties": {
@@ -31,6 +32,22 @@
     "required": ["actual_response", "expected_response"],
 }
 
+# Schema for graders that only need the actual response (e.g. string_check).
+_ACTUAL_ONLY_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "actual_response": {"type": "string"},
+    },
+    "required": ["actual_response"],
+}
+
+
+def _get_item_schema(grader_type: str) -> dict[str, Any]:
+    """Return the appropriate item schema for the given grader type."""
+    if grader_type == "string_check":
+        return _ACTUAL_ONLY_SCHEMA
+    return _TEXT_PAIR_SCHEMA
+
 
 def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
     """Build the OpenAI testing_criteria dict from the evaluator config.
@@ -120,13 +137,20 @@ async def evaluate_openai_eval(
             error="OPENAI_API_KEY environment variable is not set.",
         )
 
-    if expected_invocations is None:
+    grader_type = evaluator_def.grader.get("type", "")
+
+    # string_check graders use a static reference from config and don't need
+    # expected_invocations — only text_similarity requires a golden eval set.
+    if grader_type != "string_check" and expected_invocations is None:
         return MetricResult(
             metric_name=evaluator_def.name,
-            error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
+            error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).",
         )
 
-    items = _build_jsonl_items(actual_invocations, expected_invocations)
+    items = _build_jsonl_items(
+        actual_invocations,
+        expected_invocations if expected_invocations is not None else [],
+    )
     if not items:
         return MetricResult(
             metric_name=evaluator_def.name,
@@ -144,7 +168,7 @@ async def evaluate_openai_eval(
             name=f"agentevals-{evaluator_def.name}",
             data_source_config={
                 "type": "custom",
-                "item_schema": _TEXT_PAIR_SCHEMA,
+                "item_schema": _get_item_schema(grader_type),
                 "include_sample_schema": False,
             },
             testing_criteria=[testing_criteria],

From 95fdbc00bcc4d0454663b29fb13b5b73ec3273a6 Mon Sep 17 00:00:00 2001
From: wiliyam <wiliyam@users.noreply.github.com>
Date: Thu, 2 Apr 2026 13:45:23 +0000
Subject: [PATCH 3/4] fix: restore EvalRunConfig, revert validator order,
 include grader-relevant keys in details

---
 src/agentevals/config.py              | 74 ++++++++++++++++++++++++---
 src/agentevals/openai_eval_backend.py | 10 +++-
 2 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/src/agentevals/config.py b/src/agentevals/config.py
index b6258aa..645d8db 100644
--- a/src/agentevals/config.py
+++ b/src/agentevals/config.py
@@ -82,7 +82,7 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
     }
 )
 
-# All supported grader types — use this constant in error messages and checks.
+# All supported grader types — used in error messages and type checks.
 _SUPPORTED_GRADER_TYPES = frozenset({"text_similarity", "string_check"})
 
 
@@ -99,11 +99,6 @@ class OpenAIEvalDef(BaseModel):
     @classmethod
     def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
         grader_type = v.get("type")
-        if grader_type not in _SUPPORTED_GRADER_TYPES:
-            raise ValueError(
-                f"Unsupported grader type '{grader_type}'. "
-                f"Supported: {sorted(_SUPPORTED_GRADER_TYPES)}"
-            )
 
         if grader_type == "text_similarity":
             metric = v.get("evaluation_metric")
@@ -123,6 +118,11 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
                 )
             if "reference" not in v:
                 raise ValueError("'reference' is required for string_check grader")
+        else:
+            raise ValueError(
+                f"Unsupported grader type '{grader_type}'. "
+                f"Supported: {sorted(_SUPPORTED_GRADER_TYPES)}"
+            )
 
         return v
 
@@ -131,3 +131,65 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
     BuiltinMetricDef | CodeEvaluatorDef | RemoteEvaluatorDef | OpenAIEvalDef,
     Field(discriminator="type"),
 ]
+
+
+class EvalRunConfig(BaseModel):
+    trace_files: list[str] = Field(description="Paths to trace files (Jaeger JSON or OTLP JSON).")
+
+    eval_set_file: str | None = Field(
+        default=None,
+        description="Path to a golden eval set JSON file (ADK EvalSet format).",
+    )
+
+    metrics: list[str] = Field(
+        default_factory=lambda: ["tool_trajectory_avg_score"],
+        description="List of built-in metric names to evaluate.",
+    )
+
+    custom_evaluators: list[CustomEvaluatorDef] = Field(
+        default_factory=list,
+        description="Custom evaluator definitions.",
+    )
+
+    trace_format: str = Field(
+        default="jaeger-json",
+        description="Format of the trace files (jaeger-json or otlp-json).",
+    )
+
+    judge_model: str | None = Field(
+        default=None,
+        description="LLM model for judge-based metrics.",
+    )
+
+    threshold: float | None = Field(
+        default=None,
+        description="Score threshold for pass/fail.",
+    )
+
+    trajectory_match_type: str | None = Field(
+        default=None,
+        description="Match type for tool_trajectory_avg_score: 'EXACT', 'IN_ORDER', or 'ANY_ORDER'. Default: EXACT.",
+    )
+
+    @field_validator("trajectory_match_type")
+    @classmethod
+    def _validate_trajectory_match_type(cls, v: str | None) -> str | None:
+        valid = {"EXACT", "IN_ORDER", "ANY_ORDER"}
+        if v is not None and v.upper() not in valid:
+            raise ValueError(f"Invalid trajectory_match_type '{v}'. Valid values: {sorted(valid)}")
+        return v.upper() if v is not None else v
+
+    output_format: str = Field(
+        default="table",
+        description="Output format: 'table', 'json', or 'summary'.",
+    )
+
+    max_concurrent_traces: int = Field(
+        default=10,
+        description="Maximum number of traces to evaluate concurrently.",
+    )
+
+    max_concurrent_evals: int = Field(
+        default=5,
+        description="Maximum number of concurrent metric evaluations (LLM API calls).",
+    )
diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py
index e9dff70..c31c544 100644
--- a/src/agentevals/openai_eval_backend.py
+++ b/src/agentevals/openai_eval_backend.py
@@ -258,10 +258,18 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
     total = result_counts.total if result_counts else 0
     eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
 
+    grader_type = evaluator_def.grader.get("type", "")
+    # Include the grader-relevant key depending on type
+    # (evaluation_metric for text_similarity, operation for string_check)
+    if grader_type == "string_check":
+        grader_detail_key = "operation"
+    else:
+        grader_detail_key = "evaluation_metric"
+
     details: dict[str, Any] = {
         "openai_eval_id": eval_id,
         "openai_run_id": run_id,
-        "evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
+        grader_detail_key: evaluator_def.grader.get(grader_detail_key),
         "result_counts": {"passed": passed, "failed": failed, "total": total},
     }
     per_criteria = getattr(run, "per_testing_criteria_results", None)

From b100aea3070e0ec808e69ecf9df4d6f2f8882c8c Mon Sep 17 00:00:00 2001
From: wiliyam <wiliyam@users.noreply.github.com>
Date: Fri, 3 Apr 2026 00:04:55 +0000
Subject: [PATCH 4/4] fix: make JSONL builder grader-aware, use if-not pattern
 for reference check

---
 src/agentevals/config.py              |  2 +-
 src/agentevals/openai_eval_backend.py | 29 ++++++++++++++++-----------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/agentevals/config.py b/src/agentevals/config.py
index 645d8db..1543270 100644
--- a/src/agentevals/config.py
+++ b/src/agentevals/config.py
@@ -116,7 +116,7 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
                 raise ValueError(
                     f"Unknown operation '{operation}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}"
                 )
-            if "reference" not in v:
+            if not v.get("reference"):
                 raise ValueError("'reference' is required for string_check grader")
         else:
             raise ValueError(
diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py
index c31c544..9e4f24a 100644
--- a/src/agentevals/openai_eval_backend.py
+++ b/src/agentevals/openai_eval_backend.py
@@ -83,22 +83,26 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
 def _build_jsonl_items(
     actual_invocations: list[Invocation],
     expected_invocations: list[Invocation],
+    grader_type: str = "",
 ) -> list[dict[str, Any]]:
+    """Build JSONL items matching the grader-aware item schema.
+
+    string_check graders use a static reference from config and only need
+    ``actual_response`` in each item.  All other graders (e.g. text_similarity)
+    also require ``expected_response``.
+    """
+    include_expected = grader_type != "string_check"
     items = []
     for i, actual_inv in enumerate(actual_invocations):
         actual_text = _content_to_text(actual_inv.final_response)
-        if i < len(expected_invocations):
-            expected_text = _content_to_text(expected_invocations[i].final_response)
-        else:
-            expected_text = ""
-        items.append(
-            {
-                "item": {
-                    "actual_response": actual_text,
-                    "expected_response": expected_text,
-                }
-            }
-        )
+        item: dict[str, Any] = {"actual_response": actual_text}
+        if include_expected:
+            if i < len(expected_invocations):
+                expected_text = _content_to_text(expected_invocations[i].final_response)
+            else:
+                expected_text = ""
+            item["expected_response"] = expected_text
+        items.append({"item": item})
     return items
 
 
@@ -150,6 +154,7 @@ async def evaluate_openai_eval(
     items = _build_jsonl_items(
         actual_invocations,
         expected_invocations if expected_invocations is not None else [],
+        grader_type=grader_type,
     )
     if not items:
         return MetricResult(