From bd6cc7b2cb635185b1dacc839c1fe8f718428e12 Mon Sep 17 00:00:00 2001 From: wiliyam Date: Wed, 1 Apr 2026 23:19:34 +0000 Subject: [PATCH 1/4] feat: add StringCheckGrader support for OpenAI Evals backend (#95) --- src/agentevals/config.py | 37 ++++++++++++++++++++++----- src/agentevals/openai_eval_backend.py | 9 +++++++ 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/src/agentevals/config.py b/src/agentevals/config.py index f7a3149..836c633 100644 --- a/src/agentevals/config.py +++ b/src/agentevals/config.py @@ -53,6 +53,19 @@ class RemoteEvaluatorDef(BaseEvaluatorDef): ref: str = Field(description="Source-specific reference (e.g. path within the repo).") +_VALID_STRING_CHECK_OPERATIONS = frozenset( + { + "eq", + "ne", + "like", + "ilike", + "contains", + "not_contains", + "starts_with", + "ends_with", + } +) + _VALID_SIMILARITY_METRICS = frozenset( { "fuzzy_match", @@ -83,13 +96,23 @@ class OpenAIEvalDef(BaseModel): @classmethod def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]: grader_type = v.get("type") - if grader_type != "text_similarity": - raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'") - metric = v.get("evaluation_metric") - if not metric: - raise ValueError("'evaluation_metric' is required for text_similarity grader") - if metric not in _VALID_SIMILARITY_METRICS: - raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}") + if grader_type == "text_similarity": + metric = v.get("evaluation_metric") + if not metric: + raise ValueError("'evaluation_metric' is required for text_similarity grader") + if metric not in _VALID_SIMILARITY_METRICS: + raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}") + elif grader_type == "string_check": + operation = v.get("operation") + if not operation: + raise ValueError("'operation' is required for string_check grader") + if operation not in _VALID_STRING_CHECK_OPERATIONS: + raise ValueError(f"Unknown operation '{operation}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}") + if "reference" not in v: + raise ValueError("'reference' is required for string_check grader") + else: + supported = "'text_similarity', 'string_check'" + raise ValueError(f"Unsupported grader type '{grader_type}'. Supported: {supported}") return v diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py index a6e9c00..c92881b 100644 --- a/src/agentevals/openai_eval_backend.py +++ b/src/agentevals/openai_eval_backend.py @@ -51,6 +51,15 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]: "pass_threshold": evaluator_def.threshold, } + if grader_type == "string_check": + return { + "type": "string_check", + "name": evaluator_def.name, + "input": "{{ item.actual_response }}", + "reference": grader["reference"], + "operation": grader["operation"], + } + raise ValueError(f"Unsupported grader type: {grader_type}") From a50231585f9e73333c0945df2ae2c82f74ad7910 Mon Sep 17 00:00:00 2001 From: wiliyam Date: Thu, 2 Apr 2026 12:35:17 +0000 Subject: [PATCH 2/4] fix: address review feedback - grader-aware schema, gate expected_invocations on grader type, use _SUPPORTED_GRADER_TYPES constant --- src/agentevals/config.py | 109 +++++++------------------- src/agentevals/openai_eval_backend.py | 32 +++++++- 2 files changed, 57 insertions(+), 84 deletions(-) diff --git a/src/agentevals/config.py b/src/agentevals/config.py index 836c633..b6258aa 100644 --- a/src/agentevals/config.py +++ b/src/agentevals/config.py @@ -53,19 +53,6 @@ class RemoteEvaluatorDef(BaseEvaluatorDef): ref: str = Field(description="Source-specific reference (e.g. path within the repo).") -_VALID_STRING_CHECK_OPERATIONS = frozenset( - { - "eq", - "ne", - "like", - "ilike", - "contains", - "not_contains", - "starts_with", - "ends_with", - } -) - _VALID_SIMILARITY_METRICS = frozenset( { "fuzzy_match", @@ -82,6 +69,22 @@ class RemoteEvaluatorDef(BaseEvaluatorDef): } ) +_VALID_STRING_CHECK_OPERATIONS = frozenset( + { + "eq", + "ne", + "like", + "ilike", + "contains", + "not_contains", + "starts_with", + "ends_with", + } +) + +# All supported grader types — use this constant in error messages and checks. +_SUPPORTED_GRADER_TYPES = frozenset({"text_similarity", "string_check"}) + class OpenAIEvalDef(BaseModel): """An evaluator that delegates grading to the OpenAI Evals API.""" @@ -96,23 +99,31 @@ class OpenAIEvalDef(BaseModel): @classmethod def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]: grader_type = v.get("type") + if grader_type not in _SUPPORTED_GRADER_TYPES: + raise ValueError( + f"Unsupported grader type '{grader_type}'. " + f"Supported: {sorted(_SUPPORTED_GRADER_TYPES)}" + ) + if grader_type == "text_similarity": metric = v.get("evaluation_metric") if not metric: raise ValueError("'evaluation_metric' is required for text_similarity grader") if metric not in _VALID_SIMILARITY_METRICS: - raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}") + raise ValueError( + f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}" + ) elif grader_type == "string_check": operation = v.get("operation") if not operation: raise ValueError("'operation' is required for string_check grader") if operation not in _VALID_STRING_CHECK_OPERATIONS: - raise ValueError(f"Unknown operation '{operation}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}") + raise ValueError( + f"Unknown operation '{operation}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}" + ) if "reference" not in v: raise ValueError("'reference' is required for string_check grader") - else: - supported = "'text_similarity', 'string_check'" - raise ValueError(f"Unsupported grader type '{grader_type}'. Supported: {supported}") + return v @@ -120,65 +131,3 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]: BuiltinMetricDef | CodeEvaluatorDef | RemoteEvaluatorDef | OpenAIEvalDef, Field(discriminator="type"), ] - - -class EvalRunConfig(BaseModel): - trace_files: list[str] = Field(description="Paths to trace files (Jaeger JSON or OTLP JSON).") - - eval_set_file: str | None = Field( - default=None, - description="Path to a golden eval set JSON file (ADK EvalSet format).", - ) - - metrics: list[str] = Field( - default_factory=lambda: ["tool_trajectory_avg_score"], - description="List of built-in metric names to evaluate.", - ) - - custom_evaluators: list[CustomEvaluatorDef] = Field( - default_factory=list, - description="Custom evaluator definitions.", - ) - - trace_format: str = Field( - default="jaeger-json", - description="Format of the trace files (jaeger-json or otlp-json).", - ) - - judge_model: str | None = Field( - default=None, - description="LLM model for judge-based metrics.", - ) - - threshold: float | None = Field( - default=None, - description="Score threshold for pass/fail.", - ) - - trajectory_match_type: str | None = Field( - default=None, - description="Match type for tool_trajectory_avg_score: 'EXACT', 'IN_ORDER', or 'ANY_ORDER'. Default: EXACT.", - ) - - @field_validator("trajectory_match_type") - @classmethod - def _validate_trajectory_match_type(cls, v: str | None) -> str | None: - valid = {"EXACT", "IN_ORDER", "ANY_ORDER"} - if v is not None and v.upper() not in valid: - raise ValueError(f"Invalid trajectory_match_type '{v}'. Valid values: {sorted(valid)}") - return v.upper() if v is not None else v - - output_format: str = Field( - default="table", - description="Output format: 'table', 'json', or 'summary'.", - ) - - max_concurrent_traces: int = Field( - default=10, - description="Maximum number of traces to evaluate concurrently.", - ) - - max_concurrent_evals: int = Field( - default=5, - description="Maximum number of concurrent metric evaluations (LLM API calls).", - ) diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py index c92881b..e9dff70 100644 --- a/src/agentevals/openai_eval_backend.py +++ b/src/agentevals/openai_eval_backend.py @@ -22,6 +22,7 @@ _POLL_INTERVAL_SECONDS = 2 +# Schema for graders that compare actual vs expected (e.g. text_similarity). _TEXT_PAIR_SCHEMA = { "type": "object", "properties": { @@ -31,6 +32,22 @@ "required": ["actual_response", "expected_response"], } +# Schema for graders that only need the actual response (e.g. string_check). +_ACTUAL_ONLY_SCHEMA = { + "type": "object", + "properties": { + "actual_response": {"type": "string"}, + }, + "required": ["actual_response"], +} + + +def _get_item_schema(grader_type: str) -> dict[str, Any]: + """Return the appropriate item schema for the given grader type.""" + if grader_type == "string_check": + return _ACTUAL_ONLY_SCHEMA + return _TEXT_PAIR_SCHEMA + def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]: """Build the OpenAI testing_criteria dict from the evaluator config. @@ -120,13 +137,20 @@ async def evaluate_openai_eval( error="OPENAI_API_KEY environment variable is not set.", ) - if expected_invocations is None: + grader_type = evaluator_def.grader.get("type", "") + + # string_check graders use a static reference from config and don't need + # expected_invocations — only text_similarity requires a golden eval set. + if grader_type != "string_check" and expected_invocations is None: return MetricResult( metric_name=evaluator_def.name, - error="OpenAI text_similarity grader requires expected invocations (golden eval set).", + error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).", ) - items = _build_jsonl_items(actual_invocations, expected_invocations) + items = _build_jsonl_items( + actual_invocations, + expected_invocations if expected_invocations is not None else [], + ) if not items: return MetricResult( metric_name=evaluator_def.name, @@ -144,7 +168,7 @@ async def evaluate_openai_eval( name=f"agentevals-{evaluator_def.name}", data_source_config={ "type": "custom", - "item_schema": _TEXT_PAIR_SCHEMA, + "item_schema": _get_item_schema(grader_type), "include_sample_schema": False, }, testing_criteria=[testing_criteria], From 95fdbc00bcc4d0454663b29fb13b5b73ec3273a6 Mon Sep 17 00:00:00 2001 From: wiliyam Date: Thu, 2 Apr 2026 13:45:23 +0000 Subject: [PATCH 3/4] fix: restore EvalRunConfig, revert validator order, include grader-relevant keys in details --- src/agentevals/config.py | 74 ++++++++++++++++++++++++--- src/agentevals/openai_eval_backend.py | 10 +++- 2 files changed, 77 insertions(+), 7 deletions(-) diff --git a/src/agentevals/config.py b/src/agentevals/config.py index b6258aa..645d8db 100644 --- a/src/agentevals/config.py +++ b/src/agentevals/config.py @@ -82,7 +82,7 @@ class RemoteEvaluatorDef(BaseEvaluatorDef): } ) -# All supported grader types — use this constant in error messages and checks. +# All supported grader types — used in error messages and type checks. _SUPPORTED_GRADER_TYPES = frozenset({"text_similarity", "string_check"}) @@ -99,11 +99,6 @@ class OpenAIEvalDef(BaseModel): @classmethod def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]: grader_type = v.get("type") - if grader_type not in _SUPPORTED_GRADER_TYPES: - raise ValueError( - f"Unsupported grader type '{grader_type}'. " - f"Supported: {sorted(_SUPPORTED_GRADER_TYPES)}" - ) if grader_type == "text_similarity": metric = v.get("evaluation_metric") @@ -123,6 +118,11 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]: ) if "reference" not in v: raise ValueError("'reference' is required for string_check grader") + else: + raise ValueError( + f"Unsupported grader type '{grader_type}'. " + f"Supported: {sorted(_SUPPORTED_GRADER_TYPES)}" + ) return v @@ -131,3 +131,65 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]: BuiltinMetricDef | CodeEvaluatorDef | RemoteEvaluatorDef | OpenAIEvalDef, Field(discriminator="type"), ] + + +class EvalRunConfig(BaseModel): + trace_files: list[str] = Field(description="Paths to trace files (Jaeger JSON or OTLP JSON).") + + eval_set_file: str | None = Field( + default=None, + description="Path to a golden eval set JSON file (ADK EvalSet format).", + ) + + metrics: list[str] = Field( + default_factory=lambda: ["tool_trajectory_avg_score"], + description="List of built-in metric names to evaluate.", + ) + + custom_evaluators: list[CustomEvaluatorDef] = Field( + default_factory=list, + description="Custom evaluator definitions.", + ) + + trace_format: str = Field( + default="jaeger-json", + description="Format of the trace files (jaeger-json or otlp-json).", + ) + + judge_model: str | None = Field( + default=None, + description="LLM model for judge-based metrics.", + ) + + threshold: float | None = Field( + default=None, + description="Score threshold for pass/fail.", + ) + + trajectory_match_type: str | None = Field( + default=None, + description="Match type for tool_trajectory_avg_score: 'EXACT', 'IN_ORDER', or 'ANY_ORDER'. Default: EXACT.", + ) + + @field_validator("trajectory_match_type") + @classmethod + def _validate_trajectory_match_type(cls, v: str | None) -> str | None: + valid = {"EXACT", "IN_ORDER", "ANY_ORDER"} + if v is not None and v.upper() not in valid: + raise ValueError(f"Invalid trajectory_match_type '{v}'. Valid values: {sorted(valid)}") + return v.upper() if v is not None else v + + output_format: str = Field( + default="table", + description="Output format: 'table', 'json', or 'summary'.", + ) + + max_concurrent_traces: int = Field( + default=10, + description="Maximum number of traces to evaluate concurrently.", + ) + + max_concurrent_evals: int = Field( + default=5, + description="Maximum number of concurrent metric evaluations (LLM API calls).", + ) diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py index e9dff70..c31c544 100644 --- a/src/agentevals/openai_eval_backend.py +++ b/src/agentevals/openai_eval_backend.py @@ -258,10 +258,18 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva total = result_counts.total if result_counts else 0 eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED" + grader_type = evaluator_def.grader.get("type", "") + # Include the grader-relevant key depending on type + # (evaluation_metric for text_similarity, operation for string_check) + if grader_type == "string_check": + grader_detail_key = "operation" + else: + grader_detail_key = "evaluation_metric" + details: dict[str, Any] = { "openai_eval_id": eval_id, "openai_run_id": run_id, - "evaluation_metric": evaluator_def.grader.get("evaluation_metric"), + grader_detail_key: evaluator_def.grader.get(grader_detail_key), "result_counts": {"passed": passed, "failed": failed, "total": total}, } per_criteria = getattr(run, "per_testing_criteria_results", None) From b100aea3070e0ec808e69ecf9df4d6f2f8882c8c Mon Sep 17 00:00:00 2001 From: wiliyam Date: Fri, 3 Apr 2026 00:04:55 +0000 Subject: [PATCH 4/4] fix: make JSONL builder grader-aware, use if-not pattern for reference check --- src/agentevals/config.py | 2 +- src/agentevals/openai_eval_backend.py | 29 ++++++++++++++++----------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/agentevals/config.py b/src/agentevals/config.py index 645d8db..1543270 100644 --- a/src/agentevals/config.py +++ b/src/agentevals/config.py @@ -116,7 +116,7 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]: raise ValueError( f"Unknown operation '{operation}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}" ) - if "reference" not in v: + if not v.get("reference"): raise ValueError("'reference' is required for string_check grader") else: raise ValueError( diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py index c31c544..9e4f24a 100644 --- a/src/agentevals/openai_eval_backend.py +++ b/src/agentevals/openai_eval_backend.py @@ -83,22 +83,26 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]: def _build_jsonl_items( actual_invocations: list[Invocation], expected_invocations: list[Invocation], + grader_type: str = "", ) -> list[dict[str, Any]]: + """Build JSONL items matching the grader-aware item schema. + + string_check graders use a static reference from config and only need + ``actual_response`` in each item. All other graders (e.g. text_similarity) + also require ``expected_response``. + """ + include_expected = grader_type != "string_check" items = [] for i, actual_inv in enumerate(actual_invocations): actual_text = _content_to_text(actual_inv.final_response) - if i < len(expected_invocations): - expected_text = _content_to_text(expected_invocations[i].final_response) - else: - expected_text = "" - items.append( - { - "item": { - "actual_response": actual_text, - "expected_response": expected_text, - } - } - ) + item: dict[str, Any] = {"actual_response": actual_text} + if include_expected: + if i < len(expected_invocations): + expected_text = _content_to_text(expected_invocations[i].final_response) + else: + expected_text = "" + item["expected_response"] = expected_text + items.append({"item": item}) return items @@ -150,6 +154,7 @@ async def evaluate_openai_eval( items = _build_jsonl_items( actual_invocations, expected_invocations if expected_invocations is not None else [], + grader_type=grader_type, ) if not items: return MetricResult(