Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 41 additions & 7 deletions src/agentevals/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,22 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
}
)

_VALID_STRING_CHECK_OPERATIONS = frozenset(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are not all valid, please fix it.

{
"eq",
"ne",
"like",
"ilike",
"contains",
"not_contains",
"starts_with",
"ends_with",
}
)

# All supported grader types — used in error messages and type checks.
_SUPPORTED_GRADER_TYPES = frozenset({"text_similarity", "string_check"})


class OpenAIEvalDef(BaseModel):
"""An evaluator that delegates grading to the OpenAI Evals API."""
Expand All @@ -83,13 +99,31 @@ class OpenAIEvalDef(BaseModel):
@classmethod
def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
grader_type = v.get("type")
if grader_type != "text_similarity":
raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
metric = v.get("evaluation_metric")
if not metric:
raise ValueError("'evaluation_metric' is required for text_similarity grader")
if metric not in _VALID_SIMILARITY_METRICS:
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")

if grader_type == "text_similarity":
metric = v.get("evaluation_metric")
if not metric:
raise ValueError("'evaluation_metric' is required for text_similarity grader")
if metric not in _VALID_SIMILARITY_METRICS:
raise ValueError(
f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}"
)
elif grader_type == "string_check":
operation = v.get("operation")
if not operation:
raise ValueError("'operation' is required for string_check grader")
if operation not in _VALID_STRING_CHECK_OPERATIONS:
raise ValueError(
f"Unknown operation '{operation}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}"
)
if not v.get("reference"):
raise ValueError("'reference' is required for string_check grader")
else:
raise ValueError(
f"Unsupported grader type '{grader_type}'. "
f"Supported: {sorted(_SUPPORTED_GRADER_TYPES)}"
)

return v


Expand Down
80 changes: 63 additions & 17 deletions src/agentevals/openai_eval_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

_POLL_INTERVAL_SECONDS = 2

# Schema for graders that compare actual vs expected (e.g. text_similarity).
_TEXT_PAIR_SCHEMA = {
"type": "object",
"properties": {
Expand All @@ -31,6 +32,22 @@
"required": ["actual_response", "expected_response"],
}

# Schema for graders that only need the actual response (e.g. string_check).
_ACTUAL_ONLY_SCHEMA = {
"type": "object",
"properties": {
"actual_response": {"type": "string"},
},
"required": ["actual_response"],
}


def _get_item_schema(grader_type: str) -> dict[str, Any]:
"""Return the appropriate item schema for the given grader type."""
if grader_type == "string_check":
return _ACTUAL_ONLY_SCHEMA
return _TEXT_PAIR_SCHEMA


def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
"""Build the OpenAI testing_criteria dict from the evaluator config.
Expand All @@ -51,28 +68,41 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
"pass_threshold": evaluator_def.threshold,
}

if grader_type == "string_check":
return {
"type": "string_check",
"name": evaluator_def.name,
"input": "{{ item.actual_response }}",
"reference": grader["reference"],
"operation": grader["operation"],
}

raise ValueError(f"Unsupported grader type: {grader_type}")


def _build_jsonl_items(
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
grader_type: str = "",
) -> list[dict[str, Any]]:
"""Build JSONL items matching the grader-aware item schema.

string_check graders use a static reference from config and only need
``actual_response`` in each item. All other graders (e.g. text_similarity)
also require ``expected_response``.
"""
include_expected = grader_type != "string_check"
items = []
for i, actual_inv in enumerate(actual_invocations):
actual_text = _content_to_text(actual_inv.final_response)
if i < len(expected_invocations):
expected_text = _content_to_text(expected_invocations[i].final_response)
else:
expected_text = ""
items.append(
{
"item": {
"actual_response": actual_text,
"expected_response": expected_text,
}
}
)
item: dict[str, Any] = {"actual_response": actual_text}
if include_expected:
if i < len(expected_invocations):
expected_text = _content_to_text(expected_invocations[i].final_response)
else:
expected_text = ""
item["expected_response"] = expected_text
items.append({"item": item})
return items


Expand Down Expand Up @@ -111,13 +141,21 @@ async def evaluate_openai_eval(
error="OPENAI_API_KEY environment variable is not set.",
)

if expected_invocations is None:
grader_type = evaluator_def.grader.get("type", "")

# string_check graders use a static reference from config and don't need
# expected_invocations — only text_similarity requires a golden eval set.
if grader_type != "string_check" and expected_invocations is None:
return MetricResult(
metric_name=evaluator_def.name,
error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).",
)

items = _build_jsonl_items(actual_invocations, expected_invocations)
items = _build_jsonl_items(
actual_invocations,
expected_invocations if expected_invocations is not None else [],
grader_type=grader_type,
)
if not items:
return MetricResult(
metric_name=evaluator_def.name,
Expand All @@ -135,7 +173,7 @@ async def evaluate_openai_eval(
name=f"agentevals-{evaluator_def.name}",
data_source_config={
"type": "custom",
"item_schema": _TEXT_PAIR_SCHEMA,
"item_schema": _get_item_schema(grader_type),
"include_sample_schema": False,
},
testing_criteria=[testing_criteria],
Expand Down Expand Up @@ -225,10 +263,18 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
total = result_counts.total if result_counts else 0
eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"

grader_type = evaluator_def.grader.get("type", "")
# Include the grader-relevant key depending on type
# (evaluation_metric for text_similarity, operation for string_check)
if grader_type == "string_check":
grader_detail_key = "operation"
else:
grader_detail_key = "evaluation_metric"

details: dict[str, Any] = {
"openai_eval_id": eval_id,
"openai_run_id": run_id,
"evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
grader_detail_key: evaluator_def.grader.get(grader_detail_key),
"result_counts": {"passed": passed, "failed": failed, "total": total},
}
per_criteria = getattr(run, "per_testing_criteria_results", None)
Expand Down