From 91c00a6224e444f87971080e4e8161270f7bc710 Mon Sep 17 00:00:00 2001 From: krisztianfekete Date: Tue, 7 Apr 2026 12:42:59 +0200 Subject: [PATCH] extend sdk with perf metrics Signed-off-by: krisztianfekete --- .../src/agentevals_evaluator_sdk/types.py | 2 ++ src/agentevals/_protocol.py | 2 ++ src/agentevals/custom_evaluators.py | 31 ++++++++++++++----- src/agentevals/runner.py | 1 + 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py index 51974b1..3c408e4 100644 --- a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +++ b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py @@ -25,6 +25,7 @@ class ToolResponseData(BaseModel): name: str output: str = "" + status: Optional[str] = None class IntermediateStepData(BaseModel): @@ -51,6 +52,7 @@ class InvocationData(BaseModel): user_content: str = "" final_response: Optional[str] = None intermediate_steps: IntermediateStepData = Field(default_factory=IntermediateStepData) + performance_metrics: Optional[dict[str, Any]] = None class EvalInput(BaseModel): diff --git a/src/agentevals/_protocol.py b/src/agentevals/_protocol.py index 4508490..de44350 100644 --- a/src/agentevals/_protocol.py +++ b/src/agentevals/_protocol.py @@ -34,6 +34,7 @@ class ToolResponseData(BaseModel): name: str output: str = "" + status: Optional[str] = None class IntermediateStepData(BaseModel): @@ -50,6 +51,7 @@ class InvocationData(BaseModel): user_content: str = "" final_response: Optional[str] = None intermediate_steps: IntermediateStepData = Field(default_factory=IntermediateStepData) + performance_metrics: Optional[dict[str, Any]] = None class EvalInput(BaseModel): diff --git a/src/agentevals/custom_evaluators.py b/src/agentevals/custom_evaluators.py index 47bee97..74444ff 100644 --- a/src/agentevals/custom_evaluators.py +++ b/src/agentevals/custom_evaluators.py @@ -25,6 +25,7 @@ from agentevals._protocol import ( EvalInput, EvalResult, + IntermediateStepData, InvocationData, ToolCallData, ToolResponseData, @@ -296,33 +297,45 @@ def _extract_tool_responses_from_invocation(inv: Invocation) -> list[ToolRespons for tr in inv.intermediate_data.tool_responses or []: name = "" output = "" + status = None if hasattr(tr, "name"): name = tr.name or "" if hasattr(tr, "response"): output = str(tr.response) if tr.response else "" elif hasattr(tr, "output"): output = str(tr.output) if tr.output else "" - responses.append(ToolResponseData(name=name, output=output)) + if hasattr(tr, "status") and tr.status: + status = str(tr.status) + responses.append(ToolResponseData(name=name, output=output, status=status)) return responses -def invocation_to_data(inv: Invocation) -> InvocationData: +def invocation_to_data( + inv: Invocation, + performance_metrics: dict[str, Any] | None = None, +) -> InvocationData: """Convert an ADK Invocation to a simplified InvocationData for the protocol.""" return InvocationData( invocation_id=inv.invocation_id or "", user_content=_content_to_text(inv.user_content), final_response=_content_to_text(inv.final_response) or None, - tool_calls=_extract_tool_calls_from_invocation(inv), - tool_responses=_extract_tool_responses_from_invocation(inv), + intermediate_steps=IntermediateStepData( + tool_calls=_extract_tool_calls_from_invocation(inv), + tool_responses=_extract_tool_responses_from_invocation(inv), + ), + performance_metrics=performance_metrics, ) -def invocations_to_data(invocations: list[Invocation] | None) -> list[InvocationData] | None: +def invocations_to_data( + invocations: list[Invocation] | None, + performance_metrics: dict[str, Any] | None = None, +) -> list[InvocationData] | None: """Convert a list of ADK Invocations, or return None.""" if invocations is None: return None - return [invocation_to_data(inv) for inv in invocations] + return [invocation_to_data(inv, performance_metrics=performance_metrics) for inv in invocations] # --------------------------------------------------------------------------- @@ -382,11 +395,13 @@ def __init__( metric_name: str, threshold: float = 0.5, config: dict[str, Any] | None = None, + performance_metrics: dict[str, Any] | None = None, ): self._backend = backend self._metric_name = metric_name self._threshold = threshold self._config = config or {} + self._performance_metrics = performance_metrics async def evaluate_invocations( self, @@ -399,7 +414,7 @@ async def evaluate_invocations( metric_name=self._metric_name, threshold=self._threshold, config=self._config, - invocations=invocations_to_data(actual_invocations) or [], + invocations=invocations_to_data(actual_invocations, performance_metrics=self._performance_metrics) or [], expected_invocations=invocations_to_data(expected_invocations), ) @@ -416,6 +431,7 @@ async def evaluate_custom_evaluator( evaluator_def, actual_invocations: list[Invocation], expected_invocations: list[Invocation] | None, + performance_metrics: dict[str, Any] | None = None, ): """Evaluate a single custom evaluator and return a ``MetricResult``. @@ -468,6 +484,7 @@ async def evaluate_custom_evaluator( metric_name=evaluator_def.name, threshold=evaluator_def.threshold, config=evaluator_def.config, + performance_metrics=performance_metrics, ) try: diff --git a/src/agentevals/runner.py b/src/agentevals/runner.py index 0c6134e..76c2f91 100644 --- a/src/agentevals/runner.py +++ b/src/agentevals/runner.py @@ -261,6 +261,7 @@ async def _eval_custom_with_semaphore(evaluator_def: CustomEvaluatorDef) -> Metr evaluator_def=evaluator_def, actual_invocations=actual_invocations, expected_invocations=expected_invocations, + performance_metrics=performance_metrics, ) result.duration_ms = (time.monotonic() - t0) * 1000 return await _append_result(result)