Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
.venv/
.venv/
__pycache__/
Binary file not shown.
6 changes: 6 additions & 0 deletions evaluators/time_efficiency/evaluator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: time_efficiency
description: Scores how quickly the agent resolved relative to a time budget
language: python
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you remove this file 'evaluators/bertscore/pycache/bertscore.cpython-314.pyc' ?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! This file was already removed in commit 08b0905. The .gitignore also includes pycache/ so it won't be accidentally committed again.

entrypoint: time_efficiency.py
tags: [performance, time, latency, efficiency, budget]
author: henrikrexed
55 changes: 55 additions & 0 deletions evaluators/time_efficiency/time_efficiency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Community evaluator: time_efficiency

Scores resolution time relative to a budget. Extracts duration_s from
performance_metrics when available, otherwise returns NOT_EVALUATED.

Config: max_duration_s (float, default 120)
"""

from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator


def _extract_duration(inv) -> float | None:
perf = inv.performance_metrics
if not isinstance(perf, dict):
return None

d = perf.get("duration_s") or perf.get("duration")
if d is not None:
return float(d)
return None


@evaluator
def time_efficiency(input: EvalInput) -> EvalResult:
max_duration = input.config.get("max_duration_s", 120.0)

scores: list[float] = []
details_items: list[str] = []
has_data = False

for inv in input.invocations:
duration = _extract_duration(inv)
if duration is None:
scores.append(0.0)
details_items.append(f"{inv.invocation_id}: no duration data")
continue

has_data = True
score = max(0.0, min(1.0, 1.0 - (duration / max_duration)))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a guard against 0 values here and in tool_efficiency.

scores.append(score)
details_items.append(f"{inv.invocation_id}: {duration:.1f}s / {max_duration:.1f}s")

if not has_data:
return EvalResult(
score=0.0,
status=EvalStatus.NOT_EVALUATED,
details={"reason": "no duration data in any invocation"},
)

overall = sum(scores) / len(scores) if scores else 0.0
return EvalResult(score=overall, per_invocation_scores=scores, details={"time_details": details_items})
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use issues for consistency.



if __name__ == "__main__":
time_efficiency.run()
6 changes: 6 additions & 0 deletions evaluators/token_efficiency/evaluator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: token_efficiency
description: Scores how efficiently the agent used tokens relative to a budget
language: python
entrypoint: token_efficiency.py
tags: [performance, tokens, efficiency, budget]
author: henrikrexed
63 changes: 63 additions & 0 deletions evaluators/token_efficiency/token_efficiency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Community evaluator: token_efficiency

Scores token usage relative to a budget. Extracts input/output tokens from
performance_metrics when available, otherwise returns NOT_EVALUATED.

Config: max_input_tokens (int, default 150000), max_output_tokens (int, default 50000)
"""

from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator


def _extract_tokens(inv) -> dict | None:
perf = inv.performance_metrics
if not isinstance(perf, dict):
return None

input_t = perf.get("input_tokens") or perf.get("prompt_tokens")
output_t = perf.get("output_tokens") or perf.get("completion_tokens")
Comment on lines +17 to +18
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here, or will drop zero token values.

if input_t is not None or output_t is not None:
return {"input_tokens": int(input_t or 0), "output_tokens": int(output_t or 0)}

return None


@evaluator
def token_efficiency(input: EvalInput) -> EvalResult:
max_input = input.config.get("max_input_tokens", 150000)
max_output = input.config.get("max_output_tokens", 50000)

scores: list[float] = []
details_items: list[str] = []
has_data = False

for inv in input.invocations:
tokens = _extract_tokens(inv)
if tokens is None:
scores.append(0.0)
details_items.append(f"{inv.invocation_id}: no token data")
continue

has_data = True
input_score = max(0.0, min(1.0, 1.0 - (tokens["input_tokens"] / max_input))) if max_input > 0 else 1.0
output_score = max(0.0, min(1.0, 1.0 - (tokens["output_tokens"] / max_output))) if max_output > 0 else 1.0
score = min(input_score, output_score)
scores.append(score)
details_items.append(
f"{inv.invocation_id}: {tokens['input_tokens']}in/{max_input} + "
f"{tokens['output_tokens']}out/{max_output} -> {score:.2f}"
)

if not has_data:
return EvalResult(
score=0.0,
status=EvalStatus.NOT_EVALUATED,
details={"reason": "no token data in any invocation"},
)

overall = sum(scores) / len(scores) if scores else 0.0
return EvalResult(score=overall, per_invocation_scores=scores, details={"token_details": details_items})


if __name__ == "__main__":
token_efficiency.run()
6 changes: 6 additions & 0 deletions evaluators/tool_efficiency/evaluator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: tool_efficiency
description: Scores whether the agent used tools effectively — penalizes waste, duplicates, and errors
language: python
entrypoint: tool_efficiency.py
tags: [performance, tools, efficiency, budget]
author: henrikrexed
80 changes: 80 additions & 0 deletions evaluators/tool_efficiency/tool_efficiency.py
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please return NOT_EVALUATED when it makes sense to keep it consistent with other evaluators.

Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Community evaluator: tool_efficiency

Scores tool usage effectiveness. Penalizes duplicate calls (same tool + args),
error responses, and budget overruns.

Config: max_tool_calls (int, default 15), min_tool_calls (int, default 0),
penalize_duplicates (bool, default true), penalize_errors (bool, default true)
"""

import json
from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator


def _call_signature(call) -> str:
name = call.get("name", "") if isinstance(call, dict) else getattr(call, "name", "")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please just use attribute access to match the codebase conventions?

args = call.get("args", {}) if isinstance(call, dict) else getattr(call, "args", {})
try:
args_str = json.dumps(args, sort_keys=True, default=str)
except (TypeError, ValueError):
args_str = str(args)
return f"{name}::{args_str}"


def _is_error_response(response) -> bool:
"""Check if a tool response indicates an error via its status field."""
status = response.get("status", "") if isinstance(response, dict) else getattr(response, "status", "")
return str(status).lower() in ("error", "failed", "failure")


@evaluator
def tool_efficiency(input: EvalInput) -> EvalResult:
max_tool_calls = input.config.get("max_tool_calls", 15)
min_tool_calls = input.config.get("min_tool_calls", 0)
penalize_duplicates = input.config.get("penalize_duplicates", True)
penalize_errors = input.config.get("penalize_errors", True)

scores: list[float] = []
details_items: list[str] = []

for inv in input.invocations:
tool_calls = inv.intermediate_steps.tool_calls if inv.intermediate_steps else []
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the first time to use fields not part of the standard ADK Invocation format, we'll have to think a bit about how to go about these.

tool_responses = inv.intermediate_steps.tool_responses if inv.intermediate_steps else []
total = len(tool_calls)

if total == 0:
if min_tool_calls > 0:
scores.append(0.0)
details_items.append(f"{inv.invocation_id}: no tool calls (min required: {min_tool_calls})")
else:
scores.append(1.0)
details_items.append(f"{inv.invocation_id}: no tool calls (tools optional)")
continue
Comment on lines +45 to +52
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if we should return a perfect score here. Many times zero tool means a failure. We also have tool_coverage to check for minimum tool usage.

Maybe we should make this configurable?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair point — zero tool calls often means the agent hallucinated an answer instead of using its tools. Returning 1.0 here is misleading.

I'd suggest adding a ⁠ min_tool_calls ⁠ config (default 0 for backward compat). When set, zero calls scores 0.0 instead of 1.0. And when ⁠ min_tool_calls=0 ⁠ (explicitly "tools are optional"), zero calls still scores 1.0.

⁠ yaml
config:
max_tool_calls: 15
min_tool_calls: 1 # 0 = tools optional, >0 = penalize no-tool runs
 ⁠

This keeps ⁠ tool_efficiency ⁠ focused on efficiency while letting users opt into "tools are required". For strict "did the agent use tools at all" checks, ⁠ tool_coverage ⁠ is the right evaluator — they complement each other.


dupes = 0
if penalize_duplicates:
seen: dict[str, int] = {}
for call in tool_calls:
sig = _call_signature(call)
seen[sig] = seen.get(sig, 0) + 1
dupes = sum(c - 1 for c in seen.values() if c > 1)

errors = sum(1 for r in tool_responses if _is_error_response(r)) if penalize_errors else 0
useful = max(0, total - dupes - errors)

efficiency = useful / total
budget_factor = max(0.0, 1.0 - max(0, total - max_tool_calls) / max_tool_calls)
score = max(0.0, min(1.0, efficiency * budget_factor))
scores.append(score)

parts = [f"total={total}", f"useful={useful}"]
if dupes: parts.append(f"dupes={dupes}")
if errors: parts.append(f"errors={errors}")
details_items.append(f"{inv.invocation_id}: {', '.join(parts)}")

overall = sum(scores) / len(scores) if scores else 0.0
return EvalResult(score=overall, per_invocation_scores=scores, details={"tool_details": details_items})


if __name__ == "__main__":
tool_efficiency.run()