-
Notifications
You must be signed in to change notification settings - Fork 1
feat: add token_efficiency, tool_efficiency, time_efficiency evaluators #7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
08b0905
8371bc0
4e9899d
beadc5c
c5d1684
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1 +1,2 @@ | ||
| .venv/ | ||
| .venv/ | ||
| __pycache__/ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| name: time_efficiency | ||
| description: Scores how quickly the agent resolved relative to a time budget | ||
| language: python | ||
| entrypoint: time_efficiency.py | ||
| tags: [performance, time, latency, efficiency, budget] | ||
| author: henrikrexed | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,55 @@ | ||
| """Community evaluator: time_efficiency | ||
|
|
||
| Scores resolution time relative to a budget. Extracts duration_s from | ||
| performance_metrics when available, otherwise returns NOT_EVALUATED. | ||
|
|
||
| Config: max_duration_s (float, default 120) | ||
| """ | ||
|
|
||
| from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator | ||
|
|
||
|
|
||
| def _extract_duration(inv) -> float | None: | ||
| perf = inv.performance_metrics | ||
| if not isinstance(perf, dict): | ||
| return None | ||
|
|
||
| d = perf.get("duration_s") or perf.get("duration") | ||
| if d is not None: | ||
| return float(d) | ||
| return None | ||
|
|
||
|
|
||
| @evaluator | ||
| def time_efficiency(input: EvalInput) -> EvalResult: | ||
| max_duration = input.config.get("max_duration_s", 120.0) | ||
|
|
||
| scores: list[float] = [] | ||
| details_items: list[str] = [] | ||
| has_data = False | ||
|
|
||
| for inv in input.invocations: | ||
| duration = _extract_duration(inv) | ||
| if duration is None: | ||
| scores.append(0.0) | ||
| details_items.append(f"{inv.invocation_id}: no duration data") | ||
| continue | ||
|
|
||
| has_data = True | ||
| score = max(0.0, min(1.0, 1.0 - (duration / max_duration))) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a guard against 0 values here and in |
||
| scores.append(score) | ||
| details_items.append(f"{inv.invocation_id}: {duration:.1f}s / {max_duration:.1f}s") | ||
|
|
||
| if not has_data: | ||
| return EvalResult( | ||
| score=0.0, | ||
| status=EvalStatus.NOT_EVALUATED, | ||
| details={"reason": "no duration data in any invocation"}, | ||
| ) | ||
|
|
||
| overall = sum(scores) / len(scores) if scores else 0.0 | ||
| return EvalResult(score=overall, per_invocation_scores=scores, details={"time_details": details_items}) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use |
||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| time_efficiency.run() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| name: token_efficiency | ||
| description: Scores how efficiently the agent used tokens relative to a budget | ||
| language: python | ||
| entrypoint: token_efficiency.py | ||
| tags: [performance, tokens, efficiency, budget] | ||
| author: henrikrexed |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| """Community evaluator: token_efficiency | ||
|
|
||
| Scores token usage relative to a budget. Extracts input/output tokens from | ||
| performance_metrics when available, otherwise returns NOT_EVALUATED. | ||
|
|
||
| Config: max_input_tokens (int, default 150000), max_output_tokens (int, default 50000) | ||
| """ | ||
|
|
||
| from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator | ||
|
|
||
|
|
||
| def _extract_tokens(inv) -> dict | None: | ||
| perf = inv.performance_metrics | ||
| if not isinstance(perf, dict): | ||
| return None | ||
|
|
||
| input_t = perf.get("input_tokens") or perf.get("prompt_tokens") | ||
| output_t = perf.get("output_tokens") or perf.get("completion_tokens") | ||
|
Comment on lines
+17
to
+18
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here, |
||
| if input_t is not None or output_t is not None: | ||
| return {"input_tokens": int(input_t or 0), "output_tokens": int(output_t or 0)} | ||
|
|
||
| return None | ||
|
|
||
|
|
||
| @evaluator | ||
| def token_efficiency(input: EvalInput) -> EvalResult: | ||
| max_input = input.config.get("max_input_tokens", 150000) | ||
| max_output = input.config.get("max_output_tokens", 50000) | ||
|
|
||
| scores: list[float] = [] | ||
| details_items: list[str] = [] | ||
| has_data = False | ||
|
|
||
| for inv in input.invocations: | ||
| tokens = _extract_tokens(inv) | ||
| if tokens is None: | ||
| scores.append(0.0) | ||
| details_items.append(f"{inv.invocation_id}: no token data") | ||
| continue | ||
|
|
||
| has_data = True | ||
| input_score = max(0.0, min(1.0, 1.0 - (tokens["input_tokens"] / max_input))) if max_input > 0 else 1.0 | ||
| output_score = max(0.0, min(1.0, 1.0 - (tokens["output_tokens"] / max_output))) if max_output > 0 else 1.0 | ||
| score = min(input_score, output_score) | ||
| scores.append(score) | ||
| details_items.append( | ||
| f"{inv.invocation_id}: {tokens['input_tokens']}in/{max_input} + " | ||
| f"{tokens['output_tokens']}out/{max_output} -> {score:.2f}" | ||
| ) | ||
|
|
||
| if not has_data: | ||
| return EvalResult( | ||
| score=0.0, | ||
| status=EvalStatus.NOT_EVALUATED, | ||
| details={"reason": "no token data in any invocation"}, | ||
| ) | ||
|
|
||
| overall = sum(scores) / len(scores) if scores else 0.0 | ||
| return EvalResult(score=overall, per_invocation_scores=scores, details={"token_details": details_items}) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| token_efficiency.run() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| name: tool_efficiency | ||
| description: Scores whether the agent used tools effectively — penalizes waste, duplicates, and errors | ||
| language: python | ||
| entrypoint: tool_efficiency.py | ||
| tags: [performance, tools, efficiency, budget] | ||
| author: henrikrexed |
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please return |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| """Community evaluator: tool_efficiency | ||
|
|
||
| Scores tool usage effectiveness. Penalizes duplicate calls (same tool + args), | ||
| error responses, and budget overruns. | ||
|
|
||
| Config: max_tool_calls (int, default 15), min_tool_calls (int, default 0), | ||
| penalize_duplicates (bool, default true), penalize_errors (bool, default true) | ||
| """ | ||
|
|
||
| import json | ||
| from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator | ||
|
|
||
|
|
||
| def _call_signature(call) -> str: | ||
| name = call.get("name", "") if isinstance(call, dict) else getattr(call, "name", "") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please just use attribute access to match the codebase conventions? |
||
| args = call.get("args", {}) if isinstance(call, dict) else getattr(call, "args", {}) | ||
| try: | ||
| args_str = json.dumps(args, sort_keys=True, default=str) | ||
| except (TypeError, ValueError): | ||
| args_str = str(args) | ||
| return f"{name}::{args_str}" | ||
|
|
||
|
|
||
| def _is_error_response(response) -> bool: | ||
| """Check if a tool response indicates an error via its status field.""" | ||
| status = response.get("status", "") if isinstance(response, dict) else getattr(response, "status", "") | ||
| return str(status).lower() in ("error", "failed", "failure") | ||
|
|
||
|
|
||
| @evaluator | ||
| def tool_efficiency(input: EvalInput) -> EvalResult: | ||
| max_tool_calls = input.config.get("max_tool_calls", 15) | ||
| min_tool_calls = input.config.get("min_tool_calls", 0) | ||
| penalize_duplicates = input.config.get("penalize_duplicates", True) | ||
| penalize_errors = input.config.get("penalize_errors", True) | ||
|
|
||
| scores: list[float] = [] | ||
| details_items: list[str] = [] | ||
|
|
||
| for inv in input.invocations: | ||
| tool_calls = inv.intermediate_steps.tool_calls if inv.intermediate_steps else [] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the first time to use fields not part of the standard ADK Invocation format, we'll have to think a bit about how to go about these. |
||
| tool_responses = inv.intermediate_steps.tool_responses if inv.intermediate_steps else [] | ||
| total = len(tool_calls) | ||
|
|
||
| if total == 0: | ||
| if min_tool_calls > 0: | ||
| scores.append(0.0) | ||
| details_items.append(f"{inv.invocation_id}: no tool calls (min required: {min_tool_calls})") | ||
| else: | ||
| scores.append(1.0) | ||
| details_items.append(f"{inv.invocation_id}: no tool calls (tools optional)") | ||
| continue | ||
|
Comment on lines
+45
to
+52
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure if we should return a perfect score here. Many times zero tool means a failure. We also have tool_coverage to check for minimum tool usage. Maybe we should make this configurable?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair point — zero tool calls often means the agent hallucinated an answer instead of using its tools. Returning 1.0 here is misleading. I'd suggest adding a min_tool_calls config (default 0 for backward compat). When set, zero calls scores 0.0 instead of 1.0. And when min_tool_calls=0 (explicitly "tools are optional"), zero calls still scores 1.0. yaml This keeps tool_efficiency focused on efficiency while letting users opt into "tools are required". For strict "did the agent use tools at all" checks, tool_coverage is the right evaluator — they complement each other. |
||
|
|
||
| dupes = 0 | ||
| if penalize_duplicates: | ||
| seen: dict[str, int] = {} | ||
| for call in tool_calls: | ||
| sig = _call_signature(call) | ||
| seen[sig] = seen.get(sig, 0) + 1 | ||
| dupes = sum(c - 1 for c in seen.values() if c > 1) | ||
|
|
||
| errors = sum(1 for r in tool_responses if _is_error_response(r)) if penalize_errors else 0 | ||
| useful = max(0, total - dupes - errors) | ||
|
|
||
| efficiency = useful / total | ||
| budget_factor = max(0.0, 1.0 - max(0, total - max_tool_calls) / max_tool_calls) | ||
| score = max(0.0, min(1.0, efficiency * budget_factor)) | ||
| scores.append(score) | ||
|
|
||
| parts = [f"total={total}", f"useful={useful}"] | ||
| if dupes: parts.append(f"dupes={dupes}") | ||
| if errors: parts.append(f"errors={errors}") | ||
| details_items.append(f"{inv.invocation_id}: {', '.join(parts)}") | ||
|
|
||
| overall = sum(scores) / len(scores) if scores else 0.0 | ||
| return EvalResult(score=overall, per_invocation_scores=scores, details={"tool_details": details_items}) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| tool_efficiency.run() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you remove this file 'evaluators/bertscore/pycache/bertscore.cpython-314.pyc' ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch! This file was already removed in commit 08b0905. The .gitignore also includes pycache/ so it won't be accidentally committed again.