diff --git a/.gitignore b/.gitignore index 0cafc1c..a230a78 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -.venv/ \ No newline at end of file +.venv/ +__pycache__/ diff --git a/evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc b/evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc deleted file mode 100644 index 551cc69..0000000 Binary files a/evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc and /dev/null differ diff --git a/evaluators/time_efficiency/evaluator.yaml b/evaluators/time_efficiency/evaluator.yaml new file mode 100644 index 0000000..cdd2dc8 --- /dev/null +++ b/evaluators/time_efficiency/evaluator.yaml @@ -0,0 +1,6 @@ +name: time_efficiency +description: Scores how quickly the agent resolved relative to a time budget +language: python +entrypoint: time_efficiency.py +tags: [performance, time, latency, efficiency, budget] +author: henrikrexed diff --git a/evaluators/time_efficiency/time_efficiency.py b/evaluators/time_efficiency/time_efficiency.py new file mode 100644 index 0000000..f5069e1 --- /dev/null +++ b/evaluators/time_efficiency/time_efficiency.py @@ -0,0 +1,55 @@ +"""Community evaluator: time_efficiency + +Scores resolution time relative to a budget. Extracts duration_s from +performance_metrics when available, otherwise returns NOT_EVALUATED. + +Config: max_duration_s (float, default 120) +""" + +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator + + +def _extract_duration(inv) -> float | None: + perf = inv.performance_metrics + if not isinstance(perf, dict): + return None + + d = perf.get("duration_s") or perf.get("duration") + if d is not None: + return float(d) + return None + + +@evaluator +def time_efficiency(input: EvalInput) -> EvalResult: + max_duration = input.config.get("max_duration_s", 120.0) + + scores: list[float] = [] + details_items: list[str] = [] + has_data = False + + for inv in input.invocations: + duration = _extract_duration(inv) + if duration is None: + scores.append(0.0) + details_items.append(f"{inv.invocation_id}: no duration data") + continue + + has_data = True + score = max(0.0, min(1.0, 1.0 - (duration / max_duration))) + scores.append(score) + details_items.append(f"{inv.invocation_id}: {duration:.1f}s / {max_duration:.1f}s") + + if not has_data: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + details={"reason": "no duration data in any invocation"}, + ) + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult(score=overall, per_invocation_scores=scores, details={"time_details": details_items}) + + +if __name__ == "__main__": + time_efficiency.run() diff --git a/evaluators/token_efficiency/evaluator.yaml b/evaluators/token_efficiency/evaluator.yaml new file mode 100644 index 0000000..e6d23c6 --- /dev/null +++ b/evaluators/token_efficiency/evaluator.yaml @@ -0,0 +1,6 @@ +name: token_efficiency +description: Scores how efficiently the agent used tokens relative to a budget +language: python +entrypoint: token_efficiency.py +tags: [performance, tokens, efficiency, budget] +author: henrikrexed diff --git a/evaluators/token_efficiency/token_efficiency.py b/evaluators/token_efficiency/token_efficiency.py new file mode 100644 index 0000000..cd12975 --- /dev/null +++ b/evaluators/token_efficiency/token_efficiency.py @@ -0,0 +1,63 @@ +"""Community evaluator: token_efficiency + +Scores token usage relative to a budget. Extracts input/output tokens from +performance_metrics when available, otherwise returns NOT_EVALUATED. + +Config: max_input_tokens (int, default 150000), max_output_tokens (int, default 50000) +""" + +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator + + +def _extract_tokens(inv) -> dict | None: + perf = inv.performance_metrics + if not isinstance(perf, dict): + return None + + input_t = perf.get("input_tokens") or perf.get("prompt_tokens") + output_t = perf.get("output_tokens") or perf.get("completion_tokens") + if input_t is not None or output_t is not None: + return {"input_tokens": int(input_t or 0), "output_tokens": int(output_t or 0)} + + return None + + +@evaluator +def token_efficiency(input: EvalInput) -> EvalResult: + max_input = input.config.get("max_input_tokens", 150000) + max_output = input.config.get("max_output_tokens", 50000) + + scores: list[float] = [] + details_items: list[str] = [] + has_data = False + + for inv in input.invocations: + tokens = _extract_tokens(inv) + if tokens is None: + scores.append(0.0) + details_items.append(f"{inv.invocation_id}: no token data") + continue + + has_data = True + input_score = max(0.0, min(1.0, 1.0 - (tokens["input_tokens"] / max_input))) if max_input > 0 else 1.0 + output_score = max(0.0, min(1.0, 1.0 - (tokens["output_tokens"] / max_output))) if max_output > 0 else 1.0 + score = min(input_score, output_score) + scores.append(score) + details_items.append( + f"{inv.invocation_id}: {tokens['input_tokens']}in/{max_input} + " + f"{tokens['output_tokens']}out/{max_output} -> {score:.2f}" + ) + + if not has_data: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + details={"reason": "no token data in any invocation"}, + ) + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult(score=overall, per_invocation_scores=scores, details={"token_details": details_items}) + + +if __name__ == "__main__": + token_efficiency.run() diff --git a/evaluators/tool_efficiency/evaluator.yaml b/evaluators/tool_efficiency/evaluator.yaml new file mode 100644 index 0000000..0092f17 --- /dev/null +++ b/evaluators/tool_efficiency/evaluator.yaml @@ -0,0 +1,6 @@ +name: tool_efficiency +description: Scores whether the agent used tools effectively — penalizes waste, duplicates, and errors +language: python +entrypoint: tool_efficiency.py +tags: [performance, tools, efficiency, budget] +author: henrikrexed diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py new file mode 100644 index 0000000..d03e400 --- /dev/null +++ b/evaluators/tool_efficiency/tool_efficiency.py @@ -0,0 +1,80 @@ +"""Community evaluator: tool_efficiency + +Scores tool usage effectiveness. Penalizes duplicate calls (same tool + args), +error responses, and budget overruns. + +Config: max_tool_calls (int, default 15), min_tool_calls (int, default 0), + penalize_duplicates (bool, default true), penalize_errors (bool, default true) +""" + +import json +from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator + + +def _call_signature(call) -> str: + name = call.get("name", "") if isinstance(call, dict) else getattr(call, "name", "") + args = call.get("args", {}) if isinstance(call, dict) else getattr(call, "args", {}) + try: + args_str = json.dumps(args, sort_keys=True, default=str) + except (TypeError, ValueError): + args_str = str(args) + return f"{name}::{args_str}" + + +def _is_error_response(response) -> bool: + """Check if a tool response indicates an error via its status field.""" + status = response.get("status", "") if isinstance(response, dict) else getattr(response, "status", "") + return str(status).lower() in ("error", "failed", "failure") + + +@evaluator +def tool_efficiency(input: EvalInput) -> EvalResult: + max_tool_calls = input.config.get("max_tool_calls", 15) + min_tool_calls = input.config.get("min_tool_calls", 0) + penalize_duplicates = input.config.get("penalize_duplicates", True) + penalize_errors = input.config.get("penalize_errors", True) + + scores: list[float] = [] + details_items: list[str] = [] + + for inv in input.invocations: + tool_calls = inv.intermediate_steps.tool_calls if inv.intermediate_steps else [] + tool_responses = inv.intermediate_steps.tool_responses if inv.intermediate_steps else [] + total = len(tool_calls) + + if total == 0: + if min_tool_calls > 0: + scores.append(0.0) + details_items.append(f"{inv.invocation_id}: no tool calls (min required: {min_tool_calls})") + else: + scores.append(1.0) + details_items.append(f"{inv.invocation_id}: no tool calls (tools optional)") + continue + + dupes = 0 + if penalize_duplicates: + seen: dict[str, int] = {} + for call in tool_calls: + sig = _call_signature(call) + seen[sig] = seen.get(sig, 0) + 1 + dupes = sum(c - 1 for c in seen.values() if c > 1) + + errors = sum(1 for r in tool_responses if _is_error_response(r)) if penalize_errors else 0 + useful = max(0, total - dupes - errors) + + efficiency = useful / total + budget_factor = max(0.0, 1.0 - max(0, total - max_tool_calls) / max_tool_calls) + score = max(0.0, min(1.0, efficiency * budget_factor)) + scores.append(score) + + parts = [f"total={total}", f"useful={useful}"] + if dupes: parts.append(f"dupes={dupes}") + if errors: parts.append(f"errors={errors}") + details_items.append(f"{inv.invocation_id}: {', '.join(parts)}") + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult(score=overall, per_invocation_scores=scores, details={"tool_details": details_items}) + + +if __name__ == "__main__": + tool_efficiency.run()