agentevals-dev · henrikrexed · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
-.venv/
+.venv/
+__pycache__/
diff --git a/evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc b/evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc
diff --git a/evaluators/time_efficiency/evaluator.yaml b/evaluators/time_efficiency/evaluator.yaml
@@ -0,0 +1,6 @@
+name: time_efficiency
+description: Scores how quickly the agent resolved relative to a time budget
+language: python
+entrypoint: time_efficiency.py
+tags: [performance, time, latency, efficiency, budget]
+author: henrikrexed
diff --git a/evaluators/time_efficiency/time_efficiency.py b/evaluators/time_efficiency/time_efficiency.py
@@ -0,0 +1,55 @@
+"""Community evaluator: time_efficiency
+
+Scores resolution time relative to a budget. Extracts duration_s from
+performance_metrics when available, otherwise returns NOT_EVALUATED.
+
+Config: max_duration_s (float, default 120)
+"""
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+
+def _extract_duration(inv) -> float | None:
+    perf = inv.performance_metrics
+    if not isinstance(perf, dict):
+        return None
+
+    d = perf.get("duration_s") or perf.get("duration")
+    if d is not None:
+        return float(d)
+    return None
+
+
+@evaluator
+def time_efficiency(input: EvalInput) -> EvalResult:
+    max_duration = input.config.get("max_duration_s", 120.0)
+
+    scores: list[float] = []
+    details_items: list[str] = []
+    has_data = False
+
+    for inv in input.invocations:
+        duration = _extract_duration(inv)
+        if duration is None:
+            scores.append(0.0)
+            details_items.append(f"{inv.invocation_id}: no duration data")
+            continue
+
+        has_data = True
+        score = max(0.0, min(1.0, 1.0 - (duration / max_duration)))
+        scores.append(score)
+        details_items.append(f"{inv.invocation_id}: {duration:.1f}s / {max_duration:.1f}s")
+
+    if not has_data:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            details={"reason": "no duration data in any invocation"},
+        )
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(score=overall, per_invocation_scores=scores, details={"time_details": details_items})
+
+
+if __name__ == "__main__":
+    time_efficiency.run()
diff --git a/evaluators/token_efficiency/evaluator.yaml b/evaluators/token_efficiency/evaluator.yaml
@@ -0,0 +1,6 @@
+name: token_efficiency
+description: Scores how efficiently the agent used tokens relative to a budget
+language: python
+entrypoint: token_efficiency.py
+tags: [performance, tokens, efficiency, budget]
+author: henrikrexed
diff --git a/evaluators/token_efficiency/token_efficiency.py b/evaluators/token_efficiency/token_efficiency.py
@@ -0,0 +1,63 @@
+"""Community evaluator: token_efficiency
+
+Scores token usage relative to a budget. Extracts input/output tokens from
+performance_metrics when available, otherwise returns NOT_EVALUATED.
+
+Config: max_input_tokens (int, default 150000), max_output_tokens (int, default 50000)
+"""
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+
+def _extract_tokens(inv) -> dict | None:
+    perf = inv.performance_metrics
+    if not isinstance(perf, dict):
+        return None
+
+    input_t = perf.get("input_tokens") or perf.get("prompt_tokens")
+    output_t = perf.get("output_tokens") or perf.get("completion_tokens")
+    if input_t is not None or output_t is not None:
+        return {"input_tokens": int(input_t or 0), "output_tokens": int(output_t or 0)}
+
+    return None
+
+
+@evaluator
+def token_efficiency(input: EvalInput) -> EvalResult:
+    max_input = input.config.get("max_input_tokens", 150000)
+    max_output = input.config.get("max_output_tokens", 50000)
+
+    scores: list[float] = []
+    details_items: list[str] = []
+    has_data = False
+
+    for inv in input.invocations:
+        tokens = _extract_tokens(inv)
+        if tokens is None:
+            scores.append(0.0)
+            details_items.append(f"{inv.invocation_id}: no token data")
+            continue
+
+        has_data = True
+        input_score = max(0.0, min(1.0, 1.0 - (tokens["input_tokens"] / max_input))) if max_input > 0 else 1.0
+        output_score = max(0.0, min(1.0, 1.0 - (tokens["output_tokens"] / max_output))) if max_output > 0 else 1.0
+        score = min(input_score, output_score)
+        scores.append(score)
+        details_items.append(
+            f"{inv.invocation_id}: {tokens['input_tokens']}in/{max_input} + "
+            f"{tokens['output_tokens']}out/{max_output} -> {score:.2f}"
+        )
+
+    if not has_data:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            details={"reason": "no token data in any invocation"},
+        )
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(score=overall, per_invocation_scores=scores, details={"token_details": details_items})
+
+
+if __name__ == "__main__":
+    token_efficiency.run()
diff --git a/evaluators/tool_efficiency/evaluator.yaml b/evaluators/tool_efficiency/evaluator.yaml
@@ -0,0 +1,6 @@
+name: tool_efficiency
+description: Scores whether the agent used tools effectively — penalizes waste, duplicates, and errors
+language: python
+entrypoint: tool_efficiency.py
+tags: [performance, tools, efficiency, budget]
+author: henrikrexed
diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py
@@ -0,0 +1,80 @@
+"""Community evaluator: tool_efficiency
+
+Scores tool usage effectiveness. Penalizes duplicate calls (same tool + args),
+error responses, and budget overruns.
+
+Config: max_tool_calls (int, default 15), min_tool_calls (int, default 0),
+        penalize_duplicates (bool, default true), penalize_errors (bool, default true)
+"""
+
+import json
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+
+
+def _call_signature(call) -> str:
+    name = call.get("name", "") if isinstance(call, dict) else getattr(call, "name", "")
+    args = call.get("args", {}) if isinstance(call, dict) else getattr(call, "args", {})
+    try:
+        args_str = json.dumps(args, sort_keys=True, default=str)
+    except (TypeError, ValueError):
+        args_str = str(args)
+    return f"{name}::{args_str}"
+
+
+def _is_error_response(response) -> bool:
+    """Check if a tool response indicates an error via its status field."""
+    status = response.get("status", "") if isinstance(response, dict) else getattr(response, "status", "")
+    return str(status).lower() in ("error", "failed", "failure")
+
+
+@evaluator
+def tool_efficiency(input: EvalInput) -> EvalResult:
+    max_tool_calls = input.config.get("max_tool_calls", 15)
+    min_tool_calls = input.config.get("min_tool_calls", 0)
+    penalize_duplicates = input.config.get("penalize_duplicates", True)
+    penalize_errors = input.config.get("penalize_errors", True)
+
+    scores: list[float] = []
+    details_items: list[str] = []
+
+    for inv in input.invocations:
+        tool_calls = inv.intermediate_steps.tool_calls if inv.intermediate_steps else []
+        tool_responses = inv.intermediate_steps.tool_responses if inv.intermediate_steps else []
+        total = len(tool_calls)
+
+        if total == 0:
+            if min_tool_calls > 0:
+                scores.append(0.0)
+                details_items.append(f"{inv.invocation_id}: no tool calls (min required: {min_tool_calls})")
+            else:
+                scores.append(1.0)
+                details_items.append(f"{inv.invocation_id}: no tool calls (tools optional)")
+            continue
+
+        dupes = 0
+        if penalize_duplicates:
+            seen: dict[str, int] = {}
+            for call in tool_calls:
+                sig = _call_signature(call)
+                seen[sig] = seen.get(sig, 0) + 1
+            dupes = sum(c - 1 for c in seen.values() if c > 1)
+
+        errors = sum(1 for r in tool_responses if _is_error_response(r)) if penalize_errors else 0
+        useful = max(0, total - dupes - errors)
+
+        efficiency = useful / total
+        budget_factor = max(0.0, 1.0 - max(0, total - max_tool_calls) / max_tool_calls)
+        score = max(0.0, min(1.0, efficiency * budget_factor))
+        scores.append(score)
+
+        parts = [f"total={total}", f"useful={useful}"]
+        if dupes: parts.append(f"dupes={dupes}")
+        if errors: parts.append(f"errors={errors}")
+        details_items.append(f"{inv.invocation_id}: {', '.join(parts)}")
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(score=overall, per_invocation_scores=scores, details={"tool_details": details_items})
+
+
+if __name__ == "__main__":
+    tool_efficiency.run()