From 61c7f122afea318584e69e55d2324ba9fe21afe3 Mon Sep 17 00:00:00 2001
From: krisztianfekete <git@krisztianfekete.org>
Date: Thu, 26 Mar 2026 12:33:03 +0100
Subject: [PATCH 1/4] add venv support for custom evals

---
 examples/custom_evaluators/eval_config.yaml |  10 ++
 src/agentevals/custom_evaluators.py         |  56 +++++++---
 src/agentevals/evaluator/sources.py         |  24 +++-
 src/agentevals/evaluator/venv.py            | 117 ++++++++++++++++++++
 4 files changed, 191 insertions(+), 16 deletions(-)
 create mode 100644 src/agentevals/evaluator/venv.py

diff --git a/examples/custom_evaluators/eval_config.yaml b/examples/custom_evaluators/eval_config.yaml
index 59171b6..a7c1f92 100644
--- a/examples/custom_evaluators/eval_config.yaml
+++ b/examples/custom_evaluators/eval_config.yaml
@@ -33,3 +33,13 @@ evaluators:
     threshold: 0.110
     executor: local
 
+  # TODO switch to GitHub once PR is approved.
+  - name: bertscore
+    type: code
+    path: ../evaluators/evaluators/bertscore/bertscore.py
+    threshold: 0.7
+    timeout: 300
+    config:
+      expected: "There are two Helm releases installed in the cluster: kagent in namespace kagent (revision 2, deployed, chart kagent-0.7.14) and kagent-crds in namespace kagent (revision 1, deployed, chart kagent-crds-0.7.14)."
+      metric: "f1"
+
diff --git a/src/agentevals/custom_evaluators.py b/src/agentevals/custom_evaluators.py
index 3889476..2a9dff2 100644
--- a/src/agentevals/custom_evaluators.py
+++ b/src/agentevals/custom_evaluators.py
@@ -68,8 +68,12 @@ def extensions(self) -> tuple[str, ...]:
         """File extensions this runtime handles (e.g. ``(".py",)``)."""
 
     @abc.abstractmethod
-    def build_command(self, path: Path) -> list[str]:
-        """Return the argv list to execute *path*."""
+    def build_command(self, path: Path, python: Path | None = None) -> list[str]:
+        """Return the argv list to execute *path*.
+
+        For Python runtimes, *python* may point to a venv interpreter.
+        Non-Python runtimes ignore this parameter.
+        """
 
     def is_available(self) -> bool:
         """Return True if the runtime's interpreter is found on the system."""
@@ -89,8 +93,9 @@ def name(self) -> str:
     def extensions(self) -> tuple[str, ...]:
         return (".py",)
 
-    def build_command(self, path: Path) -> list[str]:
-        return [sys.executable, str(path)]
+    def build_command(self, path: Path, python: Path | None = None) -> list[str]:
+        exe = str(python) if python else sys.executable
+        return [exe, str(path)]
 
     def is_available(self) -> bool:
         return True
@@ -105,7 +110,7 @@ def name(self) -> str:
     def extensions(self) -> tuple[str, ...]:
         return (".js", ".ts")
 
-    def build_command(self, path: Path) -> list[str]:
+    def build_command(self, path: Path, python: Path | None = None) -> list[str]:
         node = shutil.which("node")
         if not node:
             raise RuntimeError("Node.js not found on PATH (required for .js/.ts evaluators)")
@@ -203,19 +208,22 @@ class SubprocessBackend(EvaluatorBackend):
     """Runs a local code file (.py, .js, .ts, …) as a subprocess.
 
     The correct interpreter is resolved from the file extension via the
-    :data:`_RUNTIMES` registry.
+    :data:`_RUNTIMES` registry.  When *venv_python* is provided, Python
+    evaluators run inside that virtual environment instead of the host
+    interpreter.
     """
 
-    def __init__(self, path: Path, timeout: int = 30):
+    def __init__(self, path: Path, timeout: int = 30, venv_python: Path | None = None):
         self._path = path.resolve()
         self._runtime = _resolve_runtime(self._path)
         self._timeout = timeout
+        self._venv_python = venv_python
 
         if not self._path.exists():
             raise FileNotFoundError(f"Evaluator file not found: {self._path}")
 
     async def run(self, eval_input: EvalInput, metric_name: str) -> EvalResult:
-        cmd = self._runtime.build_command(self._path)
+        cmd = self._runtime.build_command(self._path, self._venv_python)
         return await _run_subprocess(cmd, eval_input.model_dump_json(), self._timeout, metric_name)
 
 
@@ -223,20 +231,22 @@ async def run(self, eval_input: EvalInput, metric_name: str) -> EvalResult:
 # Executor factory
 # ---------------------------------------------------------------------------
 
-_EXECUTOR_FACTORIES: dict[str, Callable[[Path, int], EvaluatorBackend]] = {
-    "local": lambda path, timeout: SubprocessBackend(path, timeout),
+_EXECUTOR_FACTORIES: dict[str, Callable[..., EvaluatorBackend]] = {
+    "local": lambda path, timeout, venv_python=None: SubprocessBackend(path, timeout, venv_python),
 }
 
 
-def create_executor(executor_name: str, path: Path, timeout: int = 30) -> EvaluatorBackend:
+def create_executor(
+    executor_name: str, path: Path, timeout: int = 30, venv_python: Path | None = None
+) -> EvaluatorBackend:
     """Construct an EvaluatorBackend by executor name (e.g. 'local', 'docker')."""
     factory = _EXECUTOR_FACTORIES.get(executor_name)
     if factory is None:
         raise ValueError(f"Unknown executor '{executor_name}'. Available: {sorted(_EXECUTOR_FACTORIES.keys())}")
-    return factory(path, timeout)
+    return factory(path, timeout, venv_python)
 
 
-def register_executor(name: str, factory: Callable[[Path, int], EvaluatorBackend]) -> None:
+def register_executor(name: str, factory: Callable[..., EvaluatorBackend]) -> None:
     """Register a new executor factory (e.g. for Docker support)."""
     _EXECUTOR_FACTORIES[name] = factory
 
@@ -425,7 +435,25 @@ async def evaluate_custom_evaluator(
         evaluator_def = await get_default_resolver().resolve(evaluator_def)
 
     if isinstance(evaluator_def, CodeEvaluatorDef):
-        backend = create_executor(evaluator_def.executor, Path(evaluator_def.path), evaluator_def.timeout)
+        evaluator_path = Path(evaluator_def.path)
+
+        # Set up a venv if the evaluator ships a requirements.txt.
+        venv_python: Path | None = None
+        if evaluator_path.suffix == ".py":
+            from .evaluator.venv import ensure_venv_async
+
+            try:
+                venv_python = await ensure_venv_async(evaluator_path)
+            except Exception as exc:
+                logger.error("Failed to set up venv for '%s': %s", evaluator_def.name, exc)
+                return MetricResult(
+                    metric_name=evaluator_def.name,
+                    error=f"Dependency installation failed: {exc}",
+                )
+
+        backend = create_executor(
+            evaluator_def.executor, evaluator_path, evaluator_def.timeout, venv_python=venv_python
+        )
     else:
         raise ValueError(f"Unsupported custom evaluator type: {type(evaluator_def).__name__}")
 
diff --git a/src/agentevals/evaluator/sources.py b/src/agentevals/evaluator/sources.py
index 9ef5d1f..5122c29 100644
--- a/src/agentevals/evaluator/sources.py
+++ b/src/agentevals/evaluator/sources.py
@@ -216,8 +216,22 @@ async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
             resp = await client.get(url, headers=self._headers(), timeout=30)
             resp.raise_for_status()
 
-        dest.parent.mkdir(parents=True, exist_ok=True)
-        dest.write_text(resp.text, encoding="utf-8")  # noqa: ASYNC240
+            dest.parent.mkdir(parents=True, exist_ok=True)
+            dest.write_text(resp.text, encoding="utf-8")  # noqa: ASYNC240
+
+            # Also try to fetch requirements.txt from the same directory.
+            ref_dir = str(Path(ref).parent)
+            req_ref = f"{ref_dir}/requirements.txt"
+            req_url = self._raw_url(req_ref)
+            try:
+                req_resp = await client.get(req_url, headers=self._headers(), timeout=15)
+                if req_resp.status_code == 200:
+                    req_dest = dest.parent / "requirements.txt"
+                    req_dest.write_text(req_resp.text, encoding="utf-8")  # noqa: ASYNC240
+                    logger.info("Downloaded requirements.txt for evaluator")
+            except httpx.HTTPError:
+                logger.debug("No requirements.txt found for evaluator (or download failed)")
+
         return dest
 
 
@@ -267,6 +281,12 @@ async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
 
         dest.parent.mkdir(parents=True, exist_ok=True)
         shutil.copy2(src, dest)
+
+        # Also copy requirements.txt if it exists alongside the source file.
+        req_src = src.parent / "requirements.txt"
+        if req_src.exists():
+            shutil.copy2(req_src, dest.parent / "requirements.txt")
+
         return dest
 
 
diff --git a/src/agentevals/evaluator/venv.py b/src/agentevals/evaluator/venv.py
new file mode 100644
index 0000000..43aca99
--- /dev/null
+++ b/src/agentevals/evaluator/venv.py
@@ -0,0 +1,117 @@
+"""Virtual environment management for evaluators with dependencies.
+
+When an evaluator ships a ``requirements.txt`` alongside its entrypoint, we
+create a cached venv, install the dependencies (plus the evaluator SDK), and
+return the path to that venv's Python interpreter so the evaluator subprocess
+runs in isolation.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import logging
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+_VENV_CACHE_DIR = Path.home() / ".cache" / "agentevals" / "venvs"
+_HASH_FILE = ".requirements_hash"
+
+# Per-evaluator locks to prevent concurrent venv creation for the same evaluator.
+_venv_locks: dict[str, asyncio.Lock] = {}
+
+
+def _venv_python(venv_dir: Path) -> Path:
+    if sys.platform == "win32":
+        return venv_dir / "Scripts" / "python.exe"
+    return venv_dir / "bin" / "python"
+
+
+def _venv_key(evaluator_path: Path) -> str:
+    """Stable cache directory name derived from evaluator location."""
+    resolved = evaluator_path.resolve()
+    name = resolved.parent.name
+    path_hash = hashlib.sha256(str(resolved.parent).encode()).hexdigest()[:8]
+    return f"{name}-{path_hash}"
+
+
+def _is_venv_valid(venv_dir: Path, req_hash: str) -> bool:
+    hash_file = venv_dir / _HASH_FILE
+    return _venv_python(venv_dir).exists() and hash_file.exists() and hash_file.read_text().strip() == req_hash
+
+
+def _create_venv(venv_dir: Path, uv: str | None) -> None:
+    if venv_dir.exists():
+        shutil.rmtree(venv_dir)
+    cmd = (
+        [uv, "venv", str(venv_dir), "--python", sys.executable] if uv else [sys.executable, "-m", "venv", str(venv_dir)]
+    )
+    subprocess.run(cmd, check=True, capture_output=True)
+
+
+def _install_deps(venv_dir: Path, requirements: Path, uv: str | None) -> None:
+    python = str(_venv_python(venv_dir))
+    sdk_spec = "agentevals-evaluator-sdk"
+
+    if uv:
+        base = [uv, "pip", "install", "--python", python]
+    else:
+        base = [python, "-m", "pip", "install"]
+
+    subprocess.run(base + [sdk_spec], check=True, capture_output=True)
+    logger.info("Installing dependencies from %s ...", requirements.name)
+    subprocess.run(base + ["-r", str(requirements)], check=True)
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def ensure_venv(evaluator_path: Path) -> Path | None:
+    """Ensure a cached venv exists for *evaluator_path* if it has ``requirements.txt``.
+
+    Returns the venv Python path, or ``None`` if no venv is needed.
+    """
+    requirements = evaluator_path.resolve().parent / "requirements.txt"
+    if not requirements.exists():
+        return None
+
+    req_hash = hashlib.sha256(requirements.read_bytes()).hexdigest()
+    venv_dir = _VENV_CACHE_DIR / _venv_key(evaluator_path)
+
+    if _is_venv_valid(venv_dir, req_hash):
+        logger.debug("Using cached venv for %s at %s", evaluator_path.name, venv_dir)
+        return _venv_python(venv_dir)
+
+    uv = shutil.which("uv")
+    logger.info(
+        "Setting up environment for evaluator '%s' (using %s). This may take a while on first run...",
+        evaluator_path.stem,
+        "uv" if uv else "venv+pip",
+    )
+
+    try:
+        _create_venv(venv_dir, uv)
+        _install_deps(venv_dir, requirements, uv)
+    except subprocess.CalledProcessError as exc:
+        stderr = exc.stderr.decode() if isinstance(exc.stderr, bytes) else (exc.stderr or "")
+        raise RuntimeError(f"Failed to set up environment for evaluator '{evaluator_path.stem}': {stderr}") from exc
+
+    (venv_dir / _HASH_FILE).write_text(req_hash)
+    logger.info("Environment ready for '%s'", evaluator_path.stem)
+    return _venv_python(venv_dir)
+
+
+async def ensure_venv_async(evaluator_path: Path) -> Path | None:
+    """Async wrapper around :func:`ensure_venv` with per-evaluator locking."""
+    venv_key = _venv_key(evaluator_path)
+    if venv_key not in _venv_locks:
+        _venv_locks[venv_key] = asyncio.Lock()
+
+    async with _venv_locks[venv_key]:
+        return await asyncio.to_thread(ensure_venv, evaluator_path)

From 72eb775d16924c51b864e4d980997383433cd6d4 Mon Sep 17 00:00:00 2001
From: krisztianfekete <git@krisztianfekete.org>
Date: Thu, 26 Mar 2026 16:36:38 +0100
Subject: [PATCH 2/4] address review comments

---
 examples/custom_evaluators/eval_config.yaml | 11 -----------
 src/agentevals/evaluator/sources.py         |  4 ++--
 src/agentevals/evaluator/venv.py            |  1 +
 3 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/examples/custom_evaluators/eval_config.yaml b/examples/custom_evaluators/eval_config.yaml
index a7c1f92..d3bd261 100644
--- a/examples/custom_evaluators/eval_config.yaml
+++ b/examples/custom_evaluators/eval_config.yaml
@@ -32,14 +32,3 @@ evaluators:
     ref: evaluators/random_evaluator/random_evaluator.py
     threshold: 0.110
     executor: local
-
-  # TODO switch to GitHub once PR is approved.
-  - name: bertscore
-    type: code
-    path: ../evaluators/evaluators/bertscore/bertscore.py
-    threshold: 0.7
-    timeout: 300
-    config:
-      expected: "There are two Helm releases installed in the cluster: kagent in namespace kagent (revision 2, deployed, chart kagent-0.7.14) and kagent-crds in namespace kagent (revision 1, deployed, chart kagent-crds-0.7.14)."
-      metric: "f1"
-
diff --git a/src/agentevals/evaluator/sources.py b/src/agentevals/evaluator/sources.py
index 5122c29..8fe022c 100644
--- a/src/agentevals/evaluator/sources.py
+++ b/src/agentevals/evaluator/sources.py
@@ -8,7 +8,7 @@
 import os
 import time
 from dataclasses import asdict, dataclass, field
-from pathlib import Path
+from pathlib import Path, PurePosixPath
 
 import yaml
 
@@ -220,7 +220,7 @@ async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
             dest.write_text(resp.text, encoding="utf-8")  # noqa: ASYNC240
 
             # Also try to fetch requirements.txt from the same directory.
-            ref_dir = str(Path(ref).parent)
+            ref_dir = str(PurePosixPath(ref).parent)
             req_ref = f"{ref_dir}/requirements.txt"
             req_url = self._raw_url(req_ref)
             try:
diff --git a/src/agentevals/evaluator/venv.py b/src/agentevals/evaluator/venv.py
index 43aca99..82d7401 100644
--- a/src/agentevals/evaluator/venv.py
+++ b/src/agentevals/evaluator/venv.py
@@ -96,6 +96,7 @@ def ensure_venv(evaluator_path: Path) -> Path | None:
     )
 
     try:
+        venv_dir.parent.mkdir(parents=True, exist_ok=True)
         _create_venv(venv_dir, uv)
         _install_deps(venv_dir, requirements, uv)
     except subprocess.CalledProcessError as exc:

From ac54608345c1a8bbae2c500e295135e528525417 Mon Sep 17 00:00:00 2001
From: krisztianfekete <git@krisztianfekete.org>
Date: Fri, 27 Mar 2026 10:39:01 +0100
Subject: [PATCH 3/4] address review feedback

---
 src/agentevals/custom_evaluators.py | 44 +++++++++++++----------------
 src/agentevals/evaluator/venv.py    |  2 +-
 2 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/src/agentevals/custom_evaluators.py b/src/agentevals/custom_evaluators.py
index 2a9dff2..43f91c7 100644
--- a/src/agentevals/custom_evaluators.py
+++ b/src/agentevals/custom_evaluators.py
@@ -68,12 +68,8 @@ def extensions(self) -> tuple[str, ...]:
         """File extensions this runtime handles (e.g. ``(".py",)``)."""
 
     @abc.abstractmethod
-    def build_command(self, path: Path, python: Path | None = None) -> list[str]:
-        """Return the argv list to execute *path*.
-
-        For Python runtimes, *python* may point to a venv interpreter.
-        Non-Python runtimes ignore this parameter.
-        """
+    def build_command(self, path: Path) -> list[str]:
+        """Return the argv list to execute *path*."""
 
     def is_available(self) -> bool:
         """Return True if the runtime's interpreter is found on the system."""
@@ -85,6 +81,9 @@ def is_available(self) -> bool:
 
 
 class PythonRuntime(Runtime):
+    def __init__(self, python_path: Path | None = None):
+        self._python_path = python_path
+
     @property
     def name(self) -> str:
         return "Python"
@@ -93,8 +92,8 @@ def name(self) -> str:
     def extensions(self) -> tuple[str, ...]:
         return (".py",)
 
-    def build_command(self, path: Path, python: Path | None = None) -> list[str]:
-        exe = str(python) if python else sys.executable
+    def build_command(self, path: Path) -> list[str]:
+        exe = str(self._python_path) if self._python_path else sys.executable
         return [exe, str(path)]
 
     def is_available(self) -> bool:
@@ -110,7 +109,7 @@ def name(self) -> str:
     def extensions(self) -> tuple[str, ...]:
         return (".js", ".ts")
 
-    def build_command(self, path: Path, python: Path | None = None) -> list[str]:
+    def build_command(self, path: Path) -> list[str]:
         node = shutil.which("node")
         if not node:
             raise RuntimeError("Node.js not found on PATH (required for .js/.ts evaluators)")
@@ -208,22 +207,20 @@ class SubprocessBackend(EvaluatorBackend):
     """Runs a local code file (.py, .js, .ts, …) as a subprocess.
 
     The correct interpreter is resolved from the file extension via the
-    :data:`_RUNTIMES` registry.  When *venv_python* is provided, Python
-    evaluators run inside that virtual environment instead of the host
-    interpreter.
+    :data:`_RUNTIMES` registry.  Pass a pre-configured *runtime* to override
+    the default (e.g. a :class:`PythonRuntime` with a venv interpreter).
     """
 
-    def __init__(self, path: Path, timeout: int = 30, venv_python: Path | None = None):
+    def __init__(self, path: Path, timeout: int = 30, runtime: Runtime | None = None):
         self._path = path.resolve()
-        self._runtime = _resolve_runtime(self._path)
+        self._runtime = runtime or _resolve_runtime(self._path)
         self._timeout = timeout
-        self._venv_python = venv_python
 
         if not self._path.exists():
             raise FileNotFoundError(f"Evaluator file not found: {self._path}")
 
     async def run(self, eval_input: EvalInput, metric_name: str) -> EvalResult:
-        cmd = self._runtime.build_command(self._path, self._venv_python)
+        cmd = self._runtime.build_command(self._path)
         return await _run_subprocess(cmd, eval_input.model_dump_json(), self._timeout, metric_name)
 
 
@@ -232,18 +229,18 @@ async def run(self, eval_input: EvalInput, metric_name: str) -> EvalResult:
 # ---------------------------------------------------------------------------
 
 _EXECUTOR_FACTORIES: dict[str, Callable[..., EvaluatorBackend]] = {
-    "local": lambda path, timeout, venv_python=None: SubprocessBackend(path, timeout, venv_python),
+    "local": lambda path, timeout, runtime=None: SubprocessBackend(path, timeout, runtime),
 }
 
 
 def create_executor(
-    executor_name: str, path: Path, timeout: int = 30, venv_python: Path | None = None
+    executor_name: str, path: Path, timeout: int = 30, runtime: Runtime | None = None
 ) -> EvaluatorBackend:
     """Construct an EvaluatorBackend by executor name (e.g. 'local', 'docker')."""
     factory = _EXECUTOR_FACTORIES.get(executor_name)
     if factory is None:
         raise ValueError(f"Unknown executor '{executor_name}'. Available: {sorted(_EXECUTOR_FACTORIES.keys())}")
-    return factory(path, timeout, venv_python)
+    return factory(path, timeout, runtime)
 
 
 def register_executor(name: str, factory: Callable[..., EvaluatorBackend]) -> None:
@@ -437,8 +434,7 @@ async def evaluate_custom_evaluator(
     if isinstance(evaluator_def, CodeEvaluatorDef):
         evaluator_path = Path(evaluator_def.path)
 
-        # Set up a venv if the evaluator ships a requirements.txt.
-        venv_python: Path | None = None
+        runtime: Runtime | None = None
         if evaluator_path.suffix == ".py":
             from .evaluator.venv import ensure_venv_async
 
@@ -450,10 +446,10 @@ async def evaluate_custom_evaluator(
                     metric_name=evaluator_def.name,
                     error=f"Dependency installation failed: {exc}",
                 )
+            if venv_python:
+                runtime = PythonRuntime(python_path=venv_python)
 
-        backend = create_executor(
-            evaluator_def.executor, evaluator_path, evaluator_def.timeout, venv_python=venv_python
-        )
+        backend = create_executor(evaluator_def.executor, evaluator_path, evaluator_def.timeout, runtime=runtime)
     else:
         raise ValueError(f"Unsupported custom evaluator type: {type(evaluator_def).__name__}")
 
diff --git a/src/agentevals/evaluator/venv.py b/src/agentevals/evaluator/venv.py
index 82d7401..931489a 100644
--- a/src/agentevals/evaluator/venv.py
+++ b/src/agentevals/evaluator/venv.py
@@ -64,7 +64,7 @@ def _install_deps(venv_dir: Path, requirements: Path, uv: str | None) -> None:
 
     subprocess.run(base + [sdk_spec], check=True, capture_output=True)
     logger.info("Installing dependencies from %s ...", requirements.name)
-    subprocess.run(base + ["-r", str(requirements)], check=True)
+    subprocess.run(base + ["-r", str(requirements)], check=True, capture_output=True)
 
 
 # ---------------------------------------------------------------------------

From 74ce5dfbb7d3552e620ff4b1e907b82a87b5e6da Mon Sep 17 00:00:00 2001
From: krisztianfekete <git@krisztianfekete.org>
Date: Fri, 27 Mar 2026 11:28:56 +0100
Subject: [PATCH 4/4] address follow-up feedback, adopt the approach at
 NodeRuntime too

---
 src/agentevals/custom_evaluators.py | 29 +++++++++++++++++------------
 src/agentevals/evaluator/venv.py    |  3 ++-
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/agentevals/custom_evaluators.py b/src/agentevals/custom_evaluators.py
index 43f91c7..785af73 100644
--- a/src/agentevals/custom_evaluators.py
+++ b/src/agentevals/custom_evaluators.py
@@ -82,7 +82,7 @@ def is_available(self) -> bool:
 
 class PythonRuntime(Runtime):
     def __init__(self, python_path: Path | None = None):
-        self._python_path = python_path
+        self._exe = str(python_path) if python_path else sys.executable
 
     @property
     def name(self) -> str:
@@ -93,14 +93,16 @@ def extensions(self) -> tuple[str, ...]:
         return (".py",)
 
     def build_command(self, path: Path) -> list[str]:
-        exe = str(self._python_path) if self._python_path else sys.executable
-        return [exe, str(path)]
+        return [self._exe, str(path)]
 
     def is_available(self) -> bool:
         return True
 
 
 class NodeRuntime(Runtime):
+    def __init__(self) -> None:
+        self._exe = shutil.which("node")
+
     @property
     def name(self) -> str:
         return "Node.js"
@@ -110,10 +112,12 @@ def extensions(self) -> tuple[str, ...]:
         return (".js", ".ts")
 
     def build_command(self, path: Path) -> list[str]:
-        node = shutil.which("node")
-        if not node:
+        if not self._exe:
             raise RuntimeError("Node.js not found on PATH (required for .js/.ts evaluators)")
-        return [node, str(path)]
+        return [self._exe, str(path)]
+
+    def is_available(self) -> bool:
+        return self._exe is not None
 
 
 _RUNTIMES: list[Runtime] = [
@@ -229,18 +233,16 @@ async def run(self, eval_input: EvalInput, metric_name: str) -> EvalResult:
 # ---------------------------------------------------------------------------
 
 _EXECUTOR_FACTORIES: dict[str, Callable[..., EvaluatorBackend]] = {
-    "local": lambda path, timeout, runtime=None: SubprocessBackend(path, timeout, runtime),
+    "local": lambda path, timeout: SubprocessBackend(path, timeout),
 }
 
 
-def create_executor(
-    executor_name: str, path: Path, timeout: int = 30, runtime: Runtime | None = None
-) -> EvaluatorBackend:
+def create_executor(executor_name: str, path: Path, timeout: int = 30) -> EvaluatorBackend:
     """Construct an EvaluatorBackend by executor name (e.g. 'local', 'docker')."""
     factory = _EXECUTOR_FACTORIES.get(executor_name)
     if factory is None:
         raise ValueError(f"Unknown executor '{executor_name}'. Available: {sorted(_EXECUTOR_FACTORIES.keys())}")
-    return factory(path, timeout, runtime)
+    return factory(path, timeout)
 
 
 def register_executor(name: str, factory: Callable[..., EvaluatorBackend]) -> None:
@@ -449,7 +451,10 @@ async def evaluate_custom_evaluator(
             if venv_python:
                 runtime = PythonRuntime(python_path=venv_python)
 
-        backend = create_executor(evaluator_def.executor, evaluator_path, evaluator_def.timeout, runtime=runtime)
+        if runtime is not None:
+            backend = SubprocessBackend(evaluator_path, evaluator_def.timeout, runtime=runtime)
+        else:
+            backend = create_executor(evaluator_def.executor, evaluator_path, evaluator_def.timeout)
     else:
         raise ValueError(f"Unsupported custom evaluator type: {type(evaluator_def).__name__}")
 
diff --git a/src/agentevals/evaluator/venv.py b/src/agentevals/evaluator/venv.py
index 931489a..fcbfc88 100644
--- a/src/agentevals/evaluator/venv.py
+++ b/src/agentevals/evaluator/venv.py
@@ -11,6 +11,7 @@
 import asyncio
 import hashlib
 import logging
+import os
 import shutil
 import subprocess
 import sys
@@ -18,7 +19,7 @@
 
 logger = logging.getLogger(__name__)
 
-_VENV_CACHE_DIR = Path.home() / ".cache" / "agentevals" / "venvs"
+_VENV_CACHE_DIR = Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache")) / "agentevals" / "venvs"
 _HASH_FILE = ".requirements_hash"
 
 # Per-evaluator locks to prevent concurrent venv creation for the same evaluator.