From 1157cee420e4611f47673d18287700ea8abf5e05 Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Tue, 6 Jan 2026 19:03:04 -0800
Subject: [PATCH 01/10] Move LBO code to legacy directory

---
 legacy/README.md                                         | 9 +++++++++
 .../example_cfg/plot_lbo_results_cfg.yaml                | 0
 .../example_scripts}/plot_lbo_results.py                 | 0
 {src => legacy/src}/lbo.py                               | 0
 {src => legacy/src}/run_lbo.py                           | 0
 {src => legacy/src}/utils/capability_discovery_utils.py  | 0
 {src => legacy/src}/utils/lbo_utils.py                   | 0
 {tests/src => legacy/tests}/test_lbo.py                  | 0
 {tests/src => legacy/tests}/test_lbo_utils.py            | 0
 9 files changed, 9 insertions(+)
 create mode 100644 legacy/README.md
 rename {example_scripts => legacy/example_scripts}/example_cfg/plot_lbo_results_cfg.yaml (100%)
 rename {example_scripts => legacy/example_scripts}/plot_lbo_results.py (100%)
 rename {src => legacy/src}/lbo.py (100%)
 rename {src => legacy/src}/run_lbo.py (100%)
 rename {src => legacy/src}/utils/capability_discovery_utils.py (100%)
 rename {src => legacy/src}/utils/lbo_utils.py (100%)
 rename {tests/src => legacy/tests}/test_lbo.py (100%)
 rename {tests/src => legacy/tests}/test_lbo_utils.py (100%)

diff --git a/legacy/README.md b/legacy/README.md
new file mode 100644
index 00000000..5439c779
--- /dev/null
+++ b/legacy/README.md
@@ -0,0 +1,9 @@
+# Legacy LBO Code
+
+This directory contains legacy code for **Latent Bayesian Optimization (LBO)** from an earlier version of the repository. LBO was used for intelligent capability selection during evaluation.
+
+## Compatible Version
+
+This LBO code is compatible with the repository at commit [`a224c5ec`](https://github.com/VectorInstitute/automated_capability_evaluation/tree/a224c5ec7dd208e04ef2edc059e6e7a2d0d4bcf6). That commit contains the full working version of the codebase used for the **initial paper submission**.
+
+**This code does not work with the current codebase.** This was the base legacy code before the generation and evaluation pipelines were standardized. If you need to understand how LBO integrated with the rest of the system, refer to that version.
diff --git a/example_scripts/example_cfg/plot_lbo_results_cfg.yaml b/legacy/example_scripts/example_cfg/plot_lbo_results_cfg.yaml
similarity index 100%
rename from example_scripts/example_cfg/plot_lbo_results_cfg.yaml
rename to legacy/example_scripts/example_cfg/plot_lbo_results_cfg.yaml
diff --git a/example_scripts/plot_lbo_results.py b/legacy/example_scripts/plot_lbo_results.py
similarity index 100%
rename from example_scripts/plot_lbo_results.py
rename to legacy/example_scripts/plot_lbo_results.py
diff --git a/src/lbo.py b/legacy/src/lbo.py
similarity index 100%
rename from src/lbo.py
rename to legacy/src/lbo.py
diff --git a/src/run_lbo.py b/legacy/src/run_lbo.py
similarity index 100%
rename from src/run_lbo.py
rename to legacy/src/run_lbo.py
diff --git a/src/utils/capability_discovery_utils.py b/legacy/src/utils/capability_discovery_utils.py
similarity index 100%
rename from src/utils/capability_discovery_utils.py
rename to legacy/src/utils/capability_discovery_utils.py
diff --git a/src/utils/lbo_utils.py b/legacy/src/utils/lbo_utils.py
similarity index 100%
rename from src/utils/lbo_utils.py
rename to legacy/src/utils/lbo_utils.py
diff --git a/tests/src/test_lbo.py b/legacy/tests/test_lbo.py
similarity index 100%
rename from tests/src/test_lbo.py
rename to legacy/tests/test_lbo.py
diff --git a/tests/src/test_lbo_utils.py b/legacy/tests/test_lbo_utils.py
similarity index 100%
rename from tests/src/test_lbo_utils.py
rename to legacy/tests/test_lbo_utils.py

From 96c148e77f765fe4658553a26ec139bf5f73851c Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Wed, 7 Jan 2026 00:59:11 -0800
Subject: [PATCH 02/10] updated inspect version. updated generation schema to
 simplify read and write objects. added eval pipeline schemas and
 implementation.

---
 README.md                                     |  61 +++-
 poetry.lock                                   | 309 ++++++++++++++---
 pyproject.toml                                |   2 +-
 src/cfg/run_cfg.yaml                          | 160 ++++-----
 src/eval_stages/__init__.py                   |  18 +
 src/eval_stages/prompts.py                    |   9 +
 src/eval_stages/stage0_setup_and_dataset.py   | 260 +++++++++++++++
 src/eval_stages/stage1_eval_execution.py      | 263 +++++++++++++++
 src/eval_stages/stage2_score_aggregation.py   | 231 +++++++++++++
 src/run_eval_pipeline.py                      | 149 +++++++++
 src/schemas/EVALUATION_PIPELINE_SCHEMAS.md    | 315 ++++++++++++++++++
 ...EMAS.md => GENERATION_PIPELINE_SCHEMAS.md} |   6 +-
 src/schemas/README.md                         |  30 +-
 src/schemas/__init__.py                       |  36 +-
 src/schemas/area_schemas.py                   |  10 +-
 src/schemas/capability_schemas.py             |  20 +-
 src/schemas/eval_io_utils.py                  | 142 ++++++++
 src/schemas/eval_schemas.py                   | 134 ++++++++
 src/schemas/solution_schemas.py               |  26 +-
 src/schemas/task_schemas.py                   |  30 +-
 src/schemas/validation_schemas.py             |  49 +--
 21 files changed, 1986 insertions(+), 274 deletions(-)
 create mode 100644 src/eval_stages/__init__.py
 create mode 100644 src/eval_stages/prompts.py
 create mode 100644 src/eval_stages/stage0_setup_and_dataset.py
 create mode 100644 src/eval_stages/stage1_eval_execution.py
 create mode 100644 src/eval_stages/stage2_score_aggregation.py
 create mode 100644 src/run_eval_pipeline.py
 create mode 100644 src/schemas/EVALUATION_PIPELINE_SCHEMAS.md
 rename src/schemas/{PIPELINE_SCHEMAS.md => GENERATION_PIPELINE_SCHEMAS.md} (98%)
 create mode 100644 src/schemas/eval_io_utils.py
 create mode 100644 src/schemas/eval_schemas.py

diff --git a/README.md b/README.md
index 4ffea792..fb0dfa1c 100644
--- a/README.md
+++ b/README.md
@@ -50,9 +50,9 @@ gcloud auth application-default login
 
 2. Modify `src/cfg/run_cfg.yaml`, if required.
 
-### Base Pipeline
+### Base Generation Pipeline
 
-The base (non-agentic) pipeline consists of multiple stages that can be run sequentially or individually:
+The base (non-agentic) generation pipeline consists of multiple stages that can be run sequentially or individually:
 
 - **Stage 0**: Experiment and domain setup
 - **Stage 1**: Area generation
@@ -105,21 +105,56 @@ python -m src.run_base_pipeline stage=4 tasks_tag=_YYYYMMDD_HHMMSS solution_tag=
 python -m src.run_base_pipeline stage=5 solution_tag=_YYYYMMDD_HHMMSS validation_tag=_YYYYMMDD_HHMMSS
 ```
 
-### Evaluation of subject LLM on generated capabilities
+### Evaluation Pipeline
 
-Evaluates the subject LLM on the generated capabilities and calculates a score for each.
+The evaluation pipeline evaluates subject LLMs on the generated and validated tasks using [Inspect AI](https://inspect.aisi.org.uk/).
+
+- **Stage 0**: Setup and dataset preparation (validate inputs, create datasets - no LLM calls)
+- **Stage 1**: Evaluation execution (run Inspect AI evaluations - creates `eval_tag`)
+- **Stage 2**: Score aggregation (compute capability scores - no LLM calls)
+
+#### Run All Evaluation Stages
 
 ```bash
-python -m src.run_evaluation
+# Requires validation_tag from Generation Stage 5
+python -m src.run_eval_pipeline validation_tag=_YYYYMMDD_HHMMSS
 ```
 
-### Capability selection/generation using active learning
-
-Utilize the capability and the corresponding subject LLM score to select or generate a new capability.
+#### Run Individual Evaluation Stages
 
 ```bash
-python -m src.run_lbo
+# Stage 0: Setup and dataset preparation
+python -m src.run_eval_pipeline stage=0 validation_tag=_YYYYMMDD_HHMMSS
+
+# Stage 1: Evaluation execution (also runs Stage 0 first)
+python -m src.run_eval_pipeline stage=1 validation_tag=_YYYYMMDD_HHMMSS
+
+# Stage 2: Score aggregation (requires eval_tag from Stage 1)
+python -m src.run_eval_pipeline stage=2 eval_tag=_YYYYMMDD_HHMMSS
 ```
+
+#### Configure Subject LLMs
+
+Edit `src/cfg/run_cfg.yaml` to specify which LLMs to evaluate:
+
+```yaml
+eval_cfg:
+  subject_llms:
+    - name: gpt-4o
+      provider: openai
+    - name: claude-3-sonnet
+      provider: anthropic
+  judge_llm:
+    name: gpt-4o-mini
+    provider: openai
+```
+
+See `src/schemas/EVALUATION_PIPELINE_SCHEMAS.md` for detailed evaluation pipeline documentation.
+
+### Legacy: LBO (Latent Bayesian Optimization)
+
+The previous version of the repository included LBO for intelligent capability selection during evaluation. This functionality has been moved to the `legacy/` directory for reference. See `legacy/README.md` for details.
+
 ### Agentic Generation Scripts
 
 These scripts implement the multi-agent debate workflow for automated generation of areas, capabilities, tasks, and solutions.
@@ -276,9 +311,11 @@ python static_vs_generated.py
 When implementing new features or modifying existing pipeline stages:
 
 1. **Follow Schema Guidelines**: All data objects must use the schema classes defined in `src/schemas/`:
-   - Use `Domain`, `Area`, `Capability`, `Task`, `TaskSolution`, `ValidationResult` objects
-   - Load/save using schema IO functions from `src/schemas/io_utils.py` (e.g., `load_solution()`, `save_validation()`)
-   - See `src/schemas/PIPELINE_SCHEMAS.md` for detailed schema documentation
+   - **Generation Pipeline**: Use `Domain`, `Area`, `Capability`, `Task`, `TaskSolution`, `ValidationResult` objects
+   - **Evaluation Pipeline**: Use `EvalConfig`, `EvalDataset`, `CapabilityScore` objects
+   - Load/save using schema IO functions from `src/schemas/io_utils.py` and `src/schemas/eval_io_utils.py`
+   - See `src/schemas/GENERATION_PIPELINE_SCHEMAS.md` for generation pipeline documentation
+   - See `src/schemas/EVALUATION_PIPELINE_SCHEMAS.md` for evaluation pipeline documentation
 
 2. **Use Model Call Utilities**: All LLM interactions must use the standardized model client utilities:
    - Import from `src.utils.model_client_utils`
diff --git a/poetry.lock b/poetry.lock
index dceb53a5..b6b06d04 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
 
 [[package]]
 name = "ag2"
@@ -84,6 +84,26 @@ websockets = ["websockets (>=14.0,<16)"]
 websurfer = ["beautifulsoup4", "markdownify", "pathvalidate", "pdfminer-six"]
 wikipedia = ["wikipedia-api (>=0.8.1,<1.0)"]
 
+[[package]]
+name = "aioboto3"
+version = "15.2.0"
+description = "Async boto3 wrapper"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "aioboto3-15.2.0-py3-none-any.whl", hash = "sha256:3582f033543ee7671ae27b1df538f2095bcc91be4a3a78e7498b5ce6c654f26b"},
+    {file = "aioboto3-15.2.0.tar.gz", hash = "sha256:6a151ee0aa0f4b9af6031e6446f28460991fcc50a4ac54a1650d145319d5e2e5"},
+]
+
+[package.dependencies]
+aiobotocore = {version = "2.24.2", extras = ["boto3"]}
+aiofiles = ">=23.2.1"
+
+[package.extras]
+chalice = ["chalice (>=1.24.0)"]
+s3cse = ["cryptography (>=44.0.1)"]
+
 [[package]]
 name = "aiobotocore"
 version = "2.24.2"
@@ -99,6 +119,7 @@ files = [
 [package.dependencies]
 aiohttp = ">=3.9.2,<4.0.0"
 aioitertools = ">=0.5.1,<1.0.0"
+boto3 = {version = ">=1.40.15,<1.40.19", optional = true, markers = "extra == \"boto3\""}
 botocore = ">=1.40.15,<1.40.19"
 jmespath = ">=0.7.1,<2.0.0"
 multidict = ">=6.0.0,<7.0.0"
@@ -1758,6 +1779,107 @@ type1 = ["xattr ; sys_platform == \"darwin\""]
 unicode = ["unicodedata2 (>=15.1.0) ; python_version <= \"3.12\""]
 woff = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "zopfli (>=0.1.4)"]
 
+[[package]]
+name = "frozendict"
+version = "2.4.7"
+description = "A simple immutable dictionary"
+optional = false
+python-versions = ">=3.6"
+groups = ["main"]
+files = [
+    {file = "frozendict-2.4.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bd37c087a538944652363cfd77fb7abe8100cc1f48afea0b88b38bf0f469c3d2"},
+    {file = "frozendict-2.4.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2b96f224a5431889f04b2bc99c0e9abe285679464273ead83d7d7f2a15907d35"},
+    {file = "frozendict-2.4.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5c1781f28c4bbb177644b3cb6d5cf7da59be374b02d91cdde68d1d5ef32e046b"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8a06f6c3d3b8d487226fdde93f621e04a54faecc5bf5d9b16497b8f9ead0ac3e"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b809d1c861436a75b2b015dbfd94f6154fa4e7cb0a70e389df1d5f6246b21d1e"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:75eefdf257a84ea73d553eb80d0abbff0af4c9df62529e4600fd3f96ff17eeb3"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a4d2b27d8156922c9739dd2ff4f3934716e17cfd1cf6fb61aa17af7d378555e9"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2ebd953c41408acfb8041ff9e6c3519c09988fb7e007df7ab6b56e229029d788"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c64d34b802912ee6d107936e970b90750385a1fdfd38d310098b2918ba4cbf2"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:294a7d7d51dd979021a8691b46aedf9bd4a594ce3ed33a4bdf0a712d6929d712"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f65d1b90e9ddc791ea82ef91a9ae0ab27ef6c0cfa88fadfa0e5ca5a22f8fa22f"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:82d5272d08451bcef6fb6235a0a04cf1816b6b6815cec76be5ace1de17e0c1a4"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5943c3f683d3f32036f6ca975e920e383d85add1857eee547742de9c1f283716"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:88c6bea948da03087035bb9ca9625305d70e084aa33f11e17048cb7dda4ca293"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:ffd1a9f9babec9119712e76a39397d8aa0d72ef8c4ccad917c6175d7e7f81b74"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:0ff6f57854cc8aa8b30947ec005f9246d96e795a78b21441614e85d39b708822"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d774df483c12d6cba896eb9a1337bbc5ad3f564eb18cfaaee3e95fb4402f2a86"},
+    {file = "frozendict-2.4.7-cp310-cp310-win32.whl", hash = "sha256:a10d38fa300f6bef230fae1fdb4bc98706b78c8a3a2f3140fde748469ef3cfe8"},
+    {file = "frozendict-2.4.7-cp310-cp310-win_amd64.whl", hash = "sha256:dd518f300e5eb6a8827bee380f2e1a31c01dc0af069b13abdecd4e5769bd8a97"},
+    {file = "frozendict-2.4.7-cp310-cp310-win_arm64.whl", hash = "sha256:3842cfc2d69df5b9978f2e881b7678a282dbdd6846b11b5159f910bc633cbe4f"},
+    {file = "frozendict-2.4.7-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:735be62d757e1e7e496ccb6401efe82b473faa653e95eec0826cd7819a29a34c"},
+    {file = "frozendict-2.4.7-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fff8584e3bbdc5c1713cd016fbf4b88babfffd4e5e89b39020f2a208dd24c900"},
+    {file = "frozendict-2.4.7-cp36-cp36m-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:91a06ee46b3e3ef3b237046b914c0c905eab9fdfeac677e9b51473b482e24c28"},
+    {file = "frozendict-2.4.7-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fd7ba56cf6340c732ecb78787c4e9600c4bd01372af7313ded21037126d33ec6"},
+    {file = "frozendict-2.4.7-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1b4426457757c30ad86b57cdbcc0adaa328399f1ec3d231a0a2ce7447248987"},
+    {file = "frozendict-2.4.7-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b22d337c76b765cb7961d4ee47fe29f89e30921eb47bf856b14dc7641f4df3e5"},
+    {file = "frozendict-2.4.7-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57134ef5df1dd32229c148c75a7b89245dbdb89966a155d6dfd4bda653e8c7af"},
+    {file = "frozendict-2.4.7-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:c89617a784e1c24a31f5aa4809402f8072a26b64ddbc437897f6391ff69b0ee9"},
+    {file = "frozendict-2.4.7-cp36-cp36m-musllinux_1_2_armv7l.whl", hash = "sha256:176dd384dfe1d0d79449e05f67764c57c6f0f3095378bf00deb33165d5d2df5b"},
+    {file = "frozendict-2.4.7-cp36-cp36m-musllinux_1_2_i686.whl", hash = "sha256:b1a94e8935c69ae30043b465af496f447950f2c03660aee8657074084faae0b3"},
+    {file = "frozendict-2.4.7-cp36-cp36m-musllinux_1_2_ppc64le.whl", hash = "sha256:c570649ceccfa5e11ad9351e9009dc484c315a51a56aa02ced07ae97644bb7aa"},
+    {file = "frozendict-2.4.7-cp36-cp36m-musllinux_1_2_s390x.whl", hash = "sha256:e0d450c9d444befe2668bf9386ac2945a2f38152248d58f6b3feea63db59ba08"},
+    {file = "frozendict-2.4.7-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:7469912c1a04102457871ff675aebe600dbb7e79a6450a166cc8079b88f6ca79"},
+    {file = "frozendict-2.4.7-cp36-cp36m-win32.whl", hash = "sha256:2808bab8e21887a8c106cca5f6f0ab5bda7ee81e159409a10f53d57542ccd99c"},
+    {file = "frozendict-2.4.7-cp36-cp36m-win_amd64.whl", hash = "sha256:ca17ac727ffeeba6c46f5a88e0284a7cb1520fb03127645fcdd7041080adf849"},
+    {file = "frozendict-2.4.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8ef11dd996208c5a96eab0683f7a17cb4b992948464d2498520efd75a10a2aac"},
+    {file = "frozendict-2.4.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b960e700dc95faca7dd6919d0dce183ef89bfe01554d323cf5de7331a2e80f83"},
+    {file = "frozendict-2.4.7-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:fc43257a06e6117da6a8a0779243b974cdb9205fed82e32eb669f6746c75d27d"},
+    {file = "frozendict-2.4.7-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ece525da7d0aa3eb56c3e479f30612028d545081c15450d67d771a303ee7d4c"},
+    {file = "frozendict-2.4.7-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7ddffe7c0b3be414f88185e212758989c65b497315781290eb029e2c1e1fd64e"},
+    {file = "frozendict-2.4.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05dd27415f913cd11649009f53d97eb565ce7b76787d7869c4733738c10e8d27"},
+    {file = "frozendict-2.4.7-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0664092614d2b9d0aa404731f33ad5459a54fe8dab9d1fd45aa714fa6de4d0ef"},
+    {file = "frozendict-2.4.7-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:830d181781bb263c9fa430b81f82c867546f5dcb368e73931c8591f533a04afb"},
+    {file = "frozendict-2.4.7-cp37-cp37m-musllinux_1_2_armv7l.whl", hash = "sha256:c93827e0854393cd904b927ceb529afc17776706f5b9e45c7eaf6a40b3fc7b25"},
+    {file = "frozendict-2.4.7-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:6d30dbba6eb1497c695f3108c2c292807e7a237c67a1b9ff92c04e89969d22d1"},
+    {file = "frozendict-2.4.7-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:ec846bde66b75d68518c7b24a0a46d09db0aee5a6aefd2209d9901faf6e9df21"},
+    {file = "frozendict-2.4.7-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:1df8e22f7d24172c08434b10911f3971434bb5a59b4d1b0078ae33a623625294"},
+    {file = "frozendict-2.4.7-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:39abe54264ae69a0b2e00fabdb5118604f36a5b927d33e7532cd594c5142ebf4"},
+    {file = "frozendict-2.4.7-cp37-cp37m-win32.whl", hash = "sha256:d10c2ea7c90ba204cd053167ba214d0cdd00f3184c7b8d117a56d7fd2b0c6553"},
+    {file = "frozendict-2.4.7-cp37-cp37m-win_amd64.whl", hash = "sha256:346a53640f15c1640a3503f60ba99df39e4ab174979f10db4304bbb378df5cbd"},
+    {file = "frozendict-2.4.7-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:cc520f3f4af14f456143a534d554175dbc0f0636ffd653e63675cd591862a9d9"},
+    {file = "frozendict-2.4.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7fd0d0bd3a79e009dddbf5fedfd927ad495c218cd7b13a112d28a37e2079725c"},
+    {file = "frozendict-2.4.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a404857e48d85a517bb5b974d740f8c4fccb25d8df98885f3a2a4d950870b845"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:f42e2c25d3eee4ea3da88466f38ed0dce8c622a1a9d92572e5ee53b7a6bb9ef1"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1a083e9ee7a1904e545a6307c7db1dd76200077520fcbf7a98d886f81b57dd7"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:f556ea05d9c5f6dae50d57ce6234e4ab1fbf4551dd0d52b4fed6ef537d9f3d3c"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:739ee81e574f33b46f1e6d9312f3ec2c549bdd574a4ebb6bf106775c9d85ca7b"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:48ab42b01952bc11543577de9fe5d9ca7c41b35dda36326a07fb47d84b3d5f22"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34233deb8d09e798e874a6ac00b054d2e842164d982ebd43eb91b9f0a6a34876"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:76bd99f3508cb2ec87976f2e3fe7d92fb373a661cacffb863013d15e4cfaf0eb"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:a265e95e7087f44b88a6d78a63ea95a2ca0eb0a21ab4f76047f4c164a8beb413"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:1662f1b72b4f4a2ffdfdc4981ece275ca11f90244208ac1f1fc2c17fc9c9437a"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:2e5d2c30f4a3fea83a14b0a5722f21c10de5c755ab5637c70de5eb60886d58cd"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:2cf0a665bf2f1ce69d3cd8b6d3574b1d32ae00981a16fa1d255d2da8a2e44b7c"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_riscv64.whl", hash = "sha256:708382875c3cfe91be625dddcba03dee2dfdadbad2c431568a8c7f2f2af0bbee"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:7fe194f37052a8f45a1a8507e36229e28b79f3d21542ae55ea6a18c6a444f625"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:d8930877a2dd40461968d9238d95c754e51b33ce7d2a45500f88ffeed5cb7202"},
+    {file = "frozendict-2.4.7-cp38-cp38-win32.whl", hash = "sha256:6991469a889ee8a108fe5ed1b044447c7b7a07da9067e93c59cbfac8c1d625cf"},
+    {file = "frozendict-2.4.7-cp38-cp38-win_amd64.whl", hash = "sha256:ebae8f4a07372acfc3963fc8d68070cdaab70272c3dd836f057ebbe9b7d38643"},
+    {file = "frozendict-2.4.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1c521ad3d747aa475e9040e231f5f1847c04423bae5571c010a9d969e6983c40"},
+    {file = "frozendict-2.4.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:70e655c3aa5f893807830f549a7275031a181dbebeaf74c461b51adc755d9335"},
+    {file = "frozendict-2.4.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11d35075f979c96f528d74ccbf89322a7ef8211977dd566bc384985ebce689be"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d4d7ec24d3bfcfac3baf4dffd7fcea3fa8474b087ce32696232132064aa062cf"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5694417864875ca959932e3b98e2b7d5d27c75177bf510939d0da583712ddf58"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:57a754671c5746e11140363aa2f4e7a75c8607de6e85a2bf89dcd1daf51885a7"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:313e0e1d8b22b317aa1f7dd48aec8cbb0416ddd625addf7648a69148fcb9ccff"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:176a66094428b9fd66270927b9787e3b8b1c9505ef92723c7b0ef1923dbe3c4a"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de1fff2683d8af01299ec01eb21a24b6097ce92015fc1fbefa977cecf076a3fc"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:115a822ecd754574e11205e0880e9d61258d960863d6fd1b90883aa800f6d3b3"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:de8d2c98777ba266f5466e211778d4e3bd00635a207c54f6f7511d8613b86dd3"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:1e307be0e1f26cbc9593f6bdad5238a1408a50f39f63c9c39eb93c7de5926767"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:78a55f320ca924545494ce153df02d4349156cd95dc4603c1f0e80c42c889249"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:e89492dfcc4c27a718f8b5a4c8df1a2dec6c689718cccd70cb2ceba69ab8c642"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:1e801d62e35df24be2c6f7f43c114058712efa79a8549c289437754dad0207a3"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:3ed9e2f3547a59f4ef5c233614c6faa6221d33004cb615ae1c07ffc551cfe178"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ad0448ed5569f0a9b9b010af9fb5b6d9bdc0b4b877a3ddb188396c4742e62284"},
+    {file = "frozendict-2.4.7-cp39-cp39-win32.whl", hash = "sha256:eab9ef8a9268042e819de03079b984eb0894f05a7b63c4e5319b1cf1ef362ba7"},
+    {file = "frozendict-2.4.7-cp39-cp39-win_amd64.whl", hash = "sha256:8dfe2f4840b043436ee5bdd07b0fa5daecedf086e6957e7df050a56ab6db078d"},
+    {file = "frozendict-2.4.7-cp39-cp39-win_arm64.whl", hash = "sha256:cc2085926872a1b26deda4b81b2254d2e5d2cb2c4d7b327abe4c820b7c93f40b"},
+    {file = "frozendict-2.4.7-py3-none-any.whl", hash = "sha256:972af65924ea25cf5b4d9326d549e69a9a4918d8a76a9d3a7cd174d98b237550"},
+    {file = "frozendict-2.4.7.tar.gz", hash = "sha256:e478fb2a1391a56c8a6e10cc97c4a9002b410ecd1ac28c18d780661762e271bd"},
+]
+
 [[package]]
 name = "frozenlist"
 version = "1.7.0"
@@ -2208,6 +2330,8 @@ files = [
     {file = "greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d"},
     {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5"},
     {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f"},
+    {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7"},
+    {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8"},
     {file = "greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c"},
     {file = "greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2"},
     {file = "greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246"},
@@ -2217,6 +2341,8 @@ files = [
     {file = "greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8"},
     {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52"},
     {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa"},
+    {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c"},
+    {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5"},
     {file = "greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9"},
     {file = "greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd"},
     {file = "greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb"},
@@ -2226,6 +2352,8 @@ files = [
     {file = "greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0"},
     {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0"},
     {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f"},
+    {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0"},
+    {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d"},
     {file = "greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02"},
     {file = "greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31"},
     {file = "greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945"},
@@ -2235,6 +2363,8 @@ files = [
     {file = "greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671"},
     {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b"},
     {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae"},
+    {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b"},
+    {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929"},
     {file = "greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b"},
     {file = "greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0"},
     {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f"},
@@ -2242,6 +2372,8 @@ files = [
     {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1"},
     {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735"},
     {file = "greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337"},
+    {file = "greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269"},
+    {file = "greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681"},
     {file = "greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01"},
     {file = "greenlet-3.2.4-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:b6a7c19cf0d2742d0809a4c05975db036fdff50cd294a93632d6a310bf9ac02c"},
     {file = "greenlet-3.2.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:27890167f55d2387576d1f41d9487ef171849ea0359ce1510ca6e06c8bece11d"},
@@ -2251,6 +2383,8 @@ files = [
     {file = "greenlet-3.2.4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9913f1a30e4526f432991f89ae263459b1c64d1608c0d22a5c79c287b3c70df"},
     {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b90654e092f928f110e0007f572007c9727b5265f7632c2fa7415b4689351594"},
     {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:81701fd84f26330f0d5f4944d4e92e61afe6319dcd9775e39396e39d7c3e5f98"},
+    {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:28a3c6b7cd72a96f61b0e4b2a36f681025b60ae4779cc73c1535eb5f29560b10"},
+    {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:52206cd642670b0b320a1fd1cbfd95bca0e043179c1d8a045f2c6109dfe973be"},
     {file = "greenlet-3.2.4-cp39-cp39-win32.whl", hash = "sha256:65458b409c1ed459ea899e939f0e1cdb14f58dbc803f2f93c5eab5694d32671b"},
     {file = "greenlet-3.2.4-cp39-cp39-win_amd64.whl", hash = "sha256:d2e685ade4dafd447ede19c31277a224a239a0a1a4eca4e6390efedf20260cfb"},
     {file = "greenlet-3.2.4.tar.gz", hash = "sha256:0dca0d95ff849f9a364385f36ab49f50065d76964944638be9691e1832e9f86d"},
@@ -2680,55 +2814,60 @@ files = [
 
 [[package]]
 name = "inspect-ai"
-version = "0.3.122"
+version = "0.3.159"
 description = "Framework for large language model evaluations"
 optional = false
 python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "inspect_ai-0.3.122-py3-none-any.whl", hash = "sha256:c40958a0e74e91bb2e7a14059e4bfada73757e00d2a6cbdf5754bd3f8e92a955"},
-    {file = "inspect_ai-0.3.122.tar.gz", hash = "sha256:77b18a72603a79f05630216e577f92b9bc404c616bce080ccb67028c8368428c"},
+    {file = "inspect_ai-0.3.159-py3-none-any.whl", hash = "sha256:71f87fd242d4fb61998691143c5dd3613fd8009840a4503d3a1fa48b504fd6b3"},
+    {file = "inspect_ai-0.3.159.tar.gz", hash = "sha256:91d42ba18ac10c5ad9d4ee25e01dda142c86058a12462e5ba051b041f8afe2ed"},
 ]
 
 [package.dependencies]
+aioboto3 = ">=13.0.0"
 aiohttp = ">=3.9.0"
 anyio = ">=4.8.0"
-beautifulsoup4 = "*"
-click = ">=8.1.3,<8.2.0 || >8.2.0"
+beautifulsoup4 = ">=4.10.0"
+boto3 = "*"
+click = ">=8.1.3,<8.2.0 || >8.2.0,<8.2.2"
 debugpy = "*"
 docstring-parser = ">=0.16"
 exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
-fsspec = ">=2023.1.0,<=2025.3.0"
+frozendict = ">=2.4.6"
+fsspec = ">=2023.1.0,<=2025.9.0"
 httpx = "*"
 ijson = ">=3.2.0"
 jsonlines = ">=3.0.0"
 jsonpatch = ">=1.32"
-jsonpath-ng = ">=1.7.0"
+jsonpath-ng = ">=1.6.0"
 jsonref = ">=1.1.0"
 jsonschema = ">3.1.1"
 mmh3 = ">3.1.0"
-nest_asyncio = "*"
+nest_asyncio2 = "*"
 numpy = "*"
 platformdirs = ">=2.3.0"
 psutil = "*"
 pydantic = ">=2.11.4"
 python-dotenv = ">=0.16.0"
 pyyaml = "*"
-rich = ">=13.3.3,<14.0.0"
+rich = ">=13.3.3,<14.0.0 || >14.0.0"
 s3fs = ">=2023"
 semver = ">=3.0.0"
 shortuuid = "*"
 sniffio = "*"
 tenacity = "*"
-textual = ">=0.86.2,<v3.0.0"
+textual = ">=2.1.0"
+tiktoken = ">=0.12.0"
 typing_extensions = ">=4.9.0"
+universal-pathlib = ">=0.2.6"
 zipp = ">=3.19.1"
 
 [package.extras]
-dev = ["aioboto3", "anthropic (>=0.52.0)", "azure-ai-inference", "azure-identity", "google-genai", "griffe", "groq", "ipython", "jsonpath-ng", "markdown", "mcp (>=1.10.0)", "mistralai", "moto[server]", "mypy (>=1.17.0)", "nbformat", "openai", "pandas (>=2.0.0)", "pandas-stubs", "panflute", "pip", "pre-commit", "pyarrow (>=10.0.1)", "pyarrow-stubs", "pylint", "pytest", "pytest-asyncio", "pytest-cov", "pytest-dotenv", "pytest-mock", "pytest-watcher", "pytest-xdist", "ruff (==0.9.6)", "textual-dev (>=0.86.2)", "together", "transformer-lens", "trio", "types-Markdown", "types-PyYAML", "types-aioboto3", "types-beautifulsoup4", "types-boto3", "types-botocore", "types-jsonpatch", "types-jsonschema", "types-protobuf", "types-psutil", "types-python-dateutil"]
+dev = ["adlfs (>=2025.8.0)", "anthropic (>=0.62.0)", "azure-ai-inference", "azure-identity", "fastapi", "google-genai", "griffe", "groq", "huggingface_hub", "inspect_scout", "ipython", "jsonpath-ng", "markdown", "mcp (>=1.10.0)", "mistralai", "moto[server]", "mypy (>=1.17.0)", "nbformat", "openai", "pandas (>=2.0.0)", "pandas-stubs", "panflute", "pip", "pre-commit", "pyarrow (>=10.0.1)", "pyarrow-stubs", "pylint", "pytest", "pytest-asyncio", "pytest-cov", "pytest-dotenv", "pytest-mock", "pytest-watcher", "pytest-xdist", "ruff (==0.9.6)", "textual-dev (>=0.86.2)", "together", "trio", "types-Markdown", "types-PyYAML", "types-aioboto3", "types-beautifulsoup4", "types-boto3", "types-botocore", "types-grpcio", "types-jsonpatch", "types-jsonschema", "types-protobuf", "types-psutil", "types-python-dateutil", "uvicorn", "xai_sdk"]
 dev-mcp-tests = ["mcp-server-fetch", "mcp_server_git"]
 dist = ["build", "twine"]
-doc = ["griffe", "jupyter", "markdown", "panflute", "quarto-cli (==1.7.32)"]
+doc = ["click (>=8.2.0)", "griffe", "jupyter", "markdown", "panflute", "quarto-cli (==1.7.32)"]
 
 [[package]]
 name = "ipykernel"
@@ -4430,12 +4569,24 @@ version = "1.6.0"
 description = "Patch asyncio to allow nested event loops"
 optional = false
 python-versions = ">=3.5"
-groups = ["main", "docs"]
+groups = ["docs"]
 files = [
     {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
     {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
 ]
 
+[[package]]
+name = "nest-asyncio2"
+version = "1.7.1"
+description = "Patch asyncio to allow nested event loops"
+optional = false
+python-versions = ">=3.5"
+groups = ["main"]
+files = [
+    {file = "nest_asyncio2-1.7.1-py3-none-any.whl", hash = "sha256:f83bc1744c3cfa7d47fd29431e5e168db6cb76eda1bb20108955c32f60d7eddf"},
+    {file = "nest_asyncio2-1.7.1.tar.gz", hash = "sha256:a1fe5bbbd20894dcceb1842322d74992c5834d5ab692af2c4f59a9a4fcf75fe8"},
+]
+
 [[package]]
 name = "networkx"
 version = "3.4.2"
@@ -5611,8 +5762,8 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
     {version = ">=1.22.4", markers = "python_version < \"3.11\""},
 ]
 python-dateutil = ">=2.8.2"
@@ -5672,6 +5823,18 @@ files = [
 qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"]
 testing = ["docopt", "pytest"]
 
+[[package]]
+name = "pathlib-abc"
+version = "0.5.2"
+description = "Backport of pathlib ABCs"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "pathlib_abc-0.5.2-py3-none-any.whl", hash = "sha256:4c9d94cf1b23af417ce7c0417b43333b06a106c01000b286c99de230d95eefbb"},
+    {file = "pathlib_abc-0.5.2.tar.gz", hash = "sha256:fcd56f147234645e2c59c7ae22808b34c364bb231f685ddd9f96885aed78a94c"},
+]
+
 [[package]]
 name = "pathspec"
 version = "0.12.1"
@@ -6518,8 +6681,8 @@ files = [
 astroid = ">=3.3.8,<=3.4.0.dev0"
 colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
 dill = [
-    {version = ">=0.3.6", markers = "python_version >= \"3.11\""},
     {version = ">=0.3.7", markers = "python_version >= \"3.12\""},
+    {version = ">=0.3.6", markers = "python_version == \"3.11\""},
     {version = ">=0.2", markers = "python_version < \"3.11\""},
 ]
 isort = ">=4.2.5,<5.13 || >5.13,<7"
@@ -8164,43 +8327,69 @@ files = [
 
 [[package]]
 name = "tiktoken"
-version = "0.11.0"
+version = "0.12.0"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "tiktoken-0.11.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:8a9b517d6331d7103f8bef29ef93b3cca95fa766e293147fe7bacddf310d5917"},
-    {file = "tiktoken-0.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b4ddb1849e6bf0afa6cc1c5d809fb980ca240a5fffe585a04e119519758788c0"},
-    {file = "tiktoken-0.11.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:10331d08b5ecf7a780b4fe4d0281328b23ab22cdb4ff65e68d56caeda9940ecc"},
-    {file = "tiktoken-0.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b062c82300341dc87e0258c69f79bed725f87e753c21887aea90d272816be882"},
-    {file = "tiktoken-0.11.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:195d84bec46169af3b1349a1495c151d37a0ff4cba73fd08282736be7f92cc6c"},
-    {file = "tiktoken-0.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe91581b0ecdd8783ce8cb6e3178f2260a3912e8724d2f2d49552b98714641a1"},
-    {file = "tiktoken-0.11.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4ae374c46afadad0f501046db3da1b36cd4dfbfa52af23c998773682446097cf"},
-    {file = "tiktoken-0.11.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:25a512ff25dc6c85b58f5dd4f3d8c674dc05f96b02d66cdacf628d26a4e4866b"},
-    {file = "tiktoken-0.11.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2130127471e293d385179c1f3f9cd445070c0772be73cdafb7cec9a3684c0458"},
-    {file = "tiktoken-0.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21e43022bf2c33f733ea9b54f6a3f6b4354b909f5a73388fb1b9347ca54a069c"},
-    {file = "tiktoken-0.11.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:adb4e308eb64380dc70fa30493e21c93475eaa11669dea313b6bbf8210bfd013"},
-    {file = "tiktoken-0.11.0-cp311-cp311-win_amd64.whl", hash = "sha256:ece6b76bfeeb61a125c44bbefdfccc279b5288e6007fbedc0d32bfec602df2f2"},
-    {file = "tiktoken-0.11.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fd9e6b23e860973cf9526544e220b223c60badf5b62e80a33509d6d40e6c8f5d"},
-    {file = "tiktoken-0.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6a76d53cee2da71ee2731c9caa747398762bda19d7f92665e882fef229cb0b5b"},
-    {file = "tiktoken-0.11.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ef72aab3ea240646e642413cb363b73869fed4e604dcfd69eec63dc54d603e8"},
-    {file = "tiktoken-0.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f929255c705efec7a28bf515e29dc74220b2f07544a8c81b8d69e8efc4578bd"},
-    {file = "tiktoken-0.11.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:61f1d15822e4404953d499fd1dcc62817a12ae9fb1e4898033ec8fe3915fdf8e"},
-    {file = "tiktoken-0.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:45927a71ab6643dfd3ef57d515a5db3d199137adf551f66453be098502838b0f"},
-    {file = "tiktoken-0.11.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a5f3f25ffb152ee7fec78e90a5e5ea5b03b4ea240beed03305615847f7a6ace2"},
-    {file = "tiktoken-0.11.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7dc6e9ad16a2a75b4c4be7208055a1f707c9510541d94d9cc31f7fbdc8db41d8"},
-    {file = "tiktoken-0.11.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a0517634d67a8a48fd4a4ad73930c3022629a85a217d256a6e9b8b47439d1e4"},
-    {file = "tiktoken-0.11.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7fb4effe60574675118b73c6fbfd3b5868e5d7a1f570d6cc0d18724b09ecf318"},
-    {file = "tiktoken-0.11.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:94f984c9831fd32688aef4348803b0905d4ae9c432303087bae370dc1381a2b8"},
-    {file = "tiktoken-0.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:2177ffda31dec4023356a441793fed82f7af5291120751dee4d696414f54db0c"},
-    {file = "tiktoken-0.11.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:13220f12c9e82e399377e768640ddfe28bea962739cc3a869cad98f42c419a89"},
-    {file = "tiktoken-0.11.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7f2db627f5c74477c0404b4089fd8a28ae22fa982a6f7d9c7d4c305c375218f3"},
-    {file = "tiktoken-0.11.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2302772f035dceb2bcf8e55a735e4604a0b51a6dd50f38218ff664d46ec43807"},
-    {file = "tiktoken-0.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20b977989afe44c94bcc50db1f76971bb26dca44218bd203ba95925ef56f8e7a"},
-    {file = "tiktoken-0.11.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:669a1aa1ad6ebf1b3c26b45deb346f345da7680f845b5ea700bba45c20dea24c"},
-    {file = "tiktoken-0.11.0-cp39-cp39-win_amd64.whl", hash = "sha256:e363f33c720a055586f730c00e330df4c7ea0024bf1c83a8a9a9dbc054c4f304"},
-    {file = "tiktoken-0.11.0.tar.gz", hash = "sha256:3c518641aee1c52247c2b97e74d8d07d780092af79d5911a6ab5e79359d9b06a"},
+    {file = "tiktoken-0.12.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3de02f5a491cfd179aec916eddb70331814bd6bf764075d39e21d5862e533970"},
+    {file = "tiktoken-0.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b6cfb6d9b7b54d20af21a912bfe63a2727d9cfa8fbda642fd8322c70340aad16"},
+    {file = "tiktoken-0.12.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:cde24cdb1b8a08368f709124f15b36ab5524aac5fa830cc3fdce9c03d4fb8030"},
+    {file = "tiktoken-0.12.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:6de0da39f605992649b9cfa6f84071e3f9ef2cec458d08c5feb1b6f0ff62e134"},
+    {file = "tiktoken-0.12.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6faa0534e0eefbcafaccb75927a4a380463a2eaa7e26000f0173b920e98b720a"},
+    {file = "tiktoken-0.12.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:82991e04fc860afb933efb63957affc7ad54f83e2216fe7d319007dab1ba5892"},
+    {file = "tiktoken-0.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:6fb2995b487c2e31acf0a9e17647e3b242235a20832642bb7a9d1a181c0c1bb1"},
+    {file = "tiktoken-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6e227c7f96925003487c33b1b32265fad2fbcec2b7cf4817afb76d416f40f6bb"},
+    {file = "tiktoken-0.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c06cf0fcc24c2cb2adb5e185c7082a82cba29c17575e828518c2f11a01f445aa"},
+    {file = "tiktoken-0.12.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:f18f249b041851954217e9fd8e5c00b024ab2315ffda5ed77665a05fa91f42dc"},
+    {file = "tiktoken-0.12.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:47a5bc270b8c3db00bb46ece01ef34ad050e364b51d406b6f9730b64ac28eded"},
+    {file = "tiktoken-0.12.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:508fa71810c0efdcd1b898fda574889ee62852989f7c1667414736bcb2b9a4bd"},
+    {file = "tiktoken-0.12.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a1af81a6c44f008cba48494089dd98cccb8b313f55e961a52f5b222d1e507967"},
+    {file = "tiktoken-0.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:3e68e3e593637b53e56f7237be560f7a394451cb8c11079755e80ae64b9e6def"},
+    {file = "tiktoken-0.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b97f74aca0d78a1ff21b8cd9e9925714c15a9236d6ceacf5c7327c117e6e21e8"},
+    {file = "tiktoken-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b90f5ad190a4bb7c3eb30c5fa32e1e182ca1ca79f05e49b448438c3e225a49b"},
+    {file = "tiktoken-0.12.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:65b26c7a780e2139e73acc193e5c63ac754021f160df919add909c1492c0fb37"},
+    {file = "tiktoken-0.12.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:edde1ec917dfd21c1f2f8046b86348b0f54a2c0547f68149d8600859598769ad"},
+    {file = "tiktoken-0.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:35a2f8ddd3824608b3d650a000c1ef71f730d0c56486845705a8248da00f9fe5"},
+    {file = "tiktoken-0.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83d16643edb7fa2c99eff2ab7733508aae1eebb03d5dfc46f5565862810f24e3"},
+    {file = "tiktoken-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc5288f34a8bc02e1ea7047b8d041104791d2ddbf42d1e5fa07822cbffe16bd"},
+    {file = "tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3"},
+    {file = "tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160"},
+    {file = "tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa"},
+    {file = "tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be"},
+    {file = "tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a"},
+    {file = "tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3"},
+    {file = "tiktoken-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f"},
+    {file = "tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646"},
+    {file = "tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88"},
+    {file = "tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff"},
+    {file = "tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830"},
+    {file = "tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b"},
+    {file = "tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b"},
+    {file = "tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71"},
+    {file = "tiktoken-0.12.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:d51d75a5bffbf26f86554d28e78bfb921eae998edc2675650fd04c7e1f0cdc1e"},
+    {file = "tiktoken-0.12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:09eb4eae62ae7e4c62364d9ec3a57c62eea707ac9a2b2c5d6bd05de6724ea179"},
+    {file = "tiktoken-0.12.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:df37684ace87d10895acb44b7f447d4700349b12197a526da0d4a4149fde074c"},
+    {file = "tiktoken-0.12.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:4c9614597ac94bb294544345ad8cf30dac2129c05e2db8dc53e082f355857af7"},
+    {file = "tiktoken-0.12.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:20cf97135c9a50de0b157879c3c4accbb29116bcf001283d26e073ff3b345946"},
+    {file = "tiktoken-0.12.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:15d875454bbaa3728be39880ddd11a5a2a9e548c29418b41e8fd8a767172b5ec"},
+    {file = "tiktoken-0.12.0-cp39-cp39-win_amd64.whl", hash = "sha256:2cff3688ba3c639ebe816f8d58ffbbb0aa7433e23e08ab1cade5d175fc973fb3"},
+    {file = "tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931"},
 ]
 
 [package.dependencies]
@@ -8592,6 +8781,28 @@ files = [
 [package.extras]
 test = ["coverage", "pytest", "pytest-cov"]
 
+[[package]]
+name = "universal-pathlib"
+version = "0.3.7"
+description = "pathlib api extended to use fsspec backends"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "universal_pathlib-0.3.7-py3-none-any.whl", hash = "sha256:fb95117b20b5981f86ef9d887fddbf9c61d3596634ba42cccea444931d87c201"},
+    {file = "universal_pathlib-0.3.7.tar.gz", hash = "sha256:36331056fa59a7d7cd3b61b4045f3a3418f446f23ec1a01d281c4510814b4b05"},
+]
+
+[package.dependencies]
+fsspec = ">=2024.5.0"
+pathlib-abc = ">=0.5.1,<0.6.0"
+
+[package.extras]
+dev = ["adlfs (>=2024)", "cheroot", "fsspec[adl,gcs,github,http,s3,smb,ssh] (>=2024.5.0)", "gcsfs (>=2024.5.0)", "huggingface_hub", "moto[s3,server]", "pyftpdlib", "s3fs (>=2024.5.0)", "typing_extensions ; python_version < \"3.11\"", "webdav4[fsspec]", "wsgidav"]
+dev-third-party = ["pydantic", "pydantic-settings"]
+tests = ["mypy (>=1.10.0)", "packaging", "pydantic (>=2)", "pylint (>=2.17.4)", "pytest (>=8)", "pytest-cov (>=4.1.0)", "pytest-mock (>=3.12.0)", "pytest-mypy-plugins (>=3.1.2)", "pytest-sugar (>=0.9.7)"]
+typechecking = ["mypy (>=1.10.0)", "pytest-mypy-plugins (>=3.1.2)"]
+
 [[package]]
 name = "urllib3"
 version = "2.5.0"
@@ -9284,4 +9495,4 @@ cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and pyt
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10, <3.13"
-content-hash = "2dbaf7bb11506a213ffe1235df6de7ad4a1943ff5a1ad29e98c9c108d3210230"
+content-hash = "2ca08429df55e63e3001da780f9032e2da40b906d8092b2bb6e97e5e44b09d34"
diff --git a/pyproject.toml b/pyproject.toml
index f0337423..d5055b82 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ dependencies = [
     "datasets>=3.2.0",
     "google-cloud-storage>=3.0.0",
     "hydra-core>=1.3.2",
-    "inspect-ai>=0.3.80",
+    "inspect-ai>=0.3.159",
     "langchain_openai>=0.3.6",
     "langchain>=0.3.19",
     "matplotlib>=3.10.0",
diff --git a/src/cfg/run_cfg.yaml b/src/cfg/run_cfg.yaml
index 7924a76d..337e07ca 100644
--- a/src/cfg/run_cfg.yaml
+++ b/src/cfg/run_cfg.yaml
@@ -1,3 +1,22 @@
+# =============================================================================
+# EXPERIMENT CONFIGURATION
+# =============================================================================
+
+exp_cfg:
+  exp_id: "test_exp"
+  seed: 37
+  trial_run: false
+
+global_cfg:
+  domain: personal finance
+  output_dir: base_output/
+  pipeline_type: base
+
+# =============================================================================
+# GENERATION PIPELINE
+# =============================================================================
+
+# LLM for generation stages (1-5)
 scientist_llm:
   name: o4-mini
   provider: openai
@@ -14,126 +33,63 @@ scientist_llm:
       temperature: 0.7
       max_tokens: 2048
       seed: 42
-    judge_llm:
-      temperature: 1.0
-      max_tokens: 2048
-      seed: 42
     task_verify:
       temperature: 0.7
       max_tokens: 2048
       seed: 42
-  local_launch_cfg:
-    # Number of threads to use for local LLM
-    max_num_seqs: 1
-    # Type of GPU to use for local LLM
-    partition: "a40"
-    # QoS for local LLM
-    qos: "m2"
-    # Time limit for local LLM
-    time: "01:00:00"
-
-subject_llm:
-  name: o1-mini
-  provider: openai
-  generation_cfg:
-    temperature: 0.7
-    max_tokens: 2048
-    seed: 42
-  local_launch_cfg:
-    # Type of GPU to use for local LLM
-    partition: "a100"
-    # Number of nodes to use for local LLM
-    num_nodes: 1
-    # Number of GPUs to use for local LLM
-    gpus_per_node: 4
-    # QoS for local LLM
-    qos: "deadline"
-    # Account for local LLM
-    account: "deadline"
-    # Time limit for local LLM
-    time: "10:00:00"
-    # vLLM args
-    vllm_args: "--max-model-len=8192,--max-num-seqs=50,--compilation-config=0,--tensor-parallel-size=4,--pipeline-parallel-size=1"
 
-prompt_cfg:
-  sys_msg: Complete the given task to the best of your ability.
-
-# Diverse task generation configuration (Stage 3)
-task_generation_cfg:
-  tasks_per_blueprint: 1  # Number of tasks to generate per blueprint
-  min_subtopics: 1  # Suggested minimum number of sub-topics
-  max_subtopics: 1  # Suggested maximum number of sub-topics
+# Stage control
+stage: "all"  # Which stage to run: 0, 1, 2, 3, 4, 5, or "all"
 
-# Task verification configuration (Stage 5)
-task_verification_cfg:
-  pass_threshold: 0.8  # Minimum pass rate to consider successful
-  strict_mode: false  # If true, all alignment criteria must pass
+# Stage tags (for running individual stages or resuming)
+areas_tag: null           # Stage 1 output tag (required for stage 2 standalone)
+capabilities_tag: null    # Stage 2 output tag (required for stage 3 standalone)
+tasks_tag: null           # Stage 3 output tag (required for stage 4 standalone)
+solution_tag: null        # Stage 4 output tag (required for stage 5 standalone)
+validation_tag: null      # Stage 5 output tag (required for eval pipeline)
 
-# Area generation configuration (Stage 1)
+# Stage 1: Area generation
 areas_cfg:
-  num_areas: 2  # Number of areas to generate
+  num_areas: 2
 
-# Capability generation configuration (Stage 2)
+# Stage 2: Capability generation
 capabilities_cfg:
-  capabilities_dir: ./ace-output/
-  results_dir: gs://ace-artifacts
-  inspect_evals_dir: /fs01/projects/aieng/public/ace/inspect_evals/src/ace_evals
-  num_seed_capabilities: 1
   num_capabilities: 4
-  num_capabilities_buffer: 0.5  # Raised from 0.1 to compensate for filtering
-  num_gen_capabilities_per_run: 1  # Raised from 1 for more diversity per batch
-  num_gen_tasks_per_capability: 100
-  num_gen_tasks_buffer: 0.0
-  task_gen_few_shot: false
-  task_gen_prompt_version: "v1"
-  num_eval_tasks_per_capability: 2
-  capabilities_gen_retry_attempts: 5
-  tasks_gen_retry_attempts: 3
-  concurrency_task_solver: 2
-  concurrency_task_verifier: 2
-  concurrency_task_eval: 2
-  inspect_eval_log_level: "info"
-
-lbo_cfg:
-  num_lbo_runs: 2
-  pipeline_id: "no_discovery"
-  train_frac: 0.5
-  num_initial_train: 2
-  acquisition_function: "variance"
+  num_capabilities_buffer: 0.1
+  num_gen_capabilities_per_run: 1
 
+# Embedding config (used for capability filtering in Stage 2)
 embedding_cfg:
-  embedding_model: "text-embedding-3-small"
-  embedding_size: 256
-  filtering_similarity_threshold: 0.85  # Raised from 0.7 to keep more diverse capabilities
-
-dimensionality_reduction_cfg:
-  reduce_dimensionality_method: "pca"
-  reduced_dimensionality_size: 2
-  no_discovery_reduced_dimensionality_method: "pca"
-  no_discovery_reduced_dimensionality_size: 2
+  embedding_model: text-embedding-3-small
+  embedding_size: 1536
+  filtering_similarity_threshold: 0.85
 
-exp_cfg:
-  seed: 37
-  trial_run: false
-  exp_id: "test_exp"
+# Stage 3: Task generation
+task_generation_cfg:
+  tasks_per_blueprint: 1
+  min_subtopics: 1
+  max_subtopics: 1
 
-# Stage control
-stage: "all"  # Which stage to run: 0, 1, 2, 3, 4, 5, or "all"
-areas_tag: null  # Areas tag from Stage 1 (required for stage 2 standalone)
-capabilities_tag: null  # Capabilities tag from Stage 2 (required for stage 3 standalone)
-tasks_tag: null  # Tasks tag from Stage 3 (required for stage 4 standalone)
-solution_tag: null  # Solution tag from Stage 4 (required for stage 5 standalone)
-validation_tag: null  # Validation tag from Stage 5 (optional for resume)
+# =============================================================================
+# EVALUATION PIPELINE
+# =============================================================================
 
-# Debug settings
-use_langchain: false  # Set to false for easier debugging (disables LangChain features)
+eval_cfg:
+  # LLMs to evaluate (required)
+  subject_llms:
+    - name: gpt-4o
+      provider: openai
+    - name: claude-3-sonnet
+      provider: anthropic
 
-# Global configuration
+  # Judge LLM for scoring (required)
+  judge_llm:
+    name: gpt-4o-mini
+    provider: openai
 
-global_cfg:
-  domain: personal finance
-  output_dir: base_output/ #Base output directory for all agentic outputs
-  pipeline_type: base
+# =============================================================================
+# HYDRA
+# =============================================================================
 
 defaults:
   - _self_
diff --git a/src/eval_stages/__init__.py b/src/eval_stages/__init__.py
new file mode 100644
index 00000000..ff7eaa4c
--- /dev/null
+++ b/src/eval_stages/__init__.py
@@ -0,0 +1,18 @@
+"""Evaluation pipeline stages.
+
+Stage 0: Setup and Dataset Preparation (no LLM calls)
+Stage 1: Evaluation Execution (runs subject LLMs, creates eval_tag)
+Stage 2: Score Aggregation (no LLM calls)
+"""
+
+from src.eval_stages.stage0_setup_and_dataset import EvalSetupError, run_eval_stage0
+from src.eval_stages.stage1_eval_execution import run_eval_stage1
+from src.eval_stages.stage2_score_aggregation import run_eval_stage2
+
+
+__all__ = [
+    "run_eval_stage0",
+    "run_eval_stage1",
+    "run_eval_stage2",
+    "EvalSetupError",
+]
diff --git a/src/eval_stages/prompts.py b/src/eval_stages/prompts.py
new file mode 100644
index 00000000..e58f606e
--- /dev/null
+++ b/src/eval_stages/prompts.py
@@ -0,0 +1,9 @@
+"""Prompts for evaluation pipeline stages."""
+
+# Default prompt template for Inspect AI evaluation
+# Used in Stage 1 (Dataset Preparation) when creating EvalDataset
+DEFAULT_EVAL_PROMPT_TEMPLATE = """You are an expert. Solve the following problem.
+
+Problem: {input}
+
+Provide your final answer."""
diff --git a/src/eval_stages/stage0_setup_and_dataset.py b/src/eval_stages/stage0_setup_and_dataset.py
new file mode 100644
index 00000000..c67b9657
--- /dev/null
+++ b/src/eval_stages/stage0_setup_and_dataset.py
@@ -0,0 +1,260 @@
+"""Eval Stage 0: Setup and Dataset Preparation.
+
+This stage:
+1. Validates that required generation outputs exist
+2. Converts validated tasks to Inspect-compatible format
+
+No LLM calls, deterministic transformation. Datasets are saved under
+eval/datasets/<validation_tag>/ since they are tied to the validation source.
+"""
+
+import json
+import logging
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+from omegaconf import DictConfig
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.schemas.eval_io_utils import save_eval_dataset
+from src.schemas.eval_schemas import EvalConfig, EvalDataset
+from src.schemas.validation_schemas import ValidationResult
+
+
+logger = logging.getLogger(__name__)
+
+
+class EvalSetupError(Exception):
+    """Error during evaluation setup."""
+
+    pass
+
+
+def _validate_inputs(
+    experiment_dir: Path,
+    validation_tag: str,
+    eval_cfg: dict,
+) -> None:
+    """Validate all required inputs exist.
+
+    Args:
+        experiment_dir: Path to experiment directory
+        validation_tag: Tag from generation Stage 5
+        eval_cfg: Evaluation config section
+
+    Raises
+    ------
+        EvalSetupError: If validation fails
+    """
+    # Check experiment.json exists
+    experiment_json = experiment_dir / "experiment.json"
+    if not experiment_json.exists():
+        raise EvalSetupError(f"Experiment file not found: {experiment_json}")
+
+    # Check validation directory exists
+    validation_dir = experiment_dir / "validation" / validation_tag
+    if not validation_dir.exists():
+        raise EvalSetupError(f"Validation directory not found: {validation_dir}")
+
+    # Check validation files exist
+    validation_files = list(validation_dir.rglob("*.json"))
+    if not validation_files:
+        raise EvalSetupError(f"No validation files found in: {validation_dir}")
+
+    # Check subject_llms configured
+    if not eval_cfg.get("subject_llms"):
+        raise EvalSetupError("subject_llms must be specified in eval_cfg")
+
+    # Check judge_llm configured
+    if not eval_cfg.get("judge_llm"):
+        raise EvalSetupError("judge_llm must be specified in eval_cfg")
+
+
+def _find_validated_tasks(
+    experiment_dir: Path, validation_tag: str
+) -> List[Tuple[Path, ValidationResult]]:
+    """Find all validated tasks (verification=true) for a given tag.
+
+    Args:
+        experiment_dir: Path to experiment directory
+        validation_tag: Tag from generation Stage 5
+
+    Returns
+    -------
+        List of (file_path, ValidationResult) tuples for verified tasks
+    """
+    validation_dir = experiment_dir / "validation" / validation_tag
+
+    validated_tasks = []
+    for vf in validation_dir.rglob("*.json"):
+        with open(vf, "r", encoding="utf-8") as f:
+            data = json.load(f)
+
+        # Skip metadata-only files
+        if "verification" not in data:
+            continue
+
+        # Only include verified tasks
+        if data.get("verification", False):
+            validation_result = ValidationResult.from_dict(data)
+            validated_tasks.append((vf, validation_result))
+
+    return validated_tasks
+
+
+def _group_by_capability(
+    validated_tasks: List[Tuple[Path, ValidationResult]],
+) -> Dict[Tuple[str, str], List[ValidationResult]]:
+    """Group validated tasks by capability.
+
+    Args:
+        validated_tasks: List of (file_path, ValidationResult) tuples
+
+    Returns
+    -------
+        Dict mapping (area_id, capability_id) to list of ValidationResults
+    """
+    grouped = defaultdict(list)
+    for _, validation in validated_tasks:
+        task_solution = validation.task_solution
+        area_id = task_solution.task_obj.capability.area.area_id
+        cap_id = task_solution.task_obj.capability.capability_id
+        grouped[(area_id, cap_id)].append(validation)
+    return grouped
+
+
+def _create_eval_dataset(
+    area_id: str,
+    capability_id: str,
+    validations: List[ValidationResult],
+    prompt_template: str = DEFAULT_EVAL_PROMPT_TEMPLATE,
+) -> EvalDataset:
+    """Create EvalDataset from validated tasks.
+
+    Args:
+        area_id: Area identifier
+        capability_id: Capability identifier
+        validations: List of ValidationResults for this capability
+        prompt_template: Template for formatting task prompts
+
+    Returns
+    -------
+        EvalDataset dataclass
+    """
+    # Get capability info from first validation
+    first = validations[0]
+    capability = first.task_solution.task_obj.capability
+
+    # Build tasks list
+    tasks = []
+    for v in validations:
+        ts = v.task_solution
+        tasks.append(
+            {
+                "id": ts.task_id,
+                "input": ts.task,
+                "target": ts.solution,
+            }
+        )
+
+    return EvalDataset(
+        area_id=area_id,
+        capability_id=capability_id,
+        capability_name=capability.name,
+        domain=capability.area.domain.name,
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=prompt_template,
+    )
+
+
+def run_eval_stage0(
+    cfg: DictConfig,
+    validation_tag: str,
+) -> EvalConfig:
+    """Eval Stage 0: Setup and Dataset Preparation.
+
+    Validates inputs and creates datasets for evaluation.
+
+    Args:
+        cfg: Configuration object
+        validation_tag: Tag from generation Stage 5 (required)
+
+    Returns
+    -------
+        EvalConfig object for use in subsequent stages
+
+    Raises
+    ------
+        EvalSetupError: If validation fails
+    """
+    # Get experiment info from config
+    exp_id = cfg.exp_cfg.exp_id
+    output_base_dir = Path(cfg.global_cfg.output_dir)
+    experiment_dir = output_base_dir / exp_id
+    eval_cfg = cfg.get("eval_cfg", {})
+
+    logger.info(
+        "Eval Stage 0: exp_id=%s | validation_tag=%s",
+        exp_id,
+        validation_tag,
+    )
+
+    # Validate all inputs
+    _validate_inputs(experiment_dir, validation_tag, eval_cfg)
+    logger.info("Validation checks passed")
+
+    # Create EvalConfig (no tag yet - that's created in Stage 1)
+    eval_config = EvalConfig(
+        experiment_id=exp_id,
+        eval_tag="",  # Will be set in Stage 1
+        subject_llms=eval_cfg.get("subject_llms"),
+        judge_llm=eval_cfg.get("judge_llm"),
+        validation_tag=validation_tag,
+    )
+
+    # Find all validated tasks
+    validated_tasks = _find_validated_tasks(experiment_dir, validation_tag)
+    logger.info("Found %d validated tasks", len(validated_tasks))
+
+    if not validated_tasks:
+        raise EvalSetupError(
+            f"No validated tasks (verification=true) found in: {validation_tag}"
+        )
+
+    # Group by capability
+    grouped = _group_by_capability(validated_tasks)
+    logger.info("Found %d capabilities with validated tasks", len(grouped))
+
+    # Create and save datasets (tied to validation_tag, not eval_tag)
+    datasets_dir = experiment_dir / "eval" / "datasets" / validation_tag
+    num_created = 0
+
+    for (area_id, cap_id), validations in grouped.items():
+        # Check if dataset already exists (idempotent)
+        dataset_path = datasets_dir / area_id / cap_id / "dataset.json"
+        if dataset_path.exists():
+            logger.info("  Skipping %s/%s (already exists)", area_id, cap_id)
+            continue
+
+        # Create dataset
+        dataset = _create_eval_dataset(area_id, cap_id, validations)
+
+        # Save dataset
+        save_eval_dataset(dataset, dataset_path)
+        logger.info(
+            "  Created dataset for %s/%s (%d tasks)",
+            area_id,
+            cap_id,
+            dataset.num_tasks,
+        )
+        num_created += 1
+
+    logger.info(
+        "Eval Stage 0: Created %d datasets in %s",
+        num_created,
+        datasets_dir,
+    )
+
+    return eval_config
diff --git a/src/eval_stages/stage1_eval_execution.py b/src/eval_stages/stage1_eval_execution.py
new file mode 100644
index 00000000..a662dc38
--- /dev/null
+++ b/src/eval_stages/stage1_eval_execution.py
@@ -0,0 +1,263 @@
+"""Eval Stage 1: Evaluation Execution.
+
+This stage runs Inspect AI evaluation for each capability with each subject LLM.
+Creates eval_tag for this evaluation run since this is where LLM calls happen.
+
+See: https://inspect.aisi.org.uk/
+"""
+
+import logging
+from pathlib import Path
+from typing import List
+
+from inspect_ai import Task
+from inspect_ai import eval as inspect_eval
+from inspect_ai.dataset import MemoryDataset, Sample
+from inspect_ai.scorer import model_graded_fact
+from inspect_ai.solver import generate
+from omegaconf import DictConfig
+
+from src.schemas.eval_io_utils import load_eval_dataset, save_eval_config
+from src.schemas.eval_schemas import EvalConfig, EvalDataset
+from src.schemas.metadata_schemas import PipelineMetadata
+from src.utils.timestamp_utils import iso_timestamp, timestamp_tag
+
+
+logger = logging.getLogger(__name__)
+
+
+def _find_datasets(datasets_dir: Path) -> List[Path]:
+    """Find all dataset files.
+
+    Args:
+        datasets_dir: Path to datasets directory
+
+    Returns
+    -------
+        List of paths to dataset.json files
+    """
+    if not datasets_dir.exists():
+        return []
+    return list(datasets_dir.rglob("dataset.json"))
+
+
+def _check_eval_completed(
+    results_dir: Path, subject_llm: str, area_id: str, capability_id: str
+) -> bool:
+    """Check if evaluation was already completed for this combination.
+
+    Args:
+        results_dir: Path to results directory
+        subject_llm: Subject LLM name
+        area_id: Area identifier
+        capability_id: Capability identifier
+
+    Returns
+    -------
+        True if evaluation results exist
+    """
+    result_dir = results_dir / subject_llm / area_id / capability_id
+    # Check if directory exists and has any log files
+    if result_dir.exists():
+        log_files = list(result_dir.glob("*.json"))
+        return len(log_files) > 0
+    return False
+
+
+def _create_inspect_task(
+    dataset: EvalDataset,
+    judge_model: str,
+) -> "Task":
+    """Create an Inspect Task from EvalDataset.
+
+    Args:
+        dataset: EvalDataset with tasks
+        judge_model: Model to use for grading (e.g., "openai/gpt-4o-mini")
+
+    Returns
+    -------
+        Inspect Task object
+    """
+    # Create Inspect samples from our dataset
+    samples = [
+        Sample(
+            input=task["input"],
+            target=task["target"],
+            id=task["id"],
+        )
+        for task in dataset.tasks
+    ]
+
+    # Create memory dataset
+    inspect_dataset = MemoryDataset(samples)
+
+    # Create task with model-graded scoring
+    return Task(
+        dataset=inspect_dataset,
+        solver=generate(),
+        scorer=model_graded_fact(model=judge_model),
+    )
+
+
+def _run_inspect_eval(
+    dataset: EvalDataset,
+    subject_llm: str,
+    judge_llm: dict,
+    output_dir: Path,
+) -> bool:
+    """Run Inspect evaluation for a single capability/LLM combination.
+
+    Args:
+        dataset: EvalDataset to evaluate
+        subject_llm: Subject LLM (e.g., "openai/gpt-4o")
+        judge_llm: Judge LLM config dict
+        output_dir: Directory to save Inspect logs
+
+    Returns
+    -------
+        True if evaluation succeeded
+    """
+    # Format model names for Inspect (provider/model)
+    judge_model = f"{judge_llm['provider']}/{judge_llm['name']}"
+
+    try:
+        # Create Inspect task
+        task = _create_inspect_task(dataset, judge_model)
+
+        # Run evaluation
+        # Inspect saves logs to the specified directory
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        inspect_eval(
+            task,
+            model=subject_llm,
+            log_dir=str(output_dir),
+        )
+
+        return True
+
+    except Exception as e:
+        logger.error(
+            "Inspect evaluation failed for %s/%s with %s: %s",
+            dataset.area_id,
+            dataset.capability_id,
+            subject_llm,
+            e,
+        )
+        return False
+
+
+def run_eval_stage1(
+    cfg: DictConfig,
+    eval_config: EvalConfig,
+) -> str:
+    """Eval Stage 1: Evaluation Execution.
+
+    Runs Inspect evaluation for each capability with each subject LLM.
+    Creates eval_tag since this is where LLM calls happen.
+
+    Args:
+        cfg: Configuration object
+        eval_config: EvalConfig from Stage 0
+
+    Returns
+    -------
+        The eval_tag for this evaluation run
+    """
+    # Derive paths from config
+    exp_id = cfg.exp_cfg.exp_id
+    output_base_dir = Path(cfg.global_cfg.output_dir)
+    experiment_dir = output_base_dir / exp_id
+    validation_tag = eval_config.validation_tag
+
+    # Create eval_tag for this run
+    eval_tag = timestamp_tag()
+
+    logger.info(
+        "Eval Stage 1: Running evaluations (eval_tag=%s)",
+        eval_tag,
+    )
+
+    # Find datasets (saved under validation_tag from Stage 0)
+    datasets_dir = experiment_dir / "eval" / "datasets" / validation_tag
+    dataset_paths = _find_datasets(datasets_dir)
+    logger.info("Found %d datasets", len(dataset_paths))
+
+    if not dataset_paths:
+        raise ValueError(f"No datasets found in {datasets_dir}. Run Stage 0 first.")
+
+    # Load datasets
+    datasets = [load_eval_dataset(p) for p in dataset_paths]
+
+    # Setup results directory under eval_tag
+    eval_dir = experiment_dir / "eval" / "results" / eval_tag
+    results_dir = eval_dir
+
+    # Update eval_config with the tag and save it
+    eval_config.eval_tag = eval_tag
+    metadata = PipelineMetadata(
+        experiment_id=exp_id,
+        output_base_dir=str(output_base_dir),
+        timestamp=iso_timestamp(),
+        input_stage_tag=validation_tag,
+        output_stage_tag=eval_tag,
+        resume=False,
+    )
+    eval_config_path = eval_dir / "eval_config.json"
+    save_eval_config(eval_config, metadata, eval_config_path)
+    logger.info("Saved eval_config.json to %s", eval_config_path)
+
+    # Run evaluations
+    subject_llms = eval_config.subject_llms
+    judge_llm = eval_config.judge_llm
+
+    num_evals = 0
+    total_combinations = len(datasets) * len(subject_llms)
+
+    for dataset in datasets:
+        for llm_config in subject_llms:
+            llm_name = llm_config["name"]
+            # Construct full model string: provider/model_name
+            subject_model = f"{llm_config['provider']}/{llm_name}"
+
+            # Check if already completed (resume)
+            if _check_eval_completed(
+                results_dir, llm_name, dataset.area_id, dataset.capability_id
+            ):
+                logger.info(
+                    "  Skipping %s/%s with %s (already completed)",
+                    dataset.area_id,
+                    dataset.capability_id,
+                    llm_name,
+                )
+                continue
+
+            # Run evaluation
+            output_dir = (
+                results_dir / llm_name / dataset.area_id / dataset.capability_id
+            )
+
+            logger.info(
+                "  Evaluating %s/%s with %s",
+                dataset.area_id,
+                dataset.capability_id,
+                subject_model,
+            )
+
+            success = _run_inspect_eval(
+                dataset=dataset,
+                subject_llm=subject_model,
+                judge_llm=judge_llm,
+                output_dir=output_dir,
+            )
+
+            if success:
+                num_evals += 1
+
+    logger.info(
+        "Eval Stage 1: Completed %d/%d evaluations",
+        num_evals,
+        total_combinations,
+    )
+
+    return eval_tag
diff --git a/src/eval_stages/stage2_score_aggregation.py b/src/eval_stages/stage2_score_aggregation.py
new file mode 100644
index 00000000..2855cb61
--- /dev/null
+++ b/src/eval_stages/stage2_score_aggregation.py
@@ -0,0 +1,231 @@
+"""Eval Stage 2: Score Aggregation.
+
+This stage computes final capability scores from raw Inspect results.
+No LLM calls, just aggregation of results from Stage 1.
+
+See: https://inspect.aisi.org.uk/
+"""
+
+import logging
+import math
+from pathlib import Path
+from typing import Dict, List
+
+from inspect_ai.log import read_eval_log
+from omegaconf import DictConfig
+
+from src.schemas.eval_io_utils import (
+    load_eval_config,
+    load_eval_dataset,
+    save_capability_scores,
+)
+from src.schemas.eval_schemas import CapabilityScore
+
+
+logger = logging.getLogger(__name__)
+
+
+def _find_result_dirs(results_dir: Path, subject_llm: str) -> List[Path]:
+    """Find all result directories for a subject LLM.
+
+    Args:
+        results_dir: Path to results directory
+        subject_llm: Subject LLM name
+
+    Returns
+    -------
+        List of paths to capability result directories
+    """
+    llm_results_dir = results_dir / subject_llm
+    if not llm_results_dir.exists():
+        return []
+
+    # Find all directories with structure: <area_id>/<capability_id>/
+    result_dirs = []
+    for area_dir in llm_results_dir.iterdir():
+        if area_dir.is_dir():
+            for cap_dir in area_dir.iterdir():
+                if cap_dir.is_dir():
+                    result_dirs.append(cap_dir)
+    return result_dirs
+
+
+def _compute_stats(scores: List[float]) -> Dict:
+    """Compute mean and standard error from scores.
+
+    Args:
+        scores: List of score values (0.0 to 1.0)
+
+    Returns
+    -------
+        Dict with 'mean', 'std_err', 'num_tasks'
+    """
+    if not scores:
+        return {"mean": 0.0, "std_err": 0.0, "num_tasks": 0}
+
+    n = len(scores)
+    mean = sum(scores) / n
+
+    if n > 1:
+        variance = sum((s - mean) ** 2 for s in scores) / (n - 1)
+        std_dev = math.sqrt(variance)
+        std_err = std_dev / math.sqrt(n)
+    else:
+        std_err = 0.0
+
+    return {"mean": mean, "std_err": std_err, "num_tasks": n}
+
+
+def _parse_inspect_logs(result_dir: Path) -> Dict:
+    """Parse Inspect logs to extract scores.
+
+    Args:
+        result_dir: Path to capability result directory
+
+    Returns
+    -------
+        Dict with 'mean', 'std_err', 'num_tasks'
+    """
+    # Find Inspect log files (they have .json extension)
+    log_files = list(result_dir.glob("*.json"))
+
+    if not log_files:
+        logger.warning("No log files found in %s", result_dir)
+        return {"mean": 0.0, "std_err": 0.0, "num_tasks": 0}
+
+    scores = []
+
+    for log_file in log_files:
+        try:
+            log = read_eval_log(str(log_file))
+
+            # Extract scores from samples
+            # In Inspect AI 0.3.159+, sample.scores is dict[str, Score] | None
+            if log.samples:
+                for sample in log.samples:
+                    if sample.scores:
+                        # Iterate over all scorers (usually just one)
+                        for _scorer_name, score_obj in sample.scores.items():
+                            if score_obj.value is not None:
+                                # Score value can be numeric or string
+                                score_val = score_obj.value
+                                if isinstance(score_val, (int, float)):
+                                    scores.append(float(score_val))
+                                elif score_val == "C":  # Correct
+                                    scores.append(1.0)
+                                elif score_val == "I":  # Incorrect
+                                    scores.append(0.0)
+
+        except Exception as e:
+            logger.warning("Failed to parse log %s: %s", log_file, e)
+            continue
+
+    return _compute_stats(scores)
+
+
+def run_eval_stage2(
+    cfg: DictConfig,
+    eval_tag: str,
+) -> str:
+    """Eval Stage 2: Score Aggregation.
+
+    Computes final capability scores from raw Inspect results.
+
+    Args:
+        cfg: Configuration object
+        eval_tag: Tag from Eval Stage 1
+
+    Returns
+    -------
+        The eval_tag (same as input, for chaining)
+    """
+    # Derive paths from config
+    exp_id = cfg.exp_cfg.exp_id
+    output_base_dir = Path(cfg.global_cfg.output_dir)
+    experiment_dir = output_base_dir / exp_id
+    results_dir = experiment_dir / "eval" / "results" / eval_tag
+
+    # Load eval config from Stage 1
+    eval_config_path = results_dir / "eval_config.json"
+    if not eval_config_path.exists():
+        raise ValueError(
+            f"eval_config.json not found at {eval_config_path}. Run Stage 1 first."
+        )
+    eval_config, _ = load_eval_config(eval_config_path)
+
+    logger.info("Eval Stage 2: Aggregating scores (eval_tag=%s)", eval_tag)
+
+    # Find datasets (saved under validation_tag)
+    validation_tag = eval_config.validation_tag
+    datasets_dir = experiment_dir / "eval" / "datasets" / validation_tag
+
+    scores_dir = experiment_dir / "eval" / "scores" / eval_tag
+
+    # Load datasets for capability info
+    dataset_map = {}  # (area_id, cap_id) -> EvalDataset
+    for dataset_path in datasets_dir.rglob("dataset.json"):
+        dataset = load_eval_dataset(dataset_path)
+        dataset_map[(dataset.area_id, dataset.capability_id)] = dataset
+
+    num_llms_processed = 0
+
+    for llm_config in eval_config.subject_llms:
+        llm_name = llm_config["name"]
+        logger.info("  Processing results for %s", llm_name)
+
+        # Find all result directories for this LLM
+        result_dirs = _find_result_dirs(results_dir, llm_name)
+
+        if not result_dirs:
+            logger.warning("  No results found for %s", llm_name)
+            continue
+
+        capability_scores = []
+
+        for result_dir in result_dirs:
+            # Extract area_id and capability_id from path
+            cap_id = result_dir.name
+            area_id = result_dir.parent.name
+
+            # Get capability info from dataset
+            dataset = dataset_map.get((area_id, cap_id))
+            if not dataset:
+                logger.warning(
+                    "  No dataset found for %s/%s, skipping",
+                    area_id,
+                    cap_id,
+                )
+                continue
+
+            # Parse Inspect logs
+            parsed = _parse_inspect_logs(result_dir)
+
+            # Create CapabilityScore
+            score = CapabilityScore(
+                area_id=area_id,
+                capability_id=cap_id,
+                capability_name=dataset.capability_name,
+                subject_llm=llm_name,
+                mean=parsed["mean"],
+                std_err=parsed["std_err"],
+                num_tasks=parsed["num_tasks"],
+            )
+            capability_scores.append(score)
+
+        # Save scores for this LLM
+        if capability_scores:
+            scores_path = scores_dir / llm_name / "capability_scores.json"
+            save_capability_scores(capability_scores, scores_path)
+            logger.info(
+                "  Saved %d capability scores for %s",
+                len(capability_scores),
+                llm_name,
+            )
+            num_llms_processed += 1
+
+    logger.info(
+        "Eval Stage 2: Aggregated scores for %d LLMs",
+        num_llms_processed,
+    )
+
+    return eval_tag
diff --git a/src/run_eval_pipeline.py b/src/run_eval_pipeline.py
new file mode 100644
index 00000000..36485ea7
--- /dev/null
+++ b/src/run_eval_pipeline.py
@@ -0,0 +1,149 @@
+"""Evaluation pipeline for running LLM evaluations on generated tasks.
+
+This module orchestrates the evaluation pipeline:
+- Stage 0: Setup and Dataset Preparation (no LLM calls, no tag)
+- Stage 1: Evaluation Execution (runs subject LLMs, creates eval_tag)
+- Stage 2: Score Aggregation (no LLM calls)
+
+Usage:
+    # Run all stages
+    python -m src.run_eval_pipeline validation_tag=_YYYYMMDD_HHMMSS
+
+    # Run specific stage
+    python -m src.run_eval_pipeline stage=0 validation_tag=_YYYYMMDD_HHMMSS
+    python -m src.run_eval_pipeline stage=1 validation_tag=_YYYYMMDD_HHMMSS
+    python -m src.run_eval_pipeline stage=2 eval_tag=_YYYYMMDD_HHMMSS
+"""
+
+import logging
+from pathlib import Path
+
+import hydra
+from omegaconf import DictConfig
+
+from src.eval_stages import (
+    EvalSetupError,
+    run_eval_stage0,
+    run_eval_stage1,
+    run_eval_stage2,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+@hydra.main(version_base=None, config_path="cfg", config_name="run_cfg")
+def main(cfg: DictConfig) -> None:
+    """Run the evaluation pipeline."""
+    # Get stage to run (default: "all")
+    stage = cfg.get("stage", "all")
+
+    # Get tags from config
+    validation_tag = cfg.get("validation_tag")
+    eval_tag = cfg.get("eval_tag")
+
+    logger.info("=" * 60)
+    logger.info("EVALUATION PIPELINE")
+    logger.info("=" * 60)
+    logger.info("Stage: %s", stage)
+    logger.info("Experiment ID: %s", cfg.exp_cfg.exp_id)
+    logger.info("validation_tag: %s", validation_tag)
+    logger.info("eval_tag: %s", eval_tag)
+    logger.info("=" * 60)
+
+    # Run all stages sequentially
+    if stage == "all":
+        if not validation_tag:
+            logger.error("validation_tag is required")
+            logger.error(
+                "Usage: python -m src.run_eval_pipeline validation_tag=_YYYYMMDD_HHMMSS"
+            )
+            return
+
+        try:
+            # Stage 0: Setup and Dataset Preparation
+            logger.info("Running Eval Stage 0: Setup and Dataset Preparation")
+            eval_config = run_eval_stage0(cfg, validation_tag)
+            logger.info("Eval Stage 0 complete.")
+
+            # Stage 1: Evaluation Execution
+            logger.info("Running Eval Stage 1: Evaluation Execution")
+            eval_tag = run_eval_stage1(cfg, eval_config)
+            logger.info("Eval Stage 1 complete. eval_tag=%s", eval_tag)
+
+            # Stage 2: Score Aggregation
+            logger.info("Running Eval Stage 2: Score Aggregation")
+            run_eval_stage2(cfg, eval_tag)
+            logger.info("Eval Stage 2 complete.")
+
+            # Get results dir for final message
+            exp_id = cfg.exp_cfg.exp_id
+            output_base_dir = Path(cfg.global_cfg.output_dir)
+            scores_dir = output_base_dir / exp_id / "eval" / "scores" / eval_tag
+
+            logger.info("=" * 60)
+            logger.info("EVALUATION PIPELINE COMPLETE")
+            logger.info("Scores in: %s", scores_dir)
+            logger.info("=" * 60)
+
+        except EvalSetupError as e:
+            logger.error("Evaluation setup failed: %s", e)
+            return
+        except ValueError as e:
+            logger.error("Evaluation failed: %s", e)
+            return
+
+    # Run specific stage
+    elif stage == 0:
+        if not validation_tag:
+            logger.error("validation_tag is required for stage 0")
+            logger.error(
+                "Usage: python -m src.run_eval_pipeline stage=0 "
+                "validation_tag=_YYYYMMDD_HHMMSS"
+            )
+            return
+
+        try:
+            eval_config = run_eval_stage0(cfg, validation_tag)
+            logger.info("Eval Stage 0 complete. Datasets created.")
+        except EvalSetupError as e:
+            logger.error("Evaluation setup failed: %s", e)
+
+    elif stage == 1:
+        if not validation_tag:
+            logger.error("validation_tag is required for stage 1")
+            logger.error(
+                "Usage: python -m src.run_eval_pipeline stage=1 "
+                "validation_tag=_YYYYMMDD_HHMMSS"
+            )
+            return
+
+        try:
+            # Run Stage 0 first to get eval_config
+            eval_config = run_eval_stage0(cfg, validation_tag)
+            eval_tag = run_eval_stage1(cfg, eval_config)
+            logger.info("Eval Stage 1 complete. eval_tag=%s", eval_tag)
+        except (EvalSetupError, ValueError) as e:
+            logger.error("Stage 1 failed: %s", e)
+
+    elif stage == 2:
+        if not eval_tag:
+            logger.error("eval_tag is required for stage 2")
+            logger.error(
+                "Usage: python -m src.run_eval_pipeline stage=2 "
+                "eval_tag=_YYYYMMDD_HHMMSS"
+            )
+            return
+
+        try:
+            run_eval_stage2(cfg, eval_tag)
+            logger.info("Eval Stage 2 complete.")
+        except ValueError as e:
+            logger.error("Stage 2 failed: %s", e)
+
+    else:
+        logger.error("Invalid stage: %s. Use 'all', 0, 1, or 2", stage)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md b/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md
new file mode 100644
index 00000000..4d7d2f87
--- /dev/null
+++ b/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md
@@ -0,0 +1,315 @@
+# ACE Evaluation Pipeline Standardized Schemas
+
+The **evaluation pipeline** takes the validated tasks and solutions from the generation pipeline and evaluates subject LLMs on them using Inspect AI. It produces capability scores that measure how well each subject LLM performs on each capability.
+
+This document defines the standardized input and output formats for each stage of the evaluation pipeline. These schemas ensure consistency across different implementations and enable interoperability between pipeline stages.
+
+## Pipeline Stages
+
+The evaluation pipeline consists of three stages:
+
+0. **Setup and Dataset Preparation**: Validate inputs, convert tasks to Inspect format (no LLM calls)
+1. **Evaluation Execution**: Run Inspect evaluation with subject LLMs (creates `eval_tag`)
+2. **Score Aggregation**: Compute capability scores from raw results (no LLM calls)
+
+---
+
+## Implementation Approach
+
+**Pipeline Pattern:**
+- **Stage 0**: Deterministic data transformation (no LLM, no tag needed)
+- **Stage 1**: LLM-dependent evaluation (creates `eval_tag` for results)
+- **Stage 2**: Deterministic aggregation (uses `eval_tag` from Stage 1)
+
+**Shared Config:**
+The evaluation pipeline uses the **same configuration file** as the generation pipeline (`src/cfg/run_cfg.yaml`), with an evaluation-specific section (`eval_cfg`).
+
+**Resumability:**
+- **Stage 0**: Idempotent - skips datasets that already exist
+- **Stage 1**: Can resume by skipping completed evaluations (pass same `eval_tag`)
+
+---
+
+## Configuration
+
+```yaml
+eval_cfg:
+  # Subject LLMs to evaluate (required)
+  subject_llms:
+    - name: "gpt-4o"
+      provider: "openai"
+    - name: "claude-3-sonnet"
+      provider: "anthropic"
+
+  # Judge LLM for scoring (required)
+  judge_llm:
+    name: "gpt-4o-mini"
+    provider: "openai"
+```
+
+---
+
+## Naming Conventions
+
+See [GENERATION_PIPELINE_SCHEMAS.md](GENERATION_PIPELINE_SCHEMAS.md) for naming conventions. Tags follow the same format: `_YYYYMMDD_HHMMSS`.
+
+---
+
+## Directory Structure
+
+Evaluation outputs are stored in an `eval/` subdirectory within the experiment directory (see [GENERATION_PIPELINE_SCHEMAS.md](GENERATION_PIPELINE_SCHEMAS.md) for generation structure):
+
+```
+<experiment_id>/
+  eval/
+    datasets/                              # Stage 0 output
+      <validation_tag>/                    # Tied to validation source
+        <area_id>/
+          <capability_id>/
+            dataset.json                   # EvalDataset
+
+    results/                               # Stage 1 output
+      <eval_tag>/
+        eval_config.json                   # EvalConfig saved here
+        <subject_llm>/
+          <area_id>/
+            <capability_id>/               # Inspect logs
+
+    scores/                                # Stage 2 output
+      <eval_tag>/
+        <subject_llm>/
+          capability_scores.json           # List[CapabilityScore]
+```
+
+**Example:**
+```
+r0_10x10/
+  eval/
+    datasets/
+      _20251017_091500/
+        area_000/
+          cap_000/dataset.json
+          cap_001/dataset.json
+    results/
+      _20251020_143000/
+        eval_config.json
+        gpt-4o/
+          area_000/
+            cap_000/
+            cap_001/
+        claude-3-sonnet/
+          area_000/
+            cap_000/
+            cap_001/
+    scores/
+      _20251020_143000/
+        gpt-4o/capability_scores.json
+        claude-3-sonnet/capability_scores.json
+```
+
+---
+
+## Dataclasses
+
+The evaluation pipeline uses 3 dataclasses, plus reuses `PipelineMetadata` from the generation pipeline (see [GENERATION_PIPELINE_SCHEMAS.md](GENERATION_PIPELINE_SCHEMAS.md#pipelinemetadata)).
+
+**File:** `eval_schemas.py` in `src/schemas/`
+
+### EvalConfig
+
+Configuration for the evaluation run.
+
+**Fields:**
+- `experiment_id`: String (required)
+- `eval_tag`: String (set in Stage 1)
+- `subject_llms`: List[Dict] (required, each dict has "name" and "provider")
+- `judge_llm`: Dict (required, has "name" and "provider")
+- `validation_tag`: String (required, tag from generation Stage 5)
+
+### EvalDataset
+
+Dataset prepared for Inspect evaluation. Contains all info for one capability.
+
+**Fields:**
+- `area_id`: String (required)
+- `capability_id`: String (required)
+- `capability_name`: String (required)
+- `domain`: String (required)
+- `tasks`: List[Dict] (required, each dict has "id", "input", "target")
+- `num_tasks`: Integer (required)
+- `prompt_template`: String (required)
+
+### CapabilityScore
+
+Score for a single capability from evaluation.
+
+**Fields:**
+- `area_id`: String (required)
+- `capability_id`: String (required)
+- `capability_name`: String (required)
+- `subject_llm`: String (required)
+- `mean`: Float (required, 0.0 to 1.0)
+- `std_err`: Float (required)
+- `num_tasks`: Integer (required)
+
+---
+
+## Eval Stage 0: Setup and Dataset Preparation
+
+### Purpose
+Validate inputs and convert validated tasks to Inspect-compatible format.
+
+### Input
+- **validation_tag**: String - Tag from generation Stage 5 (required)
+- **Configuration**: `eval_cfg` section from config YAML (subject_llms and judge_llm required)
+
+### Validation Checks
+1. Generation `experiment.json` exists
+2. Validation outputs exist at `validation/<validation_tag>/`
+3. `subject_llms` and `judge_llm` are configured
+
+### Output: `dataset.json` (per capability)
+
+**Stage Output:** EvalDataset dataclass
+**Save Function:** `save_eval_dataset(dataset: EvalDataset, output_path: Path)`
+
+**File Path:** `<output_dir>/<experiment_id>/eval/datasets/<validation_tag>/<area_id>/<capability_id>/dataset.json`
+
+```json
+{
+  "area_id": "area_000",
+  "capability_id": "cap_000",
+  "capability_name": "compound_interest",
+  "domain": "personal_finance",
+  "tasks": [
+    {"id": "task_000", "input": "What is the future value of $1000...", "target": "1647.01"},
+    {"id": "task_001", "input": "Calculate the present value of $5000...", "target": "3402.92"}
+  ],
+  "num_tasks": 10,
+  "prompt_template": "..."
+}
+```
+
+**Returns:** EvalConfig object (for use in Stage 1)
+
+---
+
+## Eval Stage 1: Evaluation Execution
+
+### Purpose
+Run Inspect evaluation for each capability with each subject LLM.
+
+### Input
+- **eval_config**: EvalConfig from Stage 0
+
+### Tag Handling
+- **Creates**: `eval_tag` for this evaluation run
+- **Resume**: If `results/<eval_tag>/<subject_llm>/<area_id>/<capability_id>/` exists with logs, skip
+
+### Output: Inspect logs + `eval_config.json`
+
+**Stage Output:** Raw Inspect AI logs (stored by Inspect directly)
+
+**File Path:** `<output_dir>/<experiment_id>/eval/results/<eval_tag>/<subject_llm>/<area_id>/<capability_id>/`
+
+The `eval_config.json` is saved to `results/<eval_tag>/eval_config.json` for reference.
+
+**Returns:** `eval_tag` string
+
+---
+
+## Eval Stage 2: Score Aggregation
+
+### Purpose
+Compute final capability scores from raw Inspect results.
+
+### Input
+- **eval_tag**: Tag from Stage 1
+
+### Output: `capability_scores.json` (per subject LLM)
+
+**Stage Output:** List[CapabilityScore]
+**Save Function:** `save_capability_scores(scores: List[CapabilityScore], output_path: Path)`
+
+**File Path:** `<output_dir>/<experiment_id>/eval/scores/<eval_tag>/<subject_llm>/capability_scores.json`
+
+```json
+[
+  {
+    "area_id": "area_000",
+    "capability_id": "cap_000",
+    "capability_name": "compound_interest",
+    "subject_llm": "gpt-4o",
+    "mean": 0.90,
+    "std_err": 0.03,
+    "num_tasks": 10
+  }
+]
+```
+
+**Returns:** `eval_tag` string
+
+---
+
+## Usage
+
+### Run Full Evaluation
+
+```bash
+# Basic usage - evaluate all capabilities
+python -m src.run_eval_pipeline validation_tag=_20251017_091500
+```
+
+### Run Specific Stages
+
+```bash
+# Run only Stage 0 (setup + dataset preparation)
+python -m src.run_eval_pipeline stage=0 validation_tag=_20251017_091500
+
+# Run Stage 0 + Stage 1 (setup, datasets, and evaluation)
+python -m src.run_eval_pipeline stage=1 validation_tag=_20251017_091500
+
+# Run only Stage 2 (score aggregation) - requires eval_tag from Stage 1
+python -m src.run_eval_pipeline stage=2 eval_tag=_20251020_143000
+```
+
+---
+
+## IO Utilities
+
+The following functions are provided in `src/schemas/eval_io_utils.py`:
+
+### Save Functions
+- `save_eval_config(config: EvalConfig, metadata: PipelineMetadata, path: Path)`
+- `save_eval_dataset(dataset: EvalDataset, path: Path)`
+- `save_capability_scores(scores: List[CapabilityScore], path: Path)`
+
+### Load Functions
+- `load_eval_config(path: Path) -> Tuple[EvalConfig, PipelineMetadata]`
+- `load_eval_dataset(path: Path) -> EvalDataset`
+- `load_capability_scores(path: Path) -> List[CapabilityScore]`
+
+### Helper Functions
+- Use `timestamp_tag()` from `src.utils.timestamp_utils` to generate tags
+- `get_experiment_dir(output_base_dir: str, experiment_id: str) -> Path`
+
+---
+
+## Relationship to Generation Pipeline
+
+The evaluation pipeline depends on the generation pipeline outputs:
+
+| Eval Stage | Depends On | Generation Stage |
+|------------|------------|------------------|
+| Eval Stage 0 | `experiment.json` | Stage 0 |
+| Eval Stage 0 | `validation/<validation_tag>/` | Stage 5 |
+
+---
+
+## Legacy: LBO Support
+
+The previous version of the repository included **Latent Bayesian Optimization (LBO)** for intelligent capability selection during evaluation. This functionality has been moved to the `legacy/` directory for reference.
+
+See `legacy/README.md` for details on the LBO implementation and how it was used.
+
+---
diff --git a/src/schemas/PIPELINE_SCHEMAS.md b/src/schemas/GENERATION_PIPELINE_SCHEMAS.md
similarity index 98%
rename from src/schemas/PIPELINE_SCHEMAS.md
rename to src/schemas/GENERATION_PIPELINE_SCHEMAS.md
index d2b8b868..94f705bd 100644
--- a/src/schemas/PIPELINE_SCHEMAS.md
+++ b/src/schemas/GENERATION_PIPELINE_SCHEMAS.md
@@ -312,16 +312,16 @@ All pipeline outputs include a `metadata` object (represented by the `PipelineMe
 
 **Fields:**
 - `task_id`: String (required)
-- `task`: String (required, the task/problem text from Stage 3)
+- `task`: String (required, the task/problem text)
+- `task_solution`: TaskSolution (required, the full task solution being validated)
 - `verification`: Boolean (required, overall validation status - whether the solution is verified/valid)
 - `feedback`: String (required, detailed feedback on the validation)
-- `task_obj`: Task (required, Task dataclass object with full hierarchy)
 - `score`: Float (optional, validation score, typically 0.0 to 1.0)
 - `generation_metadata`: Dict (optional, nested dictionary containing process-specific information)
   - This field can contain any validation-specific data (e.g., validation method, criteria details, error details)
   - Structure is flexible and depends on the validation method
 
-**Note:** When serialized to JSON, the `task_obj` object is flattened to `capability` (string), `capability_id` (string), `area` (string), `area_id` (string), `domain` (string), and `domain_id` (string) fields.
+**Note:** When serialized to JSON, the `task_solution` object is flattened to include all TaskSolution fields (task_id, task, solution, reasoning, numerical_answer) plus the capability/area/domain hierarchy.
 
 ---
 
diff --git a/src/schemas/README.md b/src/schemas/README.md
index a2bc443e..edbea200 100644
--- a/src/schemas/README.md
+++ b/src/schemas/README.md
@@ -4,8 +4,10 @@ This directory contains standardized schemas for all ACE pipeline stages, ensuri
 
 ## Structure
 
-- **[`PIPELINE_SCHEMAS.md`](PIPELINE_SCHEMAS.md)** - Complete documentation of input/output formats for each stage
-- **Python Dataclasses** - Type-safe data structures for each stage:
+### Generation Pipeline
+
+- **[`GENERATION_PIPELINE_SCHEMAS.md`](GENERATION_PIPELINE_SCHEMAS.md)** - Documentation for generation pipeline stages
+- **Python Dataclasses** - Type-safe data structures:
   - [`experiment_schemas.py`](experiment_schemas.py) - Experiment (Stage 0)
   - [`domain_schemas.py`](domain_schemas.py) - Domain (Stage 0)
   - [`metadata_schemas.py`](metadata_schemas.py) - Common metadata (PipelineMetadata)
@@ -14,8 +16,16 @@ This directory contains standardized schemas for all ACE pipeline stages, ensuri
   - [`task_schemas.py`](task_schemas.py) - Task generation (Stage 3)
   - [`solution_schemas.py`](solution_schemas.py) - Solution generation (Stage 4)
   - [`validation_schemas.py`](validation_schemas.py) - Validation (Stage 5)
-- **I/O Utilities** - Save and load functions:
-  - [`io_utils.py`](io_utils.py) - Functions to save/load all stage outputs (save/load functions for all 7 stage outputs)
+- **I/O Utilities**:
+  - [`io_utils.py`](io_utils.py) - Save/load functions for generation pipeline outputs
+
+### Evaluation Pipeline
+
+- **[`EVALUATION_PIPELINE_SCHEMAS.md`](EVALUATION_PIPELINE_SCHEMAS.md)** - Documentation for evaluation pipeline stages
+- **Python Dataclasses**:
+  - [`eval_schemas.py`](eval_schemas.py) - EvalConfig, EvalDataset, CapabilityScore
+- **I/O Utilities**:
+  - [`eval_io_utils.py`](eval_io_utils.py) - Save/load functions for evaluation pipeline outputs
 
 ## Usage
 
@@ -77,6 +87,8 @@ areas, metadata = load_areas(Path("output/areas.json"))
 
 ## Pipeline Stages
 
+### Generation Pipeline
+
 0. **Experiment Setup** → `Experiment`, `Domain`
 1. **Area Generation** → `Area`
 2. **Capability Generation** → `Capability`
@@ -84,4 +96,12 @@ areas, metadata = load_areas(Path("output/areas.json"))
 4. **Solution Generation** → `TaskSolution`
 5. **Validation** → `ValidationResult`
 
-See [`PIPELINE_SCHEMAS.md`](PIPELINE_SCHEMAS.md) for detailed specifications.
+See [`GENERATION_PIPELINE_SCHEMAS.md`](GENERATION_PIPELINE_SCHEMAS.md) for detailed specifications.
+
+### Evaluation Pipeline
+
+0. **Setup and Dataset Preparation** → `EvalConfig`, `EvalDataset`
+1. **Evaluation Execution** → Inspect AI logs (creates `eval_tag`)
+2. **Score Aggregation** → `CapabilityScore`
+
+See [`EVALUATION_PIPELINE_SCHEMAS.md`](EVALUATION_PIPELINE_SCHEMAS.md) for detailed specifications.
diff --git a/src/schemas/__init__.py b/src/schemas/__init__.py
index 29e46fc9..2811f309 100644
--- a/src/schemas/__init__.py
+++ b/src/schemas/__init__.py
@@ -7,6 +7,23 @@
 from src.schemas.area_schemas import Area
 from src.schemas.capability_schemas import Capability
 from src.schemas.domain_schemas import Domain
+from src.schemas.eval_io_utils import (
+    get_eval_dir,
+    get_experiment_dir,
+    load_capability_scores,
+    load_eval_config,
+    load_eval_dataset,
+    save_capability_scores,
+    save_eval_config,
+    save_eval_dataset,
+)
+
+# Evaluation pipeline schemas
+from src.schemas.eval_schemas import (
+    CapabilityScore,
+    EvalConfig,
+    EvalDataset,
+)
 from src.schemas.experiment_schemas import Experiment
 from src.schemas.io_utils import (
     load_areas,
@@ -46,7 +63,7 @@
     "TaskSolution",
     # Validation schemas
     "ValidationResult",
-    # I/O functions - Save
+    # I/O functions - Save (Generation)
     "save_experiment",
     "save_domain",
     "save_areas",
@@ -54,7 +71,7 @@
     "save_tasks",
     "save_solution",
     "save_validation",
-    # I/O functions - Load
+    # I/O functions - Load (Generation)
     "load_experiment",
     "load_domain",
     "load_areas",
@@ -62,4 +79,19 @@
     "load_tasks",
     "load_solution",
     "load_validation",
+    # Evaluation schemas
+    "EvalConfig",
+    "EvalDataset",
+    "CapabilityScore",
+    # I/O functions - Save (Evaluation)
+    "save_eval_config",
+    "save_eval_dataset",
+    "save_capability_scores",
+    # I/O functions - Load (Evaluation)
+    "load_eval_config",
+    "load_eval_dataset",
+    "load_capability_scores",
+    # Helper functions
+    "get_experiment_dir",
+    "get_eval_dir",
 ]
diff --git a/src/schemas/area_schemas.py b/src/schemas/area_schemas.py
index 311eb8a4..68c5fa8c 100644
--- a/src/schemas/area_schemas.py
+++ b/src/schemas/area_schemas.py
@@ -36,10 +36,12 @@ def to_dict(self):
     @classmethod
     def from_dict(cls, data: dict):
         """Create from dictionary."""
-        domain = Domain(
-            name=data["domain"],
-            domain_id=data["domain_id"],
-            description=data.get("domain_description"),
+        domain = Domain.from_dict(
+            {
+                "name": data["domain"],
+                "domain_id": data["domain_id"],
+                "description": data.get("domain_description"),
+            }
         )
         return cls(
             name=data["name"],
diff --git a/src/schemas/capability_schemas.py b/src/schemas/capability_schemas.py
index 8cfc74c8..7e6e680c 100644
--- a/src/schemas/capability_schemas.py
+++ b/src/schemas/capability_schemas.py
@@ -8,7 +8,6 @@
 from typing import Dict, Optional
 
 from src.schemas.area_schemas import Area
-from src.schemas.domain_schemas import Domain
 
 
 @dataclass
@@ -40,16 +39,15 @@ def to_dict(self):
     @classmethod
     def from_dict(cls, data: dict):
         """Create from dictionary."""
-        domain = Domain(
-            name=data["domain"],
-            domain_id=data["domain_id"],
-            description=data.get("domain_description"),
-        )
-        area = Area(
-            name=data["area"],
-            area_id=data["area_id"],
-            domain=domain,
-            description=data["area_description"],
+        area = Area.from_dict(
+            {
+                "name": data["area"],
+                "area_id": data["area_id"],
+                "description": data["area_description"],
+                "domain": data["domain"],
+                "domain_id": data["domain_id"],
+                "domain_description": data.get("domain_description"),
+            }
         )
         return cls(
             name=data["name"],
diff --git a/src/schemas/eval_io_utils.py b/src/schemas/eval_io_utils.py
new file mode 100644
index 00000000..0aca7c37
--- /dev/null
+++ b/src/schemas/eval_io_utils.py
@@ -0,0 +1,142 @@
+"""I/O utilities for saving and loading evaluation pipeline outputs."""
+
+import json
+from pathlib import Path
+from typing import List, Tuple
+
+from src.schemas.eval_schemas import (
+    CapabilityScore,
+    EvalConfig,
+    EvalDataset,
+)
+from src.schemas.metadata_schemas import PipelineMetadata
+
+
+# Save functions
+
+
+def save_eval_config(
+    config: EvalConfig, metadata: PipelineMetadata, output_path: Path
+) -> None:
+    """Save eval config to JSON file.
+
+    Args:
+        config: EvalConfig dataclass
+        metadata: PipelineMetadata dataclass
+        output_path: Path to save the JSON file
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    data = {
+        "metadata": metadata.to_dict(),
+        **config.to_dict(),
+    }
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+
+
+def save_eval_dataset(dataset: EvalDataset, output_path: Path) -> None:
+    """Save eval dataset to JSON file.
+
+    Args:
+        dataset: EvalDataset dataclass
+        output_path: Path to save the JSON file
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(dataset.to_dict(), f, indent=2, ensure_ascii=False)
+
+
+def save_capability_scores(scores: List[CapabilityScore], output_path: Path) -> None:
+    """Save capability scores to JSON file.
+
+    Args:
+        scores: List of CapabilityScore dataclasses
+        output_path: Path to save the JSON file
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    data = [score.to_dict() for score in scores]
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+
+
+# Load functions
+
+
+def load_eval_config(file_path: Path) -> Tuple[EvalConfig, PipelineMetadata]:
+    """Load eval config from JSON file.
+
+    Args:
+        file_path: Path to the JSON file
+
+    Returns
+    -------
+        Tuple of (EvalConfig, PipelineMetadata)
+    """
+    with open(file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    metadata = PipelineMetadata.from_dict(data["metadata"])
+    # Config fields are at top level (alongside metadata)
+    config_data = {k: v for k, v in data.items() if k != "metadata"}
+    config = EvalConfig.from_dict(config_data)
+    return config, metadata
+
+
+def load_eval_dataset(file_path: Path) -> EvalDataset:
+    """Load eval dataset from JSON file.
+
+    Args:
+        file_path: Path to the JSON file
+
+    Returns
+    -------
+        EvalDataset dataclass
+    """
+    with open(file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    return EvalDataset.from_dict(data)
+
+
+def load_capability_scores(file_path: Path) -> List[CapabilityScore]:
+    """Load capability scores from JSON file.
+
+    Args:
+        file_path: Path to the JSON file
+
+    Returns
+    -------
+        List of CapabilityScore dataclasses
+    """
+    with open(file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    return [CapabilityScore.from_dict(item) for item in data]
+
+
+# Helper functions
+
+
+def get_experiment_dir(output_base_dir: str, experiment_id: str) -> Path:
+    """Get the experiment directory path.
+
+    Args:
+        output_base_dir: Base output directory
+        experiment_id: Experiment identifier
+
+    Returns
+    -------
+        Path to experiment directory
+    """
+    return Path(output_base_dir) / experiment_id
+
+
+def get_eval_dir(experiment_dir: Path, eval_tag: str) -> Path:
+    """Get the eval output directory path.
+
+    Args:
+        experiment_dir: Path to experiment directory
+        eval_tag: Eval tag
+
+    Returns
+    -------
+        Path to eval directory
+    """
+    return experiment_dir / "eval" / eval_tag
diff --git a/src/schemas/eval_schemas.py b/src/schemas/eval_schemas.py
new file mode 100644
index 00000000..8a553926
--- /dev/null
+++ b/src/schemas/eval_schemas.py
@@ -0,0 +1,134 @@
+"""Schemas for evaluation pipeline stages.
+
+Defines dataclasses for evaluation pipeline:
+- EvalConfig: Configuration for evaluation run (Stage 0 output)
+- EvalDataset: Dataset for one capability (Stage 1 output)
+- CapabilityScore: Score for one capability (Stage 3 output)
+"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, List
+
+
+@dataclass
+class EvalConfig:
+    """Configuration for the evaluation run.
+
+    Created by Eval Stage 0 (Eval Setup). Contains all configuration needed
+    to run the evaluation pipeline, including references to generation outputs.
+    """
+
+    experiment_id: str
+    eval_tag: str
+    subject_llms: List[
+        Dict[str, str]
+    ]  # [{"name": "gpt-4o", "provider": "openai"}, ...]
+    judge_llm: Dict[str, str]  # {"name": "gpt-4o-mini", "provider": "openai"}
+    validation_tag: str  # Tag from generation Stage 5
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "experiment_id": self.experiment_id,
+            "eval_tag": self.eval_tag,
+            "subject_llms": self.subject_llms,
+            "judge_llm": self.judge_llm,
+            "validation_tag": self.validation_tag,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "EvalConfig":
+        """Create from dictionary."""
+        return cls(
+            experiment_id=data["experiment_id"],
+            eval_tag=data["eval_tag"],
+            subject_llms=data["subject_llms"],
+            judge_llm=data["judge_llm"],
+            validation_tag=data["validation_tag"],
+        )
+
+
+@dataclass
+class EvalDataset:
+    """Dataset prepared for Inspect evaluation.
+
+    Created by Eval Stage 1 (Dataset Preparation). Contains all info needed
+    to run Inspect evaluation for one capability.
+    """
+
+    area_id: str
+    capability_id: str
+    capability_name: str
+    domain: str
+    tasks: List[
+        Dict[str, str]
+    ]  # [{"id": "task_000", "input": "...", "target": "..."}, ...]
+    num_tasks: int
+    prompt_template: str
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "area_id": self.area_id,
+            "capability_id": self.capability_id,
+            "capability_name": self.capability_name,
+            "domain": self.domain,
+            "tasks": self.tasks,
+            "num_tasks": self.num_tasks,
+            "prompt_template": self.prompt_template,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "EvalDataset":
+        """Create from dictionary."""
+        return cls(
+            area_id=data["area_id"],
+            capability_id=data["capability_id"],
+            capability_name=data["capability_name"],
+            domain=data["domain"],
+            tasks=data["tasks"],
+            num_tasks=data["num_tasks"],
+            prompt_template=data["prompt_template"],
+        )
+
+
+@dataclass
+class CapabilityScore:
+    """Score for a single capability from evaluation.
+
+    Created by Eval Stage 3 (Score Aggregation). Represents the evaluation
+    result for one capability with one subject LLM.
+    """
+
+    area_id: str
+    capability_id: str
+    capability_name: str
+    subject_llm: str
+    mean: float  # 0.0 to 1.0
+    std_err: float
+    num_tasks: int
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "area_id": self.area_id,
+            "capability_id": self.capability_id,
+            "capability_name": self.capability_name,
+            "subject_llm": self.subject_llm,
+            "mean": self.mean,
+            "std_err": self.std_err,
+            "num_tasks": self.num_tasks,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "CapabilityScore":
+        """Create from dictionary."""
+        return cls(
+            area_id=data["area_id"],
+            capability_id=data["capability_id"],
+            capability_name=data["capability_name"],
+            subject_llm=data["subject_llm"],
+            mean=data["mean"],
+            std_err=data["std_err"],
+            num_tasks=data["num_tasks"],
+        )
diff --git a/src/schemas/solution_schemas.py b/src/schemas/solution_schemas.py
index e4547a75..684f6939 100644
--- a/src/schemas/solution_schemas.py
+++ b/src/schemas/solution_schemas.py
@@ -7,9 +7,6 @@
 from dataclasses import dataclass, field
 from typing import Dict, Optional
 
-from src.schemas.area_schemas import Area
-from src.schemas.capability_schemas import Capability
-from src.schemas.domain_schemas import Domain
 from src.schemas.task_schemas import Task
 
 
@@ -50,28 +47,7 @@ def to_dict(self):
     @classmethod
     def from_dict(cls, data: dict):
         """Create from dictionary."""
-        domain = Domain(
-            name=data["domain"],
-            domain_id=data["domain_id"],
-            description=data.get("domain_description"),
-        )
-        area = Area(
-            name=data["area"],
-            area_id=data["area_id"],
-            domain=domain,
-            description=data["area_description"],
-        )
-        capability = Capability(
-            name=data["capability"],
-            capability_id=data["capability_id"],
-            area=area,
-            description=data["capability_description"],
-        )
-        task_obj = Task(
-            task_id=data["task_id"],
-            task=data["task"],
-            capability=capability,
-        )
+        task_obj = Task.from_dict(data)
         return cls(
             task_id=data["task_id"],
             task=data["task"],
diff --git a/src/schemas/task_schemas.py b/src/schemas/task_schemas.py
index 2b67068a..94f84458 100644
--- a/src/schemas/task_schemas.py
+++ b/src/schemas/task_schemas.py
@@ -7,9 +7,7 @@
 from dataclasses import dataclass, field
 from typing import Dict, Optional
 
-from src.schemas.area_schemas import Area
 from src.schemas.capability_schemas import Capability
-from src.schemas.domain_schemas import Domain
 
 
 @dataclass
@@ -42,22 +40,18 @@ def to_dict(self):
     @classmethod
     def from_dict(cls, data: dict):
         """Create from dictionary."""
-        domain = Domain(
-            name=data["domain"],
-            domain_id=data["domain_id"],
-            description=data.get("domain_description"),
-        )
-        area = Area(
-            name=data["area"],
-            area_id=data["area_id"],
-            domain=domain,
-            description=data["area_description"],
-        )
-        capability = Capability(
-            name=data["capability"],
-            capability_id=data["capability_id"],
-            area=area,
-            description=data["capability_description"],
+        capability = Capability.from_dict(
+            {
+                "name": data["capability"],
+                "capability_id": data["capability_id"],
+                "description": data["capability_description"],
+                "area": data["area"],
+                "area_id": data["area_id"],
+                "area_description": data["area_description"],
+                "domain": data["domain"],
+                "domain_id": data["domain_id"],
+                "domain_description": data.get("domain_description"),
+            }
         )
         return cls(
             task_id=data["task_id"],
diff --git a/src/schemas/validation_schemas.py b/src/schemas/validation_schemas.py
index 02ec8eef..d4812072 100644
--- a/src/schemas/validation_schemas.py
+++ b/src/schemas/validation_schemas.py
@@ -7,10 +7,7 @@
 from dataclasses import dataclass, field
 from typing import Dict, Optional
 
-from src.schemas.area_schemas import Area
-from src.schemas.capability_schemas import Capability
-from src.schemas.domain_schemas import Domain
-from src.schemas.task_schemas import Task
+from src.schemas.solution_schemas import TaskSolution
 
 
 @dataclass
@@ -19,28 +16,17 @@ class ValidationResult:
 
     task_id: str
     task: str
+    task_solution: TaskSolution
     verification: bool
     feedback: str
-    task_obj: Task
     score: Optional[float] = None
     generation_metadata: Optional[Dict] = field(default_factory=dict)
 
     def to_dict(self):
         """Convert to dictionary."""
-        result = {
-            "task_id": self.task_id,
-            "task": self.task,
-            "verification": self.verification,
-            "feedback": self.feedback,
-            "capability_id": self.task_obj.capability.capability_id,
-            "capability": self.task_obj.capability.name,
-            "capability_description": self.task_obj.capability.description,
-            "area": self.task_obj.capability.area.name,
-            "area_id": self.task_obj.capability.area.area_id,
-            "area_description": self.task_obj.capability.area.description,
-            "domain": self.task_obj.capability.area.domain.name,
-            "domain_id": self.task_obj.capability.area.domain.domain_id,
-        }
+        result = self.task_solution.to_dict()
+        result["verification"] = self.verification
+        result["feedback"] = self.feedback
         if self.score is not None:
             result["score"] = self.score
         if self.generation_metadata:
@@ -50,34 +36,13 @@ def to_dict(self):
     @classmethod
     def from_dict(cls, data: dict):
         """Create from dictionary."""
-        domain = Domain(
-            name=data["domain"],
-            domain_id=data["domain_id"],
-            description=data.get("domain_description"),
-        )
-        area = Area(
-            name=data["area"],
-            area_id=data["area_id"],
-            domain=domain,
-            description=data["area_description"],
-        )
-        capability = Capability(
-            name=data["capability"],
-            capability_id=data["capability_id"],
-            area=area,
-            description=data["capability_description"],
-        )
-        task_obj = Task(
-            task_id=data["task_id"],
-            task=data["task"],
-            capability=capability,
-        )
+        task_solution = TaskSolution.from_dict(data)
         return cls(
             task_id=data["task_id"],
             task=data["task"],
+            task_solution=task_solution,
             verification=data["verification"],
             feedback=data["feedback"],
-            task_obj=task_obj,
             score=data.get("score"),
             generation_metadata=data.get("generation_metadata", {}),
         )

From e2b5e419b5035a91751c33e661dd8b06773f622d Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Wed, 7 Jan 2026 01:14:00 -0800
Subject: [PATCH 03/10] fixed the bug in capability generation. removed lbo
 from config check.

---
 src/base_stages/generate_capabilities.py    |  5 ++-
 src/base_stages/validate_tasks.py           |  9 +----
 src/eval_stages/stage0_setup_and_dataset.py | 45 ++++++++++++---------
 src/eval_stages/stage1_eval_execution.py    | 31 +++++++++-----
 src/run_eval_pipeline.py                    | 28 ++++---------
 src/utils/data_utils.py                     |  7 ----
 6 files changed, 61 insertions(+), 64 deletions(-)

diff --git a/src/base_stages/generate_capabilities.py b/src/base_stages/generate_capabilities.py
index 05dbcda2..7b365246 100644
--- a/src/base_stages/generate_capabilities.py
+++ b/src/base_stages/generate_capabilities.py
@@ -52,6 +52,7 @@ def generate_capabilities(
             num_capabilities=min(num_capabilities_per_run, num_capabilities_left),
             client=client,
             prev_capabilities=capabilities,
+            id_offset=len(capabilities),  # Pass offset for unique IDs
         )
         capabilities.extend(run_capabilities)
         num_capabilities_left -= len(run_capabilities)
@@ -64,6 +65,7 @@ def generate_capabilities_using_llm(
     num_capabilities: int,
     client: ChatCompletionClient,
     prev_capabilities: List[Capability],
+    id_offset: int = 0,
 ) -> List[Capability]:
     """Generate capabilities using LLM.
 
@@ -72,6 +74,7 @@ def generate_capabilities_using_llm(
         num_capabilities: Number of capabilities to generate
         client: ChatCompletionClient for API calls
         prev_capabilities: Previously generated capabilities
+        id_offset: Offset for capability IDs to ensure uniqueness across batches
 
     Returns
     -------
@@ -99,7 +102,7 @@ def generate_capabilities_using_llm(
 
     for idx, capability_dict in enumerate(gen_capabilities_dict):
         try:
-            capability_id = f"cap_{idx:03d}"
+            capability_id = f"cap_{(idx + id_offset):03d}"
             capability = Capability(
                 name=capability_dict["name"],
                 capability_id=capability_id,
diff --git a/src/base_stages/validate_tasks.py b/src/base_stages/validate_tasks.py
index 408d2569..09739e03 100644
--- a/src/base_stages/validate_tasks.py
+++ b/src/base_stages/validate_tasks.py
@@ -7,7 +7,6 @@
 
 from src.base_stages.prompts import format_verification_prompt
 from src.schemas.solution_schemas import TaskSolution
-from src.schemas.task_schemas import Task
 from src.schemas.validation_schemas import ValidationResult
 from src.utils.model_client_utils import ModelCallMode, async_call_model
 
@@ -79,18 +78,12 @@ def validate_tasks(
 
             overall_aligned = response.get("overall_verdict", "Fail") == "Pass"
 
-            task = Task(
-                task_id=task_solution.task_id,
-                task=task_solution.task,
-                capability=capability,
-            )
-
             validation_result = ValidationResult(
                 task_id=task_solution.task_id,
                 task=task_solution.task,
+                task_solution=task_solution,
                 verification=overall_aligned,
                 feedback=response.get("explanation", ""),
-                task_obj=task,
                 generation_metadata={
                     "method": "validate_tasks",
                     "subtopic_aligned": response.get("blueprint_alignment", "No")
diff --git a/src/eval_stages/stage0_setup_and_dataset.py b/src/eval_stages/stage0_setup_and_dataset.py
index c67b9657..9f7ba496 100644
--- a/src/eval_stages/stage0_setup_and_dataset.py
+++ b/src/eval_stages/stage0_setup_and_dataset.py
@@ -17,9 +17,11 @@
 from omegaconf import DictConfig
 
 from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
-from src.schemas.eval_io_utils import save_eval_dataset
+from src.schemas.eval_io_utils import save_eval_config, save_eval_dataset
 from src.schemas.eval_schemas import EvalConfig, EvalDataset
+from src.schemas.metadata_schemas import PipelineMetadata
 from src.schemas.validation_schemas import ValidationResult
+from src.utils.timestamp_utils import iso_timestamp
 
 
 logger = logging.getLogger(__name__)
@@ -172,19 +174,16 @@ def _create_eval_dataset(
 def run_eval_stage0(
     cfg: DictConfig,
     validation_tag: str,
-) -> EvalConfig:
+) -> None:
     """Eval Stage 0: Setup and Dataset Preparation.
 
     Validates inputs and creates datasets for evaluation.
+    Saves eval_config.json for Stage 1 to read.
 
     Args:
         cfg: Configuration object
         validation_tag: Tag from generation Stage 5 (required)
 
-    Returns
-    -------
-        EvalConfig object for use in subsequent stages
-
     Raises
     ------
         EvalSetupError: If validation fails
@@ -205,15 +204,6 @@ def run_eval_stage0(
     _validate_inputs(experiment_dir, validation_tag, eval_cfg)
     logger.info("Validation checks passed")
 
-    # Create EvalConfig (no tag yet - that's created in Stage 1)
-    eval_config = EvalConfig(
-        experiment_id=exp_id,
-        eval_tag="",  # Will be set in Stage 1
-        subject_llms=eval_cfg.get("subject_llms"),
-        judge_llm=eval_cfg.get("judge_llm"),
-        validation_tag=validation_tag,
-    )
-
     # Find all validated tasks
     validated_tasks = _find_validated_tasks(experiment_dir, validation_tag)
     logger.info("Found %d validated tasks", len(validated_tasks))
@@ -251,10 +241,29 @@ def run_eval_stage0(
         )
         num_created += 1
 
+    # Create and save EvalConfig (eval_tag will be set in Stage 1)
+    eval_config = EvalConfig(
+        experiment_id=exp_id,
+        eval_tag="",  # Will be set in Stage 1
+        subject_llms=eval_cfg.get("subject_llms"),
+        judge_llm=eval_cfg.get("judge_llm"),
+        validation_tag=validation_tag,
+    )
+
+    metadata = PipelineMetadata(
+        experiment_id=exp_id,
+        output_base_dir=str(output_base_dir),
+        timestamp=iso_timestamp(),
+        input_stage_tag=validation_tag,
+        output_stage_tag="",  # No output tag for Stage 0
+        resume=False,
+    )
+
+    eval_config_path = datasets_dir / "eval_config.json"
+    save_eval_config(eval_config, metadata, eval_config_path)
+
     logger.info(
-        "Eval Stage 0: Created %d datasets in %s",
+        "Eval Stage 0: Created %d datasets, saved eval_config.json to %s",
         num_created,
         datasets_dir,
     )
-
-    return eval_config
diff --git a/src/eval_stages/stage1_eval_execution.py b/src/eval_stages/stage1_eval_execution.py
index a662dc38..143341bd 100644
--- a/src/eval_stages/stage1_eval_execution.py
+++ b/src/eval_stages/stage1_eval_execution.py
@@ -17,8 +17,12 @@
 from inspect_ai.solver import generate
 from omegaconf import DictConfig
 
-from src.schemas.eval_io_utils import load_eval_dataset, save_eval_config
-from src.schemas.eval_schemas import EvalConfig, EvalDataset
+from src.schemas.eval_io_utils import (
+    load_eval_config,
+    load_eval_dataset,
+    save_eval_config,
+)
+from src.schemas.eval_schemas import EvalDataset
 from src.schemas.metadata_schemas import PipelineMetadata
 from src.utils.timestamp_utils import iso_timestamp, timestamp_tag
 
@@ -149,7 +153,7 @@ def _run_inspect_eval(
 
 def run_eval_stage1(
     cfg: DictConfig,
-    eval_config: EvalConfig,
+    validation_tag: str,
 ) -> str:
     """Eval Stage 1: Evaluation Execution.
 
@@ -158,7 +162,7 @@ def run_eval_stage1(
 
     Args:
         cfg: Configuration object
-        eval_config: EvalConfig from Stage 0
+        validation_tag: Tag from generation Stage 5 (required)
 
     Returns
     -------
@@ -168,7 +172,15 @@ def run_eval_stage1(
     exp_id = cfg.exp_cfg.exp_id
     output_base_dir = Path(cfg.global_cfg.output_dir)
     experiment_dir = output_base_dir / exp_id
-    validation_tag = eval_config.validation_tag
+
+    # Load eval_config from Stage 0
+    datasets_dir = experiment_dir / "eval" / "datasets" / validation_tag
+    eval_config_path = datasets_dir / "eval_config.json"
+    if not eval_config_path.exists():
+        raise ValueError(
+            f"eval_config.json not found at {eval_config_path}. Run Stage 0 first."
+        )
+    eval_config, _ = load_eval_config(eval_config_path)
 
     # Create eval_tag for this run
     eval_tag = timestamp_tag()
@@ -179,7 +191,6 @@ def run_eval_stage1(
     )
 
     # Find datasets (saved under validation_tag from Stage 0)
-    datasets_dir = experiment_dir / "eval" / "datasets" / validation_tag
     dataset_paths = _find_datasets(datasets_dir)
     logger.info("Found %d datasets", len(dataset_paths))
 
@@ -193,7 +204,7 @@ def run_eval_stage1(
     eval_dir = experiment_dir / "eval" / "results" / eval_tag
     results_dir = eval_dir
 
-    # Update eval_config with the tag and save it
+    # Update eval_config with the tag and save it to results dir
     eval_config.eval_tag = eval_tag
     metadata = PipelineMetadata(
         experiment_id=exp_id,
@@ -203,9 +214,9 @@ def run_eval_stage1(
         output_stage_tag=eval_tag,
         resume=False,
     )
-    eval_config_path = eval_dir / "eval_config.json"
-    save_eval_config(eval_config, metadata, eval_config_path)
-    logger.info("Saved eval_config.json to %s", eval_config_path)
+    results_config_path = eval_dir / "eval_config.json"
+    save_eval_config(eval_config, metadata, results_config_path)
+    logger.info("Saved eval_config.json to %s", results_config_path)
 
     # Run evaluations
     subject_llms = eval_config.subject_llms
diff --git a/src/run_eval_pipeline.py b/src/run_eval_pipeline.py
index 36485ea7..5b0b0e63 100644
--- a/src/run_eval_pipeline.py
+++ b/src/run_eval_pipeline.py
@@ -1,9 +1,9 @@
 """Evaluation pipeline for running LLM evaluations on generated tasks.
 
 This module orchestrates the evaluation pipeline:
-- Stage 0: Setup and Dataset Preparation (no LLM calls, no tag)
+- Stage 0: Setup and Dataset Preparation
 - Stage 1: Evaluation Execution (runs subject LLMs, creates eval_tag)
-- Stage 2: Score Aggregation (no LLM calls)
+- Stage 2: Score Aggregation
 
 Usage:
     # Run all stages
@@ -16,7 +16,6 @@
 """
 
 import logging
-from pathlib import Path
 
 import hydra
 from omegaconf import DictConfig
@@ -63,12 +62,12 @@ def main(cfg: DictConfig) -> None:
         try:
             # Stage 0: Setup and Dataset Preparation
             logger.info("Running Eval Stage 0: Setup and Dataset Preparation")
-            eval_config = run_eval_stage0(cfg, validation_tag)
+            run_eval_stage0(cfg, validation_tag)
             logger.info("Eval Stage 0 complete.")
 
             # Stage 1: Evaluation Execution
             logger.info("Running Eval Stage 1: Evaluation Execution")
-            eval_tag = run_eval_stage1(cfg, eval_config)
+            eval_tag = run_eval_stage1(cfg, validation_tag)
             logger.info("Eval Stage 1 complete. eval_tag=%s", eval_tag)
 
             # Stage 2: Score Aggregation
@@ -76,16 +75,6 @@ def main(cfg: DictConfig) -> None:
             run_eval_stage2(cfg, eval_tag)
             logger.info("Eval Stage 2 complete.")
 
-            # Get results dir for final message
-            exp_id = cfg.exp_cfg.exp_id
-            output_base_dir = Path(cfg.global_cfg.output_dir)
-            scores_dir = output_base_dir / exp_id / "eval" / "scores" / eval_tag
-
-            logger.info("=" * 60)
-            logger.info("EVALUATION PIPELINE COMPLETE")
-            logger.info("Scores in: %s", scores_dir)
-            logger.info("=" * 60)
-
         except EvalSetupError as e:
             logger.error("Evaluation setup failed: %s", e)
             return
@@ -104,7 +93,7 @@ def main(cfg: DictConfig) -> None:
             return
 
         try:
-            eval_config = run_eval_stage0(cfg, validation_tag)
+            run_eval_stage0(cfg, validation_tag)
             logger.info("Eval Stage 0 complete. Datasets created.")
         except EvalSetupError as e:
             logger.error("Evaluation setup failed: %s", e)
@@ -119,11 +108,10 @@ def main(cfg: DictConfig) -> None:
             return
 
         try:
-            # Run Stage 0 first to get eval_config
-            eval_config = run_eval_stage0(cfg, validation_tag)
-            eval_tag = run_eval_stage1(cfg, eval_config)
+            # Stage 1 reads eval_config from Stage 0's output
+            eval_tag = run_eval_stage1(cfg, validation_tag)
             logger.info("Eval Stage 1 complete. eval_tag=%s", eval_tag)
-        except (EvalSetupError, ValueError) as e:
+        except ValueError as e:
             logger.error("Stage 1 failed: %s", e)
 
     elif stage == 2:
diff --git a/src/utils/data_utils.py b/src/utils/data_utils.py
index 21ced2f5..2d936f92 100644
--- a/src/utils/data_utils.py
+++ b/src/utils/data_utils.py
@@ -266,13 +266,6 @@ def check_cfg(cfg: DictConfig, logger: logging.Logger) -> None:
     additional_c = cfg.capabilities_cfg.num_gen_capabilities_per_run - rem_c
     if rem_c != 0:
         logger.warning(f"{additional_c} additional capabilities might be generated.")
-    if "discover_new" in cfg.lbo_cfg.pipeline_id:
-        assert (
-            cfg.dimensionality_reduction_cfg.discover_new_reduced_dimensionality_method
-            in ["pca", "cut-embedding"]
-        ), (
-            "The dimensionality reduction method must be either 'pca' or 'cut-embedding' when using the discover_new pipelines."
-        )
 
 
 def get_run_id(cfg: DictConfig) -> str:

From 1d3c39c27753ac11eb8745370f43c54dad10e8b7 Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Wed, 7 Jan 2026 01:27:37 -0800
Subject: [PATCH 04/10] simplified solution schema.

---
 src/schemas/solution_schemas.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/src/schemas/solution_schemas.py b/src/schemas/solution_schemas.py
index 684f6939..37360e5d 100644
--- a/src/schemas/solution_schemas.py
+++ b/src/schemas/solution_schemas.py
@@ -24,20 +24,9 @@ class TaskSolution:
 
     def to_dict(self):
         """Convert to dictionary."""
-        result = {
-            "task_id": self.task_id,
-            "task": self.task,
-            "solution": self.solution,
-            "reasoning": self.reasoning,
-            "capability_id": self.task_obj.capability.capability_id,
-            "capability": self.task_obj.capability.name,
-            "capability_description": self.task_obj.capability.description,
-            "area": self.task_obj.capability.area.name,
-            "area_id": self.task_obj.capability.area.area_id,
-            "area_description": self.task_obj.capability.area.description,
-            "domain": self.task_obj.capability.area.domain.name,
-            "domain_id": self.task_obj.capability.area.domain.domain_id,
-        }
+        result = self.task_obj.to_dict()
+        result["solution"] = self.solution
+        result["reasoning"] = self.reasoning
         if self.numerical_answer is not None:
             result["numerical_answer"] = self.numerical_answer
         if self.generation_metadata:

From 2c61f7edcea8538c4447dfb34e6c8996c6378df9 Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Tue, 3 Feb 2026 19:50:44 -0800
Subject: [PATCH 05/10] Fix eval pipeline type errors and schema attribute
 access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix stage0: task_obj → task, capability.name → capability_name,
  domain.name → domain_name, ts.task → ts.task_statement
- Fix stage1/stage2: Add proper generic type hints (Dict[str, Any])
- Fix eval_schemas.py: Correct stage numbers in docstrings
  (Stage 1/3 → Stage 0/2)
---
 src/eval_stages/stage0_setup_and_dataset.py | 18 +++++++++---------
 src/eval_stages/stage1_eval_execution.py    |  4 ++--
 src/eval_stages/stage2_score_aggregation.py | 12 ++++++------
 src/schemas/eval_schemas.py                 | 10 +++++-----
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/eval_stages/stage0_setup_and_dataset.py b/src/eval_stages/stage0_setup_and_dataset.py
index 9f7ba496..9d166bd6 100644
--- a/src/eval_stages/stage0_setup_and_dataset.py
+++ b/src/eval_stages/stage0_setup_and_dataset.py
@@ -12,7 +12,7 @@
 import logging
 from collections import defaultdict
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Any, Dict, List, Tuple
 
 from omegaconf import DictConfig
 
@@ -36,7 +36,7 @@ class EvalSetupError(Exception):
 def _validate_inputs(
     experiment_dir: Path,
     validation_tag: str,
-    eval_cfg: dict,
+    eval_cfg: Dict[str, Any],
 ) -> None:
     """Validate all required inputs exist.
 
@@ -120,8 +120,8 @@ def _group_by_capability(
     grouped = defaultdict(list)
     for _, validation in validated_tasks:
         task_solution = validation.task_solution
-        area_id = task_solution.task_obj.capability.area.area_id
-        cap_id = task_solution.task_obj.capability.capability_id
+        area_id = task_solution.task.capability.area.area_id
+        cap_id = task_solution.task.capability.capability_id
         grouped[(area_id, cap_id)].append(validation)
     return grouped
 
@@ -146,16 +146,16 @@ def _create_eval_dataset(
     """
     # Get capability info from first validation
     first = validations[0]
-    capability = first.task_solution.task_obj.capability
+    capability = first.task_solution.task.capability
 
     # Build tasks list
-    tasks = []
+    tasks: List[Dict[str, str]] = []
     for v in validations:
         ts = v.task_solution
         tasks.append(
             {
                 "id": ts.task_id,
-                "input": ts.task,
+                "input": ts.task_statement,
                 "target": ts.solution,
             }
         )
@@ -163,8 +163,8 @@ def _create_eval_dataset(
     return EvalDataset(
         area_id=area_id,
         capability_id=capability_id,
-        capability_name=capability.name,
-        domain=capability.area.domain.name,
+        capability_name=capability.capability_name,
+        domain=capability.area.domain.domain_name,
         tasks=tasks,
         num_tasks=len(tasks),
         prompt_template=prompt_template,
diff --git a/src/eval_stages/stage1_eval_execution.py b/src/eval_stages/stage1_eval_execution.py
index 143341bd..0eb69a54 100644
--- a/src/eval_stages/stage1_eval_execution.py
+++ b/src/eval_stages/stage1_eval_execution.py
@@ -8,7 +8,7 @@
 
 import logging
 from pathlib import Path
-from typing import List
+from typing import Dict, List
 
 from inspect_ai import Task
 from inspect_ai import eval as inspect_eval
@@ -106,7 +106,7 @@ def _create_inspect_task(
 def _run_inspect_eval(
     dataset: EvalDataset,
     subject_llm: str,
-    judge_llm: dict,
+    judge_llm: Dict[str, str],
     output_dir: Path,
 ) -> bool:
     """Run Inspect evaluation for a single capability/LLM combination.
diff --git a/src/eval_stages/stage2_score_aggregation.py b/src/eval_stages/stage2_score_aggregation.py
index 2855cb61..ecc75b1c 100644
--- a/src/eval_stages/stage2_score_aggregation.py
+++ b/src/eval_stages/stage2_score_aggregation.py
@@ -9,7 +9,7 @@
 import logging
 import math
 from pathlib import Path
-from typing import Dict, List
+from typing import Any, Dict, List
 
 from inspect_ai.log import read_eval_log
 from omegaconf import DictConfig
@@ -50,7 +50,7 @@ def _find_result_dirs(results_dir: Path, subject_llm: str) -> List[Path]:
     return result_dirs
 
 
-def _compute_stats(scores: List[float]) -> Dict:
+def _compute_stats(scores: List[float]) -> Dict[str, Any]:
     """Compute mean and standard error from scores.
 
     Args:
@@ -76,7 +76,7 @@ def _compute_stats(scores: List[float]) -> Dict:
     return {"mean": mean, "std_err": std_err, "num_tasks": n}
 
 
-def _parse_inspect_logs(result_dir: Path) -> Dict:
+def _parse_inspect_logs(result_dir: Path) -> Dict[str, Any]:
     """Parse Inspect logs to extract scores.
 
     Args:
@@ -188,8 +188,8 @@ def run_eval_stage2(
             area_id = result_dir.parent.name
 
             # Get capability info from dataset
-            dataset = dataset_map.get((area_id, cap_id))
-            if not dataset:
+            cap_dataset = dataset_map.get((area_id, cap_id))
+            if cap_dataset is None:
                 logger.warning(
                     "  No dataset found for %s/%s, skipping",
                     area_id,
@@ -204,7 +204,7 @@ def run_eval_stage2(
             score = CapabilityScore(
                 area_id=area_id,
                 capability_id=cap_id,
-                capability_name=dataset.capability_name,
+                capability_name=cap_dataset.capability_name,
                 subject_llm=llm_name,
                 mean=parsed["mean"],
                 std_err=parsed["std_err"],
diff --git a/src/schemas/eval_schemas.py b/src/schemas/eval_schemas.py
index 8a553926..27f06991 100644
--- a/src/schemas/eval_schemas.py
+++ b/src/schemas/eval_schemas.py
@@ -2,8 +2,8 @@
 
 Defines dataclasses for evaluation pipeline:
 - EvalConfig: Configuration for evaluation run (Stage 0 output)
-- EvalDataset: Dataset for one capability (Stage 1 output)
-- CapabilityScore: Score for one capability (Stage 3 output)
+- EvalDataset: Dataset for one capability (Stage 0 output)
+- CapabilityScore: Score for one capability (Stage 2 output)
 """
 
 from dataclasses import dataclass
@@ -52,8 +52,8 @@ def from_dict(cls, data: Dict[str, Any]) -> "EvalConfig":
 class EvalDataset:
     """Dataset prepared for Inspect evaluation.
 
-    Created by Eval Stage 1 (Dataset Preparation). Contains all info needed
-    to run Inspect evaluation for one capability.
+    Created by Eval Stage 0 (Setup and Dataset Preparation). Contains all info
+    needed to run Inspect evaluation for one capability.
     """
 
     area_id: str
@@ -96,7 +96,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "EvalDataset":
 class CapabilityScore:
     """Score for a single capability from evaluation.
 
-    Created by Eval Stage 3 (Score Aggregation). Represents the evaluation
+    Created by Eval Stage 2 (Score Aggregation). Represents the evaluation
     result for one capability with one subject LLM.
     """
 

From 777e688971bfb9ae94d86f3eab01fd35038a3653 Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Wed, 4 Feb 2026 00:46:03 -0800
Subject: [PATCH 06/10] Exclude legacy tests from pytest (broken imports)

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index d5055b82..2d3b3ab1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -177,6 +177,8 @@ env = [
 filterwarnings = [
     "ignore::DeprecationWarning",
 ]
+# Exclude legacy tests (imports are broken after code was moved)
+norecursedirs = ["legacy"]
 
 [tool.coverage]
     [tool.coverage.run]

From f85d00be67405fc0f15e248e8d1a0a24ef32cc9e Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Thu, 5 Feb 2026 12:49:55 -0800
Subject: [PATCH 07/10] clarify eval pipeline outputs and scoring details in
 evaluation pipeline schema

---
 src/schemas/EVALUATION_PIPELINE_SCHEMAS.md | 62 ++++++++++++++++++----
 1 file changed, 51 insertions(+), 11 deletions(-)

diff --git a/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md b/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md
index 4d7d2f87..8df63a81 100644
--- a/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md
+++ b/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md
@@ -14,6 +14,17 @@ The evaluation pipeline consists of three stages:
 
 ---
 
+## Implementation Files
+
+- [`src/run_eval_pipeline.py`](../run_eval_pipeline.py) (pipeline entrypoint)
+- [`src/eval_stages/stage0_setup_and_dataset.py`](../eval_stages/stage0_setup_and_dataset.py)
+- [`src/eval_stages/stage1_eval_execution.py`](../eval_stages/stage1_eval_execution.py)
+- [`src/eval_stages/stage2_score_aggregation.py`](../eval_stages/stage2_score_aggregation.py)
+- [`src/schemas/eval_schemas.py`](eval_schemas.py)
+- [`src/schemas/eval_io_utils.py`](eval_io_utils.py)
+
+---
+
 ## Implementation Approach
 
 **Pipeline Pattern:**
@@ -22,11 +33,14 @@ The evaluation pipeline consists of three stages:
 - **Stage 2**: Deterministic aggregation (uses `eval_tag` from Stage 1)
 
 **Shared Config:**
-The evaluation pipeline uses the **same configuration file** as the generation pipeline (`src/cfg/run_cfg.yaml`), with an evaluation-specific section (`eval_cfg`).
+The evaluation pipeline uses the **same configuration file** as the generation pipeline
+([`src/cfg/run_cfg.yaml`](../cfg/run_cfg.yaml)), with an evaluation-specific section
+(`eval_cfg`).
 
 **Resumability:**
 - **Stage 0**: Idempotent - skips datasets that already exist
-- **Stage 1**: Can resume by skipping completed evaluations (pass same `eval_tag`)
+- **Stage 1**: Creates a fresh `eval_tag` by default; skips any completed evaluations
+  only if you re-run Stage 1 with the same `eval_tag` programmatically
 
 ---
 
@@ -51,13 +65,16 @@ eval_cfg:
 
 ## Naming Conventions
 
-See [GENERATION_PIPELINE_SCHEMAS.md](GENERATION_PIPELINE_SCHEMAS.md) for naming conventions. Tags follow the same format: `_YYYYMMDD_HHMMSS`.
+See [`src/schemas/GENERATION_PIPELINE_SCHEMAS.md`](GENERATION_PIPELINE_SCHEMAS.md) for
+naming conventions. Tags follow the same format: `_YYYYMMDD_HHMMSS`.
 
 ---
 
 ## Directory Structure
 
-Evaluation outputs are stored in an `eval/` subdirectory within the experiment directory (see [GENERATION_PIPELINE_SCHEMAS.md](GENERATION_PIPELINE_SCHEMAS.md) for generation structure):
+Evaluation outputs are stored in an `eval/` subdirectory within the experiment directory
+(see [`src/schemas/GENERATION_PIPELINE_SCHEMAS.md`](GENERATION_PIPELINE_SCHEMAS.md) for
+generation structure):
 
 ```
 <experiment_id>/
@@ -74,6 +91,7 @@ Evaluation outputs are stored in an `eval/` subdirectory within the experiment d
         <subject_llm>/
           <area_id>/
             <capability_id>/               # Inspect logs
+              *.json                       # Inspect log files (per run)
 
     scores/                                # Stage 2 output
       <eval_tag>/
@@ -111,9 +129,10 @@ r0_10x10/
 
 ## Dataclasses
 
-The evaluation pipeline uses 3 dataclasses, plus reuses `PipelineMetadata` from the generation pipeline (see [GENERATION_PIPELINE_SCHEMAS.md](GENERATION_PIPELINE_SCHEMAS.md#pipelinemetadata)).
+The evaluation pipeline uses 3 dataclasses, plus reuses `PipelineMetadata` from the
+generation pipeline (see [`src/schemas/GENERATION_PIPELINE_SCHEMAS.md`](GENERATION_PIPELINE_SCHEMAS.md#pipelinemetadata)).
 
-**File:** `eval_schemas.py` in `src/schemas/`
+**File:** [`src/schemas/eval_schemas.py`](eval_schemas.py)
 
 ### EvalConfig
 
@@ -173,6 +192,9 @@ Validate inputs and convert validated tasks to Inspect-compatible format.
 **Stage Output:** EvalDataset dataclass
 **Save Function:** `save_eval_dataset(dataset: EvalDataset, output_path: Path)`
 
+**Also Saved:** `eval_config.json` (EvalConfig + PipelineMetadata)
+**Path:** `<output_dir>/<experiment_id>/eval/datasets/<validation_tag>/eval_config.json`
+
 **File Path:** `<output_dir>/<experiment_id>/eval/datasets/<validation_tag>/<area_id>/<capability_id>/dataset.json`
 
 ```json
@@ -203,8 +225,12 @@ Run Inspect evaluation for each capability with each subject LLM.
 - **eval_config**: EvalConfig from Stage 0
 
 ### Tag Handling
-- **Creates**: `eval_tag` for this evaluation run
-- **Resume**: If `results/<eval_tag>/<subject_llm>/<area_id>/<capability_id>/` exists with logs, skip
+- **Creates**: `eval_tag` for this evaluation run (generated by `timestamp_tag()` in
+  [`src/utils/timestamp_utils.py`](../utils/timestamp_utils.py))
+- **Resume behavior**: Stage 1 always creates a fresh `eval_tag` via
+  `timestamp_tag()`. The implementation skips any `(subject_llm, area_id, capability_id)`
+  that already has log files under the **current** `eval_tag` directory. This is only
+  relevant if you re-run Stage 1 using the same `eval_tag` programmatically.
 
 ### Output: Inspect logs + `eval_config.json`
 
@@ -212,10 +238,19 @@ Run Inspect evaluation for each capability with each subject LLM.
 
 **File Path:** `<output_dir>/<experiment_id>/eval/results/<eval_tag>/<subject_llm>/<area_id>/<capability_id>/`
 
-The `eval_config.json` is saved to `results/<eval_tag>/eval_config.json` for reference.
+The `eval_config.json` is saved to
+`<output_dir>/<experiment_id>/eval/results/<eval_tag>/eval_config.json` for reference.
 
 **Returns:** `eval_tag` string
 
+### Scoring Details (Per-Task)
+- Each task in `EvalDataset.tasks` becomes an Inspect `Sample` with `id=task["id"]`
+  (see [`src/eval_stages/stage1_eval_execution.py`](../eval_stages/stage1_eval_execution.py)).
+- The judge model scores each sample via `model_graded_fact` during Stage 1.
+- Per-task scores live **only** in the Inspect log JSON files, under
+  `samples[].scores`. These scores are aggregated in Stage 2; there is no separate
+  per-task score file written by this pipeline.
+
 ---
 
 ## Eval Stage 2: Score Aggregation
@@ -233,6 +268,10 @@ Compute final capability scores from raw Inspect results.
 
 **File Path:** `<output_dir>/<experiment_id>/eval/scores/<eval_tag>/<subject_llm>/capability_scores.json`
 
+**Aggregation Note:** Stage 2 reads all Inspect log JSON files under
+`<output_dir>/<experiment_id>/eval/results/<eval_tag>/<subject_llm>/<area_id>/<capability_id>/`
+and aggregates `samples[].scores` into `mean`, `std_err`, and `num_tasks`.
+
 ```json
 [
   {
@@ -277,7 +316,7 @@ python -m src.run_eval_pipeline stage=2 eval_tag=_20251020_143000
 
 ## IO Utilities
 
-The following functions are provided in `src/schemas/eval_io_utils.py`:
+The following functions are provided in [`src/schemas/eval_io_utils.py`](eval_io_utils.py):
 
 ### Save Functions
 - `save_eval_config(config: EvalConfig, metadata: PipelineMetadata, path: Path)`
@@ -290,7 +329,8 @@ The following functions are provided in `src/schemas/eval_io_utils.py`:
 - `load_capability_scores(path: Path) -> List[CapabilityScore]`
 
 ### Helper Functions
-- Use `timestamp_tag()` from `src.utils.timestamp_utils` to generate tags
+- Use `timestamp_tag()` from [`src/utils/timestamp_utils.py`](../utils/timestamp_utils.py)
+  to generate tags
 - `get_experiment_dir(output_base_dir: str, experiment_id: str) -> Path`
 
 ---

From 8d1ac85241c6be943c8e3e58ea108320438e8b9a Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Fri, 6 Feb 2026 01:57:04 -0800
Subject: [PATCH 08/10] fix eval io path + schema docs

---
 src/schemas/EVALUATION_PIPELINE_SCHEMAS.md | 49 +++++++++++-----------
 src/schemas/eval_io_utils.py               |  4 +-
 2 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md b/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md
index 8df63a81..5bb727a6 100644
--- a/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md
+++ b/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md
@@ -1,6 +1,6 @@
 # ACE Evaluation Pipeline Standardized Schemas
 
-The **evaluation pipeline** takes the validated tasks and solutions from the generation pipeline and evaluates subject LLMs on them using Inspect AI. It produces capability scores that measure how well each subject LLM performs on each capability.
+The **evaluation pipeline** takes the validated tasks and solutions from the generation pipeline and evaluates subject LLMs on them using [Inspect](https://inspect.aisi.org.uk/). It produces capability scores that measure how well each subject LLM performs on each capability.
 
 This document defines the standardized input and output formats for each stage of the evaluation pipeline. These schemas ensure consistency across different implementations and enable interoperability between pipeline stages.
 
@@ -14,17 +14,6 @@ The evaluation pipeline consists of three stages:
 
 ---
 
-## Implementation Files
-
-- [`src/run_eval_pipeline.py`](../run_eval_pipeline.py) (pipeline entrypoint)
-- [`src/eval_stages/stage0_setup_and_dataset.py`](../eval_stages/stage0_setup_and_dataset.py)
-- [`src/eval_stages/stage1_eval_execution.py`](../eval_stages/stage1_eval_execution.py)
-- [`src/eval_stages/stage2_score_aggregation.py`](../eval_stages/stage2_score_aggregation.py)
-- [`src/schemas/eval_schemas.py`](eval_schemas.py)
-- [`src/schemas/eval_io_utils.py`](eval_io_utils.py)
-
----
-
 ## Implementation Approach
 
 **Pipeline Pattern:**
@@ -39,8 +28,8 @@ The evaluation pipeline uses the **same configuration file** as the generation p
 
 **Resumability:**
 - **Stage 0**: Idempotent - skips datasets that already exist
-- **Stage 1**: Creates a fresh `eval_tag` by default; skips any completed evaluations
-  only if you re-run Stage 1 with the same `eval_tag` programmatically
+- **Stage 1**: Creates a fresh `eval_tag` by default. If you provide an existing
+  `eval_tag`, Stage 1 resumes and skips capability/LLM runs with complete logs.
 
 ---
 
@@ -73,8 +62,7 @@ naming conventions. Tags follow the same format: `_YYYYMMDD_HHMMSS`.
 ## Directory Structure
 
 Evaluation outputs are stored in an `eval/` subdirectory within the experiment directory
-(see [`src/schemas/GENERATION_PIPELINE_SCHEMAS.md`](GENERATION_PIPELINE_SCHEMAS.md) for
-generation structure):
+(see [`src/schemas/GENERATION_PIPELINE_SCHEMAS.md`](GENERATION_PIPELINE_SCHEMAS.md) for generation structure):
 
 ```
 <experiment_id>/
@@ -212,7 +200,7 @@ Validate inputs and convert validated tasks to Inspect-compatible format.
 }
 ```
 
-**Returns:** EvalConfig object (for use in Stage 1)
+**Returns:** None (writes `eval_config.json` for Stage 1)
 
 ---
 
@@ -223,14 +211,15 @@ Run Inspect evaluation for each capability with each subject LLM.
 
 ### Input
 - **eval_config**: EvalConfig from Stage 0
+- **eval_tag** (optional): Existing tag to resume an interrupted Stage 1 run
 
 ### Tag Handling
-- **Creates**: `eval_tag` for this evaluation run (generated by `timestamp_tag()` in
+- **Creates**: New `eval_tag` if none is provided (generated by `timestamp_tag()` in
   [`src/utils/timestamp_utils.py`](../utils/timestamp_utils.py))
-- **Resume behavior**: Stage 1 always creates a fresh `eval_tag` via
-  `timestamp_tag()`. The implementation skips any `(subject_llm, area_id, capability_id)`
-  that already has log files under the **current** `eval_tag` directory. This is only
-  relevant if you re-run Stage 1 using the same `eval_tag` programmatically.
+- **Resume**: If `eval_tag` is provided, Stage 1 writes into that tag and skips
+  `(subject_llm, area_id, capability_id)` combinations that already have complete logs.
+  For incomplete combinations with failed logs, Stage 1 uses Inspect `eval_retry`
+  from those log files.
 
 ### Output: Inspect logs + `eval_config.json`
 
@@ -243,6 +232,13 @@ The `eval_config.json` is saved to
 
 **Returns:** `eval_tag` string
 
+**Completion Criterion:** A capability/LLM run is treated as complete only when
+scored task IDs in the log exactly match expected dataset task IDs for that capability.
+
+**Stage 1 Summary Logs:** The stage logs
+`completed_this_run`, `skipped_completed`, `resumed`, `failed`, `incomplete`, and
+`total`.
+
 ### Scoring Details (Per-Task)
 - Each task in `EvalDataset.tasks` becomes an Inspect `Sample` with `id=task["id"]`
   (see [`src/eval_stages/stage1_eval_execution.py`](../eval_stages/stage1_eval_execution.py)).
@@ -268,9 +264,9 @@ Compute final capability scores from raw Inspect results.
 
 **File Path:** `<output_dir>/<experiment_id>/eval/scores/<eval_tag>/<subject_llm>/capability_scores.json`
 
-**Aggregation Note:** Stage 2 reads all Inspect log JSON files under
+**Aggregation Note:** Stage 2 reads Inspect log JSON files under
 `<output_dir>/<experiment_id>/eval/results/<eval_tag>/<subject_llm>/<area_id>/<capability_id>/`
-and aggregates `samples[].scores` into `mean`, `std_err`, and `num_tasks`.
+and uses the log with the most expected-task matches (to avoid retry double-counting).
 
 ```json
 [
@@ -305,9 +301,12 @@ python -m src.run_eval_pipeline validation_tag=_20251017_091500
 # Run only Stage 0 (setup + dataset preparation)
 python -m src.run_eval_pipeline stage=0 validation_tag=_20251017_091500
 
-# Run Stage 0 + Stage 1 (setup, datasets, and evaluation)
+# Run only Stage 1 (requires Stage 0 outputs)
 python -m src.run_eval_pipeline stage=1 validation_tag=_20251017_091500
 
+# Resume Stage 1 with an existing eval_tag
+python -m src.run_eval_pipeline stage=1 validation_tag=_20251017_091500 eval_tag=_20251020_143000
+
 # Run only Stage 2 (score aggregation) - requires eval_tag from Stage 1
 python -m src.run_eval_pipeline stage=2 eval_tag=_20251020_143000
 ```
diff --git a/src/schemas/eval_io_utils.py b/src/schemas/eval_io_utils.py
index 0aca7c37..d1a9e343 100644
--- a/src/schemas/eval_io_utils.py
+++ b/src/schemas/eval_io_utils.py
@@ -137,6 +137,6 @@ def get_eval_dir(experiment_dir: Path, eval_tag: str) -> Path:
 
     Returns
     -------
-        Path to eval directory
+        Path to eval Stage 1 results directory
     """
-    return experiment_dir / "eval" / eval_tag
+    return experiment_dir / "eval" / "results" / eval_tag

From 88977c72f39166070f1123840582af26f640906f Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Fri, 6 Feb 2026 01:58:34 -0800
Subject: [PATCH 09/10] add eval_tag to run_cfg for eval stage resume

---
 src/cfg/run_cfg.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cfg/run_cfg.yaml b/src/cfg/run_cfg.yaml
index 337e07ca..d7de581b 100644
--- a/src/cfg/run_cfg.yaml
+++ b/src/cfg/run_cfg.yaml
@@ -47,6 +47,7 @@ capabilities_tag: null    # Stage 2 output tag (required for stage 3 standalone)
 tasks_tag: null           # Stage 3 output tag (required for stage 4 standalone)
 solution_tag: null        # Stage 4 output tag (required for stage 5 standalone)
 validation_tag: null      # Stage 5 output tag (required for eval pipeline)
+eval_tag: null            # Eval Stage 1 output tag (required for eval stage 2, optional for stage 1 resume)
 
 # Stage 1: Area generation
 areas_cfg:

From 6952a5bb4205a60b1ecff74ce2c3a4483cc60472 Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Fri, 6 Feb 2026 02:49:59 -0800
Subject: [PATCH 10/10] fix stage1: strict task-id completion and JSON retry
 resume. fix stage 2 accordingly.

---
 src/eval_stages/stage1_eval_execution.py    | 287 ++++++++++++++------
 src/eval_stages/stage2_score_aggregation.py | 187 ++++++++-----
 src/run_eval_pipeline.py                    |   8 +-
 3 files changed, 330 insertions(+), 152 deletions(-)

diff --git a/src/eval_stages/stage1_eval_execution.py b/src/eval_stages/stage1_eval_execution.py
index 0eb69a54..a2aa08a1 100644
--- a/src/eval_stages/stage1_eval_execution.py
+++ b/src/eval_stages/stage1_eval_execution.py
@@ -1,18 +1,20 @@
 """Eval Stage 1: Evaluation Execution.
 
 This stage runs Inspect AI evaluation for each capability with each subject LLM.
-Creates eval_tag for this evaluation run since this is where LLM calls happen.
+Creates a new eval_tag by default, or reuses a provided eval_tag in resume mode.
 
 See: https://inspect.aisi.org.uk/
 """
 
 import logging
 from pathlib import Path
-from typing import Dict, List
+from typing import Dict, List, Optional, Set
 
 from inspect_ai import Task
 from inspect_ai import eval as inspect_eval
+from inspect_ai import eval_retry as inspect_eval_retry
 from inspect_ai.dataset import MemoryDataset, Sample
+from inspect_ai.log import read_eval_log
 from inspect_ai.scorer import model_graded_fact
 from inspect_ai.solver import generate
 from omegaconf import DictConfig
@@ -31,57 +33,126 @@
 
 
 def _find_datasets(datasets_dir: Path) -> List[Path]:
-    """Find all dataset files.
-
-    Args:
-        datasets_dir: Path to datasets directory
-
-    Returns
-    -------
-        List of paths to dataset.json files
-    """
+    """Return all Stage 0 dataset files."""
     if not datasets_dir.exists():
         return []
-    return list(datasets_dir.rglob("dataset.json"))
+    return sorted(datasets_dir.rglob("dataset.json"))
+
+
+def _find_inspect_logs(result_dir: Path) -> List[Path]:
+    """Find Inspect JSON log files for a capability result directory."""
+    return sorted(result_dir.glob("*.json"))
+
+
+def _score_value_to_float(value: object) -> Optional[float]:
+    """Convert an Inspect score value to float when possible."""
+    if isinstance(value, (int, float)):
+        return float(value)
+
+    if isinstance(value, str):
+        upper = value.strip().upper()
+        if upper == "C":
+            return 1.0
+        if upper == "I":
+            return 0.0
+        try:
+            return float(value)
+        except ValueError:
+            return None
+
+    return None
+
+
+def _scored_sample_ids_from_log(log: object) -> Set[str]:
+    """Return scored sample IDs from a parsed Inspect log object."""
+    samples = getattr(log, "samples", None)
+    if not samples:
+        return set()
+
+    scored_ids: Set[str] = set()
+    for sample in samples:
+        sample_id = str(getattr(sample, "id", ""))
+        sample_scores = getattr(sample, "scores", None)
+        if not sample_id or not sample_scores:
+            continue
+
+        for score_obj in sample_scores.values():
+            if _score_value_to_float(getattr(score_obj, "value", None)) is not None:
+                scored_ids.add(sample_id)
+                break
+
+    return scored_ids
+
+
+def _scored_sample_ids(log_file: Path) -> Set[str]:
+    """Return sample IDs with at least one interpretable score."""
+    try:
+        log = read_eval_log(str(log_file))
+    except Exception:
+        return set()
+    return _scored_sample_ids_from_log(log)
 
 
 def _check_eval_completed(
-    results_dir: Path, subject_llm: str, area_id: str, capability_id: str
+    results_dir: Path,
+    subject_llm: str,
+    area_id: str,
+    capability_id: str,
+    expected_task_ids: Set[str],
 ) -> bool:
-    """Check if evaluation was already completed for this combination.
-
-    Args:
-        results_dir: Path to results directory
-        subject_llm: Subject LLM name
-        area_id: Area identifier
-        capability_id: Capability identifier
-
-    Returns
-    -------
-        True if evaluation results exist
-    """
+    """Return True if scored task IDs exactly match expected task IDs."""
+    if not expected_task_ids:
+        return False
+
     result_dir = results_dir / subject_llm / area_id / capability_id
-    # Check if directory exists and has any log files
     if result_dir.exists():
-        log_files = list(result_dir.glob("*.json"))
-        return len(log_files) > 0
+        for log_file in _find_inspect_logs(result_dir):
+            if _scored_sample_ids(log_file) == expected_task_ids:
+                return True
     return False
 
 
+def _find_retry_log(
+    result_dir: Path,
+    expected_task_ids: Set[str],
+) -> Optional[Path]:
+    """Find the best failed/incomplete log to resume with Inspect eval_retry."""
+    if not result_dir.exists():
+        return None
+
+    candidates: List[tuple[Path, int]] = []
+    for log_file in _find_inspect_logs(result_dir):
+        try:
+            log = read_eval_log(str(log_file))
+        except Exception:
+            continue
+
+        scored_ids = _scored_sample_ids_from_log(log)
+        if scored_ids == expected_task_ids:
+            continue
+
+        status = str(getattr(log, "status", "")).lower()
+        invalidated = bool(getattr(log, "invalidated", False))
+        is_retryable = invalidated or status in {"started", "error", "cancelled"}
+        if is_retryable:
+            matched_expected = len(scored_ids & expected_task_ids)
+            candidates.append((log_file, matched_expected))
+
+    if not candidates:
+        return None
+
+    best_log, _ = max(
+        candidates,
+        key=lambda item: (item[1], item[0].stat().st_mtime, item[0].name),
+    )
+    return best_log
+
+
 def _create_inspect_task(
     dataset: EvalDataset,
     judge_model: str,
 ) -> "Task":
-    """Create an Inspect Task from EvalDataset.
-
-    Args:
-        dataset: EvalDataset with tasks
-        judge_model: Model to use for grading (e.g., "openai/gpt-4o-mini")
-
-    Returns
-    -------
-        Inspect Task object
-    """
+    """Build an Inspect task for one capability dataset."""
     # Create Inspect samples from our dataset
     samples = [
         Sample(
@@ -109,18 +180,7 @@ def _run_inspect_eval(
     judge_llm: Dict[str, str],
     output_dir: Path,
 ) -> bool:
-    """Run Inspect evaluation for a single capability/LLM combination.
-
-    Args:
-        dataset: EvalDataset to evaluate
-        subject_llm: Subject LLM (e.g., "openai/gpt-4o")
-        judge_llm: Judge LLM config dict
-        output_dir: Directory to save Inspect logs
-
-    Returns
-    -------
-        True if evaluation succeeded
-    """
+    """Run a fresh Inspect eval for one capability/LLM pair."""
     # Format model names for Inspect (provider/model)
     judge_model = f"{judge_llm['provider']}/{judge_llm['name']}"
 
@@ -136,6 +196,7 @@ def _run_inspect_eval(
             task,
             model=subject_llm,
             log_dir=str(output_dir),
+            log_format="json",
         )
 
         return True
@@ -151,23 +212,30 @@ def _run_inspect_eval(
         return False
 
 
+def _run_inspect_retry(
+    retry_log_path: Path,
+    output_dir: Path,
+) -> bool:
+    """Run Inspect eval_retry from a prior failed log."""
+    try:
+        output_dir.mkdir(parents=True, exist_ok=True)
+        inspect_eval_retry(
+            str(retry_log_path),
+            log_dir=str(output_dir),
+            log_format="json",
+        )
+        return True
+    except Exception as e:
+        logger.error("Inspect eval_retry failed for %s: %s", retry_log_path, e)
+        return False
+
+
 def run_eval_stage1(
     cfg: DictConfig,
     validation_tag: str,
+    eval_tag: Optional[str] = None,
 ) -> str:
-    """Eval Stage 1: Evaluation Execution.
-
-    Runs Inspect evaluation for each capability with each subject LLM.
-    Creates eval_tag since this is where LLM calls happen.
-
-    Args:
-        cfg: Configuration object
-        validation_tag: Tag from generation Stage 5 (required)
-
-    Returns
-    -------
-        The eval_tag for this evaluation run
-    """
+    """Run Stage 1 evals and return the eval tag."""
     # Derive paths from config
     exp_id = cfg.exp_cfg.exp_id
     output_base_dir = Path(cfg.global_cfg.output_dir)
@@ -182,12 +250,15 @@ def run_eval_stage1(
         )
     eval_config, _ = load_eval_config(eval_config_path)
 
-    # Create eval_tag for this run
-    eval_tag = timestamp_tag()
+    # Create eval_tag for this run (or reuse existing one for resume)
+    is_resume = eval_tag is not None
+    if eval_tag is None:
+        eval_tag = timestamp_tag()
 
     logger.info(
-        "Eval Stage 1: Running evaluations (eval_tag=%s)",
+        "Eval Stage 1: Running evaluations (eval_tag=%s, resume=%s)",
         eval_tag,
+        is_resume,
     )
 
     # Find datasets (saved under validation_tag from Stage 0)
@@ -212,7 +283,7 @@ def run_eval_stage1(
         timestamp=iso_timestamp(),
         input_stage_tag=validation_tag,
         output_stage_tag=eval_tag,
-        resume=False,
+        resume=is_resume,
     )
     results_config_path = eval_dir / "eval_config.json"
     save_eval_config(eval_config, metadata, results_config_path)
@@ -222,10 +293,15 @@ def run_eval_stage1(
     subject_llms = eval_config.subject_llms
     judge_llm = eval_config.judge_llm
 
-    num_evals = 0
+    num_completed_this_run = 0
+    num_skipped_completed = 0
+    num_failed = 0
+    num_incomplete = 0
+    num_resumed = 0
     total_combinations = len(datasets) * len(subject_llms)
 
     for dataset in datasets:
+        expected_task_ids = {str(task["id"]) for task in dataset.tasks}
         for llm_config in subject_llms:
             llm_name = llm_config["name"]
             # Construct full model string: provider/model_name
@@ -233,7 +309,11 @@ def run_eval_stage1(
 
             # Check if already completed (resume)
             if _check_eval_completed(
-                results_dir, llm_name, dataset.area_id, dataset.capability_id
+                results_dir,
+                llm_name,
+                dataset.area_id,
+                dataset.capability_id,
+                expected_task_ids,
             ):
                 logger.info(
                     "  Skipping %s/%s with %s (already completed)",
@@ -241,6 +321,7 @@ def run_eval_stage1(
                     dataset.capability_id,
                     llm_name,
                 )
+                num_skipped_completed += 1
                 continue
 
             # Run evaluation
@@ -248,26 +329,66 @@ def run_eval_stage1(
                 results_dir / llm_name / dataset.area_id / dataset.capability_id
             )
 
-            logger.info(
-                "  Evaluating %s/%s with %s",
-                dataset.area_id,
-                dataset.capability_id,
-                subject_model,
+            retry_log = (
+                _find_retry_log(output_dir, expected_task_ids) if is_resume else None
             )
+            if retry_log is not None:
+                logger.info(
+                    "  Resuming %s/%s with %s from %s",
+                    dataset.area_id,
+                    dataset.capability_id,
+                    subject_model,
+                    retry_log.name,
+                )
+                success = _run_inspect_retry(
+                    retry_log_path=retry_log,
+                    output_dir=output_dir,
+                )
+                num_resumed += 1
+            else:
+                logger.info(
+                    "  Evaluating %s/%s with %s",
+                    dataset.area_id,
+                    dataset.capability_id,
+                    subject_model,
+                )
 
-            success = _run_inspect_eval(
-                dataset=dataset,
-                subject_llm=subject_model,
-                judge_llm=judge_llm,
-                output_dir=output_dir,
-            )
+                success = _run_inspect_eval(
+                    dataset=dataset,
+                    subject_llm=subject_model,
+                    judge_llm=judge_llm,
+                    output_dir=output_dir,
+                )
 
             if success:
-                num_evals += 1
+                if _check_eval_completed(
+                    results_dir,
+                    llm_name,
+                    dataset.area_id,
+                    dataset.capability_id,
+                    expected_task_ids,
+                ):
+                    num_completed_this_run += 1
+                else:
+                    logger.warning(
+                        "  Incomplete evaluation output for %s/%s with %s "
+                        "(task IDs mismatch: missing or extra scored tasks)",
+                        dataset.area_id,
+                        dataset.capability_id,
+                        llm_name,
+                    )
+                    num_incomplete += 1
+            else:
+                num_failed += 1
 
     logger.info(
-        "Eval Stage 1: Completed %d/%d evaluations",
-        num_evals,
+        "Eval Stage 1 summary: completed_this_run=%d skipped_completed=%d "
+        "resumed=%d failed=%d incomplete=%d total=%d",
+        num_completed_this_run,
+        num_skipped_completed,
+        num_resumed,
+        num_failed,
+        num_incomplete,
         total_combinations,
     )
 
diff --git a/src/eval_stages/stage2_score_aggregation.py b/src/eval_stages/stage2_score_aggregation.py
index ecc75b1c..e855dc17 100644
--- a/src/eval_stages/stage2_score_aggregation.py
+++ b/src/eval_stages/stage2_score_aggregation.py
@@ -9,7 +9,7 @@
 import logging
 import math
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 from inspect_ai.log import read_eval_log
 from omegaconf import DictConfig
@@ -26,40 +26,28 @@
 
 
 def _find_result_dirs(results_dir: Path, subject_llm: str) -> List[Path]:
-    """Find all result directories for a subject LLM.
-
-    Args:
-        results_dir: Path to results directory
-        subject_llm: Subject LLM name
-
-    Returns
-    -------
-        List of paths to capability result directories
-    """
+    """Return capability result directories for one subject model."""
     llm_results_dir = results_dir / subject_llm
     if not llm_results_dir.exists():
         return []
 
     # Find all directories with structure: <area_id>/<capability_id>/
     result_dirs = []
-    for area_dir in llm_results_dir.iterdir():
+    for area_dir in sorted(llm_results_dir.iterdir()):
         if area_dir.is_dir():
-            for cap_dir in area_dir.iterdir():
+            for cap_dir in sorted(area_dir.iterdir()):
                 if cap_dir.is_dir():
                     result_dirs.append(cap_dir)
     return result_dirs
 
 
-def _compute_stats(scores: List[float]) -> Dict[str, Any]:
-    """Compute mean and standard error from scores.
+def _find_inspect_logs(result_dir: Path) -> List[Path]:
+    """Find Inspect JSON log files for a capability result directory."""
+    return sorted(result_dir.glob("*.json"))
 
-    Args:
-        scores: List of score values (0.0 to 1.0)
 
-    Returns
-    -------
-        Dict with 'mean', 'std_err', 'num_tasks'
-    """
+def _compute_stats(scores: List[float]) -> Dict[str, Any]:
+    """Compute mean, standard error, and sample count."""
     if not scores:
         return {"mean": 0.0, "std_err": 0.0, "num_tasks": 0}
 
@@ -76,69 +64,107 @@ def _compute_stats(scores: List[float]) -> Dict[str, Any]:
     return {"mean": mean, "std_err": std_err, "num_tasks": n}
 
 
-def _parse_inspect_logs(result_dir: Path) -> Dict[str, Any]:
-    """Parse Inspect logs to extract scores.
+def _score_value_to_float(value: object) -> Optional[float]:
+    """Convert a score value to float when possible."""
+    if isinstance(value, (int, float)):
+        return float(value)
+
+    if isinstance(value, str):
+        upper = value.strip().upper()
+        if upper == "C":
+            return 1.0
+        if upper == "I":
+            return 0.0
+        try:
+            return float(value)
+        except ValueError:
+            return None
+
+    return None
+
 
-    Args:
-        result_dir: Path to capability result directory
+def _extract_scores_from_log(log_file: Path) -> Dict[str, float]:
+    """Extract one score per sample ID from a single Inspect log file."""
+    scores: Dict[str, float] = {}
+    log = read_eval_log(str(log_file))
 
-    Returns
-    -------
-        Dict with 'mean', 'std_err', 'num_tasks'
-    """
-    # Find Inspect log files (they have .json extension)
-    log_files = list(result_dir.glob("*.json"))
+    if not log.samples:
+        return scores
+
+    for sample in log.samples:
+        sample_id = str(getattr(sample, "id", ""))
+        if not sample_id or not sample.scores:
+            continue
+
+        # Count at most one score per sample to avoid duplicating across scorers.
+        for score_obj in sample.scores.values():
+            score_value = _score_value_to_float(getattr(score_obj, "value", None))
+            if score_value is not None:
+                scores[sample_id] = score_value
+                break
+
+    return scores
+
+
+def _parse_inspect_logs(
+    result_dir: Path, expected_task_ids: Set[str]
+) -> Dict[str, Any]:
+    """Parse logs and return stats for the best-matching retry log."""
+    # Find Inspect log files (.json)
+    log_files = _find_inspect_logs(result_dir)
 
     if not log_files:
         logger.warning("No log files found in %s", result_dir)
         return {"mean": 0.0, "std_err": 0.0, "num_tasks": 0}
 
-    scores = []
-
+    log_scores: List[Tuple[Path, List[float], Set[str]]] = []
     for log_file in log_files:
         try:
-            log = read_eval_log(str(log_file))
-
-            # Extract scores from samples
-            # In Inspect AI 0.3.159+, sample.scores is dict[str, Score] | None
-            if log.samples:
-                for sample in log.samples:
-                    if sample.scores:
-                        # Iterate over all scorers (usually just one)
-                        for _scorer_name, score_obj in sample.scores.items():
-                            if score_obj.value is not None:
-                                # Score value can be numeric or string
-                                score_val = score_obj.value
-                                if isinstance(score_val, (int, float)):
-                                    scores.append(float(score_val))
-                                elif score_val == "C":  # Correct
-                                    scores.append(1.0)
-                                elif score_val == "I":  # Incorrect
-                                    scores.append(0.0)
-
+            scored_by_id = _extract_scores_from_log(log_file)
+            scored_ids = set(scored_by_id.keys())
+            matched_scores = [
+                scored_by_id[task_id]
+                for task_id in expected_task_ids
+                if task_id in scored_by_id
+            ]
+            log_scores.append((log_file, matched_scores, scored_ids))
         except Exception as e:
             logger.warning("Failed to parse log %s: %s", log_file, e)
             continue
 
-    return _compute_stats(scores)
+    if not log_scores:
+        return {"mean": 0.0, "std_err": 0.0, "num_tasks": 0, "exact_match": False}
+
+    # If multiple logs exist, prefer exact task-id match, then best coverage.
+    # This avoids double-counting retries in the same capability directory.
+    selected_log, selected_scores, selected_ids = max(
+        log_scores,
+        key=lambda x: (
+            x[2] == expected_task_ids,
+            len(x[1]),
+            x[0].stat().st_mtime,
+            x[0].name,
+        ),
+    )
+
+    if len(log_scores) > 1:
+        logger.info(
+            "Multiple logs found in %s; selected %s with %d scored samples",
+            result_dir,
+            selected_log.name,
+            len(selected_scores),
+        )
+
+    stats = _compute_stats(selected_scores)
+    stats["exact_match"] = selected_ids == expected_task_ids
+    return stats
 
 
 def run_eval_stage2(
     cfg: DictConfig,
     eval_tag: str,
 ) -> str:
-    """Eval Stage 2: Score Aggregation.
-
-    Computes final capability scores from raw Inspect results.
-
-    Args:
-        cfg: Configuration object
-        eval_tag: Tag from Eval Stage 1
-
-    Returns
-    -------
-        The eval_tag (same as input, for chaining)
-    """
+    """Run Stage 2 score aggregation and return eval_tag."""
     # Derive paths from config
     exp_id = cfg.exp_cfg.exp_id
     output_base_dir = Path(cfg.global_cfg.output_dir)
@@ -163,10 +189,15 @@ def run_eval_stage2(
 
     # Load datasets for capability info
     dataset_map = {}  # (area_id, cap_id) -> EvalDataset
-    for dataset_path in datasets_dir.rglob("dataset.json"):
+    for dataset_path in sorted(datasets_dir.rglob("dataset.json")):
         dataset = load_eval_dataset(dataset_path)
         dataset_map[(dataset.area_id, dataset.capability_id)] = dataset
 
+    if not dataset_map:
+        raise ValueError(
+            f"No datasets found in {datasets_dir}. Run Eval Stage 0 first."
+        )
+
     num_llms_processed = 0
 
     for llm_config in eval_config.subject_llms:
@@ -197,8 +228,28 @@ def run_eval_stage2(
                 )
                 continue
 
+            expected_task_ids = {str(task["id"]) for task in cap_dataset.tasks}
+
             # Parse Inspect logs
-            parsed = _parse_inspect_logs(result_dir)
+            parsed = _parse_inspect_logs(result_dir, expected_task_ids)
+
+            if parsed["num_tasks"] < cap_dataset.num_tasks:
+                logger.warning(
+                    "  Incomplete scoring for %s/%s with %s: %d/%d tasks scored",
+                    area_id,
+                    cap_id,
+                    llm_name,
+                    parsed["num_tasks"],
+                    cap_dataset.num_tasks,
+                )
+            elif not parsed.get("exact_match", False):
+                logger.warning(
+                    "  Task ID mismatch for %s/%s with %s "
+                    "(scored task IDs differ from dataset task IDs)",
+                    area_id,
+                    cap_id,
+                    llm_name,
+                )
 
             # Create CapabilityScore
             score = CapabilityScore(
@@ -212,6 +263,8 @@ def run_eval_stage2(
             )
             capability_scores.append(score)
 
+        capability_scores.sort(key=lambda s: (s.area_id, s.capability_id))
+
         # Save scores for this LLM
         if capability_scores:
             scores_path = scores_dir / llm_name / "capability_scores.json"
diff --git a/src/run_eval_pipeline.py b/src/run_eval_pipeline.py
index 5b0b0e63..541a762e 100644
--- a/src/run_eval_pipeline.py
+++ b/src/run_eval_pipeline.py
@@ -12,6 +12,8 @@
     # Run specific stage
     python -m src.run_eval_pipeline stage=0 validation_tag=_YYYYMMDD_HHMMSS
     python -m src.run_eval_pipeline stage=1 validation_tag=_YYYYMMDD_HHMMSS
+    python -m src.run_eval_pipeline stage=1 validation_tag=_YYYYMMDD_HHMMSS \
+        eval_tag=_YYYYMMDD_HHMMSS
     python -m src.run_eval_pipeline stage=2 eval_tag=_YYYYMMDD_HHMMSS
 """
 
@@ -36,6 +38,8 @@ def main(cfg: DictConfig) -> None:
     """Run the evaluation pipeline."""
     # Get stage to run (default: "all")
     stage = cfg.get("stage", "all")
+    if isinstance(stage, str) and stage.isdigit():
+        stage = int(stage)
 
     # Get tags from config
     validation_tag = cfg.get("validation_tag")
@@ -67,7 +71,7 @@ def main(cfg: DictConfig) -> None:
 
             # Stage 1: Evaluation Execution
             logger.info("Running Eval Stage 1: Evaluation Execution")
-            eval_tag = run_eval_stage1(cfg, validation_tag)
+            eval_tag = run_eval_stage1(cfg, validation_tag, eval_tag)
             logger.info("Eval Stage 1 complete. eval_tag=%s", eval_tag)
 
             # Stage 2: Score Aggregation
@@ -109,7 +113,7 @@ def main(cfg: DictConfig) -> None:
 
         try:
             # Stage 1 reads eval_config from Stage 0's output
-            eval_tag = run_eval_stage1(cfg, validation_tag)
+            eval_tag = run_eval_stage1(cfg, validation_tag, eval_tag)
             logger.info("Eval Stage 1 complete. eval_tag=%s", eval_tag)
         except ValueError as e:
             logger.error("Stage 1 failed: %s", e)