diff --git a/legacy/README.md b/legacy/README.md
new file mode 100644
index 00000000..5439c779
--- /dev/null
+++ b/legacy/README.md
@@ -0,0 +1,9 @@
+# Legacy LBO Code
+
+This directory contains legacy code for **Latent Bayesian Optimization (LBO)** from an earlier version of the repository. LBO was used for intelligent capability selection during evaluation.
+
+## Compatible Version
+
+This LBO code is compatible with the repository at commit [`a224c5ec`](https://github.com/VectorInstitute/automated_capability_evaluation/tree/a224c5ec7dd208e04ef2edc059e6e7a2d0d4bcf6). That commit contains the full working version of the codebase used for the **initial paper submission**.
+
+**This code does not work with the current codebase.** This was the base legacy code before the generation and evaluation pipelines were standardized. If you need to understand how LBO integrated with the rest of the system, refer to that version.
diff --git a/example_scripts/example_cfg/plot_lbo_results_cfg.yaml b/legacy/example_scripts/example_cfg/plot_lbo_results_cfg.yaml
similarity index 100%
rename from example_scripts/example_cfg/plot_lbo_results_cfg.yaml
rename to legacy/example_scripts/example_cfg/plot_lbo_results_cfg.yaml
diff --git a/example_scripts/plot_lbo_results.py b/legacy/example_scripts/plot_lbo_results.py
similarity index 100%
rename from example_scripts/plot_lbo_results.py
rename to legacy/example_scripts/plot_lbo_results.py
diff --git a/src/lbo.py b/legacy/src/lbo.py
similarity index 100%
rename from src/lbo.py
rename to legacy/src/lbo.py
diff --git a/src/run_lbo.py b/legacy/src/run_lbo.py
similarity index 100%
rename from src/run_lbo.py
rename to legacy/src/run_lbo.py
diff --git a/src/utils/capability_discovery_utils.py b/legacy/src/utils/capability_discovery_utils.py
similarity index 100%
rename from src/utils/capability_discovery_utils.py
rename to legacy/src/utils/capability_discovery_utils.py
diff --git a/src/utils/lbo_utils.py b/legacy/src/utils/lbo_utils.py
similarity index 100%
rename from src/utils/lbo_utils.py
rename to legacy/src/utils/lbo_utils.py
diff --git a/tests/src/test_lbo.py b/legacy/tests/test_lbo.py
similarity index 100%
rename from tests/src/test_lbo.py
rename to legacy/tests/test_lbo.py
diff --git a/tests/src/test_lbo_utils.py b/legacy/tests/test_lbo_utils.py
similarity index 100%
rename from tests/src/test_lbo_utils.py
rename to legacy/tests/test_lbo_utils.py
diff --git a/poetry.lock b/poetry.lock
index dceb53a5..b6b06d04 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
 
 [[package]]
 name = "ag2"
@@ -84,6 +84,26 @@ websockets = ["websockets (>=14.0,<16)"]
 websurfer = ["beautifulsoup4", "markdownify", "pathvalidate", "pdfminer-six"]
 wikipedia = ["wikipedia-api (>=0.8.1,<1.0)"]
 
+[[package]]
+name = "aioboto3"
+version = "15.2.0"
+description = "Async boto3 wrapper"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "aioboto3-15.2.0-py3-none-any.whl", hash = "sha256:3582f033543ee7671ae27b1df538f2095bcc91be4a3a78e7498b5ce6c654f26b"},
+    {file = "aioboto3-15.2.0.tar.gz", hash = "sha256:6a151ee0aa0f4b9af6031e6446f28460991fcc50a4ac54a1650d145319d5e2e5"},
+]
+
+[package.dependencies]
+aiobotocore = {version = "2.24.2", extras = ["boto3"]}
+aiofiles = ">=23.2.1"
+
+[package.extras]
+chalice = ["chalice (>=1.24.0)"]
+s3cse = ["cryptography (>=44.0.1)"]
+
 [[package]]
 name = "aiobotocore"
 version = "2.24.2"
@@ -99,6 +119,7 @@ files = [
 [package.dependencies]
 aiohttp = ">=3.9.2,<4.0.0"
 aioitertools = ">=0.5.1,<1.0.0"
+boto3 = {version = ">=1.40.15,<1.40.19", optional = true, markers = "extra == \"boto3\""}
 botocore = ">=1.40.15,<1.40.19"
 jmespath = ">=0.7.1,<2.0.0"
 multidict = ">=6.0.0,<7.0.0"
@@ -1758,6 +1779,107 @@ type1 = ["xattr ; sys_platform == \"darwin\""]
 unicode = ["unicodedata2 (>=15.1.0) ; python_version <= \"3.12\""]
 woff = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "zopfli (>=0.1.4)"]
 
+[[package]]
+name = "frozendict"
+version = "2.4.7"
+description = "A simple immutable dictionary"
+optional = false
+python-versions = ">=3.6"
+groups = ["main"]
+files = [
+    {file = "frozendict-2.4.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bd37c087a538944652363cfd77fb7abe8100cc1f48afea0b88b38bf0f469c3d2"},
+    {file = "frozendict-2.4.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2b96f224a5431889f04b2bc99c0e9abe285679464273ead83d7d7f2a15907d35"},
+    {file = "frozendict-2.4.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5c1781f28c4bbb177644b3cb6d5cf7da59be374b02d91cdde68d1d5ef32e046b"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8a06f6c3d3b8d487226fdde93f621e04a54faecc5bf5d9b16497b8f9ead0ac3e"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b809d1c861436a75b2b015dbfd94f6154fa4e7cb0a70e389df1d5f6246b21d1e"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:75eefdf257a84ea73d553eb80d0abbff0af4c9df62529e4600fd3f96ff17eeb3"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a4d2b27d8156922c9739dd2ff4f3934716e17cfd1cf6fb61aa17af7d378555e9"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2ebd953c41408acfb8041ff9e6c3519c09988fb7e007df7ab6b56e229029d788"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c64d34b802912ee6d107936e970b90750385a1fdfd38d310098b2918ba4cbf2"},
+    {file = "frozendict-2.4.7-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:294a7d7d51dd979021a8691b46aedf9bd4a594ce3ed33a4bdf0a712d6929d712"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f65d1b90e9ddc791ea82ef91a9ae0ab27ef6c0cfa88fadfa0e5ca5a22f8fa22f"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:82d5272d08451bcef6fb6235a0a04cf1816b6b6815cec76be5ace1de17e0c1a4"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5943c3f683d3f32036f6ca975e920e383d85add1857eee547742de9c1f283716"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:88c6bea948da03087035bb9ca9625305d70e084aa33f11e17048cb7dda4ca293"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:ffd1a9f9babec9119712e76a39397d8aa0d72ef8c4ccad917c6175d7e7f81b74"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:0ff6f57854cc8aa8b30947ec005f9246d96e795a78b21441614e85d39b708822"},
+    {file = "frozendict-2.4.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d774df483c12d6cba896eb9a1337bbc5ad3f564eb18cfaaee3e95fb4402f2a86"},
+    {file = "frozendict-2.4.7-cp310-cp310-win32.whl", hash = "sha256:a10d38fa300f6bef230fae1fdb4bc98706b78c8a3a2f3140fde748469ef3cfe8"},
+    {file = "frozendict-2.4.7-cp310-cp310-win_amd64.whl", hash = "sha256:dd518f300e5eb6a8827bee380f2e1a31c01dc0af069b13abdecd4e5769bd8a97"},
+    {file = "frozendict-2.4.7-cp310-cp310-win_arm64.whl", hash = "sha256:3842cfc2d69df5b9978f2e881b7678a282dbdd6846b11b5159f910bc633cbe4f"},
+    {file = "frozendict-2.4.7-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:735be62d757e1e7e496ccb6401efe82b473faa653e95eec0826cd7819a29a34c"},
+    {file = "frozendict-2.4.7-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fff8584e3bbdc5c1713cd016fbf4b88babfffd4e5e89b39020f2a208dd24c900"},
+    {file = "frozendict-2.4.7-cp36-cp36m-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:91a06ee46b3e3ef3b237046b914c0c905eab9fdfeac677e9b51473b482e24c28"},
+    {file = "frozendict-2.4.7-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fd7ba56cf6340c732ecb78787c4e9600c4bd01372af7313ded21037126d33ec6"},
+    {file = "frozendict-2.4.7-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1b4426457757c30ad86b57cdbcc0adaa328399f1ec3d231a0a2ce7447248987"},
+    {file = "frozendict-2.4.7-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b22d337c76b765cb7961d4ee47fe29f89e30921eb47bf856b14dc7641f4df3e5"},
+    {file = "frozendict-2.4.7-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57134ef5df1dd32229c148c75a7b89245dbdb89966a155d6dfd4bda653e8c7af"},
+    {file = "frozendict-2.4.7-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:c89617a784e1c24a31f5aa4809402f8072a26b64ddbc437897f6391ff69b0ee9"},
+    {file = "frozendict-2.4.7-cp36-cp36m-musllinux_1_2_armv7l.whl", hash = "sha256:176dd384dfe1d0d79449e05f67764c57c6f0f3095378bf00deb33165d5d2df5b"},
+    {file = "frozendict-2.4.7-cp36-cp36m-musllinux_1_2_i686.whl", hash = "sha256:b1a94e8935c69ae30043b465af496f447950f2c03660aee8657074084faae0b3"},
+    {file = "frozendict-2.4.7-cp36-cp36m-musllinux_1_2_ppc64le.whl", hash = "sha256:c570649ceccfa5e11ad9351e9009dc484c315a51a56aa02ced07ae97644bb7aa"},
+    {file = "frozendict-2.4.7-cp36-cp36m-musllinux_1_2_s390x.whl", hash = "sha256:e0d450c9d444befe2668bf9386ac2945a2f38152248d58f6b3feea63db59ba08"},
+    {file = "frozendict-2.4.7-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:7469912c1a04102457871ff675aebe600dbb7e79a6450a166cc8079b88f6ca79"},
+    {file = "frozendict-2.4.7-cp36-cp36m-win32.whl", hash = "sha256:2808bab8e21887a8c106cca5f6f0ab5bda7ee81e159409a10f53d57542ccd99c"},
+    {file = "frozendict-2.4.7-cp36-cp36m-win_amd64.whl", hash = "sha256:ca17ac727ffeeba6c46f5a88e0284a7cb1520fb03127645fcdd7041080adf849"},
+    {file = "frozendict-2.4.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8ef11dd996208c5a96eab0683f7a17cb4b992948464d2498520efd75a10a2aac"},
+    {file = "frozendict-2.4.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b960e700dc95faca7dd6919d0dce183ef89bfe01554d323cf5de7331a2e80f83"},
+    {file = "frozendict-2.4.7-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:fc43257a06e6117da6a8a0779243b974cdb9205fed82e32eb669f6746c75d27d"},
+    {file = "frozendict-2.4.7-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ece525da7d0aa3eb56c3e479f30612028d545081c15450d67d771a303ee7d4c"},
+    {file = "frozendict-2.4.7-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7ddffe7c0b3be414f88185e212758989c65b497315781290eb029e2c1e1fd64e"},
+    {file = "frozendict-2.4.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05dd27415f913cd11649009f53d97eb565ce7b76787d7869c4733738c10e8d27"},
+    {file = "frozendict-2.4.7-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0664092614d2b9d0aa404731f33ad5459a54fe8dab9d1fd45aa714fa6de4d0ef"},
+    {file = "frozendict-2.4.7-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:830d181781bb263c9fa430b81f82c867546f5dcb368e73931c8591f533a04afb"},
+    {file = "frozendict-2.4.7-cp37-cp37m-musllinux_1_2_armv7l.whl", hash = "sha256:c93827e0854393cd904b927ceb529afc17776706f5b9e45c7eaf6a40b3fc7b25"},
+    {file = "frozendict-2.4.7-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:6d30dbba6eb1497c695f3108c2c292807e7a237c67a1b9ff92c04e89969d22d1"},
+    {file = "frozendict-2.4.7-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:ec846bde66b75d68518c7b24a0a46d09db0aee5a6aefd2209d9901faf6e9df21"},
+    {file = "frozendict-2.4.7-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:1df8e22f7d24172c08434b10911f3971434bb5a59b4d1b0078ae33a623625294"},
+    {file = "frozendict-2.4.7-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:39abe54264ae69a0b2e00fabdb5118604f36a5b927d33e7532cd594c5142ebf4"},
+    {file = "frozendict-2.4.7-cp37-cp37m-win32.whl", hash = "sha256:d10c2ea7c90ba204cd053167ba214d0cdd00f3184c7b8d117a56d7fd2b0c6553"},
+    {file = "frozendict-2.4.7-cp37-cp37m-win_amd64.whl", hash = "sha256:346a53640f15c1640a3503f60ba99df39e4ab174979f10db4304bbb378df5cbd"},
+    {file = "frozendict-2.4.7-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:cc520f3f4af14f456143a534d554175dbc0f0636ffd653e63675cd591862a9d9"},
+    {file = "frozendict-2.4.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7fd0d0bd3a79e009dddbf5fedfd927ad495c218cd7b13a112d28a37e2079725c"},
+    {file = "frozendict-2.4.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a404857e48d85a517bb5b974d740f8c4fccb25d8df98885f3a2a4d950870b845"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:f42e2c25d3eee4ea3da88466f38ed0dce8c622a1a9d92572e5ee53b7a6bb9ef1"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1a083e9ee7a1904e545a6307c7db1dd76200077520fcbf7a98d886f81b57dd7"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:f556ea05d9c5f6dae50d57ce6234e4ab1fbf4551dd0d52b4fed6ef537d9f3d3c"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:739ee81e574f33b46f1e6d9312f3ec2c549bdd574a4ebb6bf106775c9d85ca7b"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:48ab42b01952bc11543577de9fe5d9ca7c41b35dda36326a07fb47d84b3d5f22"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34233deb8d09e798e874a6ac00b054d2e842164d982ebd43eb91b9f0a6a34876"},
+    {file = "frozendict-2.4.7-cp38-cp38-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:76bd99f3508cb2ec87976f2e3fe7d92fb373a661cacffb863013d15e4cfaf0eb"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:a265e95e7087f44b88a6d78a63ea95a2ca0eb0a21ab4f76047f4c164a8beb413"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:1662f1b72b4f4a2ffdfdc4981ece275ca11f90244208ac1f1fc2c17fc9c9437a"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:2e5d2c30f4a3fea83a14b0a5722f21c10de5c755ab5637c70de5eb60886d58cd"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:2cf0a665bf2f1ce69d3cd8b6d3574b1d32ae00981a16fa1d255d2da8a2e44b7c"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_riscv64.whl", hash = "sha256:708382875c3cfe91be625dddcba03dee2dfdadbad2c431568a8c7f2f2af0bbee"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:7fe194f37052a8f45a1a8507e36229e28b79f3d21542ae55ea6a18c6a444f625"},
+    {file = "frozendict-2.4.7-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:d8930877a2dd40461968d9238d95c754e51b33ce7d2a45500f88ffeed5cb7202"},
+    {file = "frozendict-2.4.7-cp38-cp38-win32.whl", hash = "sha256:6991469a889ee8a108fe5ed1b044447c7b7a07da9067e93c59cbfac8c1d625cf"},
+    {file = "frozendict-2.4.7-cp38-cp38-win_amd64.whl", hash = "sha256:ebae8f4a07372acfc3963fc8d68070cdaab70272c3dd836f057ebbe9b7d38643"},
+    {file = "frozendict-2.4.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1c521ad3d747aa475e9040e231f5f1847c04423bae5571c010a9d969e6983c40"},
+    {file = "frozendict-2.4.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:70e655c3aa5f893807830f549a7275031a181dbebeaf74c461b51adc755d9335"},
+    {file = "frozendict-2.4.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11d35075f979c96f528d74ccbf89322a7ef8211977dd566bc384985ebce689be"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d4d7ec24d3bfcfac3baf4dffd7fcea3fa8474b087ce32696232132064aa062cf"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5694417864875ca959932e3b98e2b7d5d27c75177bf510939d0da583712ddf58"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:57a754671c5746e11140363aa2f4e7a75c8607de6e85a2bf89dcd1daf51885a7"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:313e0e1d8b22b317aa1f7dd48aec8cbb0416ddd625addf7648a69148fcb9ccff"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:176a66094428b9fd66270927b9787e3b8b1c9505ef92723c7b0ef1923dbe3c4a"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de1fff2683d8af01299ec01eb21a24b6097ce92015fc1fbefa977cecf076a3fc"},
+    {file = "frozendict-2.4.7-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:115a822ecd754574e11205e0880e9d61258d960863d6fd1b90883aa800f6d3b3"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:de8d2c98777ba266f5466e211778d4e3bd00635a207c54f6f7511d8613b86dd3"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:1e307be0e1f26cbc9593f6bdad5238a1408a50f39f63c9c39eb93c7de5926767"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:78a55f320ca924545494ce153df02d4349156cd95dc4603c1f0e80c42c889249"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:e89492dfcc4c27a718f8b5a4c8df1a2dec6c689718cccd70cb2ceba69ab8c642"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:1e801d62e35df24be2c6f7f43c114058712efa79a8549c289437754dad0207a3"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:3ed9e2f3547a59f4ef5c233614c6faa6221d33004cb615ae1c07ffc551cfe178"},
+    {file = "frozendict-2.4.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ad0448ed5569f0a9b9b010af9fb5b6d9bdc0b4b877a3ddb188396c4742e62284"},
+    {file = "frozendict-2.4.7-cp39-cp39-win32.whl", hash = "sha256:eab9ef8a9268042e819de03079b984eb0894f05a7b63c4e5319b1cf1ef362ba7"},
+    {file = "frozendict-2.4.7-cp39-cp39-win_amd64.whl", hash = "sha256:8dfe2f4840b043436ee5bdd07b0fa5daecedf086e6957e7df050a56ab6db078d"},
+    {file = "frozendict-2.4.7-cp39-cp39-win_arm64.whl", hash = "sha256:cc2085926872a1b26deda4b81b2254d2e5d2cb2c4d7b327abe4c820b7c93f40b"},
+    {file = "frozendict-2.4.7-py3-none-any.whl", hash = "sha256:972af65924ea25cf5b4d9326d549e69a9a4918d8a76a9d3a7cd174d98b237550"},
+    {file = "frozendict-2.4.7.tar.gz", hash = "sha256:e478fb2a1391a56c8a6e10cc97c4a9002b410ecd1ac28c18d780661762e271bd"},
+]
+
 [[package]]
 name = "frozenlist"
 version = "1.7.0"
@@ -2208,6 +2330,8 @@ files = [
     {file = "greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d"},
     {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5"},
     {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f"},
+    {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7"},
+    {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8"},
     {file = "greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c"},
     {file = "greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2"},
     {file = "greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246"},
@@ -2217,6 +2341,8 @@ files = [
     {file = "greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8"},
     {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52"},
     {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa"},
+    {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c"},
+    {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5"},
     {file = "greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9"},
     {file = "greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd"},
     {file = "greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb"},
@@ -2226,6 +2352,8 @@ files = [
     {file = "greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0"},
     {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0"},
     {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f"},
+    {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0"},
+    {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d"},
     {file = "greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02"},
     {file = "greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31"},
     {file = "greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945"},
@@ -2235,6 +2363,8 @@ files = [
     {file = "greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671"},
     {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b"},
     {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae"},
+    {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b"},
+    {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929"},
     {file = "greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b"},
     {file = "greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0"},
     {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f"},
@@ -2242,6 +2372,8 @@ files = [
     {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1"},
     {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735"},
     {file = "greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337"},
+    {file = "greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269"},
+    {file = "greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681"},
     {file = "greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01"},
     {file = "greenlet-3.2.4-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:b6a7c19cf0d2742d0809a4c05975db036fdff50cd294a93632d6a310bf9ac02c"},
     {file = "greenlet-3.2.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:27890167f55d2387576d1f41d9487ef171849ea0359ce1510ca6e06c8bece11d"},
@@ -2251,6 +2383,8 @@ files = [
     {file = "greenlet-3.2.4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9913f1a30e4526f432991f89ae263459b1c64d1608c0d22a5c79c287b3c70df"},
     {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b90654e092f928f110e0007f572007c9727b5265f7632c2fa7415b4689351594"},
     {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:81701fd84f26330f0d5f4944d4e92e61afe6319dcd9775e39396e39d7c3e5f98"},
+    {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:28a3c6b7cd72a96f61b0e4b2a36f681025b60ae4779cc73c1535eb5f29560b10"},
+    {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:52206cd642670b0b320a1fd1cbfd95bca0e043179c1d8a045f2c6109dfe973be"},
     {file = "greenlet-3.2.4-cp39-cp39-win32.whl", hash = "sha256:65458b409c1ed459ea899e939f0e1cdb14f58dbc803f2f93c5eab5694d32671b"},
     {file = "greenlet-3.2.4-cp39-cp39-win_amd64.whl", hash = "sha256:d2e685ade4dafd447ede19c31277a224a239a0a1a4eca4e6390efedf20260cfb"},
     {file = "greenlet-3.2.4.tar.gz", hash = "sha256:0dca0d95ff849f9a364385f36ab49f50065d76964944638be9691e1832e9f86d"},
@@ -2680,55 +2814,60 @@ files = [
 
 [[package]]
 name = "inspect-ai"
-version = "0.3.122"
+version = "0.3.159"
 description = "Framework for large language model evaluations"
 optional = false
 python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "inspect_ai-0.3.122-py3-none-any.whl", hash = "sha256:c40958a0e74e91bb2e7a14059e4bfada73757e00d2a6cbdf5754bd3f8e92a955"},
-    {file = "inspect_ai-0.3.122.tar.gz", hash = "sha256:77b18a72603a79f05630216e577f92b9bc404c616bce080ccb67028c8368428c"},
+    {file = "inspect_ai-0.3.159-py3-none-any.whl", hash = "sha256:71f87fd242d4fb61998691143c5dd3613fd8009840a4503d3a1fa48b504fd6b3"},
+    {file = "inspect_ai-0.3.159.tar.gz", hash = "sha256:91d42ba18ac10c5ad9d4ee25e01dda142c86058a12462e5ba051b041f8afe2ed"},
 ]
 
 [package.dependencies]
+aioboto3 = ">=13.0.0"
 aiohttp = ">=3.9.0"
 anyio = ">=4.8.0"
-beautifulsoup4 = "*"
-click = ">=8.1.3,<8.2.0 || >8.2.0"
+beautifulsoup4 = ">=4.10.0"
+boto3 = "*"
+click = ">=8.1.3,<8.2.0 || >8.2.0,<8.2.2"
 debugpy = "*"
 docstring-parser = ">=0.16"
 exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
-fsspec = ">=2023.1.0,<=2025.3.0"
+frozendict = ">=2.4.6"
+fsspec = ">=2023.1.0,<=2025.9.0"
 httpx = "*"
 ijson = ">=3.2.0"
 jsonlines = ">=3.0.0"
 jsonpatch = ">=1.32"
-jsonpath-ng = ">=1.7.0"
+jsonpath-ng = ">=1.6.0"
 jsonref = ">=1.1.0"
 jsonschema = ">3.1.1"
 mmh3 = ">3.1.0"
-nest_asyncio = "*"
+nest_asyncio2 = "*"
 numpy = "*"
 platformdirs = ">=2.3.0"
 psutil = "*"
 pydantic = ">=2.11.4"
 python-dotenv = ">=0.16.0"
 pyyaml = "*"
-rich = ">=13.3.3,<14.0.0"
+rich = ">=13.3.3,<14.0.0 || >14.0.0"
 s3fs = ">=2023"
 semver = ">=3.0.0"
 shortuuid = "*"
 sniffio = "*"
 tenacity = "*"
-textual = ">=0.86.2,<v3.0.0"
+textual = ">=2.1.0"
+tiktoken = ">=0.12.0"
 typing_extensions = ">=4.9.0"
+universal-pathlib = ">=0.2.6"
 zipp = ">=3.19.1"
 
 [package.extras]
-dev = ["aioboto3", "anthropic (>=0.52.0)", "azure-ai-inference", "azure-identity", "google-genai", "griffe", "groq", "ipython", "jsonpath-ng", "markdown", "mcp (>=1.10.0)", "mistralai", "moto[server]", "mypy (>=1.17.0)", "nbformat", "openai", "pandas (>=2.0.0)", "pandas-stubs", "panflute", "pip", "pre-commit", "pyarrow (>=10.0.1)", "pyarrow-stubs", "pylint", "pytest", "pytest-asyncio", "pytest-cov", "pytest-dotenv", "pytest-mock", "pytest-watcher", "pytest-xdist", "ruff (==0.9.6)", "textual-dev (>=0.86.2)", "together", "transformer-lens", "trio", "types-Markdown", "types-PyYAML", "types-aioboto3", "types-beautifulsoup4", "types-boto3", "types-botocore", "types-jsonpatch", "types-jsonschema", "types-protobuf", "types-psutil", "types-python-dateutil"]
+dev = ["adlfs (>=2025.8.0)", "anthropic (>=0.62.0)", "azure-ai-inference", "azure-identity", "fastapi", "google-genai", "griffe", "groq", "huggingface_hub", "inspect_scout", "ipython", "jsonpath-ng", "markdown", "mcp (>=1.10.0)", "mistralai", "moto[server]", "mypy (>=1.17.0)", "nbformat", "openai", "pandas (>=2.0.0)", "pandas-stubs", "panflute", "pip", "pre-commit", "pyarrow (>=10.0.1)", "pyarrow-stubs", "pylint", "pytest", "pytest-asyncio", "pytest-cov", "pytest-dotenv", "pytest-mock", "pytest-watcher", "pytest-xdist", "ruff (==0.9.6)", "textual-dev (>=0.86.2)", "together", "trio", "types-Markdown", "types-PyYAML", "types-aioboto3", "types-beautifulsoup4", "types-boto3", "types-botocore", "types-grpcio", "types-jsonpatch", "types-jsonschema", "types-protobuf", "types-psutil", "types-python-dateutil", "uvicorn", "xai_sdk"]
 dev-mcp-tests = ["mcp-server-fetch", "mcp_server_git"]
 dist = ["build", "twine"]
-doc = ["griffe", "jupyter", "markdown", "panflute", "quarto-cli (==1.7.32)"]
+doc = ["click (>=8.2.0)", "griffe", "jupyter", "markdown", "panflute", "quarto-cli (==1.7.32)"]
 
 [[package]]
 name = "ipykernel"
@@ -4430,12 +4569,24 @@ version = "1.6.0"
 description = "Patch asyncio to allow nested event loops"
 optional = false
 python-versions = ">=3.5"
-groups = ["main", "docs"]
+groups = ["docs"]
 files = [
     {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
     {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
 ]
 
+[[package]]
+name = "nest-asyncio2"
+version = "1.7.1"
+description = "Patch asyncio to allow nested event loops"
+optional = false
+python-versions = ">=3.5"
+groups = ["main"]
+files = [
+    {file = "nest_asyncio2-1.7.1-py3-none-any.whl", hash = "sha256:f83bc1744c3cfa7d47fd29431e5e168db6cb76eda1bb20108955c32f60d7eddf"},
+    {file = "nest_asyncio2-1.7.1.tar.gz", hash = "sha256:a1fe5bbbd20894dcceb1842322d74992c5834d5ab692af2c4f59a9a4fcf75fe8"},
+]
+
 [[package]]
 name = "networkx"
 version = "3.4.2"
@@ -5611,8 +5762,8 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
     {version = ">=1.22.4", markers = "python_version < \"3.11\""},
 ]
 python-dateutil = ">=2.8.2"
@@ -5672,6 +5823,18 @@ files = [
 qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"]
 testing = ["docopt", "pytest"]
 
+[[package]]
+name = "pathlib-abc"
+version = "0.5.2"
+description = "Backport of pathlib ABCs"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "pathlib_abc-0.5.2-py3-none-any.whl", hash = "sha256:4c9d94cf1b23af417ce7c0417b43333b06a106c01000b286c99de230d95eefbb"},
+    {file = "pathlib_abc-0.5.2.tar.gz", hash = "sha256:fcd56f147234645e2c59c7ae22808b34c364bb231f685ddd9f96885aed78a94c"},
+]
+
 [[package]]
 name = "pathspec"
 version = "0.12.1"
@@ -6518,8 +6681,8 @@ files = [
 astroid = ">=3.3.8,<=3.4.0.dev0"
 colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
 dill = [
-    {version = ">=0.3.6", markers = "python_version >= \"3.11\""},
     {version = ">=0.3.7", markers = "python_version >= \"3.12\""},
+    {version = ">=0.3.6", markers = "python_version == \"3.11\""},
     {version = ">=0.2", markers = "python_version < \"3.11\""},
 ]
 isort = ">=4.2.5,<5.13 || >5.13,<7"
@@ -8164,43 +8327,69 @@ files = [
 
 [[package]]
 name = "tiktoken"
-version = "0.11.0"
+version = "0.12.0"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "tiktoken-0.11.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:8a9b517d6331d7103f8bef29ef93b3cca95fa766e293147fe7bacddf310d5917"},
-    {file = "tiktoken-0.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b4ddb1849e6bf0afa6cc1c5d809fb980ca240a5fffe585a04e119519758788c0"},
-    {file = "tiktoken-0.11.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:10331d08b5ecf7a780b4fe4d0281328b23ab22cdb4ff65e68d56caeda9940ecc"},
-    {file = "tiktoken-0.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b062c82300341dc87e0258c69f79bed725f87e753c21887aea90d272816be882"},
-    {file = "tiktoken-0.11.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:195d84bec46169af3b1349a1495c151d37a0ff4cba73fd08282736be7f92cc6c"},
-    {file = "tiktoken-0.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe91581b0ecdd8783ce8cb6e3178f2260a3912e8724d2f2d49552b98714641a1"},
-    {file = "tiktoken-0.11.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4ae374c46afadad0f501046db3da1b36cd4dfbfa52af23c998773682446097cf"},
-    {file = "tiktoken-0.11.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:25a512ff25dc6c85b58f5dd4f3d8c674dc05f96b02d66cdacf628d26a4e4866b"},
-    {file = "tiktoken-0.11.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2130127471e293d385179c1f3f9cd445070c0772be73cdafb7cec9a3684c0458"},
-    {file = "tiktoken-0.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21e43022bf2c33f733ea9b54f6a3f6b4354b909f5a73388fb1b9347ca54a069c"},
-    {file = "tiktoken-0.11.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:adb4e308eb64380dc70fa30493e21c93475eaa11669dea313b6bbf8210bfd013"},
-    {file = "tiktoken-0.11.0-cp311-cp311-win_amd64.whl", hash = "sha256:ece6b76bfeeb61a125c44bbefdfccc279b5288e6007fbedc0d32bfec602df2f2"},
-    {file = "tiktoken-0.11.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fd9e6b23e860973cf9526544e220b223c60badf5b62e80a33509d6d40e6c8f5d"},
-    {file = "tiktoken-0.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6a76d53cee2da71ee2731c9caa747398762bda19d7f92665e882fef229cb0b5b"},
-    {file = "tiktoken-0.11.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ef72aab3ea240646e642413cb363b73869fed4e604dcfd69eec63dc54d603e8"},
-    {file = "tiktoken-0.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f929255c705efec7a28bf515e29dc74220b2f07544a8c81b8d69e8efc4578bd"},
-    {file = "tiktoken-0.11.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:61f1d15822e4404953d499fd1dcc62817a12ae9fb1e4898033ec8fe3915fdf8e"},
-    {file = "tiktoken-0.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:45927a71ab6643dfd3ef57d515a5db3d199137adf551f66453be098502838b0f"},
-    {file = "tiktoken-0.11.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a5f3f25ffb152ee7fec78e90a5e5ea5b03b4ea240beed03305615847f7a6ace2"},
-    {file = "tiktoken-0.11.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7dc6e9ad16a2a75b4c4be7208055a1f707c9510541d94d9cc31f7fbdc8db41d8"},
-    {file = "tiktoken-0.11.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a0517634d67a8a48fd4a4ad73930c3022629a85a217d256a6e9b8b47439d1e4"},
-    {file = "tiktoken-0.11.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7fb4effe60574675118b73c6fbfd3b5868e5d7a1f570d6cc0d18724b09ecf318"},
-    {file = "tiktoken-0.11.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:94f984c9831fd32688aef4348803b0905d4ae9c432303087bae370dc1381a2b8"},
-    {file = "tiktoken-0.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:2177ffda31dec4023356a441793fed82f7af5291120751dee4d696414f54db0c"},
-    {file = "tiktoken-0.11.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:13220f12c9e82e399377e768640ddfe28bea962739cc3a869cad98f42c419a89"},
-    {file = "tiktoken-0.11.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7f2db627f5c74477c0404b4089fd8a28ae22fa982a6f7d9c7d4c305c375218f3"},
-    {file = "tiktoken-0.11.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2302772f035dceb2bcf8e55a735e4604a0b51a6dd50f38218ff664d46ec43807"},
-    {file = "tiktoken-0.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20b977989afe44c94bcc50db1f76971bb26dca44218bd203ba95925ef56f8e7a"},
-    {file = "tiktoken-0.11.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:669a1aa1ad6ebf1b3c26b45deb346f345da7680f845b5ea700bba45c20dea24c"},
-    {file = "tiktoken-0.11.0-cp39-cp39-win_amd64.whl", hash = "sha256:e363f33c720a055586f730c00e330df4c7ea0024bf1c83a8a9a9dbc054c4f304"},
-    {file = "tiktoken-0.11.0.tar.gz", hash = "sha256:3c518641aee1c52247c2b97e74d8d07d780092af79d5911a6ab5e79359d9b06a"},
+    {file = "tiktoken-0.12.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3de02f5a491cfd179aec916eddb70331814bd6bf764075d39e21d5862e533970"},
+    {file = "tiktoken-0.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b6cfb6d9b7b54d20af21a912bfe63a2727d9cfa8fbda642fd8322c70340aad16"},
+    {file = "tiktoken-0.12.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:cde24cdb1b8a08368f709124f15b36ab5524aac5fa830cc3fdce9c03d4fb8030"},
+    {file = "tiktoken-0.12.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:6de0da39f605992649b9cfa6f84071e3f9ef2cec458d08c5feb1b6f0ff62e134"},
+    {file = "tiktoken-0.12.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6faa0534e0eefbcafaccb75927a4a380463a2eaa7e26000f0173b920e98b720a"},
+    {file = "tiktoken-0.12.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:82991e04fc860afb933efb63957affc7ad54f83e2216fe7d319007dab1ba5892"},
+    {file = "tiktoken-0.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:6fb2995b487c2e31acf0a9e17647e3b242235a20832642bb7a9d1a181c0c1bb1"},
+    {file = "tiktoken-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6e227c7f96925003487c33b1b32265fad2fbcec2b7cf4817afb76d416f40f6bb"},
+    {file = "tiktoken-0.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c06cf0fcc24c2cb2adb5e185c7082a82cba29c17575e828518c2f11a01f445aa"},
+    {file = "tiktoken-0.12.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:f18f249b041851954217e9fd8e5c00b024ab2315ffda5ed77665a05fa91f42dc"},
+    {file = "tiktoken-0.12.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:47a5bc270b8c3db00bb46ece01ef34ad050e364b51d406b6f9730b64ac28eded"},
+    {file = "tiktoken-0.12.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:508fa71810c0efdcd1b898fda574889ee62852989f7c1667414736bcb2b9a4bd"},
+    {file = "tiktoken-0.12.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a1af81a6c44f008cba48494089dd98cccb8b313f55e961a52f5b222d1e507967"},
+    {file = "tiktoken-0.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:3e68e3e593637b53e56f7237be560f7a394451cb8c11079755e80ae64b9e6def"},
+    {file = "tiktoken-0.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b97f74aca0d78a1ff21b8cd9e9925714c15a9236d6ceacf5c7327c117e6e21e8"},
+    {file = "tiktoken-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b90f5ad190a4bb7c3eb30c5fa32e1e182ca1ca79f05e49b448438c3e225a49b"},
+    {file = "tiktoken-0.12.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:65b26c7a780e2139e73acc193e5c63ac754021f160df919add909c1492c0fb37"},
+    {file = "tiktoken-0.12.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:edde1ec917dfd21c1f2f8046b86348b0f54a2c0547f68149d8600859598769ad"},
+    {file = "tiktoken-0.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:35a2f8ddd3824608b3d650a000c1ef71f730d0c56486845705a8248da00f9fe5"},
+    {file = "tiktoken-0.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83d16643edb7fa2c99eff2ab7733508aae1eebb03d5dfc46f5565862810f24e3"},
+    {file = "tiktoken-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc5288f34a8bc02e1ea7047b8d041104791d2ddbf42d1e5fa07822cbffe16bd"},
+    {file = "tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3"},
+    {file = "tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160"},
+    {file = "tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa"},
+    {file = "tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be"},
+    {file = "tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a"},
+    {file = "tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3"},
+    {file = "tiktoken-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25"},
+    {file = "tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f"},
+    {file = "tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646"},
+    {file = "tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88"},
+    {file = "tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff"},
+    {file = "tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830"},
+    {file = "tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b"},
+    {file = "tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b"},
+    {file = "tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0"},
+    {file = "tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71"},
+    {file = "tiktoken-0.12.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:d51d75a5bffbf26f86554d28e78bfb921eae998edc2675650fd04c7e1f0cdc1e"},
+    {file = "tiktoken-0.12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:09eb4eae62ae7e4c62364d9ec3a57c62eea707ac9a2b2c5d6bd05de6724ea179"},
+    {file = "tiktoken-0.12.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:df37684ace87d10895acb44b7f447d4700349b12197a526da0d4a4149fde074c"},
+    {file = "tiktoken-0.12.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:4c9614597ac94bb294544345ad8cf30dac2129c05e2db8dc53e082f355857af7"},
+    {file = "tiktoken-0.12.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:20cf97135c9a50de0b157879c3c4accbb29116bcf001283d26e073ff3b345946"},
+    {file = "tiktoken-0.12.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:15d875454bbaa3728be39880ddd11a5a2a9e548c29418b41e8fd8a767172b5ec"},
+    {file = "tiktoken-0.12.0-cp39-cp39-win_amd64.whl", hash = "sha256:2cff3688ba3c639ebe816f8d58ffbbb0aa7433e23e08ab1cade5d175fc973fb3"},
+    {file = "tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931"},
 ]
 
 [package.dependencies]
@@ -8592,6 +8781,28 @@ files = [
 [package.extras]
 test = ["coverage", "pytest", "pytest-cov"]
 
+[[package]]
+name = "universal-pathlib"
+version = "0.3.7"
+description = "pathlib api extended to use fsspec backends"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "universal_pathlib-0.3.7-py3-none-any.whl", hash = "sha256:fb95117b20b5981f86ef9d887fddbf9c61d3596634ba42cccea444931d87c201"},
+    {file = "universal_pathlib-0.3.7.tar.gz", hash = "sha256:36331056fa59a7d7cd3b61b4045f3a3418f446f23ec1a01d281c4510814b4b05"},
+]
+
+[package.dependencies]
+fsspec = ">=2024.5.0"
+pathlib-abc = ">=0.5.1,<0.6.0"
+
+[package.extras]
+dev = ["adlfs (>=2024)", "cheroot", "fsspec[adl,gcs,github,http,s3,smb,ssh] (>=2024.5.0)", "gcsfs (>=2024.5.0)", "huggingface_hub", "moto[s3,server]", "pyftpdlib", "s3fs (>=2024.5.0)", "typing_extensions ; python_version < \"3.11\"", "webdav4[fsspec]", "wsgidav"]
+dev-third-party = ["pydantic", "pydantic-settings"]
+tests = ["mypy (>=1.10.0)", "packaging", "pydantic (>=2)", "pylint (>=2.17.4)", "pytest (>=8)", "pytest-cov (>=4.1.0)", "pytest-mock (>=3.12.0)", "pytest-mypy-plugins (>=3.1.2)", "pytest-sugar (>=0.9.7)"]
+typechecking = ["mypy (>=1.10.0)", "pytest-mypy-plugins (>=3.1.2)"]
+
 [[package]]
 name = "urllib3"
 version = "2.5.0"
@@ -9284,4 +9495,4 @@ cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and pyt
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10, <3.13"
-content-hash = "2dbaf7bb11506a213ffe1235df6de7ad4a1943ff5a1ad29e98c9c108d3210230"
+content-hash = "2ca08429df55e63e3001da780f9032e2da40b906d8092b2bb6e97e5e44b09d34"
diff --git a/pyproject.toml b/pyproject.toml
index f0337423..2d3b3ab1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ dependencies = [
     "datasets>=3.2.0",
     "google-cloud-storage>=3.0.0",
     "hydra-core>=1.3.2",
-    "inspect-ai>=0.3.80",
+    "inspect-ai>=0.3.159",
     "langchain_openai>=0.3.6",
     "langchain>=0.3.19",
     "matplotlib>=3.10.0",
@@ -177,6 +177,8 @@ env = [
 filterwarnings = [
     "ignore::DeprecationWarning",
 ]
+# Exclude legacy tests (imports are broken after code was moved)
+norecursedirs = ["legacy"]
 
 [tool.coverage]
     [tool.coverage.run]
diff --git a/src/cfg/run_cfg.yaml b/src/cfg/run_cfg.yaml
index 7924a76d..d7de581b 100644
--- a/src/cfg/run_cfg.yaml
+++ b/src/cfg/run_cfg.yaml
@@ -1,3 +1,22 @@
+# =============================================================================
+# EXPERIMENT CONFIGURATION
+# =============================================================================
+
+exp_cfg:
+  exp_id: "test_exp"
+  seed: 37
+  trial_run: false
+
+global_cfg:
+  domain: personal finance
+  output_dir: base_output/
+  pipeline_type: base
+
+# =============================================================================
+# GENERATION PIPELINE
+# =============================================================================
+
+# LLM for generation stages (1-5)
 scientist_llm:
   name: o4-mini
   provider: openai
@@ -14,126 +33,64 @@ scientist_llm:
       temperature: 0.7
       max_tokens: 2048
       seed: 42
-    judge_llm:
-      temperature: 1.0
-      max_tokens: 2048
-      seed: 42
     task_verify:
       temperature: 0.7
       max_tokens: 2048
       seed: 42
-  local_launch_cfg:
-    # Number of threads to use for local LLM
-    max_num_seqs: 1
-    # Type of GPU to use for local LLM
-    partition: "a40"
-    # QoS for local LLM
-    qos: "m2"
-    # Time limit for local LLM
-    time: "01:00:00"
-
-subject_llm:
-  name: o1-mini
-  provider: openai
-  generation_cfg:
-    temperature: 0.7
-    max_tokens: 2048
-    seed: 42
-  local_launch_cfg:
-    # Type of GPU to use for local LLM
-    partition: "a100"
-    # Number of nodes to use for local LLM
-    num_nodes: 1
-    # Number of GPUs to use for local LLM
-    gpus_per_node: 4
-    # QoS for local LLM
-    qos: "deadline"
-    # Account for local LLM
-    account: "deadline"
-    # Time limit for local LLM
-    time: "10:00:00"
-    # vLLM args
-    vllm_args: "--max-model-len=8192,--max-num-seqs=50,--compilation-config=0,--tensor-parallel-size=4,--pipeline-parallel-size=1"
 
-prompt_cfg:
-  sys_msg: Complete the given task to the best of your ability.
-
-# Diverse task generation configuration (Stage 3)
-task_generation_cfg:
-  tasks_per_blueprint: 1  # Number of tasks to generate per blueprint
-  min_subtopics: 1  # Suggested minimum number of sub-topics
-  max_subtopics: 1  # Suggested maximum number of sub-topics
+# Stage control
+stage: "all"  # Which stage to run: 0, 1, 2, 3, 4, 5, or "all"
 
-# Task verification configuration (Stage 5)
-task_verification_cfg:
-  pass_threshold: 0.8  # Minimum pass rate to consider successful
-  strict_mode: false  # If true, all alignment criteria must pass
+# Stage tags (for running individual stages or resuming)
+areas_tag: null           # Stage 1 output tag (required for stage 2 standalone)
+capabilities_tag: null    # Stage 2 output tag (required for stage 3 standalone)
+tasks_tag: null           # Stage 3 output tag (required for stage 4 standalone)
+solution_tag: null        # Stage 4 output tag (required for stage 5 standalone)
+validation_tag: null      # Stage 5 output tag (required for eval pipeline)
+eval_tag: null            # Eval Stage 1 output tag (required for eval stage 2, optional for stage 1 resume)
 
-# Area generation configuration (Stage 1)
+# Stage 1: Area generation
 areas_cfg:
-  num_areas: 2  # Number of areas to generate
+  num_areas: 2
 
-# Capability generation configuration (Stage 2)
+# Stage 2: Capability generation
 capabilities_cfg:
-  capabilities_dir: ./ace-output/
-  results_dir: gs://ace-artifacts
-  inspect_evals_dir: /fs01/projects/aieng/public/ace/inspect_evals/src/ace_evals
-  num_seed_capabilities: 1
   num_capabilities: 4
-  num_capabilities_buffer: 0.5  # Raised from 0.1 to compensate for filtering
-  num_gen_capabilities_per_run: 1  # Raised from 1 for more diversity per batch
-  num_gen_tasks_per_capability: 100
-  num_gen_tasks_buffer: 0.0
-  task_gen_few_shot: false
-  task_gen_prompt_version: "v1"
-  num_eval_tasks_per_capability: 2
-  capabilities_gen_retry_attempts: 5
-  tasks_gen_retry_attempts: 3
-  concurrency_task_solver: 2
-  concurrency_task_verifier: 2
-  concurrency_task_eval: 2
-  inspect_eval_log_level: "info"
-
-lbo_cfg:
-  num_lbo_runs: 2
-  pipeline_id: "no_discovery"
-  train_frac: 0.5
-  num_initial_train: 2
-  acquisition_function: "variance"
+  num_capabilities_buffer: 0.1
+  num_gen_capabilities_per_run: 1
 
+# Embedding config (used for capability filtering in Stage 2)
 embedding_cfg:
-  embedding_model: "text-embedding-3-small"
-  embedding_size: 256
-  filtering_similarity_threshold: 0.85  # Raised from 0.7 to keep more diverse capabilities
-
-dimensionality_reduction_cfg:
-  reduce_dimensionality_method: "pca"
-  reduced_dimensionality_size: 2
-  no_discovery_reduced_dimensionality_method: "pca"
-  no_discovery_reduced_dimensionality_size: 2
+  embedding_model: text-embedding-3-small
+  embedding_size: 1536
+  filtering_similarity_threshold: 0.85
 
-exp_cfg:
-  seed: 37
-  trial_run: false
-  exp_id: "test_exp"
+# Stage 3: Task generation
+task_generation_cfg:
+  tasks_per_blueprint: 1
+  min_subtopics: 1
+  max_subtopics: 1
 
-# Stage control
-stage: "all"  # Which stage to run: 0, 1, 2, 3, 4, 5, or "all"
-areas_tag: null  # Areas tag from Stage 1 (required for stage 2 standalone)
-capabilities_tag: null  # Capabilities tag from Stage 2 (required for stage 3 standalone)
-tasks_tag: null  # Tasks tag from Stage 3 (required for stage 4 standalone)
-solution_tag: null  # Solution tag from Stage 4 (required for stage 5 standalone)
-validation_tag: null  # Validation tag from Stage 5 (optional for resume)
+# =============================================================================
+# EVALUATION PIPELINE
+# =============================================================================
 
-# Debug settings
-use_langchain: false  # Set to false for easier debugging (disables LangChain features)
+eval_cfg:
+  # LLMs to evaluate (required)
+  subject_llms:
+    - name: gpt-4o
+      provider: openai
+    - name: claude-3-sonnet
+      provider: anthropic
 
-# Global configuration
+  # Judge LLM for scoring (required)
+  judge_llm:
+    name: gpt-4o-mini
+    provider: openai
 
-global_cfg:
-  domain: personal finance
-  output_dir: base_output/ #Base output directory for all agentic outputs
-  pipeline_type: base
+# =============================================================================
+# HYDRA
+# =============================================================================
 
 defaults:
   - _self_
diff --git a/src/eval_stages/__init__.py b/src/eval_stages/__init__.py
new file mode 100644
index 00000000..ff7eaa4c
--- /dev/null
+++ b/src/eval_stages/__init__.py
@@ -0,0 +1,18 @@
+"""Evaluation pipeline stages.
+
+Stage 0: Setup and Dataset Preparation (no LLM calls)
+Stage 1: Evaluation Execution (runs subject LLMs, creates eval_tag)
+Stage 2: Score Aggregation (no LLM calls)
+"""
+
+from src.eval_stages.stage0_setup_and_dataset import EvalSetupError, run_eval_stage0
+from src.eval_stages.stage1_eval_execution import run_eval_stage1
+from src.eval_stages.stage2_score_aggregation import run_eval_stage2
+
+
+__all__ = [
+    "run_eval_stage0",
+    "run_eval_stage1",
+    "run_eval_stage2",
+    "EvalSetupError",
+]
diff --git a/src/eval_stages/prompts.py b/src/eval_stages/prompts.py
new file mode 100644
index 00000000..e58f606e
--- /dev/null
+++ b/src/eval_stages/prompts.py
@@ -0,0 +1,9 @@
+"""Prompts for evaluation pipeline stages."""
+
+# Default prompt template for Inspect AI evaluation
+# Used in Stage 1 (Dataset Preparation) when creating EvalDataset
+DEFAULT_EVAL_PROMPT_TEMPLATE = """You are an expert. Solve the following problem.
+
+Problem: {input}
+
+Provide your final answer."""
diff --git a/src/eval_stages/stage0_setup_and_dataset.py b/src/eval_stages/stage0_setup_and_dataset.py
new file mode 100644
index 00000000..9d166bd6
--- /dev/null
+++ b/src/eval_stages/stage0_setup_and_dataset.py
@@ -0,0 +1,269 @@
+"""Eval Stage 0: Setup and Dataset Preparation.
+
+This stage:
+1. Validates that required generation outputs exist
+2. Converts validated tasks to Inspect-compatible format
+
+No LLM calls, deterministic transformation. Datasets are saved under
+eval/datasets/<validation_tag>/ since they are tied to the validation source.
+"""
+
+import json
+import logging
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+from omegaconf import DictConfig
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.schemas.eval_io_utils import save_eval_config, save_eval_dataset
+from src.schemas.eval_schemas import EvalConfig, EvalDataset
+from src.schemas.metadata_schemas import PipelineMetadata
+from src.schemas.validation_schemas import ValidationResult
+from src.utils.timestamp_utils import iso_timestamp
+
+
+logger = logging.getLogger(__name__)
+
+
+class EvalSetupError(Exception):
+    """Error during evaluation setup."""
+
+    pass
+
+
+def _validate_inputs(
+    experiment_dir: Path,
+    validation_tag: str,
+    eval_cfg: Dict[str, Any],
+) -> None:
+    """Validate all required inputs exist.
+
+    Args:
+        experiment_dir: Path to experiment directory
+        validation_tag: Tag from generation Stage 5
+        eval_cfg: Evaluation config section
+
+    Raises
+    ------
+        EvalSetupError: If validation fails
+    """
+    # Check experiment.json exists
+    experiment_json = experiment_dir / "experiment.json"
+    if not experiment_json.exists():
+        raise EvalSetupError(f"Experiment file not found: {experiment_json}")
+
+    # Check validation directory exists
+    validation_dir = experiment_dir / "validation" / validation_tag
+    if not validation_dir.exists():
+        raise EvalSetupError(f"Validation directory not found: {validation_dir}")
+
+    # Check validation files exist
+    validation_files = list(validation_dir.rglob("*.json"))
+    if not validation_files:
+        raise EvalSetupError(f"No validation files found in: {validation_dir}")
+
+    # Check subject_llms configured
+    if not eval_cfg.get("subject_llms"):
+        raise EvalSetupError("subject_llms must be specified in eval_cfg")
+
+    # Check judge_llm configured
+    if not eval_cfg.get("judge_llm"):
+        raise EvalSetupError("judge_llm must be specified in eval_cfg")
+
+
+def _find_validated_tasks(
+    experiment_dir: Path, validation_tag: str
+) -> List[Tuple[Path, ValidationResult]]:
+    """Find all validated tasks (verification=true) for a given tag.
+
+    Args:
+        experiment_dir: Path to experiment directory
+        validation_tag: Tag from generation Stage 5
+
+    Returns
+    -------
+        List of (file_path, ValidationResult) tuples for verified tasks
+    """
+    validation_dir = experiment_dir / "validation" / validation_tag
+
+    validated_tasks = []
+    for vf in validation_dir.rglob("*.json"):
+        with open(vf, "r", encoding="utf-8") as f:
+            data = json.load(f)
+
+        # Skip metadata-only files
+        if "verification" not in data:
+            continue
+
+        # Only include verified tasks
+        if data.get("verification", False):
+            validation_result = ValidationResult.from_dict(data)
+            validated_tasks.append((vf, validation_result))
+
+    return validated_tasks
+
+
+def _group_by_capability(
+    validated_tasks: List[Tuple[Path, ValidationResult]],
+) -> Dict[Tuple[str, str], List[ValidationResult]]:
+    """Group validated tasks by capability.
+
+    Args:
+        validated_tasks: List of (file_path, ValidationResult) tuples
+
+    Returns
+    -------
+        Dict mapping (area_id, capability_id) to list of ValidationResults
+    """
+    grouped = defaultdict(list)
+    for _, validation in validated_tasks:
+        task_solution = validation.task_solution
+        area_id = task_solution.task.capability.area.area_id
+        cap_id = task_solution.task.capability.capability_id
+        grouped[(area_id, cap_id)].append(validation)
+    return grouped
+
+
+def _create_eval_dataset(
+    area_id: str,
+    capability_id: str,
+    validations: List[ValidationResult],
+    prompt_template: str = DEFAULT_EVAL_PROMPT_TEMPLATE,
+) -> EvalDataset:
+    """Create EvalDataset from validated tasks.
+
+    Args:
+        area_id: Area identifier
+        capability_id: Capability identifier
+        validations: List of ValidationResults for this capability
+        prompt_template: Template for formatting task prompts
+
+    Returns
+    -------
+        EvalDataset dataclass
+    """
+    # Get capability info from first validation
+    first = validations[0]
+    capability = first.task_solution.task.capability
+
+    # Build tasks list
+    tasks: List[Dict[str, str]] = []
+    for v in validations:
+        ts = v.task_solution
+        tasks.append(
+            {
+                "id": ts.task_id,
+                "input": ts.task_statement,
+                "target": ts.solution,
+            }
+        )
+
+    return EvalDataset(
+        area_id=area_id,
+        capability_id=capability_id,
+        capability_name=capability.capability_name,
+        domain=capability.area.domain.domain_name,
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=prompt_template,
+    )
+
+
+def run_eval_stage0(
+    cfg: DictConfig,
+    validation_tag: str,
+) -> None:
+    """Eval Stage 0: Setup and Dataset Preparation.
+
+    Validates inputs and creates datasets for evaluation.
+    Saves eval_config.json for Stage 1 to read.
+
+    Args:
+        cfg: Configuration object
+        validation_tag: Tag from generation Stage 5 (required)
+
+    Raises
+    ------
+        EvalSetupError: If validation fails
+    """
+    # Get experiment info from config
+    exp_id = cfg.exp_cfg.exp_id
+    output_base_dir = Path(cfg.global_cfg.output_dir)
+    experiment_dir = output_base_dir / exp_id
+    eval_cfg = cfg.get("eval_cfg", {})
+
+    logger.info(
+        "Eval Stage 0: exp_id=%s | validation_tag=%s",
+        exp_id,
+        validation_tag,
+    )
+
+    # Validate all inputs
+    _validate_inputs(experiment_dir, validation_tag, eval_cfg)
+    logger.info("Validation checks passed")
+
+    # Find all validated tasks
+    validated_tasks = _find_validated_tasks(experiment_dir, validation_tag)
+    logger.info("Found %d validated tasks", len(validated_tasks))
+
+    if not validated_tasks:
+        raise EvalSetupError(
+            f"No validated tasks (verification=true) found in: {validation_tag}"
+        )
+
+    # Group by capability
+    grouped = _group_by_capability(validated_tasks)
+    logger.info("Found %d capabilities with validated tasks", len(grouped))
+
+    # Create and save datasets (tied to validation_tag, not eval_tag)
+    datasets_dir = experiment_dir / "eval" / "datasets" / validation_tag
+    num_created = 0
+
+    for (area_id, cap_id), validations in grouped.items():
+        # Check if dataset already exists (idempotent)
+        dataset_path = datasets_dir / area_id / cap_id / "dataset.json"
+        if dataset_path.exists():
+            logger.info("  Skipping %s/%s (already exists)", area_id, cap_id)
+            continue
+
+        # Create dataset
+        dataset = _create_eval_dataset(area_id, cap_id, validations)
+
+        # Save dataset
+        save_eval_dataset(dataset, dataset_path)
+        logger.info(
+            "  Created dataset for %s/%s (%d tasks)",
+            area_id,
+            cap_id,
+            dataset.num_tasks,
+        )
+        num_created += 1
+
+    # Create and save EvalConfig (eval_tag will be set in Stage 1)
+    eval_config = EvalConfig(
+        experiment_id=exp_id,
+        eval_tag="",  # Will be set in Stage 1
+        subject_llms=eval_cfg.get("subject_llms"),
+        judge_llm=eval_cfg.get("judge_llm"),
+        validation_tag=validation_tag,
+    )
+
+    metadata = PipelineMetadata(
+        experiment_id=exp_id,
+        output_base_dir=str(output_base_dir),
+        timestamp=iso_timestamp(),
+        input_stage_tag=validation_tag,
+        output_stage_tag="",  # No output tag for Stage 0
+        resume=False,
+    )
+
+    eval_config_path = datasets_dir / "eval_config.json"
+    save_eval_config(eval_config, metadata, eval_config_path)
+
+    logger.info(
+        "Eval Stage 0: Created %d datasets, saved eval_config.json to %s",
+        num_created,
+        datasets_dir,
+    )
diff --git a/src/eval_stages/stage1_eval_execution.py b/src/eval_stages/stage1_eval_execution.py
new file mode 100644
index 00000000..a2aa08a1
--- /dev/null
+++ b/src/eval_stages/stage1_eval_execution.py
@@ -0,0 +1,395 @@
+"""Eval Stage 1: Evaluation Execution.
+
+This stage runs Inspect AI evaluation for each capability with each subject LLM.
+Creates a new eval_tag by default, or reuses a provided eval_tag in resume mode.
+
+See: https://inspect.aisi.org.uk/
+"""
+
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Set
+
+from inspect_ai import Task
+from inspect_ai import eval as inspect_eval
+from inspect_ai import eval_retry as inspect_eval_retry
+from inspect_ai.dataset import MemoryDataset, Sample
+from inspect_ai.log import read_eval_log
+from inspect_ai.scorer import model_graded_fact
+from inspect_ai.solver import generate
+from omegaconf import DictConfig
+
+from src.schemas.eval_io_utils import (
+    load_eval_config,
+    load_eval_dataset,
+    save_eval_config,
+)
+from src.schemas.eval_schemas import EvalDataset
+from src.schemas.metadata_schemas import PipelineMetadata
+from src.utils.timestamp_utils import iso_timestamp, timestamp_tag
+
+
+logger = logging.getLogger(__name__)
+
+
+def _find_datasets(datasets_dir: Path) -> List[Path]:
+    """Return all Stage 0 dataset files."""
+    if not datasets_dir.exists():
+        return []
+    return sorted(datasets_dir.rglob("dataset.json"))
+
+
+def _find_inspect_logs(result_dir: Path) -> List[Path]:
+    """Find Inspect JSON log files for a capability result directory."""
+    return sorted(result_dir.glob("*.json"))
+
+
+def _score_value_to_float(value: object) -> Optional[float]:
+    """Convert an Inspect score value to float when possible."""
+    if isinstance(value, (int, float)):
+        return float(value)
+
+    if isinstance(value, str):
+        upper = value.strip().upper()
+        if upper == "C":
+            return 1.0
+        if upper == "I":
+            return 0.0
+        try:
+            return float(value)
+        except ValueError:
+            return None
+
+    return None
+
+
+def _scored_sample_ids_from_log(log: object) -> Set[str]:
+    """Return scored sample IDs from a parsed Inspect log object."""
+    samples = getattr(log, "samples", None)
+    if not samples:
+        return set()
+
+    scored_ids: Set[str] = set()
+    for sample in samples:
+        sample_id = str(getattr(sample, "id", ""))
+        sample_scores = getattr(sample, "scores", None)
+        if not sample_id or not sample_scores:
+            continue
+
+        for score_obj in sample_scores.values():
+            if _score_value_to_float(getattr(score_obj, "value", None)) is not None:
+                scored_ids.add(sample_id)
+                break
+
+    return scored_ids
+
+
+def _scored_sample_ids(log_file: Path) -> Set[str]:
+    """Return sample IDs with at least one interpretable score."""
+    try:
+        log = read_eval_log(str(log_file))
+    except Exception:
+        return set()
+    return _scored_sample_ids_from_log(log)
+
+
+def _check_eval_completed(
+    results_dir: Path,
+    subject_llm: str,
+    area_id: str,
+    capability_id: str,
+    expected_task_ids: Set[str],
+) -> bool:
+    """Return True if scored task IDs exactly match expected task IDs."""
+    if not expected_task_ids:
+        return False
+
+    result_dir = results_dir / subject_llm / area_id / capability_id
+    if result_dir.exists():
+        for log_file in _find_inspect_logs(result_dir):
+            if _scored_sample_ids(log_file) == expected_task_ids:
+                return True
+    return False
+
+
+def _find_retry_log(
+    result_dir: Path,
+    expected_task_ids: Set[str],
+) -> Optional[Path]:
+    """Find the best failed/incomplete log to resume with Inspect eval_retry."""
+    if not result_dir.exists():
+        return None
+
+    candidates: List[tuple[Path, int]] = []
+    for log_file in _find_inspect_logs(result_dir):
+        try:
+            log = read_eval_log(str(log_file))
+        except Exception:
+            continue
+
+        scored_ids = _scored_sample_ids_from_log(log)
+        if scored_ids == expected_task_ids:
+            continue
+
+        status = str(getattr(log, "status", "")).lower()
+        invalidated = bool(getattr(log, "invalidated", False))
+        is_retryable = invalidated or status in {"started", "error", "cancelled"}
+        if is_retryable:
+            matched_expected = len(scored_ids & expected_task_ids)
+            candidates.append((log_file, matched_expected))
+
+    if not candidates:
+        return None
+
+    best_log, _ = max(
+        candidates,
+        key=lambda item: (item[1], item[0].stat().st_mtime, item[0].name),
+    )
+    return best_log
+
+
+def _create_inspect_task(
+    dataset: EvalDataset,
+    judge_model: str,
+) -> "Task":
+    """Build an Inspect task for one capability dataset."""
+    # Create Inspect samples from our dataset
+    samples = [
+        Sample(
+            input=task["input"],
+            target=task["target"],
+            id=task["id"],
+        )
+        for task in dataset.tasks
+    ]
+
+    # Create memory dataset
+    inspect_dataset = MemoryDataset(samples)
+
+    # Create task with model-graded scoring
+    return Task(
+        dataset=inspect_dataset,
+        solver=generate(),
+        scorer=model_graded_fact(model=judge_model),
+    )
+
+
+def _run_inspect_eval(
+    dataset: EvalDataset,
+    subject_llm: str,
+    judge_llm: Dict[str, str],
+    output_dir: Path,
+) -> bool:
+    """Run a fresh Inspect eval for one capability/LLM pair."""
+    # Format model names for Inspect (provider/model)
+    judge_model = f"{judge_llm['provider']}/{judge_llm['name']}"
+
+    try:
+        # Create Inspect task
+        task = _create_inspect_task(dataset, judge_model)
+
+        # Run evaluation
+        # Inspect saves logs to the specified directory
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        inspect_eval(
+            task,
+            model=subject_llm,
+            log_dir=str(output_dir),
+            log_format="json",
+        )
+
+        return True
+
+    except Exception as e:
+        logger.error(
+            "Inspect evaluation failed for %s/%s with %s: %s",
+            dataset.area_id,
+            dataset.capability_id,
+            subject_llm,
+            e,
+        )
+        return False
+
+
+def _run_inspect_retry(
+    retry_log_path: Path,
+    output_dir: Path,
+) -> bool:
+    """Run Inspect eval_retry from a prior failed log."""
+    try:
+        output_dir.mkdir(parents=True, exist_ok=True)
+        inspect_eval_retry(
+            str(retry_log_path),
+            log_dir=str(output_dir),
+            log_format="json",
+        )
+        return True
+    except Exception as e:
+        logger.error("Inspect eval_retry failed for %s: %s", retry_log_path, e)
+        return False
+
+
+def run_eval_stage1(
+    cfg: DictConfig,
+    validation_tag: str,
+    eval_tag: Optional[str] = None,
+) -> str:
+    """Run Stage 1 evals and return the eval tag."""
+    # Derive paths from config
+    exp_id = cfg.exp_cfg.exp_id
+    output_base_dir = Path(cfg.global_cfg.output_dir)
+    experiment_dir = output_base_dir / exp_id
+
+    # Load eval_config from Stage 0
+    datasets_dir = experiment_dir / "eval" / "datasets" / validation_tag
+    eval_config_path = datasets_dir / "eval_config.json"
+    if not eval_config_path.exists():
+        raise ValueError(
+            f"eval_config.json not found at {eval_config_path}. Run Stage 0 first."
+        )
+    eval_config, _ = load_eval_config(eval_config_path)
+
+    # Create eval_tag for this run (or reuse existing one for resume)
+    is_resume = eval_tag is not None
+    if eval_tag is None:
+        eval_tag = timestamp_tag()
+
+    logger.info(
+        "Eval Stage 1: Running evaluations (eval_tag=%s, resume=%s)",
+        eval_tag,
+        is_resume,
+    )
+
+    # Find datasets (saved under validation_tag from Stage 0)
+    dataset_paths = _find_datasets(datasets_dir)
+    logger.info("Found %d datasets", len(dataset_paths))
+
+    if not dataset_paths:
+        raise ValueError(f"No datasets found in {datasets_dir}. Run Stage 0 first.")
+
+    # Load datasets
+    datasets = [load_eval_dataset(p) for p in dataset_paths]
+
+    # Setup results directory under eval_tag
+    eval_dir = experiment_dir / "eval" / "results" / eval_tag
+    results_dir = eval_dir
+
+    # Update eval_config with the tag and save it to results dir
+    eval_config.eval_tag = eval_tag
+    metadata = PipelineMetadata(
+        experiment_id=exp_id,
+        output_base_dir=str(output_base_dir),
+        timestamp=iso_timestamp(),
+        input_stage_tag=validation_tag,
+        output_stage_tag=eval_tag,
+        resume=is_resume,
+    )
+    results_config_path = eval_dir / "eval_config.json"
+    save_eval_config(eval_config, metadata, results_config_path)
+    logger.info("Saved eval_config.json to %s", results_config_path)
+
+    # Run evaluations
+    subject_llms = eval_config.subject_llms
+    judge_llm = eval_config.judge_llm
+
+    num_completed_this_run = 0
+    num_skipped_completed = 0
+    num_failed = 0
+    num_incomplete = 0
+    num_resumed = 0
+    total_combinations = len(datasets) * len(subject_llms)
+
+    for dataset in datasets:
+        expected_task_ids = {str(task["id"]) for task in dataset.tasks}
+        for llm_config in subject_llms:
+            llm_name = llm_config["name"]
+            # Construct full model string: provider/model_name
+            subject_model = f"{llm_config['provider']}/{llm_name}"
+
+            # Check if already completed (resume)
+            if _check_eval_completed(
+                results_dir,
+                llm_name,
+                dataset.area_id,
+                dataset.capability_id,
+                expected_task_ids,
+            ):
+                logger.info(
+                    "  Skipping %s/%s with %s (already completed)",
+                    dataset.area_id,
+                    dataset.capability_id,
+                    llm_name,
+                )
+                num_skipped_completed += 1
+                continue
+
+            # Run evaluation
+            output_dir = (
+                results_dir / llm_name / dataset.area_id / dataset.capability_id
+            )
+
+            retry_log = (
+                _find_retry_log(output_dir, expected_task_ids) if is_resume else None
+            )
+            if retry_log is not None:
+                logger.info(
+                    "  Resuming %s/%s with %s from %s",
+                    dataset.area_id,
+                    dataset.capability_id,
+                    subject_model,
+                    retry_log.name,
+                )
+                success = _run_inspect_retry(
+                    retry_log_path=retry_log,
+                    output_dir=output_dir,
+                )
+                num_resumed += 1
+            else:
+                logger.info(
+                    "  Evaluating %s/%s with %s",
+                    dataset.area_id,
+                    dataset.capability_id,
+                    subject_model,
+                )
+
+                success = _run_inspect_eval(
+                    dataset=dataset,
+                    subject_llm=subject_model,
+                    judge_llm=judge_llm,
+                    output_dir=output_dir,
+                )
+
+            if success:
+                if _check_eval_completed(
+                    results_dir,
+                    llm_name,
+                    dataset.area_id,
+                    dataset.capability_id,
+                    expected_task_ids,
+                ):
+                    num_completed_this_run += 1
+                else:
+                    logger.warning(
+                        "  Incomplete evaluation output for %s/%s with %s "
+                        "(task IDs mismatch: missing or extra scored tasks)",
+                        dataset.area_id,
+                        dataset.capability_id,
+                        llm_name,
+                    )
+                    num_incomplete += 1
+            else:
+                num_failed += 1
+
+    logger.info(
+        "Eval Stage 1 summary: completed_this_run=%d skipped_completed=%d "
+        "resumed=%d failed=%d incomplete=%d total=%d",
+        num_completed_this_run,
+        num_skipped_completed,
+        num_resumed,
+        num_failed,
+        num_incomplete,
+        total_combinations,
+    )
+
+    return eval_tag
diff --git a/src/eval_stages/stage2_score_aggregation.py b/src/eval_stages/stage2_score_aggregation.py
new file mode 100644
index 00000000..e855dc17
--- /dev/null
+++ b/src/eval_stages/stage2_score_aggregation.py
@@ -0,0 +1,284 @@
+"""Eval Stage 2: Score Aggregation.
+
+This stage computes final capability scores from raw Inspect results.
+No LLM calls, just aggregation of results from Stage 1.
+
+See: https://inspect.aisi.org.uk/
+"""
+
+import logging
+import math
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+from inspect_ai.log import read_eval_log
+from omegaconf import DictConfig
+
+from src.schemas.eval_io_utils import (
+    load_eval_config,
+    load_eval_dataset,
+    save_capability_scores,
+)
+from src.schemas.eval_schemas import CapabilityScore
+
+
+logger = logging.getLogger(__name__)
+
+
+def _find_result_dirs(results_dir: Path, subject_llm: str) -> List[Path]:
+    """Return capability result directories for one subject model."""
+    llm_results_dir = results_dir / subject_llm
+    if not llm_results_dir.exists():
+        return []
+
+    # Find all directories with structure: <area_id>/<capability_id>/
+    result_dirs = []
+    for area_dir in sorted(llm_results_dir.iterdir()):
+        if area_dir.is_dir():
+            for cap_dir in sorted(area_dir.iterdir()):
+                if cap_dir.is_dir():
+                    result_dirs.append(cap_dir)
+    return result_dirs
+
+
+def _find_inspect_logs(result_dir: Path) -> List[Path]:
+    """Find Inspect JSON log files for a capability result directory."""
+    return sorted(result_dir.glob("*.json"))
+
+
+def _compute_stats(scores: List[float]) -> Dict[str, Any]:
+    """Compute mean, standard error, and sample count."""
+    if not scores:
+        return {"mean": 0.0, "std_err": 0.0, "num_tasks": 0}
+
+    n = len(scores)
+    mean = sum(scores) / n
+
+    if n > 1:
+        variance = sum((s - mean) ** 2 for s in scores) / (n - 1)
+        std_dev = math.sqrt(variance)
+        std_err = std_dev / math.sqrt(n)
+    else:
+        std_err = 0.0
+
+    return {"mean": mean, "std_err": std_err, "num_tasks": n}
+
+
+def _score_value_to_float(value: object) -> Optional[float]:
+    """Convert a score value to float when possible."""
+    if isinstance(value, (int, float)):
+        return float(value)
+
+    if isinstance(value, str):
+        upper = value.strip().upper()
+        if upper == "C":
+            return 1.0
+        if upper == "I":
+            return 0.0
+        try:
+            return float(value)
+        except ValueError:
+            return None
+
+    return None
+
+
+def _extract_scores_from_log(log_file: Path) -> Dict[str, float]:
+    """Extract one score per sample ID from a single Inspect log file."""
+    scores: Dict[str, float] = {}
+    log = read_eval_log(str(log_file))
+
+    if not log.samples:
+        return scores
+
+    for sample in log.samples:
+        sample_id = str(getattr(sample, "id", ""))
+        if not sample_id or not sample.scores:
+            continue
+
+        # Count at most one score per sample to avoid duplicating across scorers.
+        for score_obj in sample.scores.values():
+            score_value = _score_value_to_float(getattr(score_obj, "value", None))
+            if score_value is not None:
+                scores[sample_id] = score_value
+                break
+
+    return scores
+
+
+def _parse_inspect_logs(
+    result_dir: Path, expected_task_ids: Set[str]
+) -> Dict[str, Any]:
+    """Parse logs and return stats for the best-matching retry log."""
+    # Find Inspect log files (.json)
+    log_files = _find_inspect_logs(result_dir)
+
+    if not log_files:
+        logger.warning("No log files found in %s", result_dir)
+        return {"mean": 0.0, "std_err": 0.0, "num_tasks": 0}
+
+    log_scores: List[Tuple[Path, List[float], Set[str]]] = []
+    for log_file in log_files:
+        try:
+            scored_by_id = _extract_scores_from_log(log_file)
+            scored_ids = set(scored_by_id.keys())
+            matched_scores = [
+                scored_by_id[task_id]
+                for task_id in expected_task_ids
+                if task_id in scored_by_id
+            ]
+            log_scores.append((log_file, matched_scores, scored_ids))
+        except Exception as e:
+            logger.warning("Failed to parse log %s: %s", log_file, e)
+            continue
+
+    if not log_scores:
+        return {"mean": 0.0, "std_err": 0.0, "num_tasks": 0, "exact_match": False}
+
+    # If multiple logs exist, prefer exact task-id match, then best coverage.
+    # This avoids double-counting retries in the same capability directory.
+    selected_log, selected_scores, selected_ids = max(
+        log_scores,
+        key=lambda x: (
+            x[2] == expected_task_ids,
+            len(x[1]),
+            x[0].stat().st_mtime,
+            x[0].name,
+        ),
+    )
+
+    if len(log_scores) > 1:
+        logger.info(
+            "Multiple logs found in %s; selected %s with %d scored samples",
+            result_dir,
+            selected_log.name,
+            len(selected_scores),
+        )
+
+    stats = _compute_stats(selected_scores)
+    stats["exact_match"] = selected_ids == expected_task_ids
+    return stats
+
+
+def run_eval_stage2(
+    cfg: DictConfig,
+    eval_tag: str,
+) -> str:
+    """Run Stage 2 score aggregation and return eval_tag."""
+    # Derive paths from config
+    exp_id = cfg.exp_cfg.exp_id
+    output_base_dir = Path(cfg.global_cfg.output_dir)
+    experiment_dir = output_base_dir / exp_id
+    results_dir = experiment_dir / "eval" / "results" / eval_tag
+
+    # Load eval config from Stage 1
+    eval_config_path = results_dir / "eval_config.json"
+    if not eval_config_path.exists():
+        raise ValueError(
+            f"eval_config.json not found at {eval_config_path}. Run Stage 1 first."
+        )
+    eval_config, _ = load_eval_config(eval_config_path)
+
+    logger.info("Eval Stage 2: Aggregating scores (eval_tag=%s)", eval_tag)
+
+    # Find datasets (saved under validation_tag)
+    validation_tag = eval_config.validation_tag
+    datasets_dir = experiment_dir / "eval" / "datasets" / validation_tag
+
+    scores_dir = experiment_dir / "eval" / "scores" / eval_tag
+
+    # Load datasets for capability info
+    dataset_map = {}  # (area_id, cap_id) -> EvalDataset
+    for dataset_path in sorted(datasets_dir.rglob("dataset.json")):
+        dataset = load_eval_dataset(dataset_path)
+        dataset_map[(dataset.area_id, dataset.capability_id)] = dataset
+
+    if not dataset_map:
+        raise ValueError(
+            f"No datasets found in {datasets_dir}. Run Eval Stage 0 first."
+        )
+
+    num_llms_processed = 0
+
+    for llm_config in eval_config.subject_llms:
+        llm_name = llm_config["name"]
+        logger.info("  Processing results for %s", llm_name)
+
+        # Find all result directories for this LLM
+        result_dirs = _find_result_dirs(results_dir, llm_name)
+
+        if not result_dirs:
+            logger.warning("  No results found for %s", llm_name)
+            continue
+
+        capability_scores = []
+
+        for result_dir in result_dirs:
+            # Extract area_id and capability_id from path
+            cap_id = result_dir.name
+            area_id = result_dir.parent.name
+
+            # Get capability info from dataset
+            cap_dataset = dataset_map.get((area_id, cap_id))
+            if cap_dataset is None:
+                logger.warning(
+                    "  No dataset found for %s/%s, skipping",
+                    area_id,
+                    cap_id,
+                )
+                continue
+
+            expected_task_ids = {str(task["id"]) for task in cap_dataset.tasks}
+
+            # Parse Inspect logs
+            parsed = _parse_inspect_logs(result_dir, expected_task_ids)
+
+            if parsed["num_tasks"] < cap_dataset.num_tasks:
+                logger.warning(
+                    "  Incomplete scoring for %s/%s with %s: %d/%d tasks scored",
+                    area_id,
+                    cap_id,
+                    llm_name,
+                    parsed["num_tasks"],
+                    cap_dataset.num_tasks,
+                )
+            elif not parsed.get("exact_match", False):
+                logger.warning(
+                    "  Task ID mismatch for %s/%s with %s "
+                    "(scored task IDs differ from dataset task IDs)",
+                    area_id,
+                    cap_id,
+                    llm_name,
+                )
+
+            # Create CapabilityScore
+            score = CapabilityScore(
+                area_id=area_id,
+                capability_id=cap_id,
+                capability_name=cap_dataset.capability_name,
+                subject_llm=llm_name,
+                mean=parsed["mean"],
+                std_err=parsed["std_err"],
+                num_tasks=parsed["num_tasks"],
+            )
+            capability_scores.append(score)
+
+        capability_scores.sort(key=lambda s: (s.area_id, s.capability_id))
+
+        # Save scores for this LLM
+        if capability_scores:
+            scores_path = scores_dir / llm_name / "capability_scores.json"
+            save_capability_scores(capability_scores, scores_path)
+            logger.info(
+                "  Saved %d capability scores for %s",
+                len(capability_scores),
+                llm_name,
+            )
+            num_llms_processed += 1
+
+    logger.info(
+        "Eval Stage 2: Aggregated scores for %d LLMs",
+        num_llms_processed,
+    )
+
+    return eval_tag
diff --git a/src/run_eval_pipeline.py b/src/run_eval_pipeline.py
new file mode 100644
index 00000000..541a762e
--- /dev/null
+++ b/src/run_eval_pipeline.py
@@ -0,0 +1,141 @@
+"""Evaluation pipeline for running LLM evaluations on generated tasks.
+
+This module orchestrates the evaluation pipeline:
+- Stage 0: Setup and Dataset Preparation
+- Stage 1: Evaluation Execution (runs subject LLMs, creates eval_tag)
+- Stage 2: Score Aggregation
+
+Usage:
+    # Run all stages
+    python -m src.run_eval_pipeline validation_tag=_YYYYMMDD_HHMMSS
+
+    # Run specific stage
+    python -m src.run_eval_pipeline stage=0 validation_tag=_YYYYMMDD_HHMMSS
+    python -m src.run_eval_pipeline stage=1 validation_tag=_YYYYMMDD_HHMMSS
+    python -m src.run_eval_pipeline stage=1 validation_tag=_YYYYMMDD_HHMMSS \
+        eval_tag=_YYYYMMDD_HHMMSS
+    python -m src.run_eval_pipeline stage=2 eval_tag=_YYYYMMDD_HHMMSS
+"""
+
+import logging
+
+import hydra
+from omegaconf import DictConfig
+
+from src.eval_stages import (
+    EvalSetupError,
+    run_eval_stage0,
+    run_eval_stage1,
+    run_eval_stage2,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+@hydra.main(version_base=None, config_path="cfg", config_name="run_cfg")
+def main(cfg: DictConfig) -> None:
+    """Run the evaluation pipeline."""
+    # Get stage to run (default: "all")
+    stage = cfg.get("stage", "all")
+    if isinstance(stage, str) and stage.isdigit():
+        stage = int(stage)
+
+    # Get tags from config
+    validation_tag = cfg.get("validation_tag")
+    eval_tag = cfg.get("eval_tag")
+
+    logger.info("=" * 60)
+    logger.info("EVALUATION PIPELINE")
+    logger.info("=" * 60)
+    logger.info("Stage: %s", stage)
+    logger.info("Experiment ID: %s", cfg.exp_cfg.exp_id)
+    logger.info("validation_tag: %s", validation_tag)
+    logger.info("eval_tag: %s", eval_tag)
+    logger.info("=" * 60)
+
+    # Run all stages sequentially
+    if stage == "all":
+        if not validation_tag:
+            logger.error("validation_tag is required")
+            logger.error(
+                "Usage: python -m src.run_eval_pipeline validation_tag=_YYYYMMDD_HHMMSS"
+            )
+            return
+
+        try:
+            # Stage 0: Setup and Dataset Preparation
+            logger.info("Running Eval Stage 0: Setup and Dataset Preparation")
+            run_eval_stage0(cfg, validation_tag)
+            logger.info("Eval Stage 0 complete.")
+
+            # Stage 1: Evaluation Execution
+            logger.info("Running Eval Stage 1: Evaluation Execution")
+            eval_tag = run_eval_stage1(cfg, validation_tag, eval_tag)
+            logger.info("Eval Stage 1 complete. eval_tag=%s", eval_tag)
+
+            # Stage 2: Score Aggregation
+            logger.info("Running Eval Stage 2: Score Aggregation")
+            run_eval_stage2(cfg, eval_tag)
+            logger.info("Eval Stage 2 complete.")
+
+        except EvalSetupError as e:
+            logger.error("Evaluation setup failed: %s", e)
+            return
+        except ValueError as e:
+            logger.error("Evaluation failed: %s", e)
+            return
+
+    # Run specific stage
+    elif stage == 0:
+        if not validation_tag:
+            logger.error("validation_tag is required for stage 0")
+            logger.error(
+                "Usage: python -m src.run_eval_pipeline stage=0 "
+                "validation_tag=_YYYYMMDD_HHMMSS"
+            )
+            return
+
+        try:
+            run_eval_stage0(cfg, validation_tag)
+            logger.info("Eval Stage 0 complete. Datasets created.")
+        except EvalSetupError as e:
+            logger.error("Evaluation setup failed: %s", e)
+
+    elif stage == 1:
+        if not validation_tag:
+            logger.error("validation_tag is required for stage 1")
+            logger.error(
+                "Usage: python -m src.run_eval_pipeline stage=1 "
+                "validation_tag=_YYYYMMDD_HHMMSS"
+            )
+            return
+
+        try:
+            # Stage 1 reads eval_config from Stage 0's output
+            eval_tag = run_eval_stage1(cfg, validation_tag, eval_tag)
+            logger.info("Eval Stage 1 complete. eval_tag=%s", eval_tag)
+        except ValueError as e:
+            logger.error("Stage 1 failed: %s", e)
+
+    elif stage == 2:
+        if not eval_tag:
+            logger.error("eval_tag is required for stage 2")
+            logger.error(
+                "Usage: python -m src.run_eval_pipeline stage=2 "
+                "eval_tag=_YYYYMMDD_HHMMSS"
+            )
+            return
+
+        try:
+            run_eval_stage2(cfg, eval_tag)
+            logger.info("Eval Stage 2 complete.")
+        except ValueError as e:
+            logger.error("Stage 2 failed: %s", e)
+
+    else:
+        logger.error("Invalid stage: %s. Use 'all', 0, 1, or 2", stage)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md b/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md
new file mode 100644
index 00000000..5bb727a6
--- /dev/null
+++ b/src/schemas/EVALUATION_PIPELINE_SCHEMAS.md
@@ -0,0 +1,354 @@
+# ACE Evaluation Pipeline Standardized Schemas
+
+The **evaluation pipeline** takes the validated tasks and solutions from the generation pipeline and evaluates subject LLMs on them using [Inspect](https://inspect.aisi.org.uk/). It produces capability scores that measure how well each subject LLM performs on each capability.
+
+This document defines the standardized input and output formats for each stage of the evaluation pipeline. These schemas ensure consistency across different implementations and enable interoperability between pipeline stages.
+
+## Pipeline Stages
+
+The evaluation pipeline consists of three stages:
+
+0. **Setup and Dataset Preparation**: Validate inputs, convert tasks to Inspect format (no LLM calls)
+1. **Evaluation Execution**: Run Inspect evaluation with subject LLMs (creates `eval_tag`)
+2. **Score Aggregation**: Compute capability scores from raw results (no LLM calls)
+
+---
+
+## Implementation Approach
+
+**Pipeline Pattern:**
+- **Stage 0**: Deterministic data transformation (no LLM, no tag needed)
+- **Stage 1**: LLM-dependent evaluation (creates `eval_tag` for results)
+- **Stage 2**: Deterministic aggregation (uses `eval_tag` from Stage 1)
+
+**Shared Config:**
+The evaluation pipeline uses the **same configuration file** as the generation pipeline
+([`src/cfg/run_cfg.yaml`](../cfg/run_cfg.yaml)), with an evaluation-specific section
+(`eval_cfg`).
+
+**Resumability:**
+- **Stage 0**: Idempotent - skips datasets that already exist
+- **Stage 1**: Creates a fresh `eval_tag` by default. If you provide an existing
+  `eval_tag`, Stage 1 resumes and skips capability/LLM runs with complete logs.
+
+---
+
+## Configuration
+
+```yaml
+eval_cfg:
+  # Subject LLMs to evaluate (required)
+  subject_llms:
+    - name: "gpt-4o"
+      provider: "openai"
+    - name: "claude-3-sonnet"
+      provider: "anthropic"
+
+  # Judge LLM for scoring (required)
+  judge_llm:
+    name: "gpt-4o-mini"
+    provider: "openai"
+```
+
+---
+
+## Naming Conventions
+
+See [`src/schemas/GENERATION_PIPELINE_SCHEMAS.md`](GENERATION_PIPELINE_SCHEMAS.md) for
+naming conventions. Tags follow the same format: `_YYYYMMDD_HHMMSS`.
+
+---
+
+## Directory Structure
+
+Evaluation outputs are stored in an `eval/` subdirectory within the experiment directory
+(see [`src/schemas/GENERATION_PIPELINE_SCHEMAS.md`](GENERATION_PIPELINE_SCHEMAS.md) for generation structure):
+
+```
+<experiment_id>/
+  eval/
+    datasets/                              # Stage 0 output
+      <validation_tag>/                    # Tied to validation source
+        <area_id>/
+          <capability_id>/
+            dataset.json                   # EvalDataset
+
+    results/                               # Stage 1 output
+      <eval_tag>/
+        eval_config.json                   # EvalConfig saved here
+        <subject_llm>/
+          <area_id>/
+            <capability_id>/               # Inspect logs
+              *.json                       # Inspect log files (per run)
+
+    scores/                                # Stage 2 output
+      <eval_tag>/
+        <subject_llm>/
+          capability_scores.json           # List[CapabilityScore]
+```
+
+**Example:**
+```
+r0_10x10/
+  eval/
+    datasets/
+      _20251017_091500/
+        area_000/
+          cap_000/dataset.json
+          cap_001/dataset.json
+    results/
+      _20251020_143000/
+        eval_config.json
+        gpt-4o/
+          area_000/
+            cap_000/
+            cap_001/
+        claude-3-sonnet/
+          area_000/
+            cap_000/
+            cap_001/
+    scores/
+      _20251020_143000/
+        gpt-4o/capability_scores.json
+        claude-3-sonnet/capability_scores.json
+```
+
+---
+
+## Dataclasses
+
+The evaluation pipeline uses 3 dataclasses, plus reuses `PipelineMetadata` from the
+generation pipeline (see [`src/schemas/GENERATION_PIPELINE_SCHEMAS.md`](GENERATION_PIPELINE_SCHEMAS.md#pipelinemetadata)).
+
+**File:** [`src/schemas/eval_schemas.py`](eval_schemas.py)
+
+### EvalConfig
+
+Configuration for the evaluation run.
+
+**Fields:**
+- `experiment_id`: String (required)
+- `eval_tag`: String (set in Stage 1)
+- `subject_llms`: List[Dict] (required, each dict has "name" and "provider")
+- `judge_llm`: Dict (required, has "name" and "provider")
+- `validation_tag`: String (required, tag from generation Stage 5)
+
+### EvalDataset
+
+Dataset prepared for Inspect evaluation. Contains all info for one capability.
+
+**Fields:**
+- `area_id`: String (required)
+- `capability_id`: String (required)
+- `capability_name`: String (required)
+- `domain`: String (required)
+- `tasks`: List[Dict] (required, each dict has "id", "input", "target")
+- `num_tasks`: Integer (required)
+- `prompt_template`: String (required)
+
+### CapabilityScore
+
+Score for a single capability from evaluation.
+
+**Fields:**
+- `area_id`: String (required)
+- `capability_id`: String (required)
+- `capability_name`: String (required)
+- `subject_llm`: String (required)
+- `mean`: Float (required, 0.0 to 1.0)
+- `std_err`: Float (required)
+- `num_tasks`: Integer (required)
+
+---
+
+## Eval Stage 0: Setup and Dataset Preparation
+
+### Purpose
+Validate inputs and convert validated tasks to Inspect-compatible format.
+
+### Input
+- **validation_tag**: String - Tag from generation Stage 5 (required)
+- **Configuration**: `eval_cfg` section from config YAML (subject_llms and judge_llm required)
+
+### Validation Checks
+1. Generation `experiment.json` exists
+2. Validation outputs exist at `validation/<validation_tag>/`
+3. `subject_llms` and `judge_llm` are configured
+
+### Output: `dataset.json` (per capability)
+
+**Stage Output:** EvalDataset dataclass
+**Save Function:** `save_eval_dataset(dataset: EvalDataset, output_path: Path)`
+
+**Also Saved:** `eval_config.json` (EvalConfig + PipelineMetadata)
+**Path:** `<output_dir>/<experiment_id>/eval/datasets/<validation_tag>/eval_config.json`
+
+**File Path:** `<output_dir>/<experiment_id>/eval/datasets/<validation_tag>/<area_id>/<capability_id>/dataset.json`
+
+```json
+{
+  "area_id": "area_000",
+  "capability_id": "cap_000",
+  "capability_name": "compound_interest",
+  "domain": "personal_finance",
+  "tasks": [
+    {"id": "task_000", "input": "What is the future value of $1000...", "target": "1647.01"},
+    {"id": "task_001", "input": "Calculate the present value of $5000...", "target": "3402.92"}
+  ],
+  "num_tasks": 10,
+  "prompt_template": "..."
+}
+```
+
+**Returns:** None (writes `eval_config.json` for Stage 1)
+
+---
+
+## Eval Stage 1: Evaluation Execution
+
+### Purpose
+Run Inspect evaluation for each capability with each subject LLM.
+
+### Input
+- **eval_config**: EvalConfig from Stage 0
+- **eval_tag** (optional): Existing tag to resume an interrupted Stage 1 run
+
+### Tag Handling
+- **Creates**: New `eval_tag` if none is provided (generated by `timestamp_tag()` in
+  [`src/utils/timestamp_utils.py`](../utils/timestamp_utils.py))
+- **Resume**: If `eval_tag` is provided, Stage 1 writes into that tag and skips
+  `(subject_llm, area_id, capability_id)` combinations that already have complete logs.
+  For incomplete combinations with failed logs, Stage 1 uses Inspect `eval_retry`
+  from those log files.
+
+### Output: Inspect logs + `eval_config.json`
+
+**Stage Output:** Raw Inspect AI logs (stored by Inspect directly)
+
+**File Path:** `<output_dir>/<experiment_id>/eval/results/<eval_tag>/<subject_llm>/<area_id>/<capability_id>/`
+
+The `eval_config.json` is saved to
+`<output_dir>/<experiment_id>/eval/results/<eval_tag>/eval_config.json` for reference.
+
+**Returns:** `eval_tag` string
+
+**Completion Criterion:** A capability/LLM run is treated as complete only when
+scored task IDs in the log exactly match expected dataset task IDs for that capability.
+
+**Stage 1 Summary Logs:** The stage logs
+`completed_this_run`, `skipped_completed`, `resumed`, `failed`, `incomplete`, and
+`total`.
+
+### Scoring Details (Per-Task)
+- Each task in `EvalDataset.tasks` becomes an Inspect `Sample` with `id=task["id"]`
+  (see [`src/eval_stages/stage1_eval_execution.py`](../eval_stages/stage1_eval_execution.py)).
+- The judge model scores each sample via `model_graded_fact` during Stage 1.
+- Per-task scores live **only** in the Inspect log JSON files, under
+  `samples[].scores`. These scores are aggregated in Stage 2; there is no separate
+  per-task score file written by this pipeline.
+
+---
+
+## Eval Stage 2: Score Aggregation
+
+### Purpose
+Compute final capability scores from raw Inspect results.
+
+### Input
+- **eval_tag**: Tag from Stage 1
+
+### Output: `capability_scores.json` (per subject LLM)
+
+**Stage Output:** List[CapabilityScore]
+**Save Function:** `save_capability_scores(scores: List[CapabilityScore], output_path: Path)`
+
+**File Path:** `<output_dir>/<experiment_id>/eval/scores/<eval_tag>/<subject_llm>/capability_scores.json`
+
+**Aggregation Note:** Stage 2 reads Inspect log JSON files under
+`<output_dir>/<experiment_id>/eval/results/<eval_tag>/<subject_llm>/<area_id>/<capability_id>/`
+and uses the log with the most expected-task matches (to avoid retry double-counting).
+
+```json
+[
+  {
+    "area_id": "area_000",
+    "capability_id": "cap_000",
+    "capability_name": "compound_interest",
+    "subject_llm": "gpt-4o",
+    "mean": 0.90,
+    "std_err": 0.03,
+    "num_tasks": 10
+  }
+]
+```
+
+**Returns:** `eval_tag` string
+
+---
+
+## Usage
+
+### Run Full Evaluation
+
+```bash
+# Basic usage - evaluate all capabilities
+python -m src.run_eval_pipeline validation_tag=_20251017_091500
+```
+
+### Run Specific Stages
+
+```bash
+# Run only Stage 0 (setup + dataset preparation)
+python -m src.run_eval_pipeline stage=0 validation_tag=_20251017_091500
+
+# Run only Stage 1 (requires Stage 0 outputs)
+python -m src.run_eval_pipeline stage=1 validation_tag=_20251017_091500
+
+# Resume Stage 1 with an existing eval_tag
+python -m src.run_eval_pipeline stage=1 validation_tag=_20251017_091500 eval_tag=_20251020_143000
+
+# Run only Stage 2 (score aggregation) - requires eval_tag from Stage 1
+python -m src.run_eval_pipeline stage=2 eval_tag=_20251020_143000
+```
+
+---
+
+## IO Utilities
+
+The following functions are provided in [`src/schemas/eval_io_utils.py`](eval_io_utils.py):
+
+### Save Functions
+- `save_eval_config(config: EvalConfig, metadata: PipelineMetadata, path: Path)`
+- `save_eval_dataset(dataset: EvalDataset, path: Path)`
+- `save_capability_scores(scores: List[CapabilityScore], path: Path)`
+
+### Load Functions
+- `load_eval_config(path: Path) -> Tuple[EvalConfig, PipelineMetadata]`
+- `load_eval_dataset(path: Path) -> EvalDataset`
+- `load_capability_scores(path: Path) -> List[CapabilityScore]`
+
+### Helper Functions
+- Use `timestamp_tag()` from [`src/utils/timestamp_utils.py`](../utils/timestamp_utils.py)
+  to generate tags
+- `get_experiment_dir(output_base_dir: str, experiment_id: str) -> Path`
+
+---
+
+## Relationship to Generation Pipeline
+
+The evaluation pipeline depends on the generation pipeline outputs:
+
+| Eval Stage | Depends On | Generation Stage |
+|------------|------------|------------------|
+| Eval Stage 0 | `experiment.json` | Stage 0 |
+| Eval Stage 0 | `validation/<validation_tag>/` | Stage 5 |
+
+---
+
+## Legacy: LBO Support
+
+The previous version of the repository included **Latent Bayesian Optimization (LBO)** for intelligent capability selection during evaluation. This functionality has been moved to the `legacy/` directory for reference.
+
+See `legacy/README.md` for details on the LBO implementation and how it was used.
+
+---
diff --git a/src/schemas/__init__.py b/src/schemas/__init__.py
index 29e46fc9..2811f309 100644
--- a/src/schemas/__init__.py
+++ b/src/schemas/__init__.py
@@ -7,6 +7,23 @@
 from src.schemas.area_schemas import Area
 from src.schemas.capability_schemas import Capability
 from src.schemas.domain_schemas import Domain
+from src.schemas.eval_io_utils import (
+    get_eval_dir,
+    get_experiment_dir,
+    load_capability_scores,
+    load_eval_config,
+    load_eval_dataset,
+    save_capability_scores,
+    save_eval_config,
+    save_eval_dataset,
+)
+
+# Evaluation pipeline schemas
+from src.schemas.eval_schemas import (
+    CapabilityScore,
+    EvalConfig,
+    EvalDataset,
+)
 from src.schemas.experiment_schemas import Experiment
 from src.schemas.io_utils import (
     load_areas,
@@ -46,7 +63,7 @@
     "TaskSolution",
     # Validation schemas
     "ValidationResult",
-    # I/O functions - Save
+    # I/O functions - Save (Generation)
     "save_experiment",
     "save_domain",
     "save_areas",
@@ -54,7 +71,7 @@
     "save_tasks",
     "save_solution",
     "save_validation",
-    # I/O functions - Load
+    # I/O functions - Load (Generation)
     "load_experiment",
     "load_domain",
     "load_areas",
@@ -62,4 +79,19 @@
     "load_tasks",
     "load_solution",
     "load_validation",
+    # Evaluation schemas
+    "EvalConfig",
+    "EvalDataset",
+    "CapabilityScore",
+    # I/O functions - Save (Evaluation)
+    "save_eval_config",
+    "save_eval_dataset",
+    "save_capability_scores",
+    # I/O functions - Load (Evaluation)
+    "load_eval_config",
+    "load_eval_dataset",
+    "load_capability_scores",
+    # Helper functions
+    "get_experiment_dir",
+    "get_eval_dir",
 ]
diff --git a/src/schemas/eval_io_utils.py b/src/schemas/eval_io_utils.py
new file mode 100644
index 00000000..d1a9e343
--- /dev/null
+++ b/src/schemas/eval_io_utils.py
@@ -0,0 +1,142 @@
+"""I/O utilities for saving and loading evaluation pipeline outputs."""
+
+import json
+from pathlib import Path
+from typing import List, Tuple
+
+from src.schemas.eval_schemas import (
+    CapabilityScore,
+    EvalConfig,
+    EvalDataset,
+)
+from src.schemas.metadata_schemas import PipelineMetadata
+
+
+# Save functions
+
+
+def save_eval_config(
+    config: EvalConfig, metadata: PipelineMetadata, output_path: Path
+) -> None:
+    """Save eval config to JSON file.
+
+    Args:
+        config: EvalConfig dataclass
+        metadata: PipelineMetadata dataclass
+        output_path: Path to save the JSON file
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    data = {
+        "metadata": metadata.to_dict(),
+        **config.to_dict(),
+    }
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+
+
+def save_eval_dataset(dataset: EvalDataset, output_path: Path) -> None:
+    """Save eval dataset to JSON file.
+
+    Args:
+        dataset: EvalDataset dataclass
+        output_path: Path to save the JSON file
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(dataset.to_dict(), f, indent=2, ensure_ascii=False)
+
+
+def save_capability_scores(scores: List[CapabilityScore], output_path: Path) -> None:
+    """Save capability scores to JSON file.
+
+    Args:
+        scores: List of CapabilityScore dataclasses
+        output_path: Path to save the JSON file
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    data = [score.to_dict() for score in scores]
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+
+
+# Load functions
+
+
+def load_eval_config(file_path: Path) -> Tuple[EvalConfig, PipelineMetadata]:
+    """Load eval config from JSON file.
+
+    Args:
+        file_path: Path to the JSON file
+
+    Returns
+    -------
+        Tuple of (EvalConfig, PipelineMetadata)
+    """
+    with open(file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    metadata = PipelineMetadata.from_dict(data["metadata"])
+    # Config fields are at top level (alongside metadata)
+    config_data = {k: v for k, v in data.items() if k != "metadata"}
+    config = EvalConfig.from_dict(config_data)
+    return config, metadata
+
+
+def load_eval_dataset(file_path: Path) -> EvalDataset:
+    """Load eval dataset from JSON file.
+
+    Args:
+        file_path: Path to the JSON file
+
+    Returns
+    -------
+        EvalDataset dataclass
+    """
+    with open(file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    return EvalDataset.from_dict(data)
+
+
+def load_capability_scores(file_path: Path) -> List[CapabilityScore]:
+    """Load capability scores from JSON file.
+
+    Args:
+        file_path: Path to the JSON file
+
+    Returns
+    -------
+        List of CapabilityScore dataclasses
+    """
+    with open(file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    return [CapabilityScore.from_dict(item) for item in data]
+
+
+# Helper functions
+
+
+def get_experiment_dir(output_base_dir: str, experiment_id: str) -> Path:
+    """Get the experiment directory path.
+
+    Args:
+        output_base_dir: Base output directory
+        experiment_id: Experiment identifier
+
+    Returns
+    -------
+        Path to experiment directory
+    """
+    return Path(output_base_dir) / experiment_id
+
+
+def get_eval_dir(experiment_dir: Path, eval_tag: str) -> Path:
+    """Get the eval output directory path.
+
+    Args:
+        experiment_dir: Path to experiment directory
+        eval_tag: Eval tag
+
+    Returns
+    -------
+        Path to eval Stage 1 results directory
+    """
+    return experiment_dir / "eval" / "results" / eval_tag
diff --git a/src/schemas/eval_schemas.py b/src/schemas/eval_schemas.py
new file mode 100644
index 00000000..27f06991
--- /dev/null
+++ b/src/schemas/eval_schemas.py
@@ -0,0 +1,134 @@
+"""Schemas for evaluation pipeline stages.
+
+Defines dataclasses for evaluation pipeline:
+- EvalConfig: Configuration for evaluation run (Stage 0 output)
+- EvalDataset: Dataset for one capability (Stage 0 output)
+- CapabilityScore: Score for one capability (Stage 2 output)
+"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, List
+
+
+@dataclass
+class EvalConfig:
+    """Configuration for the evaluation run.
+
+    Created by Eval Stage 0 (Eval Setup). Contains all configuration needed
+    to run the evaluation pipeline, including references to generation outputs.
+    """
+
+    experiment_id: str
+    eval_tag: str
+    subject_llms: List[
+        Dict[str, str]
+    ]  # [{"name": "gpt-4o", "provider": "openai"}, ...]
+    judge_llm: Dict[str, str]  # {"name": "gpt-4o-mini", "provider": "openai"}
+    validation_tag: str  # Tag from generation Stage 5
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "experiment_id": self.experiment_id,
+            "eval_tag": self.eval_tag,
+            "subject_llms": self.subject_llms,
+            "judge_llm": self.judge_llm,
+            "validation_tag": self.validation_tag,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "EvalConfig":
+        """Create from dictionary."""
+        return cls(
+            experiment_id=data["experiment_id"],
+            eval_tag=data["eval_tag"],
+            subject_llms=data["subject_llms"],
+            judge_llm=data["judge_llm"],
+            validation_tag=data["validation_tag"],
+        )
+
+
+@dataclass
+class EvalDataset:
+    """Dataset prepared for Inspect evaluation.
+
+    Created by Eval Stage 0 (Setup and Dataset Preparation). Contains all info
+    needed to run Inspect evaluation for one capability.
+    """
+
+    area_id: str
+    capability_id: str
+    capability_name: str
+    domain: str
+    tasks: List[
+        Dict[str, str]
+    ]  # [{"id": "task_000", "input": "...", "target": "..."}, ...]
+    num_tasks: int
+    prompt_template: str
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "area_id": self.area_id,
+            "capability_id": self.capability_id,
+            "capability_name": self.capability_name,
+            "domain": self.domain,
+            "tasks": self.tasks,
+            "num_tasks": self.num_tasks,
+            "prompt_template": self.prompt_template,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "EvalDataset":
+        """Create from dictionary."""
+        return cls(
+            area_id=data["area_id"],
+            capability_id=data["capability_id"],
+            capability_name=data["capability_name"],
+            domain=data["domain"],
+            tasks=data["tasks"],
+            num_tasks=data["num_tasks"],
+            prompt_template=data["prompt_template"],
+        )
+
+
+@dataclass
+class CapabilityScore:
+    """Score for a single capability from evaluation.
+
+    Created by Eval Stage 2 (Score Aggregation). Represents the evaluation
+    result for one capability with one subject LLM.
+    """
+
+    area_id: str
+    capability_id: str
+    capability_name: str
+    subject_llm: str
+    mean: float  # 0.0 to 1.0
+    std_err: float
+    num_tasks: int
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "area_id": self.area_id,
+            "capability_id": self.capability_id,
+            "capability_name": self.capability_name,
+            "subject_llm": self.subject_llm,
+            "mean": self.mean,
+            "std_err": self.std_err,
+            "num_tasks": self.num_tasks,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "CapabilityScore":
+        """Create from dictionary."""
+        return cls(
+            area_id=data["area_id"],
+            capability_id=data["capability_id"],
+            capability_name=data["capability_name"],
+            subject_llm=data["subject_llm"],
+            mean=data["mean"],
+            std_err=data["std_err"],
+            num_tasks=data["num_tasks"],
+        )