VectorInstitute · kohankhaki · Mar 20, 2026 · Jan 6, 2026 · Feb 3, 2026 · Feb 25, 2026
diff --git a/.gitignore b/.gitignore
@@ -140,8 +140,11 @@ dmypy.json
 .idea/
 
 logs/
+src/task_generation/init_book_chapter_text_files
+src/task_generation/other_scripts
 src/outputs/
 outputs/
+base_output/
 
 # inspect result logs
 seed_datasets_inspect_logs/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -44,14 +44,14 @@ repos:
     - id: doctest
       name: doctest
       entry: python3 -m doctest -o NORMALIZE_WHITESPACE
-      files: "^automated_capability_evaluation/"
+      files: "^(src|legacy|tests|example_scripts|wikipedia)/.*\\.py$"
       language: system
 
   - repo: local
     hooks:
     - id: pytest
       name: pytest
-      entry: python3 -m pytest -m "not integration_test"
+      entry: python3 -m pytest
       language: system
       pass_filenames: false
       always_run: true
@@ -65,5 +65,4 @@ ci:
     autoupdate_branch: ''
     autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
     autoupdate_schedule: weekly
-    skip: [pytest,doctest,mypy]
     submodules: false
diff --git a/README.md b/README.md
@@ -20,14 +20,6 @@ run:
 python3 -m poetry install --with test
 ```
 
-#### [Optional] Google Cloud Authentication
-
-The capability evaluation logs (evaluated using [Inspect](https://inspect.aisi.org.uk/)) are stored in a GCP bucket. Use the following command to log in using your GCP account:
-
-```bash
-gcloud auth application-default login
-```
-
 ## Run pipeline
 
 ### Configuration
@@ -40,16 +32,23 @@ gcloud auth application-default login
 - Rate limit vars (default values given):
     - RATE_LIMIT_CALLS=5
     - RATE_LIMIT_PERIOD=60
-- LangSmith tracing vars:
-    - LANGSMITH_TRACING=true
-    - LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
-    - LANGSMITH_API_KEY=<langsmith_api_key>
-    - LANGSMITH_PROJECT="automated_capability_evaluation"
-- GCP env vars:
-    - GOOGLE_CLOUD_PROJECT=<project_id>
+- Langfuse tracing vars (for task solver and agent traces):
+    - LANGFUSE_PUBLIC_KEY=<langfuse_public_key>
+    - LANGFUSE_SECRET_KEY=<langfuse_secret_key>
+    - LANGFUSE_HOST=<langfuse_host> (optional, defaults to Langfuse Cloud)
 
 2. Modify `src/cfg/run_cfg.yaml`, if required.
 
+### Current Pipeline Status
+
+- The active default flow is the schema-based base pipeline in `src/base_stages/` (Stages 0-5).
+- Most of the old agentic generation stages were removed from the active code path because they were outdated (notably legacy agentic area/capability generation modules).
+- Agentic task generation is experimental and currently Stage-3-focused. For usage and caveats, see `src/task_generation/INSTRUCTIONS.md`.
+- We kept the task solver module because it now includes tool-assisted solving support introduced in PR #62:
+  - `src/task_solver/tool_assisted_scientist.py`
+  - `src/tools/` (`ScientificToolKit`, safe Python execution, optional doc retrieval, tool-selection flow)
+  - details and examples: `src/tools/README.md`
+
 ### Base Pipeline
 
 The base (non-agentic) pipeline consists of multiple stages that can be run sequentially or individually:
@@ -105,127 +104,26 @@ python -m src.run_base_pipeline stage=4 tasks_tag=_YYYYMMDD_HHMMSS solution_tag=
 python -m src.run_base_pipeline stage=5 solution_tag=_YYYYMMDD_HHMMSS validation_tag=_YYYYMMDD_HHMMSS
 ```
 
-### Evaluation of subject LLM on generated capabilities
+### Evaluation Pipeline
 
-Evaluates the subject LLM on the generated capabilities and calculates a score for each.
+Evaluate subject LLMs on validated tasks and aggregate scores:
 
 ```bash
-python -m src.run_evaluation
-```
-
-### Capability selection/generation using active learning
+# Run all evaluation stages (setup -> execution -> aggregation)
+python -m src.run_eval_pipeline validation_tag=_YYYYMMDD_HHMMSS
 
-Utilize the capability and the corresponding subject LLM score to select or generate a new capability.
-
-```bash
-python -m src.run_lbo
+# Or run individual stages
+python -m src.run_eval_pipeline stage=0 validation_tag=_YYYYMMDD_HHMMSS
+python -m src.run_eval_pipeline stage=1 validation_tag=_YYYYMMDD_HHMMSS
+python -m src.run_eval_pipeline stage=2 eval_tag=_YYYYMMDD_HHMMSS
 ```
-### Agentic Generation Scripts
-
-These scripts implement the multi-agent debate workflow for automated generation of areas, capabilities, tasks, and solutions.
-All configurable parameters are defined in `src/cfg/agentic_config.yaml`.
 
-#### Understanding Pipeline Tags
+### Legacy Pipelines
 
-The pipeline uses **auto-generated tags** to organize outputs from each step. Understanding how tags work is essential for running the pipeline:
+Some historical pipelines and scripts were moved to `legacy/` and are not part of the active flow:
 
-- **Tag Format**: Tags are automatically generated timestamps in the format `_YYYYMMDD_HHMMSS` (e.g., `_20251104_143022`)
-- **Auto-Generation**: When you run a step (e.g., Generate Areas), the script automatically creates a tag and includes it in the output path
-- **Finding Tags**: After running a step, check the console output or the output directory to see the generated tag. The tag appears in the file path where outputs are saved
-- **Using Tags**: To run the next step in the pipeline, you need to specify the tag from the previous step's output:
-  - Step 2 (Generate Capabilities) needs `areas_tag` from Step 1
-  - Step 3 (Generate Tasks) needs `capabilities_tag` from Step 2
-  - Step 4 (Generate Solutions) needs `tasks_tag` from Step 3
-
-**Example Workflow**:
-1. Run `python -m src.agentic_area_generator` → outputs to `.../areas/_20251104_143022/areas.json`
-2. Use the tag `_20251104_143022` in the next step:
-   ```bash
-   python -m src.agentic_capability_generator pipeline_tags.areas_tag=_20251104_143022
-   ```
-3. The capability generator outputs to `.../capabilities/_20251104_150315/...`
-4. Use this new tag for the next step, and so on.
-
----
-
-#### 1. Generate Areas
-Generate domain areas using the scientist–moderator debate system:
-```bash
-python -m src.agentic_area_generator
-```
-
-This step auto-generates a tag (e.g., `_20251104_143022`) and outputs the results to:
-
-**Output location:**
-```
-~/<output_dir>/<domain>/<exp_id>/areas/<areas_tag>/areas.json
-```
-Where:
-- `<output_dir>` comes from `global_cfg.output_dir`
-- `<domain>` comes from `global_cfg.domain` (spaces replaced with underscores)
-- `<exp_id>` comes from `exp_cfg.exp_id`
-- `<areas_tag>` is the auto-generated tag for this run (use this tag in Step 2)
-
-#### 2. Generate Capabilities
-Generate capabilities for each area:
-```bash
-# Use the areas_tag from Step 1 (Generate Areas) output
-python -m src.agentic_capability_generator pipeline_tags.areas_tag=_YYYYMMDD_HHMMSS pipeline_tags.resume_capabilities_tag=_YYYYMMDD_HHMMSS
-```
-
-**Options:**
-- `pipeline_tags.areas_tag` specifies which set of areas to use when generating capabilities. This should be the `<areas_tag>` from the output of Step 1 (Generate Areas).
-- `pipeline_tags.resume_capabilities_tag` (optional) resumes a previous capability generation run.
-
-This step auto-generates a new tag for the capabilities output.
-
-**Output location:**
-```
-~/<output_dir>/<domain>/<exp_id>/capabilities/<capabilities_tag>/<area>/capabilities.json
-```
-Where:
-- `<capabilities_tag>` is the auto-generated tag for this run (use this tag in Step 3)
-
-
-#### 3. Generate Tasks
-Generate evaluation tasks for a specific capabilities tag:
-```bash
-# Use the capabilities_tag from Step 2 (Generate Capabilities) output
-python -m src.agentic_task_generator pipeline_tags.capabilities_tag=_YYYYMMDD_HHMMSS pipeline_tags.resume_tasks_tag=_YYYYMMDD_HHMMSS
-```
-
-**Options:**
-- `pipeline_tags.capabilities_tag` specifies which set of capabilities to use when generating tasks. This should be the `<capabilities_tag>` from the output of Step 2 (Generate Capabilities).
-- `pipeline_tags.resume_tasks_tag` (optional) resumes a previous task generation run.
-
-This step auto-generates a new tag for the tasks output.
-
-**Output location:**
-```
-~/<output_dir>/<domain>/<exp_id>/tasks/<tasks_tag>/[<area>]-[<capability>]/tasks.json
-```
-Where:
-- `<tasks_tag>` is the auto-generated tag for this run (use this tag in Step 4)
-
-#### 4. Generate Solutions
-Solve generated tasks using the multi-agent debate system:
-```bash
-# Use the tasks_tag from Step 3 (Generate Tasks) output
-python -m src.agentic_task_solver pipeline_tags.tasks_tag=_YYYYMMDD_HHMMSS pipeline_tags.resume_solutions_tag=_YYYYMMDD_HHMMSS
-```
-
-**Options:**
-- `pipeline_tags.tasks_tag` specifies which set of tasks to solve. This should be the `<tasks_tag>` from the output of Step 3 (Generate Tasks).
-- `pipeline_tags.resume_solutions_tag` (optional) resumes a previous solution generation run.
-
-This step auto-generates a new tag for the solutions output.
-
-**Output location:**
-```
-~/<output_dir>/<domain>/<exp_id>/task_solutions/<solutions_tag>/[<area>]-[<capability>]/<task_id>_solution.json
-```
-Where:
-- `<solutions_tag>` is the auto-generated tag for this run
+- `legacy/pre_schema_pipeline/`: older capability-centric scripts and examples.
+- `legacy/src/`: legacy LBO implementation from the original paper codebase.
 
 ### Wikipedia-Based Analysis Tools
 

diff --git a/example_scripts/README.md b/example_scripts/README.md
@@ -3,7 +3,7 @@
 
 Here we describe the steps required for reading and selecting pre-generated capabilities and their tasks, generating capability embeddings, filtering capabilities based on those embeddings, reducing dimensionality, and visualizing capabilities. All of these steps are implemented in the `train_test_embedding_visualization.py` script, which runs the process for both `train` and `test` capabilities. The directory containing the `train` and `test` capabilities and tasks is specified in the `train_test_embedding_visualization_cfg.yaml` file.
 
-You can also find the steps for loading and visualizing LLM scores in `plot_llm_capability_scores.py`. The scores can be plotted using a spider chart or a bar chart via the `plot_capability_scores_spider_and_bar_chart()` function.
+The older LLM score plotting script (`plot_llm_capability_scores.py`) has been archived in `legacy/pre_schema_pipeline/example_scripts/` because it depended on removed legacy modules.
 
 
 Step 1: Read the already generated and saved train capabilities:

diff --git a/example_scripts/compare_seed_capability_results.py b/example_scripts/compare_seed_capability_results.py
@@ -9,11 +9,11 @@
 import numpy as np
 from omegaconf import DictConfig
 
-from src.utils import constants
-from src.utils.capability_management_utils import get_previous_capabilities
-from src.utils.capability_utils import (
+from legacy.src.utils.capability_utils import (
     read_score_inspect_json,
 )
+from legacy.utils import legacy_constants as constants
+from legacy.utils.legacy_capability_management_utils import get_previous_capabilities
 
 
 def generate_latex_table(

diff --git a/example_scripts/train_test_embedding_visualization.py b/example_scripts/train_test_embedding_visualization.py
@@ -6,16 +6,16 @@
 import hydra
 from omegaconf import DictConfig
 
-from src.utils.capability_management_utils import (
+from legacy.utils.legacy_capability_management_utils import (
     filter_capabilities,
     get_previous_capabilities,
 )
-from src.utils.embedding_utils import (
+from legacy.utils.legacy_embedding_utils import (
     apply_dimensionality_reduction,
     apply_dimensionality_reduction_to_test_capabilities,
     generate_and_set_capabilities_embeddings,
 )
-from src.utils.visualization_utils import (
+from legacy.utils.legacy_visualization_utils import (
     generate_capability_heatmap,
     plot_hierarchical_capability_2d_embeddings,
 )

diff --git a/legacy/README.md b/legacy/README.md
@@ -1,9 +1,14 @@
-# Legacy LBO Code
+# Legacy Code
 
-This directory contains legacy code for **Latent Bayesian Optimization (LBO)** from an earlier version of the repository. LBO was used for intelligent capability selection during evaluation.
+This directory stores archived implementations that are no longer part of the active pipeline.
 
-## Compatible Version
+## Contents
 
-This LBO code is compatible with the repository at commit [`a224c5ec`](https://github.com/VectorInstitute/automated_capability_evaluation/tree/a224c5ec7dd208e04ef2edc059e6e7a2d0d4bcf6). That commit contains the full working version of the codebase used for the **initial paper submission**.
+- `legacy/src/`: Legacy LBO code from the earlier paper-era codebase.
+- `legacy/pre_schema_pipeline/`: Archived pre-standardization scripts and examples.
 
-**This code does not work with the current codebase.** This was the base legacy code before the generation and evaluation pipelines were standardized. If you need to understand how LBO integrated with the rest of the system, refer to that version.
+## LBO Compatibility Note
+
+The LBO code in `legacy/src/` is compatible with commit [`a224c5ec`](https://github.com/VectorInstitute/automated_capability_evaluation/tree/a224c5ec7dd208e04ef2edc059e6e7a2d0d4bcf6), which contains the original working version used for the initial paper submission.
+
+Most legacy code here is provided for reference and is not expected to run unchanged against the current active pipeline.
diff --git a/legacy/example_scripts/plot_lbo_results.py b/legacy/example_scripts/plot_lbo_results.py
@@ -9,7 +9,7 @@
 import numpy as np
 from omegaconf import DictConfig
 
-from src.utils import constants
+from legacy.utils import legacy_constants as constants
 
 
 logger = logging.getLogger(__name__)

diff --git a/legacy/pre_schema_pipeline/README.md b/legacy/pre_schema_pipeline/README.md
@@ -0,0 +1,17 @@
+# Archived Pre-Schema Pipeline Scripts
+
+This folder contains older scripts that depended on pre-standardization capabilities/LBO modules and were no longer wired into the active base/evaluation pipelines.
+
+## Archived from `src/`
+
+- `create_seed_capabilities.py`
+- `generate_tasks.py`
+- `get_seed_capability_results.py`
+- `run_embedding_eval.py`
+- `run_evaluation.py`
+
+## Archived from `example_scripts/`
+
+- `plot_llm_capability_scores.py`
+
+These files are preserved for historical context and selective reuse.
diff --git a/...ple_scripts/plot_llm_capability_scores.py → ...ple_scripts/plot_llm_capability_scores.py b/...ple_scripts/plot_llm_capability_scores.py → ...ple_scripts/plot_llm_capability_scores.py
@@ -5,14 +5,14 @@
 from omegaconf import DictConfig
 from tqdm import tqdm
 
-from src.utils.capability_discovery_utils import (
+from legacy.src.utils.capability_discovery_utils import (
     select_complete_capabilities,
 )
-from src.utils.capability_management_utils import (
+from legacy.utils.legacy_capability_management_utils import (
     get_previous_capabilities,
 )
-from src.utils.data_utils import get_run_id
-from src.utils.visualization_utils import (
+from legacy.utils.legacy_data_utils import get_run_id
+from legacy.utils.legacy_visualization_utils import (
     plot_capability_scores_spider_and_bar_chart,
 )
 

diff --git a/src/create_seed_capabilities.py → ..._pipeline/src/create_seed_capabilities.py b/src/create_seed_capabilities.py → ..._pipeline/src/create_seed_capabilities.py
@@ -11,8 +11,9 @@
 import hydra
 from omegaconf import DictConfig
 
-from capability import CapabilitySeedDataset
-from utils import constants, templates
+from legacy.src.capability import CapabilitySeedDataset
+from legacy.src.utils import templates
+from legacy.utils import legacy_constants as constants
 
 
 logger = logging.getLogger(__name__)

diff --git a/src/generate_tasks.py → ...pre_schema_pipeline/src/generate_tasks.py b/src/generate_tasks.py → ...pre_schema_pipeline/src/generate_tasks.py
@@ -8,10 +8,11 @@
 from langsmith import tracing_context
 from tenacity import Retrying, stop_after_attempt
 
-from src.capability import Capability, CapabilityState
-from src.model import Model
-from src.utils import constants, prompts
-from src.utils.capability_utils import extract_and_parse_response
+from legacy.src.capability import Capability, CapabilityState
+from legacy.src.model import Model
+from legacy.src.utils import prompts
+from legacy.src.utils.capability_utils import extract_and_parse_response
+from legacy.utils import legacy_constants as constants
 
 
 logger = logging.getLogger(__name__)

diff --git a/src/get_seed_capability_results.py → ...peline/src/get_seed_capability_results.py b/src/get_seed_capability_results.py → ...peline/src/get_seed_capability_results.py
@@ -8,7 +8,12 @@
 import numpy as np
 from omegaconf import DictConfig
 
-from utils.data_utils import copy_file, list_dir, read_json_file, write_json_file
+from legacy.utils.legacy_data_utils import (
+    copy_file,
+    list_dir,
+    read_json_file,
+    write_json_file,
+)
 
 
 logger = logging.getLogger(__name__)