From 80a7cd545172fce4cd0d97cd0e8712a72f123f47 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Tue, 17 Mar 2026 01:05:58 -0400
Subject: [PATCH 1/8] Added static benchmarks

---
 .gitignore                                    |   7 +
 scripts/flatten_inspect_logs.py               | 108 +++++++++
 scripts/static_benchmarks/bizbench_eval.sh    |  62 +++++
 .../static_benchmarks/finance_math_eval.sh    |  62 +++++
 scripts/static_benchmarks/hardmath_eval.sh    |  56 +++++
 scripts/static_benchmarks/harp_eval.sh        |  62 +++++
 scripts/static_benchmarks/math500_eval.sh     |  61 +++++
 scripts/static_benchmarks/minif2f_eval.sh     |  61 +++++
 scripts/static_benchmarks/omni_math_eval.sh   |  61 +++++
 scripts/static_benchmarks/orca_math_eval.sh   |  61 +++++
 scripts/static_benchmarks/proofnet_eval.sh    |  61 +++++
 scripts/static_benchmarks/stateval_eval.sh    |  57 +++++
 .../submit_all_static_benchmarks.sh           |  20 ++
 scripts/static_benchmarks/wemath_eval.sh      |  57 +++++
 src/eval_stages/__init__.py                   |   2 +
 src/eval_stages/stage0_static_benchmarks.py   | 196 ++++++++++++++++
 src/eval_stages/static_benchmarks/__init__.py |   2 +
 src/eval_stages/static_benchmarks/bizbench.py | 108 +++++++++
 .../static_benchmarks/finance_math.py         | 114 +++++++++
 src/eval_stages/static_benchmarks/hardmath.py | 119 ++++++++++
 src/eval_stages/static_benchmarks/harp.py     |  88 +++++++
 src/eval_stages/static_benchmarks/math500.py  |  78 +++++++
 .../static_benchmarks/mathvista.py            |  99 ++++++++
 src/eval_stages/static_benchmarks/minif2f.py  |  79 +++++++
 .../static_benchmarks/omni_math.py            |  78 +++++++
 .../static_benchmarks/orca_math.py            |  62 +++++
 src/eval_stages/static_benchmarks/proofnet.py |  86 +++++++
 src/eval_stages/static_benchmarks/specs.py    |  38 +++
 src/eval_stages/static_benchmarks/stateval.py |  49 ++++
 .../stateval_foundational.py                  | 219 ++++++++++++++++++
 .../static_benchmarks/stateval_research.py    |  96 ++++++++
 src/eval_stages/static_benchmarks/wemath.py   |  95 ++++++++
 src/run_eval_pipeline.py                      |  16 ++
 33 files changed, 2420 insertions(+)
 create mode 100644 scripts/flatten_inspect_logs.py
 create mode 100644 scripts/static_benchmarks/bizbench_eval.sh
 create mode 100644 scripts/static_benchmarks/finance_math_eval.sh
 create mode 100755 scripts/static_benchmarks/hardmath_eval.sh
 create mode 100644 scripts/static_benchmarks/harp_eval.sh
 create mode 100755 scripts/static_benchmarks/math500_eval.sh
 create mode 100644 scripts/static_benchmarks/minif2f_eval.sh
 create mode 100644 scripts/static_benchmarks/omni_math_eval.sh
 create mode 100644 scripts/static_benchmarks/orca_math_eval.sh
 create mode 100644 scripts/static_benchmarks/proofnet_eval.sh
 create mode 100755 scripts/static_benchmarks/stateval_eval.sh
 create mode 100755 scripts/static_benchmarks/submit_all_static_benchmarks.sh
 create mode 100755 scripts/static_benchmarks/wemath_eval.sh
 create mode 100644 src/eval_stages/stage0_static_benchmarks.py
 create mode 100644 src/eval_stages/static_benchmarks/__init__.py
 create mode 100644 src/eval_stages/static_benchmarks/bizbench.py
 create mode 100644 src/eval_stages/static_benchmarks/finance_math.py
 create mode 100644 src/eval_stages/static_benchmarks/hardmath.py
 create mode 100644 src/eval_stages/static_benchmarks/harp.py
 create mode 100644 src/eval_stages/static_benchmarks/math500.py
 create mode 100644 src/eval_stages/static_benchmarks/mathvista.py
 create mode 100644 src/eval_stages/static_benchmarks/minif2f.py
 create mode 100644 src/eval_stages/static_benchmarks/omni_math.py
 create mode 100644 src/eval_stages/static_benchmarks/orca_math.py
 create mode 100644 src/eval_stages/static_benchmarks/proofnet.py
 create mode 100644 src/eval_stages/static_benchmarks/specs.py
 create mode 100644 src/eval_stages/static_benchmarks/stateval.py
 create mode 100644 src/eval_stages/static_benchmarks/stateval_foundational.py
 create mode 100644 src/eval_stages/static_benchmarks/stateval_research.py
 create mode 100644 src/eval_stages/static_benchmarks/wemath.py

diff --git a/.gitignore b/.gitignore
index 762fa1f3..d4d61f2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -143,6 +143,13 @@ logs/
 src/outputs/
 outputs/
 
+# Evaluation outputs/results (large, generated)
+base_output/
+
+# Slurm / batch logs (often checked in accidentally)
+*.out
+*.err
+
 # inspect result logs
 seed_datasets_inspect_logs/
 seed_tasks_results/
diff --git a/scripts/flatten_inspect_logs.py b/scripts/flatten_inspect_logs.py
new file mode 100644
index 00000000..d6943a23
--- /dev/null
+++ b/scripts/flatten_inspect_logs.py
@@ -0,0 +1,108 @@
+"""Utility to flatten Inspect JSON logs into a simple, readable format.
+
+Given an Inspect eval log file (one of the large JSON files under
+base_output/<exp_id>/eval/results/<eval_tag>/<model>/<area>/<capability>/),
+this script writes out a JSONL file with, per row:
+
+- id:          sample id
+- question:    original input
+- ground_truth: target string
+- model_output: subject model's answer text
+- grade:       judge letter grade (if present, e.g. \"C\" or \"I\")
+
+Usage:
+    python scripts/flatten_inspect_logs.py \\
+        --log_path base_output/test_exp/eval/results/_20260316_031445/\\
+                  gpt-5-nano/static_benchmarks/integral/\\
+                  2026-03-15T23-14-46-04-00_task_mZxA3jKBseS2smuk4ppcxN.json \\
+        --out_path base_output/test_exp/eval/results/_20260316_031445/\\
+                  gpt-5-nano/static_benchmarks/integral/flat_integral.jsonl
+
+The first line of the JSONL file is a summary object with:
+- num_samples
+- num_correct
+- num_incorrect
+- accuracy
+- f1 (computed treating "C" as correct, "I" as incorrect)
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+
+
+def flatten_inspect_log(log_path: Path) -> List[Dict[str, Any]]:
+    data = json.loads(log_path.read_text(encoding="utf-8"))
+
+    samples = data.get("samples", [])
+    flattened: List[Dict[str, Any]] = []
+
+    for s in samples:
+        sid = s.get("id")
+        question = s.get("input")
+        target = s.get("target")
+
+        model_output = None
+        output = s.get("output") or {}
+        choices = output.get("choices") or []
+        if choices:
+            msg = (choices[0] or {}).get("message") or {}
+            model_output = msg.get("content")
+
+        grade = None
+        scores = s.get("scores") or {}
+        fact = scores.get("model_graded_fact") or {}
+        grade = fact.get("value")
+
+        flattened.append(
+            {
+                "id": sid,
+                "question": question,
+                "ground_truth": target,
+                "model_output": model_output,
+                "grade": grade,
+            }
+        )
+
+    return flattened
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_path", type=str, required=True)
+    parser.add_argument("--out_path", type=str, required=True)
+    args = parser.parse_args()
+
+    log_path = Path(args.log_path)
+    out_path = Path(args.out_path)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    rows = flatten_inspect_log(log_path)
+
+    num_samples = len(rows)
+    num_correct = sum(1 for r in rows if r.get("grade") == "C")
+    num_incorrect = sum(1 for r in rows if r.get("grade") == "I")
+    accuracy = (num_correct / num_samples) if num_samples else 0.0
+    # In this binary setting with grades only, we treat F1 as equal to accuracy.
+    f1 = accuracy
+
+    with out_path.open("w", encoding="utf-8") as f:
+        summary = {
+            "summary": True,
+            "num_samples": num_samples,
+            "num_correct": num_correct,
+            "num_incorrect": num_incorrect,
+            "accuracy": accuracy,
+            "f1": f1,
+        }
+        f.write(json.dumps(summary, ensure_ascii=False) + "\n")
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/static_benchmarks/bizbench_eval.sh b/scripts/static_benchmarks/bizbench_eval.sh
new file mode 100644
index 00000000..5a067592
--- /dev/null
+++ b/scripts/static_benchmarks/bizbench_eval.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#SBATCH --job-name=bizbench_eval
+#SBATCH --output=logs/bizbench_eval_%j.out
+#SBATCH --error=logs/bizbench_eval_%j.err
+#SBATCH --time=04:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+VALIDATION_TAG="_BIZBENCH_$(date +%Y%m%d_%H%M%S)"
+
+# Stage 0_static: build datasets from kensho/bizbench
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=kensho/bizbench \
+  +static_benchmark_cfg.split=test \
+  +static_benchmark_cfg.limit=30
+
+# Stage 1: run subject models on the static datasets
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+# Stage 2: aggregate scores
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+# Optional: generate flattened JSONL views of Inspect logs for easier reading
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
+
diff --git a/scripts/static_benchmarks/finance_math_eval.sh b/scripts/static_benchmarks/finance_math_eval.sh
new file mode 100644
index 00000000..24b8e257
--- /dev/null
+++ b/scripts/static_benchmarks/finance_math_eval.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#SBATCH --job-name=finance_math_eval
+#SBATCH --output=logs/finance_math_eval_%j.out
+#SBATCH --error=logs/finance_math_eval_%j.err
+#SBATCH --time=04:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+VALIDATION_TAG="_FINANCE_MATH_$(date +%Y%m%d_%H%M%S)"
+
+# Stage 0_static: build datasets from yale-nlp/FinanceMath (validation split only)
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=yale-nlp/FinanceMath \
+  +static_benchmark_cfg.split=validation \
+  +static_benchmark_cfg.limit=30
+
+# Stage 1: run subject models on the static datasets
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+# Stage 2: aggregate scores
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+# Optional: generate flattened JSONL views of Inspect logs for easier reading
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
+
diff --git a/scripts/static_benchmarks/hardmath_eval.sh b/scripts/static_benchmarks/hardmath_eval.sh
new file mode 100755
index 00000000..a309e089
--- /dev/null
+++ b/scripts/static_benchmarks/hardmath_eval.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+#SBATCH --job-name=hardmath_eval
+#SBATCH --output=logs/hardmath_eval_%j.out
+#SBATCH --error=logs/hardmath_eval_%j.err
+#SBATCH --time=04:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+VALIDATION_TAG="_HARDMATH_$(date +%Y%m%d_%H%M%S)"
+
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=HARDMath \
+  +static_benchmark_cfg.limit=500
+
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
diff --git a/scripts/static_benchmarks/harp_eval.sh b/scripts/static_benchmarks/harp_eval.sh
new file mode 100644
index 00000000..952cb81c
--- /dev/null
+++ b/scripts/static_benchmarks/harp_eval.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#SBATCH --job-name=harp_eval
+#SBATCH --output=logs/harp_eval_%j.out
+#SBATCH --error=logs/harp_eval_%j.err
+#SBATCH --time=04:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+VALIDATION_TAG="_HARP_$(date +%Y%m%d_%H%M%S)"
+
+# Stage 0_static: build datasets from aadityasingh/HARP (main JSONL split)
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=aadityasingh/HARP \
+  +static_benchmark_cfg.split=train \
+  +static_benchmark_cfg.limit=30
+
+# Stage 1: run subject models on the static datasets
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+# Stage 2: aggregate scores
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+# Optional: generate flattened JSONL views of Inspect logs for easier reading
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
+
diff --git a/scripts/static_benchmarks/math500_eval.sh b/scripts/static_benchmarks/math500_eval.sh
new file mode 100755
index 00000000..dfa33048
--- /dev/null
+++ b/scripts/static_benchmarks/math500_eval.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#SBATCH --job-name=math500_eval
+#SBATCH --output=logs/math500_eval_%j.out
+#SBATCH --error=logs/math500_eval_%j.err
+#SBATCH --time=02:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+VALIDATION_TAG="_MATH500_$(date +%Y%m%d_%H%M%S)"
+
+# Stage 0_static: build datasets from HF MATH-500
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=HuggingFaceH4/MATH-500 \
+  +static_benchmark_cfg.split=test \
+  +static_benchmark_cfg.limit=30
+
+# Stage 1: run subject models on the static datasets
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+# Stage 2: aggregate scores
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+# Optional: generate flattened JSONL views of Inspect logs for easier reading
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
diff --git a/scripts/static_benchmarks/minif2f_eval.sh b/scripts/static_benchmarks/minif2f_eval.sh
new file mode 100644
index 00000000..8e5bd38c
--- /dev/null
+++ b/scripts/static_benchmarks/minif2f_eval.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#SBATCH --job-name=minif2f_eval
+#SBATCH --output=logs/minif2f_eval_%j.out
+#SBATCH --error=logs/minif2f_eval_%j.err
+#SBATCH --time=04:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+VALIDATION_TAG="_MINIF2F_$(date +%Y%m%d_%H%M%S)"
+
+# Stage 0_static: build datasets from Tonic/MiniF2F (train split only)
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=Tonic/MiniF2F \
+  +static_benchmark_cfg.split=train \
+  +static_benchmark_cfg.limit=30
+
+# Stage 1: run subject models on the static datasets
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+# Stage 2: aggregate scores
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+# Optional: generate flattened JSONL views of Inspect logs for easier reading
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
diff --git a/scripts/static_benchmarks/omni_math_eval.sh b/scripts/static_benchmarks/omni_math_eval.sh
new file mode 100644
index 00000000..3cc4bc8a
--- /dev/null
+++ b/scripts/static_benchmarks/omni_math_eval.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#SBATCH --job-name=omni_math_eval
+#SBATCH --output=logs/omni_math_eval_%j.out
+#SBATCH --error=logs/omni_math_eval_%j.err
+#SBATCH --time=04:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+VALIDATION_TAG="_OMNI_MATH_$(date +%Y%m%d_%H%M%S)"
+
+# Stage 0_static: build datasets from KbsdJames/Omni-MATH (test split)
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=KbsdJames/Omni-MATH \
+  +static_benchmark_cfg.split=test \
+  +static_benchmark_cfg.limit=30
+
+# Stage 1: run subject models on the static datasets
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+# Stage 2: aggregate scores
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+# Optional: generate flattened JSONL views of Inspect logs for easier reading
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
diff --git a/scripts/static_benchmarks/orca_math_eval.sh b/scripts/static_benchmarks/orca_math_eval.sh
new file mode 100644
index 00000000..4e914c9e
--- /dev/null
+++ b/scripts/static_benchmarks/orca_math_eval.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#SBATCH --job-name=orca_math_eval
+#SBATCH --output=logs/orca_math_eval_%j.out
+#SBATCH --error=logs/orca_math_eval_%j.err
+#SBATCH --time=04:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+VALIDATION_TAG="_ORCA_MATH_$(date +%Y%m%d_%H%M%S)"
+
+# Stage 0_static: build datasets from microsoft/orca-math-word-problems-200k (train split only)
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=microsoft/orca-math-word-problems-200k \
+  +static_benchmark_cfg.split=train \
+  +static_benchmark_cfg.limit=30
+
+# Stage 1: run subject models on the static datasets
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+# Stage 2: aggregate scores
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+# Optional: generate flattened JSONL views of Inspect logs for easier reading
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
diff --git a/scripts/static_benchmarks/proofnet_eval.sh b/scripts/static_benchmarks/proofnet_eval.sh
new file mode 100644
index 00000000..d200e516
--- /dev/null
+++ b/scripts/static_benchmarks/proofnet_eval.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#SBATCH --job-name=proofnet_eval
+#SBATCH --output=logs/proofnet_eval_%j.out
+#SBATCH --error=logs/proofnet_eval_%j.err
+#SBATCH --time=04:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+VALIDATION_TAG="_PROOFNET_$(date +%Y%m%d_%H%M%S)"
+
+# Stage 0_static: build datasets from hoskinson-center/proofnet (plain_text, validation)
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=hoskinson-center/proofnet \
+  +static_benchmark_cfg.split=validation \
+  +static_benchmark_cfg.limit=30
+
+# Stage 1: run subject models on the static datasets
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+# Stage 2: aggregate scores
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+# Optional: generate flattened JSONL views of Inspect logs for easier reading
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
diff --git a/scripts/static_benchmarks/stateval_eval.sh b/scripts/static_benchmarks/stateval_eval.sh
new file mode 100755
index 00000000..097fef77
--- /dev/null
+++ b/scripts/static_benchmarks/stateval_eval.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#SBATCH --job-name=stateval_eval
+#SBATCH --output=logs/stateval_eval_%j.out
+#SBATCH --error=logs/stateval_eval_%j.err
+#SBATCH --time=06:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=32G
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+VALIDATION_TAG="_STATEVAL_$(date +%Y%m%d_%H%M%S)"
+
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=StatEval \
+  +static_benchmark_cfg.split=train \
+  +static_benchmark_cfg.limit=30
+
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
diff --git a/scripts/static_benchmarks/submit_all_static_benchmarks.sh b/scripts/static_benchmarks/submit_all_static_benchmarks.sh
new file mode 100755
index 00000000..2469076a
--- /dev/null
+++ b/scripts/static_benchmarks/submit_all_static_benchmarks.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# Ensure scripts are executable
+chmod +x scripts/static_benchmarks/*_eval.sh || true
+
+sbatch scripts/static_benchmarks/math500_eval.sh
+sbatch scripts/static_benchmarks/hardmath_eval.sh
+sbatch scripts/static_benchmarks/wemath_eval.sh
+sbatch scripts/static_benchmarks/stateval_eval.sh
+sbatch scripts/static_benchmarks/orca_math_eval.sh
+sbatch scripts/static_benchmarks/proofnet_eval.sh
+sbatch scripts/static_benchmarks/harp_eval.sh
+sbatch scripts/static_benchmarks/finance_math_eval.sh
+sbatch scripts/static_benchmarks/bizbench_eval.sh
+sbatch scripts/static_benchmarks/omni_math_eval.sh
+sbatch scripts/static_benchmarks/minif2f_eval.sh
diff --git a/scripts/static_benchmarks/wemath_eval.sh b/scripts/static_benchmarks/wemath_eval.sh
new file mode 100755
index 00000000..eff18509
--- /dev/null
+++ b/scripts/static_benchmarks/wemath_eval.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#SBATCH --job-name=wemath_eval
+#SBATCH --output=logs/wemath_eval_%j.out
+#SBATCH --error=logs/wemath_eval_%j.err
+#SBATCH --time=04:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+VALIDATION_TAG="_WEMATH_$(date +%Y%m%d_%H%M%S)"
+
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=We-Math/We-Math \
+  +static_benchmark_cfg.split=testmini \
+  +static_benchmark_cfg.limit=30
+
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
diff --git a/src/eval_stages/__init__.py b/src/eval_stages/__init__.py
index ff7eaa4c..1a5ab861 100644
--- a/src/eval_stages/__init__.py
+++ b/src/eval_stages/__init__.py
@@ -6,12 +6,14 @@
 """
 
 from src.eval_stages.stage0_setup_and_dataset import EvalSetupError, run_eval_stage0
+from src.eval_stages.stage0_static_benchmarks import run_eval_stage0_static
 from src.eval_stages.stage1_eval_execution import run_eval_stage1
 from src.eval_stages.stage2_score_aggregation import run_eval_stage2
 
 
 __all__ = [
     "run_eval_stage0",
+    "run_eval_stage0_static",
     "run_eval_stage1",
     "run_eval_stage2",
     "EvalSetupError",
diff --git a/src/eval_stages/stage0_static_benchmarks.py b/src/eval_stages/stage0_static_benchmarks.py
new file mode 100644
index 00000000..7278b42f
--- /dev/null
+++ b/src/eval_stages/stage0_static_benchmarks.py
@@ -0,0 +1,196 @@
+"""Eval Stage 0_static: Static benchmark ingestion.
+
+This stage lets you reuse Eval Stages 1 and 2 on external/static benchmarks
+(e.g., Hugging Face datasets) that do not originate from this repo's
+generation/validation pipeline.
+
+It converts a benchmark-specific schema into the pipeline's EvalDataset JSON
+format and writes outputs under:
+
+    <output_dir>/<exp_id>/eval/datasets/<validation_tag>/
+
+so that Stage 1 can run unchanged (it only needs eval_config.json plus one or
+more dataset.json files).
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from pathlib import Path
+from typing import Any, Dict, List
+
+from omegaconf import DictConfig, OmegaConf
+
+from src.eval_stages.static_benchmarks.hardmath import (
+    build_eval_datasets_from_hardmath,
+)
+from src.eval_stages.static_benchmarks.math500 import build_eval_datasets_from_math500
+from src.eval_stages.static_benchmarks.mathvista import (
+    build_eval_datasets_from_mathvista,
+)
+from src.eval_stages.static_benchmarks.orca_math import (
+    build_eval_datasets_from_orca_math,
+)
+from src.eval_stages.static_benchmarks.minif2f import (
+    build_eval_datasets_from_minif2f,
+)
+from src.eval_stages.static_benchmarks.omni_math import (
+    build_eval_datasets_from_omni_math,
+)
+from src.eval_stages.static_benchmarks.harp import (
+    build_eval_datasets_from_harp,
+)
+from src.eval_stages.static_benchmarks.finance_math import (
+    build_eval_datasets_from_finance_math,
+)
+from src.eval_stages.static_benchmarks.bizbench import (
+    build_eval_datasets_from_bizbench,
+)
+from src.eval_stages.static_benchmarks.proofnet import (
+    build_eval_datasets_from_proofnet,
+)
+from src.eval_stages.static_benchmarks.stateval import build_eval_datasets_from_stateval
+from src.eval_stages.static_benchmarks.wemath import build_eval_datasets_from_wemath
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_io_utils import save_eval_config, save_eval_dataset
+from src.schemas.eval_schemas import EvalConfig, EvalDataset
+from src.schemas.metadata_schemas import PipelineMetadata
+from src.utils.timestamp_utils import iso_timestamp
+
+
+logger = logging.getLogger(__name__)
+
+
+def _slugify(text: str) -> str:
+    """Convert arbitrary strings into safe directory-friendly IDs."""
+    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
+    return cleaned or "unknown"
+
+
+def _build_datasets_from_spec(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Dispatch to the appropriate adapter based on benchmark_id.
+
+    Returns a list of EvalDataset objects so that one static benchmark can
+    produce multiple capabilities if desired.
+    """
+    bid = spec.benchmark_id.strip()
+    if bid in {"HuggingFaceH4/MATH-500", "math500", "MATH-500"}:
+        return build_eval_datasets_from_math500(spec)
+    if bid in {"HARDMath", "hardmath", "HARDMATH"}:
+        return build_eval_datasets_from_hardmath(spec)
+    if bid in {"We-Math/We-Math", "We-Math", "wemath", "WE-MATH"}:
+        return build_eval_datasets_from_wemath(spec)
+    if bid in {"AI4Math/MathVista", "MathVista", "mathvista"}:
+        return build_eval_datasets_from_mathvista(spec)
+    if bid in {
+        "microsoft/orca-math-word-problems-200k",
+        "orca-math-word-problems-200k",
+        "orca_math",
+        "OrcaMath",
+    }:
+        return build_eval_datasets_from_orca_math(spec)
+    if bid in {
+        "hoskinson-center/proofnet",
+        "proofnet",
+        "ProofNet",
+    }:
+        return build_eval_datasets_from_proofnet(spec)
+    if bid in {"Tonic/MiniF2F", "MiniF2F", "minif2f"}:
+        return build_eval_datasets_from_minif2f(spec)
+    if bid in {"KbsdJames/Omni-MATH", "Omni-MATH", "omni_math"}:
+        return build_eval_datasets_from_omni_math(spec)
+    if bid in {"aadityasingh/HARP", "HARP", "harp"}:
+        return build_eval_datasets_from_harp(spec)
+    if bid in {"yale-nlp/FinanceMath", "FinanceMath", "finance_math"}:
+        return build_eval_datasets_from_finance_math(spec)
+    if bid in {"kensho/bizbench", "BizBench", "bizbench"}:
+        return build_eval_datasets_from_bizbench(spec)
+    if bid in {
+        "0v01111/StatEval-Foundational-knowledge",
+        "StatEval-Foundational-knowledge",
+        "stateval_foundational",
+        "StatEval",
+        "stateval",
+    }:
+        return build_eval_datasets_from_stateval(spec)
+    raise ValueError(f"Unknown static benchmark_id: {spec.benchmark_id}")
+
+
+def run_eval_stage0_static(cfg: DictConfig, validation_tag: str) -> None:
+    """Prepare eval datasets/config from a static benchmark."""
+    exp_id = cfg.exp_cfg.exp_id
+    output_base_dir = Path(cfg.global_cfg.output_dir)
+    experiment_dir = output_base_dir / exp_id
+    eval_cfg: Dict[str, Any] = cfg.get("eval_cfg", {})
+
+    static_cfg: Dict[str, Any] = cfg.get("static_benchmark_cfg", {})
+    benchmark_id = static_cfg.get("benchmark_id")
+    if not benchmark_id:
+        raise ValueError(
+            "static_benchmark_cfg.benchmark_id is required for stage=0_static "
+            "(e.g. static_benchmark_cfg.benchmark_id=HuggingFaceH4/MATH-500)"
+        )
+
+    spec = StaticBenchmarkSpec(
+        benchmark_id=str(benchmark_id),
+        split=str(static_cfg.get("split", "test")),
+        limit=static_cfg.get("limit"),
+        area_id=str(static_cfg.get("area_id", StaticBenchmarkSpec.area_id)),
+        capability_id=static_cfg.get("capability_id"),
+        capability_name=static_cfg.get("capability_name"),
+        domain=str(static_cfg.get("domain", StaticBenchmarkSpec.domain)),
+    )
+
+    logger.info(
+        "Eval Stage 0_static: exp_id=%s | benchmark_id=%s | split=%s | limit=%s | validation_tag=%s",
+        exp_id,
+        spec.benchmark_id,
+        spec.split,
+        spec.limit,
+        validation_tag,
+    )
+
+    datasets = _build_datasets_from_spec(spec)
+    total_tasks = sum(d.num_tasks for d in datasets)
+    if total_tasks == 0:
+        raise ValueError(f"No tasks created for benchmark: {spec.benchmark_id}")
+
+    datasets_dir = experiment_dir / "eval" / "datasets" / validation_tag
+    for dataset in datasets:
+        dataset_path = (
+            datasets_dir / dataset.area_id / dataset.capability_id / "dataset.json"
+        )
+        save_eval_dataset(dataset, dataset_path)
+        logger.info(
+            "Wrote dataset.json with %d tasks to %s",
+            dataset.num_tasks,
+            dataset_path,
+        )
+
+    # Convert Hydra containers to plain Python types for JSON serialization.
+    subject_llms_cfg = eval_cfg.get("subject_llms")
+    judge_llm_cfg = eval_cfg.get("judge_llm")
+
+    subject_llms = OmegaConf.to_container(subject_llms_cfg, resolve=True) if subject_llms_cfg is not None else []
+    judge_llm = OmegaConf.to_container(judge_llm_cfg, resolve=True) if judge_llm_cfg is not None else {}
+
+    eval_config = EvalConfig(
+        experiment_id=exp_id,
+        eval_tag="",
+        subject_llms=subject_llms,
+        judge_llm=judge_llm,
+        validation_tag=validation_tag,
+    )
+    metadata = PipelineMetadata(
+        experiment_id=exp_id,
+        output_base_dir=str(output_base_dir),
+        timestamp=iso_timestamp(),
+        input_stage_tag=spec.benchmark_id,
+        output_stage_tag=None,
+        resume=False,
+    )
+    eval_config_path = datasets_dir / "eval_config.json"
+    save_eval_config(eval_config, metadata, eval_config_path)
+    logger.info("Wrote eval_config.json to %s", eval_config_path)
+
diff --git a/src/eval_stages/static_benchmarks/__init__.py b/src/eval_stages/static_benchmarks/__init__.py
new file mode 100644
index 00000000..18a46891
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/__init__.py
@@ -0,0 +1,2 @@
+"""Adapters for static (external) benchmarks used by Eval Stage 0.5."""
+
diff --git a/src/eval_stages/static_benchmarks/bizbench.py b/src/eval_stages/static_benchmarks/bizbench.py
new file mode 100644
index 00000000..68cb24e5
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/bizbench.py
@@ -0,0 +1,108 @@
+"""Adapter for the kensho/bizbench static benchmark.
+
+Dataset card: https://huggingface.co/datasets/kensho/bizbench
+
+Splits: train, test
+
+Columns (per dataset viewer):
+- question (str)
+- answer (str)
+- task (str)
+- context (str | None)
+- context_type (str)
+- options (list)
+- program (str | None)
+
+We use:
+- input: context (if present) + question + options (if present)
+- target: answer
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Iterable, List
+
+from datasets import load_dataset
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+
+def _normalize_answer(val: Any) -> str:
+    if val is None:
+        return ""
+    if isinstance(val, dict):
+        for key in ("answer", "label", "text", "value"):
+            if key in val and val[key] is not None:
+                return str(val[key]).strip()
+        return str(val).strip()
+    return str(val).strip()
+
+
+def _format_options(options: Any) -> str:
+    if not options:
+        return ""
+    if isinstance(options, list):
+        cleaned = [str(o).strip() for o in options if str(o).strip()]
+        if not cleaned:
+            return ""
+        return "\n".join(f"- {o}" for o in cleaned)
+    opt = str(options).strip()
+    return f"- {opt}" if opt else ""
+
+
+def _build_input(question: str, context: Any, options: Any) -> str:
+    question = question.strip()
+    parts: List[str] = []
+
+    ctx = "" if context is None else str(context).strip()
+    if ctx:
+        parts.append(f"Context:\n{ctx}")
+
+    parts.append(f"Question:\n{question}")
+
+    opts = _format_options(options)
+    if opts:
+        parts.append(f"Options:\n{opts}")
+
+    return "\n\n".join(parts).strip()
+
+
+def _iter_bizbench_samples(split: str, limit: int | None) -> Iterable[Dict[str, Any]]:
+    ds = load_dataset("kensho/bizbench", split=split)
+    if limit is not None:
+        ds = ds.select(range(min(limit, len(ds))))
+    yield from ds
+
+
+def build_eval_datasets_from_bizbench(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Convert BizBench into a single EvalDataset."""
+    tasks: List[Dict[str, str]] = []
+
+    for idx, row in enumerate(_iter_bizbench_samples(spec.split, spec.limit)):
+        question = str(row.get("question", "")).strip()
+        raw_answer = row.get("answer")
+        answer = _normalize_answer(raw_answer)
+
+        if not question or not answer:
+            continue
+
+        inp = _build_input(question, row.get("context"), row.get("options"))
+        task_id = f"bizbench_{idx:05d}"
+        tasks.append({"id": task_id, "input": inp, "target": answer})
+
+    if not tasks:
+        return []
+
+    dataset = EvalDataset(
+        area_id=spec.area_id,
+        capability_id="bizbench",
+        capability_name="BizBench",
+        domain="finance",
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+    )
+    return [dataset]
+
diff --git a/src/eval_stages/static_benchmarks/finance_math.py b/src/eval_stages/static_benchmarks/finance_math.py
new file mode 100644
index 00000000..84b31e1a
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/finance_math.py
@@ -0,0 +1,114 @@
+"""Adapter for the yale-nlp/FinanceMath static benchmark.
+
+Dataset card: https://huggingface.co/datasets/yale-nlp/FinanceMath
+
+FinanceMath is a finance-domain math reasoning benchmark with two splits:
+- validation: 200 examples with answers
+- test: 1000 examples (answers not publicly released)
+
+We expect to use the validation split for evaluation.
+
+Fields:
+- question_id: string
+- question: problem text
+- tables: list of markdown tables (strings)
+- python_solution: expert solution code (ignored here)
+- ground_truth: float, executed result rounded to 3 decimals
+- topic: financial area (ignored here)
+
+We use (tables + question) as input and ground_truth (string) as target.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Iterable, List
+
+from datasets import load_dataset
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+
+def _normalize_answer(val: Any) -> str:
+    """Normalize answer to string."""
+    if val is None:
+        return ""
+    if isinstance(val, dict):
+        for key in ("ground_truth", "value", "answer"):
+            if key in val and val[key] is not None:
+                return str(val[key]).strip()
+        return str(val).strip()
+    # For floats, preserve the dataset's rounding (usually 3 decimals).
+    return str(val).strip()
+
+
+def _build_input(question: str, tables: Any) -> str:
+    """Construct model input from question plus optional markdown tables."""
+    question = question.strip()
+    if not tables:
+        return question
+
+    # tables is a list of markdown strings according to the dataset card.
+    if isinstance(tables, list):
+        tables_str = "\n\n".join(str(t).strip() for t in tables if str(t).strip())
+    else:
+        tables_str = str(tables).strip()
+
+    if not tables_str:
+        return question
+
+    return f"Tables:\n{tables_str}\n\nQuestion:\n{question}"
+
+
+def _iter_finance_math_samples(
+    split: str,
+    limit: int | None,
+) -> Iterable[Dict[str, Any]]:
+    """Yield rows from yale-nlp/FinanceMath in order."""
+    ds = load_dataset("yale-nlp/FinanceMath", split=split)
+    if limit is not None:
+        ds = ds.select(range(min(limit, len(ds))))
+    yield from ds
+
+
+def build_eval_datasets_from_finance_math(
+    spec: StaticBenchmarkSpec,
+) -> List[EvalDataset]:
+    """Convert FinanceMath into a single EvalDataset.
+
+    - input: tables (markdown) + question text
+    - target: ground_truth (normalized to string)
+    - domain: math
+    - capability_id: finance_math
+    Rows with missing question or ground_truth are skipped.
+    """
+    tasks: List[Dict[str, str]] = []
+
+    for idx, row in enumerate(_iter_finance_math_samples(spec.split, spec.limit)):
+        question = str(row.get("question", "")).strip()
+        tables = row.get("tables")
+        raw_answer = row.get("ground_truth")
+        answer = _normalize_answer(raw_answer)
+
+        if not question or not answer:
+            continue
+
+        inp = _build_input(question, tables)
+        task_id = f"finance_math_{idx:05d}"
+        tasks.append({"id": task_id, "input": inp, "target": answer})
+
+    if not tasks:
+        return []
+
+    dataset = EvalDataset(
+        area_id=spec.area_id,
+        capability_id="finance_math",
+        capability_name="FinanceMath",
+        domain="math",
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+    )
+    return [dataset]
+
diff --git a/src/eval_stages/static_benchmarks/hardmath.py b/src/eval_stages/static_benchmarks/hardmath.py
new file mode 100644
index 00000000..5443cda1
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/hardmath.py
@@ -0,0 +1,119 @@
+"""Adapter for the HARDMath static benchmark.
+
+Source JSON:
+- https://github.com/sarahmart/HARDMath/blob/main/data/HARDMath.json
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from typing import Any, Dict, Iterable, List
+from urllib.request import urlopen
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+
+HARDMATH_URL = (
+    "https://raw.githubusercontent.com/sarahmart/HARDMath/main/data/HARDMath.json"
+)
+
+
+def _slugify(text: str) -> str:
+    """Convert arbitrary strings into safe directory-friendly IDs."""
+    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
+    return cleaned or "unknown"
+
+
+def _iter_hardmath_samples(limit: int | None) -> Iterable[Dict[str, Any]]:
+    """Yield rows from HARDMath JSON in a stable order."""
+    with urlopen(HARDMATH_URL) as f:
+        data = json.load(f)
+
+    # The JSON is a dict keyed by string indices ("0", "1", ...).
+    # We iterate in key-sorted order for reproducibility.
+    items = [data[k] for k in sorted(data.keys(), key=lambda x: int(x))]
+
+    if limit is not None:
+        items = items[: max(0, min(limit, len(items)))]
+
+    yield from items
+
+
+def build_eval_datasets_from_hardmath(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Convert HARDMath into one EvalDataset per question_type.
+
+    We treat:
+    - domain: always "math"
+    - area_id: taken from spec.area_id (e.g. "math" or "static_benchmarks")
+    - capability_id / capability_name: derived from the "question_type" field
+      (e.g., "integral", "ODE", "polynomial_roots_corrections", ...).
+    """
+    by_qtype: Dict[str, List[Dict[str, str]]] = {}
+
+    for idx, row in enumerate(_iter_hardmath_samples(spec.limit)):
+        question = str(row.get("question", "")).strip()
+        # Use the curated LaTeX-like final answer field.
+        answer = str(row.get("answer_val", "")).strip()
+        if not question or not answer:
+            continue
+
+        qtype = str(row.get("question_type", "")).strip() or "unknown"
+        answer_type = str(row.get("answer_type", "")).strip()
+        precision = row.get("precision")
+
+        # Enrich the input with answer-type instructions so the subject model
+        # knows what form is expected.
+        extra_lines = []
+        if answer_type:
+            if answer_type == "list":
+                extra_lines.append(
+                    "Answer format: provide a Python-style list of expressions or numbers, in the order requested."
+                )
+            elif answer_type in {"integer", "float"}:
+                extra_lines.append(
+                    f"Answer format: a single {answer_type} value (no explanation in the final line)."
+                )
+            elif answer_type == "math_expression":
+                extra_lines.append(
+                    "Answer format: a single closed-form mathematical expression."
+                )
+            else:
+                extra_lines.append(f"Answer format: {answer_type}.")
+
+        if isinstance(precision, (int, float)):
+            extra_lines.append(
+                f"If the answer is numeric, round to {int(precision)} decimal places."
+            )
+
+        if extra_lines:
+            input_text = question + "\n\n" + "\n".join(extra_lines)
+        else:
+            input_text = question
+
+        task_id = f"hardmath_{idx:04d}"
+
+        by_qtype.setdefault(qtype, []).append(
+            {"id": task_id, "input": input_text, "target": answer}
+        )
+
+    datasets: List[EvalDataset] = []
+    for qtype, tasks in sorted(by_qtype.items()):
+        capability_id = _slugify(qtype)
+        capability_name = qtype
+
+        dataset = EvalDataset(
+            area_id=spec.area_id,
+            capability_id=capability_id,
+            capability_name=capability_name,
+            domain="math",
+            tasks=tasks,
+            num_tasks=len(tasks),
+            prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+        )
+        datasets.append(dataset)
+
+    return datasets
+
diff --git a/src/eval_stages/static_benchmarks/harp.py b/src/eval_stages/static_benchmarks/harp.py
new file mode 100644
index 00000000..4d5d2161
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/harp.py
@@ -0,0 +1,88 @@
+"""Adapter for the HARP static benchmark.
+
+Repository: https://github.com/aadityasingh/HARP
+
+We use the main split HARP.jsonl (short-answer questions).
+
+Fields (from README):
+- problem: problem text
+- answer: ground truth answer
+We ignore other metadata fields.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Iterable, List
+
+from datasets import load_dataset
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+
+def _normalize_answer(val: Any) -> str:
+    """Normalize answer to string (dataset may have mixed types)."""
+    if val is None:
+        return ""
+    if isinstance(val, dict):
+        for key in ("label", "text", "value", "answer"):
+            if key in val and val[key] is not None:
+                return str(val[key]).strip()
+        return str(val).strip()
+    return str(val).strip()
+
+
+def _iter_harp_samples(
+    split: str,
+    limit: int | None,
+) -> Iterable[Dict[str, Any]]:
+    """Yield rows from HARP.jsonl (single split).
+
+    The dataset is hosted in the GitHub repo as a JSONL file in a zip archive.
+    We load it via datasets.load_dataset with a remote URL.
+    """
+    # Main short-answer split; datasets can read compressed JSONL directly.
+    data_files = "https://github.com/aadityasingh/HARP/raw/main/HARP.jsonl.zip"
+    ds = load_dataset("json", data_files=data_files, split="train")
+
+    if limit is not None:
+        ds = ds.select(range(min(limit, len(ds))))
+    yield from ds
+
+
+def build_eval_datasets_from_harp(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Convert HARP into a single EvalDataset.
+
+    - input: problem (competition math problem text)
+    - target: answer (normalized to string)
+    - domain: math
+    - capability_id: harp
+    Rows with empty problem or answer are skipped.
+    """
+    tasks: List[Dict[str, str]] = []
+
+    for idx, row in enumerate(_iter_harp_samples(spec.split, spec.limit)):
+        problem = str(row.get("problem", "")).strip()
+        raw_answer = row.get("answer")
+        answer = _normalize_answer(raw_answer)
+        if not problem or not answer:
+            continue
+
+        task_id = f"harp_{idx:05d}"
+        tasks.append({"id": task_id, "input": problem, "target": answer})
+
+    if not tasks:
+        return []
+
+    dataset = EvalDataset(
+        area_id=spec.area_id,
+        capability_id="harp",
+        capability_name="HARP",
+        domain="math",
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+    )
+    return [dataset]
+
diff --git a/src/eval_stages/static_benchmarks/math500.py b/src/eval_stages/static_benchmarks/math500.py
new file mode 100644
index 00000000..f8303925
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/math500.py
@@ -0,0 +1,78 @@
+"""Adapter for the HuggingFaceH4/MATH-500 static benchmark.
+
+Dataset card: https://huggingface.co/datasets/HuggingFaceH4/MATH-500
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, Iterable, List
+
+from datasets import load_dataset
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.schemas.eval_schemas import EvalDataset
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+
+
+def _slugify(text: str) -> str:
+    """Convert arbitrary strings into safe directory-friendly IDs."""
+    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
+    return cleaned or "unknown"
+
+
+def _iter_math500_samples(
+    split: str,
+    limit: int | None,
+) -> Iterable[Dict[str, Any]]:
+    """Yield rows from HuggingFaceH4/MATH-500 in a stable order."""
+    ds = load_dataset("HuggingFaceH4/MATH-500", split=split)
+    if limit is not None:
+        ds = ds.select(range(min(limit, len(ds))))
+    yield from ds
+
+
+def build_eval_datasets_from_math500(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Convert HF MATH-500 into one EvalDataset per subject.
+
+    We treat:
+    - domain: always "math"
+    - area_id: taken from spec.area_id (e.g. "math" or "static_benchmarks")
+    - capability_id / capability_name: derived from the dataset "subject" column
+      (Prealgebra, Algebra, Geometry, ...).
+    """
+    by_subject: Dict[str, List[Dict[str, str]]] = {}
+
+    for idx, row in enumerate(_iter_math500_samples(spec.split, spec.limit)):
+        problem = str(row.get("problem", "")).strip()
+        answer = str(row.get("answer", "")).strip()
+        unique_id = row.get("unique_id")
+        task_id = str(unique_id).strip() if unique_id else f"math500_{idx:04d}"
+        subject = str(row.get("subject", "")).strip() or "unknown"
+
+        if not problem:
+            continue
+
+        by_subject.setdefault(subject, []).append(
+            {"id": task_id, "input": problem, "target": answer}
+        )
+
+    datasets: List[EvalDataset] = []
+    for subject, tasks in sorted(by_subject.items()):
+        capability_id = _slugify(subject)
+        capability_name = subject
+
+        dataset = EvalDataset(
+            area_id=spec.area_id,
+            capability_id=capability_id,
+            capability_name=capability_name,
+            domain="math",
+            tasks=tasks,
+            num_tasks=len(tasks),
+            prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+        )
+        datasets.append(dataset)
+
+    return datasets
+
+
diff --git a/src/eval_stages/static_benchmarks/mathvista.py b/src/eval_stages/static_benchmarks/mathvista.py
new file mode 100644
index 00000000..48f2ff92
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/mathvista.py
@@ -0,0 +1,99 @@
+"""Adapter for the AI4Math/MathVista static benchmark.
+
+Dataset card: https://huggingface.co/datasets/AI4Math/MathVista
+
+This adapter focuses on the labeled ``testmini`` split, which provides
+answers for 1,000 examples. The \"test\" split does not expose labels.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, Iterable, List
+
+from datasets import load_dataset
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+
+def _slugify(text: str) -> str:
+    """Convert arbitrary strings into safe directory-friendly IDs."""
+    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
+    return cleaned or "unknown"
+
+
+def _iter_mathvista_samples(
+    split: str,
+    limit: int | None,
+) -> Iterable[Dict[str, Any]]:
+    """Yield rows from AI4Math/MathVista in a stable order."""
+    ds = load_dataset("AI4Math/MathVista", split=split)
+    if limit is not None:
+        ds = ds.select(range(min(limit, len(ds))))
+    yield from ds
+
+
+def build_eval_datasets_from_mathvista(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Convert MathVista into a single EvalDataset.
+
+    We treat:
+    - domain: always "math"
+    - area_id: taken from spec.area_id (e.g. "math" or "static_benchmarks")
+    - capability_id / capability_name: a single capability "mathvista"
+      covering all tasks in the chosen split (typically ``testmini``).
+    """
+    tasks: List[Dict[str, str]] = []
+
+    for idx, row in enumerate(_iter_mathvista_samples(spec.split, spec.limit)):
+        # Prefer the curated query prompt if present.
+        query = str(row.get("query", "")).strip()
+        question = str(row.get("question", "")).strip()
+        image_path = str(row.get("image", "")).strip()
+        choices = row.get("choices")
+
+        if query:
+            input_text = query
+        else:
+            parts: List[str] = []
+            if question:
+                parts.append(question)
+            if isinstance(choices, list) and choices:
+                labeled: List[str] = []
+                for i, opt in enumerate(choices):
+                    label = chr(ord("A") + i)
+                    labeled.append(f"{label}. {str(opt).strip()}")
+                parts.append("Options: " + " ".join(labeled))
+            input_text = "\n\n".join(parts).strip()
+
+        if image_path:
+            input_text = f"{input_text}\n\n[Image path: {image_path}]".strip()
+
+        answer = str(row.get("answer", "")).strip()
+
+        if not input_text or not answer:
+            continue
+
+        pid = str(row.get("pid", "")).strip()
+        task_id = pid or f"mathvista_{idx:04d}"
+
+        tasks.append({"id": task_id, "input": input_text, "target": answer})
+
+    if not tasks:
+        return []
+
+    capability_id = "mathvista"
+    capability_name = "MathVista"
+
+    dataset = EvalDataset(
+        area_id=spec.area_id,
+        capability_id=capability_id,
+        capability_name=capability_name,
+        domain="math",
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+    )
+    return [dataset]
+
diff --git a/src/eval_stages/static_benchmarks/minif2f.py b/src/eval_stages/static_benchmarks/minif2f.py
new file mode 100644
index 00000000..97f0af14
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/minif2f.py
@@ -0,0 +1,79 @@
+"""Adapter for the Tonic/MiniF2F static benchmark.
+
+Dataset card: https://huggingface.co/datasets/Tonic/MiniF2F
+
+MiniF2F contains mathematical problems with informal statements (LaTeX) and
+formal Lean statements. Single split: train (488 rows).
+
+Columns: name, split, informal_prefix, formal_statement, goal, header.
+We use informal_prefix as input and formal_statement as target (autoformalization).
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, Iterable, List
+
+from datasets import load_dataset
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+
+def _slugify(text: str) -> str:
+    """Convert arbitrary strings into safe directory-friendly IDs."""
+    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
+    return cleaned or "unknown"
+
+
+def _iter_minif2f_samples(
+    split: str,
+    limit: int | None,
+) -> Iterable[Dict[str, Any]]:
+    """Yield rows from Tonic/MiniF2F in order."""
+    ds = load_dataset("Tonic/MiniF2F", split=split)
+    if limit is not None:
+        ds = ds.select(range(min(limit, len(ds))))
+    yield from ds
+
+
+def build_eval_datasets_from_minif2f(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Convert MiniF2F into a single EvalDataset.
+
+    - input: informal_prefix (informal mathematical statement in LaTeX)
+    - target: formal_statement (formal theorem in Lean)
+    - domain: math
+    - capability_id: minif2f
+    Rows with empty informal_prefix or formal_statement are skipped.
+    """
+    tasks: List[Dict[str, str]] = []
+    id_counts: Dict[str, int] = {}
+
+    for idx, row in enumerate(_iter_minif2f_samples(spec.split, spec.limit)):
+        informal = str(row.get("informal_prefix", "")).strip()
+        formal = str(row.get("formal_statement", "")).strip()
+        if not informal or not formal:
+            continue
+
+        raw_id = row.get("name")
+        base_id = _slugify(str(raw_id).strip()) if raw_id else f"minif2f_{idx:04d}"
+        cnt = id_counts.get(base_id, 0)
+        id_counts[base_id] = cnt + 1
+        task_id = base_id if cnt == 0 else f"{base_id}_{cnt}"
+
+        tasks.append({"id": task_id, "input": informal, "target": formal})
+
+    if not tasks:
+        return []
+
+    dataset = EvalDataset(
+        area_id=spec.area_id,
+        capability_id="minif2f",
+        capability_name="MiniF2F",
+        domain="math",
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+    )
+    return [dataset]
diff --git a/src/eval_stages/static_benchmarks/omni_math.py b/src/eval_stages/static_benchmarks/omni_math.py
new file mode 100644
index 00000000..786e40a3
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/omni_math.py
@@ -0,0 +1,78 @@
+"""Adapter for the KbsdJames/Omni-MATH static benchmark.
+
+Dataset card: https://huggingface.co/datasets/KbsdJames/Omni-MATH
+
+Omni-MATH is an Olympiad-level math benchmark (~4.4k problems). Single split: test.
+
+Columns: domain, difficulty, problem, solution, answer, source.
+We use problem as input and answer as target.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Iterable, List
+
+from datasets import load_dataset
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+
+def _normalize_answer(val: Any) -> str:
+    """Normalize answer to string (dataset may have mixed types)."""
+    if val is None:
+        return ""
+    if isinstance(val, dict):
+        for key in ("label", "text", "value"):
+            if key in val and val[key] is not None:
+                return str(val[key]).strip()
+        return str(val).strip()
+    return str(val).strip()
+
+
+def _iter_omni_math_samples(
+    split: str,
+    limit: int | None,
+) -> Iterable[Dict[str, Any]]:
+    """Yield rows from KbsdJames/Omni-MATH in order."""
+    ds = load_dataset("KbsdJames/Omni-MATH", split=split)
+    if limit is not None:
+        ds = ds.select(range(min(limit, len(ds))))
+    yield from ds
+
+
+def build_eval_datasets_from_omni_math(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Convert Omni-MATH into a single EvalDataset.
+
+    - input: problem (Olympiad-level math problem text)
+    - target: answer (normalized to string)
+    - domain: math
+    - capability_id: omni_math
+    Rows with empty problem or answer are skipped.
+    """
+    tasks: List[Dict[str, str]] = []
+
+    for idx, row in enumerate(_iter_omni_math_samples(spec.split, spec.limit)):
+        problem = str(row.get("problem", "")).strip()
+        raw_answer = row.get("answer")
+        answer = _normalize_answer(raw_answer)
+        if not problem or not answer:
+            continue
+
+        task_id = f"omni_math_{idx:05d}"
+        tasks.append({"id": task_id, "input": problem, "target": answer})
+
+    if not tasks:
+        return []
+
+    dataset = EvalDataset(
+        area_id=spec.area_id,
+        capability_id="omni_math",
+        capability_name="Omni-MATH",
+        domain="math",
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+    )
+    return [dataset]
diff --git a/src/eval_stages/static_benchmarks/orca_math.py b/src/eval_stages/static_benchmarks/orca_math.py
new file mode 100644
index 00000000..cb259369
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/orca_math.py
@@ -0,0 +1,62 @@
+"""Adapter for the microsoft/orca-math-word-problems-200k static benchmark.
+
+Dataset card: https://huggingface.co/datasets/microsoft/orca-math-word-problems-200k
+
+Columns: question (math word problem), answer (step-by-step solution).
+Single split: train. Use +static_benchmark_cfg.split=train when running Stage 0.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Iterable, List
+
+from datasets import load_dataset
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+
+def _iter_orca_math_samples(
+    split: str,
+    limit: int | None,
+) -> Iterable[Dict[str, Any]]:
+    """Yield rows from microsoft/orca-math-word-problems-200k in order."""
+    ds = load_dataset("microsoft/orca-math-word-problems-200k", split=split)
+    if limit is not None:
+        ds = ds.select(range(min(limit, len(ds))))
+    yield from ds
+
+
+def build_eval_datasets_from_orca_math(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Convert Orca Math Word Problems into a single EvalDataset.
+
+    - input: question (math word problem text)
+    - target: answer (step-by-step solution from the dataset)
+    - domain: math
+    - capability_id: orca_math_word_problems
+    """
+    tasks: List[Dict[str, str]] = []
+
+    for idx, row in enumerate(_iter_orca_math_samples(spec.split, spec.limit)):
+        question = str(row.get("question", "")).strip()
+        answer = str(row.get("answer", "")).strip()
+        if not question or not answer:
+            continue
+
+        task_id = f"orca_math_{idx:06d}"
+        tasks.append({"id": task_id, "input": question, "target": answer})
+
+    if not tasks:
+        return []
+
+    dataset = EvalDataset(
+        area_id=spec.area_id,
+        capability_id="orca_math_word_problems",
+        capability_name="Orca Math Word Problems",
+        domain="math",
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+    )
+    return [dataset]
diff --git a/src/eval_stages/static_benchmarks/proofnet.py b/src/eval_stages/static_benchmarks/proofnet.py
new file mode 100644
index 00000000..bf28642f
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/proofnet.py
@@ -0,0 +1,86 @@
+"""Adapter for the hoskinson-center/proofnet static benchmark.
+
+Dataset card: https://huggingface.co/datasets/hoskinson-center/proofnet
+
+ProofNet is a benchmark for autoformalization and formal proving of undergraduate
+mathematics. Uses the "plain_text" config. Splits: validation (185), test (186).
+
+Columns: id, nl_statement (natural language theorem), nl_proof (natural language
+proof in LaTeX), formal_statement (Lean 3), src_header.
+We use nl_statement as input and nl_proof as target.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, Iterable, List
+
+from datasets import load_dataset
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+
+def _slugify(text: str) -> str:
+    """Convert arbitrary strings into safe directory-friendly IDs."""
+    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
+    return cleaned or "unknown"
+
+
+def _iter_proofnet_samples(
+    split: str,
+    limit: int | None,
+) -> Iterable[Dict[str, Any]]:
+    """Yield rows from hoskinson-center/proofnet plain_text in order."""
+    ds = load_dataset(
+        "hoskinson-center/proofnet",
+        "plain_text",
+        split=split,
+    )
+    if limit is not None:
+        ds = ds.select(range(min(limit, len(ds))))
+    yield from ds
+
+
+def build_eval_datasets_from_proofnet(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Convert ProofNet into a single EvalDataset.
+
+    - input: nl_statement (natural language theorem statement)
+    - target: nl_proof (natural language proof)
+    - domain: math
+    - capability_id: proofnet
+    Rows with empty nl_proof are skipped.
+    """
+    tasks: List[Dict[str, str]] = []
+    id_counts: Dict[str, int] = {}
+
+    for idx, row in enumerate(_iter_proofnet_samples(spec.split, spec.limit)):
+        nl_statement = str(row.get("nl_statement", "")).strip()
+        nl_proof = str(row.get("nl_proof", "")).strip()
+        if not nl_statement or not nl_proof:
+            continue
+
+        raw_id = row.get("id")
+        base_id = _slugify(str(raw_id).strip()) if raw_id else f"proofnet_{idx:04d}"
+        cnt = id_counts.get(base_id, 0)
+        id_counts[base_id] = cnt + 1
+        task_id = base_id if cnt == 0 else f"{base_id}_{cnt}"
+
+        tasks.append(
+            {"id": task_id, "input": nl_statement, "target": nl_proof}
+        )
+
+    if not tasks:
+        return []
+
+    dataset = EvalDataset(
+        area_id=spec.area_id,
+        capability_id="proofnet",
+        capability_name="ProofNet",
+        domain="math",
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+    )
+    return [dataset]
diff --git a/src/eval_stages/static_benchmarks/specs.py b/src/eval_stages/static_benchmarks/specs.py
new file mode 100644
index 00000000..6ccb9dc0
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/specs.py
@@ -0,0 +1,38 @@
+"""Shared specs and types for static benchmark adapters."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass(frozen=True)
+class StaticBenchmarkSpec:
+    """Minimal specification for ingesting a static benchmark.
+
+    Attributes
+    ----------
+    benchmark_id
+        Identifier used to select the adapter (e.g. "HuggingFaceH4/MATH-500").
+    split
+        Data split to load (e.g. "train", "test", "validation").
+    limit
+        Optional maximum number of rows to load (for smoke tests).
+    area_id
+        Area identifier used in EvalDataset (groups capabilities).
+    capability_id
+        Capability identifier in EvalDataset; if omitted, adapters may derive it.
+    capability_name
+        Human-readable capability name; if omitted, adapters may derive it.
+    domain
+        Domain label for EvalDataset (e.g. "math", "external").
+    """
+
+    benchmark_id: str
+    split: str = "test"
+    limit: Optional[int] = None
+    area_id: str = "static_benchmarks"
+    capability_id: Optional[str] = None
+    capability_name: Optional[str] = None
+    domain: str = "external"
+
diff --git a/src/eval_stages/static_benchmarks/stateval.py b/src/eval_stages/static_benchmarks/stateval.py
new file mode 100644
index 00000000..8510335d
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/stateval.py
@@ -0,0 +1,49 @@
+"""Unified StatEval benchmark: one benchmark, domain math, two areas.
+
+StatEval has two subsets:
+- Foundational Knowledge Dataset (0v01111/StatEval-Foundational-knowledge)
+- Statistical Research Dataset (0v01111/StatEval-Statistical-Research)
+
+This module exposes a single benchmark_id (StatEval / stateval) that loads
+both and produces two capabilities under one area "stateval" with domain "math":
+- foundational_knowledge
+- statistical_research
+"""
+
+from __future__ import annotations
+
+from typing import List
+
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.eval_stages.static_benchmarks.stateval_foundational import (
+    build_eval_datasets_from_stateval_foundational,
+)
+from src.eval_stages.static_benchmarks.stateval_research import (
+    build_eval_datasets_from_stateval_research,
+)
+from src.schemas.eval_schemas import EvalDataset
+
+STATEVAL_AREA_ID = "stateval"
+STATEVAL_DOMAIN = "math"
+
+
+def build_eval_datasets_from_stateval(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Build EvalDatasets for both StatEval subsets with domain=math, area=stateval.
+
+    Returns two datasets: Foundational Knowledge and Statistical Research.
+    Uses the same split and limit from spec for each subset.
+    """
+    stateval_spec = StaticBenchmarkSpec(
+        benchmark_id=spec.benchmark_id,
+        split=spec.split,
+        limit=spec.limit,
+        area_id=STATEVAL_AREA_ID,
+        capability_id=spec.capability_id,
+        capability_name=spec.capability_name,
+        domain=STATEVAL_DOMAIN,
+    )
+
+    foundational = build_eval_datasets_from_stateval_foundational(stateval_spec)
+    research = build_eval_datasets_from_stateval_research(stateval_spec)
+
+    return foundational + research
diff --git a/src/eval_stages/static_benchmarks/stateval_foundational.py b/src/eval_stages/static_benchmarks/stateval_foundational.py
new file mode 100644
index 00000000..6e076877
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/stateval_foundational.py
@@ -0,0 +1,219 @@
+"""Adapter for the StatEval Foundational Knowledge static benchmark.
+
+Primary dataset:
+- https://huggingface.co/datasets/0v01111/StatEval-Foundational-knowledge
+
+The upstream JSON has mixed types for the "answer" column (string vs object),
+which breaks the default HuggingFace loader. We try load_dataset first, then
+fall back to downloading raw JSON and parsing with per-row normalization.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from pathlib import Path
+from typing import Any, Dict, Iterable, List
+
+from datasets import load_dataset
+from datasets.exceptions import DatasetGenerationError
+from huggingface_hub import hf_hub_download, list_repo_files
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+REPO_ID = "0v01111/StatEval-Foundational-knowledge"
+
+
+def _slugify(text: str) -> str:
+    """Convert arbitrary strings into safe directory-friendly IDs."""
+    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
+    return cleaned or "unknown"
+
+
+def _normalize_answer_in_row(row: Dict[str, Any]) -> None:
+    """In-place: ensure row['answer'] is a string (upstream has mixed types)."""
+    a = row.get("answer")
+    if a is None:
+        row["answer"] = ""
+        return
+    if isinstance(a, dict):
+        for key in ("label", "text", "final", "value"):
+            if key in a and a[key] is not None:
+                row["answer"] = str(a[key]).strip()
+                return
+        row["answer"] = str(a).strip()
+        return
+    row["answer"] = str(a).strip()
+
+
+def _load_foundational_raw(split: str, limit: int | None) -> Iterable[Dict[str, Any]]:
+    """Load repo JSON manually and yield rows with normalized 'answer'.
+
+    Used when load_dataset fails due to mixed column types in the upstream data.
+    """
+    files = list_repo_files(REPO_ID, repo_type="dataset")
+    # Prefer the canonical Foundational-knowledge.jsonl if present.
+    filename = None
+    for cand in files:
+        if "Foundational-knowledge" in cand:
+            filename = cand
+            break
+    if filename is None:
+        # Fallback: any JSON/JSONL file that mentions the split, else any JSON/JSONL.
+        json_files = [
+            f for f in files if (f.endswith(".json") or f.endswith(".jsonl")) and split in f
+        ]
+        if not json_files:
+            json_files = [f for f in files if f.endswith(".json") or f.endswith(".jsonl")]
+        if not json_files:
+            return
+        filename = json_files[0]
+
+    path = hf_hub_download(
+        repo_id=REPO_ID,
+        filename=filename,
+        repo_type="dataset",
+    )
+    text = Path(path).read_text(encoding="utf-8", errors="replace")
+    rows: List[Dict[str, Any]] = []
+    try:
+        data = json.loads(text)
+        if isinstance(data, list):
+            rows = data
+        elif isinstance(data, dict) and "data" in data:
+            rows = data["data"]
+        elif isinstance(data, dict) and data:
+            keys = sorted(
+                data.keys(),
+                key=lambda k: int(k) if isinstance(k, str) and k.isdigit() else k,
+            )
+            rows = [data[k] for k in keys if isinstance(data[k], dict)]
+    except (json.JSONDecodeError, TypeError, ValueError):
+        for line in text.splitlines():
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+                if isinstance(obj, dict):
+                    rows.append(obj)
+                elif isinstance(obj, list):
+                    rows.extend(obj)
+            except json.JSONDecodeError:
+                continue
+    for i, row in enumerate(rows):
+        if limit is not None and i >= limit:
+            break
+        if not isinstance(row, dict):
+            continue
+        _normalize_answer_in_row(row)
+        yield row
+
+
+def _iter_stateval_foundational_samples(
+    split: str,
+    limit: int | None,
+) -> Iterable[Dict[str, Any]]:
+    """Yield rows from StatEval-Foundational-knowledge in a stable order.
+
+    Always loads raw JSON from the Hub and normalizes 'answer', avoiding
+    mixed-type issues in the official JSONL that break load_dataset.
+    """
+    yield from _load_foundational_raw(split, limit)
+
+
+def _extract_question_and_options(row: Dict[str, Any]) -> str:
+    """Build a human-readable question string from a StatEval row.
+
+    We try several likely field names for the question stem and options to
+    make the adapter robust to minor schema variations.
+    """
+    stem = (
+        str(
+            row.get("question")
+            or row.get("prompt")
+            or row.get("task")
+            or row.get("problem")
+            or ""
+        ).strip()
+    )
+
+    options = row.get("options") or row.get("choices") or row.get("mc_options")
+    options_text = ""
+    if isinstance(options, list) and options:
+        labeled = []
+        for i, opt in enumerate(options):
+            label = chr(ord("A") + i)
+            labeled.append(f"{label}. {str(opt).strip()}")
+        options_text = " ".join(labeled)
+    elif isinstance(options, str) and options.strip():
+        options_text = options.strip()
+
+    if options_text:
+        return f"{stem}\n\nOptions: {options_text}"
+    return stem
+
+
+def _extract_answer(row: Dict[str, Any]) -> str:
+    """Extract a compact target answer string from a StatEval row.
+
+    The 'answer' field can be heterogeneous (string or object). We normalize
+    it into a single string that can be used as target text.
+    """
+    answer = row.get("answer")
+
+    # If answer is a mapping, look for common keys first.
+    if isinstance(answer, dict):
+        for key in ("label", "text", "final", "value"):
+            if key in answer and answer[key] is not None:
+                return str(answer[key]).strip()
+        # Fallback: stringify the whole object.
+        return str(answer).strip()
+
+    if answer is None:
+        return ""
+
+    return str(answer).strip()
+
+
+def build_eval_datasets_from_stateval_foundational(
+    spec: StaticBenchmarkSpec,
+) -> List[EvalDataset]:
+    """Convert StatEval Foundational Knowledge into a single EvalDataset.
+
+    All tasks are grouped under one capability "foundational_knowledge".
+    Uses spec.area_id and spec.domain (caller sets e.g. area_id=stateval, domain=math).
+    """
+    tasks: List[Dict[str, str]] = []
+
+    for idx, row in enumerate(
+        _iter_stateval_foundational_samples(spec.split, spec.limit)
+    ):
+        input_text = _extract_question_and_options(row)
+        target = _extract_answer(row)
+        if not input_text or not target:
+            continue
+
+        task_id = (
+            str(row.get("id") or row.get("ID") or "").strip()
+            or f"stateval_fk_{idx:05d}"
+        )
+        tasks.append({"id": task_id, "input": input_text, "target": target})
+
+    if not tasks:
+        return []
+
+    domain = (spec.domain or "math").strip().lower() or "math"
+    dataset = EvalDataset(
+        area_id=spec.area_id,
+        capability_id="foundational_knowledge",
+        capability_name="Foundational Knowledge",
+        domain=domain,
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+    )
+    return [dataset]
+
diff --git a/src/eval_stages/static_benchmarks/stateval_research.py b/src/eval_stages/static_benchmarks/stateval_research.py
new file mode 100644
index 00000000..535b1ff8
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/stateval_research.py
@@ -0,0 +1,96 @@
+"""Adapter for the StatEval Statistical Research static benchmark.
+
+Dataset: https://huggingface.co/datasets/0v01111/StatEval-Statistical-Research
+
+Research-level, proof-based tasks from papers. Exposed as a single
+capability "statistical_research" for the unified StatEval benchmark.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Iterable, List
+
+from datasets import load_dataset
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+
+def _iter_stateval_research_samples(
+    split: str,
+    limit: int | None,
+) -> Iterable[Dict[str, Any]]:
+    """Yield rows from StatEval-Statistical-Research in stable order."""
+    ds = load_dataset("0v01111/StatEval-Statistical-Research", split=split)
+    if limit is not None:
+        ds = ds.select(range(min(limit, len(ds))))
+    yield from ds
+
+
+def _extract_input(row: Dict[str, Any]) -> str:
+    """Build question/prompt text from a research row."""
+    stem = (
+        str(
+            row.get("question")
+            or row.get("prompt")
+            or row.get("task")
+            or row.get("problem")
+            or row.get("context")
+            or ""
+        ).strip()
+    )
+    return stem or ""
+
+
+def _extract_answer(row: Dict[str, Any]) -> str:
+    """Extract target answer from a research row."""
+    answer = row.get("answer")
+    if isinstance(answer, dict):
+        for key in ("label", "text", "final", "value", "solution"):
+            if key in answer and answer[key] is not None:
+                return str(answer[key]).strip()
+        return str(answer).strip()
+    if answer is None:
+        return ""
+    return str(answer).strip()
+
+
+def build_eval_datasets_from_stateval_research(
+    spec: StaticBenchmarkSpec,
+) -> List[EvalDataset]:
+    """Convert StatEval Statistical Research into a single EvalDataset.
+
+    All tasks under one capability "statistical_research".
+    Uses spec.area_id and spec.domain (e.g. area_id=stateval, domain=math).
+    """
+    tasks: List[Dict[str, str]] = []
+
+    for idx, row in enumerate(
+        _iter_stateval_research_samples(spec.split, spec.limit)
+    ):
+        input_text = _extract_input(row)
+        target = _extract_answer(row)
+        if not input_text or not target:
+            continue
+
+        task_id = (
+            str(row.get("id") or row.get("ID") or "").strip()
+            or f"stateval_research_{idx:05d}"
+        )
+        tasks.append({"id": task_id, "input": input_text, "target": target})
+
+    if not tasks:
+        return []
+
+    domain = (spec.domain or "math").strip().lower() or "math"
+    dataset = EvalDataset(
+        area_id=spec.area_id,
+        capability_id="statistical_research",
+        capability_name="Statistical Research",
+        domain=domain,
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+    )
+    return [dataset]
diff --git a/src/eval_stages/static_benchmarks/wemath.py b/src/eval_stages/static_benchmarks/wemath.py
new file mode 100644
index 00000000..28f6d0de
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/wemath.py
@@ -0,0 +1,95 @@
+"""Adapter for the We-Math/We-Math static benchmark.
+
+Dataset card: https://huggingface.co/datasets/We-Math/We-Math
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, Iterable, List
+
+from datasets import load_dataset
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+
+def _slugify(text: str) -> str:
+    """Convert arbitrary strings into safe directory-friendly IDs."""
+    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
+    return cleaned or "unknown"
+
+
+def _iter_wemath_samples(
+    split: str,
+    limit: int | None,
+) -> Iterable[Dict[str, Any]]:
+    """Yield rows from We-Math/We-Math in a stable order."""
+    # The public config exposes a "testmini" split; callers should pass
+    # static_benchmark_cfg.split=testmini in Hydra.
+    ds = load_dataset("We-Math/We-Math", split=split)
+    if limit is not None:
+        ds = ds.select(range(min(limit, len(ds))))
+    yield from ds
+
+
+def build_eval_datasets_from_wemath(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Convert We-Math into EvalDatasets grouped by knowledge concept.
+
+    We treat:
+    - domain: always "math"
+    - area_id: taken from spec.area_id (e.g. "math" or "static_benchmarks")
+    - capability_id / capability_name: derived from the "knowledge_concept"
+      column (e.g., "Properties and Understanding of Squares").
+
+    Each task:
+    - input: question text plus options as a single string
+    - target: the correct option letter from the "answer" column
+    """
+    by_concept: Dict[str, List[Dict[str, str]]] = {}
+    id_counts: Dict[str, int] = {}
+
+    for idx, row in enumerate(_iter_wemath_samples(spec.split, spec.limit)):
+        # Skip questions that have an image; this pipeline is text-only and does not pass images.
+        if row.get("image") is not None:
+            continue
+
+        question = str(row.get("question", "")).strip()
+        options = str(row.get("option", "")).strip()
+        answer = str(row.get("answer", "")).strip()
+        if not question or not options or not answer:
+            continue
+
+        concept = str(row.get("knowledge_concept", "")).strip() or "unknown"
+        base_id = str(row.get("ID", "")).strip() or f"wemath_{idx:04d}"
+        # Ensure uniqueness of task ids since Inspect requires unique ids.
+        cnt = id_counts.get(base_id, 0)
+        id_counts[base_id] = cnt + 1
+        task_id = base_id if cnt == 0 else f"{base_id}_{cnt}"
+
+        # Pack question and options into a single prompt input.
+        input_text = f"{question}\n\nOptions: {options}"
+
+        by_concept.setdefault(concept, []).append(
+            {"id": task_id, "input": input_text, "target": answer}
+        )
+
+    datasets: List[EvalDataset] = []
+    for concept, tasks in sorted(by_concept.items()):
+        capability_id = _slugify(concept)
+        capability_name = concept
+
+        dataset = EvalDataset(
+            area_id=spec.area_id,
+            capability_id=capability_id,
+            capability_name=capability_name,
+            domain="math",
+            tasks=tasks,
+            num_tasks=len(tasks),
+            prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+        )
+        datasets.append(dataset)
+
+    return datasets
+
diff --git a/src/run_eval_pipeline.py b/src/run_eval_pipeline.py
index 541a762e..c1212189 100644
--- a/src/run_eval_pipeline.py
+++ b/src/run_eval_pipeline.py
@@ -25,6 +25,7 @@
 from src.eval_stages import (
     EvalSetupError,
     run_eval_stage0,
+    run_eval_stage0_static,
     run_eval_stage1,
     run_eval_stage2,
 )
@@ -133,6 +134,21 @@ def main(cfg: DictConfig) -> None:
         except ValueError as e:
             logger.error("Stage 2 failed: %s", e)
 
+    elif stage in {"0_static", "static0", "static"}:
+        if not validation_tag:
+            logger.error("validation_tag is required for stage 0_static")
+            logger.error(
+                "Usage: python -m src.run_eval_pipeline stage=0_static "
+                "validation_tag=_SOME_TAG static_benchmark_cfg.benchmark_id=HuggingFaceH4/MATH-500"
+            )
+            return
+
+        try:
+            run_eval_stage0_static(cfg, validation_tag)
+            logger.info("Eval Stage 0_static complete. Datasets created.")
+        except ValueError as e:
+            logger.error("Stage 0_static failed: %s", e)
+
     else:
         logger.error("Invalid stage: %s. Use 'all', 0, 1, or 2", stage)
 

From 3d13bd1b21c1b16d935b43e737c739c96fc206a1 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Thu, 19 Mar 2026 09:53:25 -0400
Subject: [PATCH 2/8] New Finance Benchmarks

---
 scripts/static_benchmarks/bizbench_eval.sh    |  16 +-
 .../static_benchmarks/finance_math_eval.sh    |  16 +-
 .../static_benchmarks/finance_tasks_eval.sh   |  65 +++++++
 .../submit_all_static_benchmarks.sh           |   2 +
 .../static_benchmarks/xfinbench_test_eval.sh  |  73 ++++++++
 src/eval_stages/stage0_static_benchmarks.py   |  18 ++
 src/eval_stages/stage1_eval_execution.py      |  85 ++++++---
 src/eval_stages/static_benchmarks/bizbench.py |  32 +++-
 .../static_benchmarks/finance_math.py         |  38 ++++-
 .../static_benchmarks/finance_tasks.py        | 121 +++++++++++++
 src/eval_stages/static_benchmarks/specs.py    |   6 +-
 .../static_benchmarks/xfinbench.py            | 161 ++++++++++++++++++
 12 files changed, 589 insertions(+), 44 deletions(-)
 create mode 100755 scripts/static_benchmarks/finance_tasks_eval.sh
 create mode 100755 scripts/static_benchmarks/xfinbench_test_eval.sh
 create mode 100644 src/eval_stages/static_benchmarks/finance_tasks.py
 create mode 100644 src/eval_stages/static_benchmarks/xfinbench.py

diff --git a/scripts/static_benchmarks/bizbench_eval.sh b/scripts/static_benchmarks/bizbench_eval.sh
index 5a067592..4620f7e6 100644
--- a/scripts/static_benchmarks/bizbench_eval.sh
+++ b/scripts/static_benchmarks/bizbench_eval.sh
@@ -1,16 +1,23 @@
 #!/bin/bash
 #SBATCH --job-name=bizbench_eval
-#SBATCH --output=logs/bizbench_eval_%j.out
-#SBATCH --error=logs/bizbench_eval_%j.err
+#SBATCH --output=logs/bizbench_eval_%A_%a.out
+#SBATCH --error=logs/bizbench_eval_%A_%a.err
 #SBATCH --time=04:00:00
 #SBATCH --cpus-per-task=4
 #SBATCH --mem=16G
+#SBATCH --array=0-9
 
 set -euo pipefail
 
 cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 
-VALIDATION_TAG="_BIZBENCH_$(date +%Y%m%d_%H%M%S)"
+# Allow running either via sbatch (with SLURM_ARRAY_TASK_ID set)
+# or directly (default to a single chunk 0).
+: "${SLURM_ARRAY_TASK_ID:=0}"
+
+CHUNK=500
+OFFSET=$((SLURM_ARRAY_TASK_ID * CHUNK))
+VALIDATION_TAG="_BIZBENCH_${SLURM_ARRAY_TASK_ID}_$(date +%Y%m%d_%H%M%S)"
 
 # Stage 0_static: build datasets from kensho/bizbench
 python -m src.run_eval_pipeline \
@@ -18,7 +25,8 @@ python -m src.run_eval_pipeline \
   validation_tag="$VALIDATION_TAG" \
   +static_benchmark_cfg.benchmark_id=kensho/bizbench \
   +static_benchmark_cfg.split=test \
-  +static_benchmark_cfg.limit=30
+  +static_benchmark_cfg.offset="$OFFSET" \
+  +static_benchmark_cfg.limit="$CHUNK"
 
 # Stage 1: run subject models on the static datasets
 python -m src.run_eval_pipeline \
diff --git a/scripts/static_benchmarks/finance_math_eval.sh b/scripts/static_benchmarks/finance_math_eval.sh
index 24b8e257..b4a40650 100644
--- a/scripts/static_benchmarks/finance_math_eval.sh
+++ b/scripts/static_benchmarks/finance_math_eval.sh
@@ -1,16 +1,23 @@
 #!/bin/bash
 #SBATCH --job-name=finance_math_eval
-#SBATCH --output=logs/finance_math_eval_%j.out
-#SBATCH --error=logs/finance_math_eval_%j.err
+#SBATCH --output=logs/finance_math_eval_%A_%a.out
+#SBATCH --error=logs/finance_math_eval_%A_%a.err
 #SBATCH --time=04:00:00
 #SBATCH --cpus-per-task=4
 #SBATCH --mem=16G
+#SBATCH --array=0-9
 
 set -euo pipefail
 
 cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 
-VALIDATION_TAG="_FINANCE_MATH_$(date +%Y%m%d_%H%M%S)"
+# Allow running via sbatch (with SLURM_ARRAY_TASK_ID) or directly (defaults to 0).
+: "${SLURM_ARRAY_TASK_ID:=0}"
+
+# FinanceMath validation has 121 non-table tasks after filtering.
+CHUNK=20
+OFFSET=$((SLURM_ARRAY_TASK_ID * CHUNK))
+VALIDATION_TAG="_FINANCE_MATH_${SLURM_ARRAY_TASK_ID}_$(date +%Y%m%d_%H%M%S)"
 
 # Stage 0_static: build datasets from yale-nlp/FinanceMath (validation split only)
 python -m src.run_eval_pipeline \
@@ -18,7 +25,8 @@ python -m src.run_eval_pipeline \
   validation_tag="$VALIDATION_TAG" \
   +static_benchmark_cfg.benchmark_id=yale-nlp/FinanceMath \
   +static_benchmark_cfg.split=validation \
-  +static_benchmark_cfg.limit=30
+  +static_benchmark_cfg.offset="$OFFSET" \
+  +static_benchmark_cfg.limit="$CHUNK"
 
 # Stage 1: run subject models on the static datasets
 python -m src.run_eval_pipeline \
diff --git a/scripts/static_benchmarks/finance_tasks_eval.sh b/scripts/static_benchmarks/finance_tasks_eval.sh
new file mode 100755
index 00000000..14f41f1e
--- /dev/null
+++ b/scripts/static_benchmarks/finance_tasks_eval.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+#SBATCH --job-name=finance_tasks_eval
+#SBATCH --output=logs/finance_tasks_eval_%j.out
+#SBATCH --error=logs/finance_tasks_eval_%j.err
+#SBATCH --time=04:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+VALIDATION_TAG="_FINANCE_TASKS_$(date +%Y%m%d_%H%M%S)"
+
+# Stage 0_static: build datasets from local finance_tasks.json
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=finance_tasks.json \
+  +static_benchmark_cfg.split=na \
+  +static_benchmark_cfg.domain=finance \
+  +static_benchmark_cfg.capability_id=finance_tasks \
+  +static_benchmark_cfg.capability_name="Finance Tasks"
+  # +static_benchmark_cfg.limit=30 \
+
+# Stage 1: run subject models on the static datasets
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+# Stage 2: aggregate scores
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+# Optional: generate flattened JSONL views of Inspect logs for easier reading
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
+
diff --git a/scripts/static_benchmarks/submit_all_static_benchmarks.sh b/scripts/static_benchmarks/submit_all_static_benchmarks.sh
index 2469076a..415f0429 100755
--- a/scripts/static_benchmarks/submit_all_static_benchmarks.sh
+++ b/scripts/static_benchmarks/submit_all_static_benchmarks.sh
@@ -15,6 +15,8 @@ sbatch scripts/static_benchmarks/orca_math_eval.sh
 sbatch scripts/static_benchmarks/proofnet_eval.sh
 sbatch scripts/static_benchmarks/harp_eval.sh
 sbatch scripts/static_benchmarks/finance_math_eval.sh
+sbatch scripts/static_benchmarks/finance_tasks_eval.sh
+sbatch scripts/static_benchmarks/xfinbench_eval.sh
 sbatch scripts/static_benchmarks/bizbench_eval.sh
 sbatch scripts/static_benchmarks/omni_math_eval.sh
 sbatch scripts/static_benchmarks/minif2f_eval.sh
diff --git a/scripts/static_benchmarks/xfinbench_test_eval.sh b/scripts/static_benchmarks/xfinbench_test_eval.sh
new file mode 100755
index 00000000..62caf544
--- /dev/null
+++ b/scripts/static_benchmarks/xfinbench_test_eval.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+#SBATCH --job-name=xfinbench_test_eval
+#SBATCH --output=logs/xfinbench_test_eval_%A_%a.out
+#SBATCH --error=logs/xfinbench_test_eval_%A_%a.err
+#SBATCH --time=08:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+#SBATCH --array=0-9
+
+set -euo pipefail
+
+cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# Allow running via sbatch (with SLURM_ARRAY_TASK_ID) or directly (defaults to 0).
+: "${SLURM_ARRAY_TASK_ID:=0}"
+
+# 10 chunks over ~2828 filtered test examples → ~300 per chunk
+CHUNK=300
+OFFSET=$((SLURM_ARRAY_TASK_ID * CHUNK))
+VALIDATION_TAG="_XFINBENCH_TEST_${SLURM_ARRAY_TASK_ID}_$(date +%Y%m%d_%H%M%S)"
+
+# Stage 0_static: build datasets from Zhihan/XFinBench (test split, CSV-backed HF repo)
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=Zhihan/XFinBench \
+  +static_benchmark_cfg.split=test \
+  +static_benchmark_cfg.offset="$OFFSET" \
+  +static_benchmark_cfg.limit="$CHUNK" \
+  +static_benchmark_cfg.domain=finance \
+  +static_benchmark_cfg.capability_id=xfinbench_test \
+  +static_benchmark_cfg.capability_name="XFinBench Test"
+
+# Stage 1: run subject models on the static datasets
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+# Stage 2: aggregate scores
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets (test split): base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+# Optional: generate flattened JSONL views of Inspect logs for easier reading
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
+
diff --git a/src/eval_stages/stage0_static_benchmarks.py b/src/eval_stages/stage0_static_benchmarks.py
index 7278b42f..c7c53cbe 100644
--- a/src/eval_stages/stage0_static_benchmarks.py
+++ b/src/eval_stages/stage0_static_benchmarks.py
@@ -52,6 +52,12 @@
 )
 from src.eval_stages.static_benchmarks.stateval import build_eval_datasets_from_stateval
 from src.eval_stages.static_benchmarks.wemath import build_eval_datasets_from_wemath
+from src.eval_stages.static_benchmarks.finance_tasks import (
+    build_eval_datasets_from_finance_tasks,
+)
+from src.eval_stages.static_benchmarks.xfinbench import (
+    build_eval_datasets_from_xfinbench,
+)
 from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
 from src.schemas.eval_io_utils import save_eval_config, save_eval_dataset
 from src.schemas.eval_schemas import EvalConfig, EvalDataset
@@ -114,6 +120,17 @@ def _build_datasets_from_spec(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
         "stateval",
     }:
         return build_eval_datasets_from_stateval(spec)
+    if bid in {"Zhihan/XFinBench", "XFinBench", "xfinbench"}:
+        return build_eval_datasets_from_xfinbench(spec)
+    if bid in {
+        "finance_tasks",
+        "FinanceTasks",
+        "finance_tasks.json",
+        "local_finance_tasks",
+    } or bid.endswith(".json"):
+        # If a user points benchmark_id to a local JSON path, we ingest it here.
+        # This is intentionally permissive for local workflows.
+        return build_eval_datasets_from_finance_tasks(spec)
     raise ValueError(f"Unknown static benchmark_id: {spec.benchmark_id}")
 
 
@@ -136,6 +153,7 @@ def run_eval_stage0_static(cfg: DictConfig, validation_tag: str) -> None:
         benchmark_id=str(benchmark_id),
         split=str(static_cfg.get("split", "test")),
         limit=static_cfg.get("limit"),
+        offset=static_cfg.get("offset"),
         area_id=str(static_cfg.get("area_id", StaticBenchmarkSpec.area_id)),
         capability_id=static_cfg.get("capability_id"),
         capability_name=static_cfg.get("capability_name"),
diff --git a/src/eval_stages/stage1_eval_execution.py b/src/eval_stages/stage1_eval_execution.py
index a2aa08a1..a417c140 100644
--- a/src/eval_stages/stage1_eval_execution.py
+++ b/src/eval_stages/stage1_eval_execution.py
@@ -7,6 +7,7 @@
 """
 
 import logging
+import time
 from pathlib import Path
 from typing import Dict, List, Optional, Set
 
@@ -179,37 +180,74 @@ def _run_inspect_eval(
     subject_llm: str,
     judge_llm: Dict[str, str],
     output_dir: Path,
+    *,
+    max_attempts: int = 3,
 ) -> bool:
-    """Run a fresh Inspect eval for one capability/LLM pair."""
+    """Run an Inspect eval for one capability/LLM pair with auto-retry.
+
+    Why retry: providers occasionally drop connections mid-run (e.g. httpx
+    RemoteProtocolError: server disconnected without sending a response). When
+    that happens, Inspect often leaves a partial log that can be resumed via
+    `inspect_eval_retry`.
+    """
     # Format model names for Inspect (provider/model)
     judge_model = f"{judge_llm['provider']}/{judge_llm['name']}"
 
-    try:
-        # Create Inspect task
-        task = _create_inspect_task(dataset, judge_model)
+    expected_task_ids = {str(task["id"]) for task in dataset.tasks}
 
-        # Run evaluation
-        # Inspect saves logs to the specified directory
-        output_dir.mkdir(parents=True, exist_ok=True)
+    for attempt in range(1, max_attempts + 1):
+        try:
+            # Create Inspect task
+            task = _create_inspect_task(dataset, judge_model)
 
-        inspect_eval(
-            task,
-            model=subject_llm,
-            log_dir=str(output_dir),
-            log_format="json",
-        )
+            # Run evaluation
+            # Inspect saves logs to the specified directory
+            output_dir.mkdir(parents=True, exist_ok=True)
+
+            inspect_eval(
+                task,
+                model=subject_llm,
+                log_dir=str(output_dir),
+                log_format="json",
+            )
 
-        return True
+            return True
 
-    except Exception as e:
-        logger.error(
-            "Inspect evaluation failed for %s/%s with %s: %s",
-            dataset.area_id,
-            dataset.capability_id,
-            subject_llm,
-            e,
-        )
-        return False
+        except Exception as e:
+            logger.warning(
+                "Inspect eval attempt %d/%d failed for %s/%s with %s: %s",
+                attempt,
+                max_attempts,
+                dataset.area_id,
+                dataset.capability_id,
+                subject_llm,
+                e,
+            )
+
+            # Try to resume from a partial log if present.
+            retry_log = _find_retry_log(output_dir, expected_task_ids)
+            if retry_log is not None:
+                logger.info(
+                    "Attempting inspect_eval_retry from partial log: %s",
+                    retry_log.name,
+                )
+                if _run_inspect_retry(retry_log_path=retry_log, output_dir=output_dir):
+                    return True
+
+            if attempt < max_attempts:
+                sleep_s = min(2**attempt, 30)
+                logger.info("Retrying after %ds...", sleep_s)
+                time.sleep(sleep_s)
+                continue
+
+            logger.error(
+                "Inspect evaluation ultimately failed for %s/%s with %s after %d attempts",
+                dataset.area_id,
+                dataset.capability_id,
+                subject_llm,
+                max_attempts,
+            )
+            return False
 
 
 def _run_inspect_retry(
@@ -358,6 +396,7 @@ def run_eval_stage1(
                     subject_llm=subject_model,
                     judge_llm=judge_llm,
                     output_dir=output_dir,
+                    max_attempts=int(cfg.get("eval_cfg", {}).get("max_attempts", 3)),
                 )
 
             if success:
diff --git a/src/eval_stages/static_benchmarks/bizbench.py b/src/eval_stages/static_benchmarks/bizbench.py
index 68cb24e5..10b25aa2 100644
--- a/src/eval_stages/static_benchmarks/bizbench.py
+++ b/src/eval_stages/static_benchmarks/bizbench.py
@@ -69,18 +69,37 @@ def _build_input(question: str, context: Any, options: Any) -> str:
     return "\n\n".join(parts).strip()
 
 
-def _iter_bizbench_samples(split: str, limit: int | None) -> Iterable[Dict[str, Any]]:
+def _iter_bizbench_samples(
+    split: str,
+    offset: int | None,
+    limit: int | None,
+) -> Iterable[Dict[str, Any]]:
     ds = load_dataset("kensho/bizbench", split=split)
-    if limit is not None:
-        ds = ds.select(range(min(limit, len(ds))))
-    yield from ds
+    n = len(ds)
+
+    start = 0 if offset is None else max(0, int(offset))
+    if start >= n:
+        return iter(())
+
+    if limit is None:
+        end = n
+    else:
+        end = min(start + int(limit), n)
+
+    if start == 0 and end == n:
+        yield from ds
+        return
+
+    yield from ds.select(range(start, end))
 
 
 def build_eval_datasets_from_bizbench(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
     """Convert BizBench into a single EvalDataset."""
     tasks: List[Dict[str, str]] = []
 
-    for idx, row in enumerate(_iter_bizbench_samples(spec.split, spec.limit)):
+    for local_idx, row in enumerate(
+        _iter_bizbench_samples(spec.split, spec.offset, spec.limit)
+    ):
         question = str(row.get("question", "")).strip()
         raw_answer = row.get("answer")
         answer = _normalize_answer(raw_answer)
@@ -89,7 +108,8 @@ def build_eval_datasets_from_bizbench(spec: StaticBenchmarkSpec) -> List[EvalDat
             continue
 
         inp = _build_input(question, row.get("context"), row.get("options"))
-        task_id = f"bizbench_{idx:05d}"
+        global_idx = (spec.offset or 0) + local_idx
+        task_id = f"bizbench_{global_idx:05d}"
         tasks.append({"id": task_id, "input": inp, "target": answer})
 
     if not tasks:
diff --git a/src/eval_stages/static_benchmarks/finance_math.py b/src/eval_stages/static_benchmarks/finance_math.py
index 84b31e1a..383247c9 100644
--- a/src/eval_stages/static_benchmarks/finance_math.py
+++ b/src/eval_stages/static_benchmarks/finance_math.py
@@ -63,13 +63,27 @@ def _build_input(question: str, tables: Any) -> str:
 
 def _iter_finance_math_samples(
     split: str,
+    offset: int | None,
     limit: int | None,
 ) -> Iterable[Dict[str, Any]]:
     """Yield rows from yale-nlp/FinanceMath in order."""
     ds = load_dataset("yale-nlp/FinanceMath", split=split)
-    if limit is not None:
-        ds = ds.select(range(min(limit, len(ds))))
-    yield from ds
+    n = len(ds)
+
+    start = 0 if offset is None else max(0, int(offset))
+    if start >= n:
+        return iter(())
+
+    if limit is None:
+        end = n
+    else:
+        end = min(start + int(limit), n)
+
+    if start == 0 and end == n:
+        yield from ds
+        return
+
+    yield from ds.select(range(start, end))
 
 
 def build_eval_datasets_from_finance_math(
@@ -85,17 +99,29 @@ def build_eval_datasets_from_finance_math(
     """
     tasks: List[Dict[str, str]] = []
 
-    for idx, row in enumerate(_iter_finance_math_samples(spec.split, spec.limit)):
+    for local_idx, row in enumerate(
+        _iter_finance_math_samples(spec.split, spec.offset, spec.limit)
+    ):
         question = str(row.get("question", "")).strip()
         tables = row.get("tables")
         raw_answer = row.get("ground_truth")
         answer = _normalize_answer(raw_answer)
 
+        # Skip table-based questions entirely.
+        # The dataset uses `tables` as a list; we only keep rows where it's empty.
+        if isinstance(tables, list) and len(tables) > 0:
+            continue
+        if tables not in (None, [], ""):
+            # Defensive: if tables is any non-empty structure/string, skip.
+            if str(tables).strip():
+                continue
+
         if not question or not answer:
             continue
 
-        inp = _build_input(question, tables)
-        task_id = f"finance_math_{idx:05d}"
+        inp = question
+        global_idx = (spec.offset or 0) + local_idx
+        task_id = f"finance_math_{global_idx:05d}"
         tasks.append({"id": task_id, "input": inp, "target": answer})
 
     if not tasks:
diff --git a/src/eval_stages/static_benchmarks/finance_tasks.py b/src/eval_stages/static_benchmarks/finance_tasks.py
new file mode 100644
index 00000000..00129795
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/finance_tasks.py
@@ -0,0 +1,121 @@
+"""Adapter for a local finance task JSON export.
+
+This adapter ingests a local JSON file (e.g. `finance_tasks.json`) that follows
+the repo's task-generation export shape:
+
+- Top-level keys: `metadata`, `tasks`
+- Each task contains:
+  - `task_id` (str)
+  - `task_statement` (str) — includes options for multiple-choice tasks
+  - `generation_metadata.correct_answer` (str) — e.g. "A", "B", ...
+
+We map:
+- input: task_statement
+- target: correct_answer (fallbacks supported)
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+
+def _sanitize_text(text: str) -> str:
+    """Sanitize text so it is safe to JSON-encode and send to APIs."""
+    # Remove null bytes (can break downstream tooling / transports).
+    text = text.replace("\x00", "")
+    # Replace any invalid unicode sequences (e.g., unpaired surrogates) deterministically.
+    return text.encode("utf-8", errors="replace").decode("utf-8", errors="replace")
+
+
+def _resolve_json_path(benchmark_id: str) -> Path:
+    raw = benchmark_id.strip()
+    if raw.startswith("file://"):
+        raw = raw[len("file://") :]
+    candidate = Path(raw)
+    if candidate.exists():
+        return candidate
+
+    # Default: assume a repo-root file name was given as benchmark_id.
+    default = Path("finance_tasks.json")
+    if default.exists():
+        return default
+
+    # Fall back to relative path from CWD.
+    return candidate
+
+
+def _read_json(path: Path) -> Dict[str, Any]:
+    with path.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, dict):
+        raise ValueError(f"Expected a JSON object in {path}, got {type(data).__name__}")
+    return data
+
+
+def _extract_target(task: Dict[str, Any]) -> str:
+    gen_md = task.get("generation_metadata")
+    if isinstance(gen_md, dict):
+        val = gen_md.get("correct_answer")
+        if val is not None:
+            return str(val).strip()
+
+    # Fallbacks for other possible exports.
+    for key in ("correct_answer", "answer", "target", "label"):
+        if key in task and task[key] is not None:
+            return str(task[key]).strip()
+
+    return ""
+
+
+def build_eval_datasets_from_finance_tasks(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Convert a local finance_tasks JSON file into a single EvalDataset."""
+    json_path = _resolve_json_path(spec.benchmark_id)
+    payload = _read_json(json_path)
+    raw_tasks = payload.get("tasks", [])
+    if not isinstance(raw_tasks, list):
+        raise ValueError(
+            f"Expected `tasks` to be a list in {json_path}, got {type(raw_tasks).__name__}"
+        )
+
+    tasks: List[Dict[str, str]] = []
+    limit: Optional[int] = spec.limit
+    offset: int = max(0, int(spec.offset or 0))
+
+    for idx, row in enumerate(raw_tasks[offset:]):
+        if not isinstance(row, dict):
+            continue
+
+        task_id = str(row.get("task_id", "")).strip()
+        statement = _sanitize_text(str(row.get("task_statement", "")).strip())
+        target = _extract_target(row)
+
+        if not task_id:
+            global_idx = offset + len(tasks)
+            task_id = f"finance_tasks_{global_idx:05d}"
+        if not statement or not target:
+            continue
+
+        tasks.append({"id": task_id, "input": statement, "target": target})
+        if limit is not None and len(tasks) >= limit:
+            break
+
+    if not tasks:
+        return []
+
+    dataset = EvalDataset(
+        area_id=spec.area_id,
+        capability_id=str(spec.capability_id or "finance_tasks"),
+        capability_name=str(spec.capability_name or "Finance Tasks"),
+        domain=str(spec.domain or "finance"),
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+    )
+    return [dataset]
+
diff --git a/src/eval_stages/static_benchmarks/specs.py b/src/eval_stages/static_benchmarks/specs.py
index 6ccb9dc0..d387e462 100644
--- a/src/eval_stages/static_benchmarks/specs.py
+++ b/src/eval_stages/static_benchmarks/specs.py
@@ -26,11 +26,15 @@ class StaticBenchmarkSpec:
         Human-readable capability name; if omitted, adapters may derive it.
     domain
         Domain label for EvalDataset (e.g. "math", "external").
-    """
+    +    offset
+    +        Optional starting index for slicing the underlying dataset. Used
+    +        together with `limit` to support chunked / array-style evaluation.
+    +    """
 
     benchmark_id: str
     split: str = "test"
     limit: Optional[int] = None
+    offset: Optional[int] = None
     area_id: str = "static_benchmarks"
     capability_id: Optional[str] = None
     capability_name: Optional[str] = None
diff --git a/src/eval_stages/static_benchmarks/xfinbench.py b/src/eval_stages/static_benchmarks/xfinbench.py
new file mode 100644
index 00000000..de944bc7
--- /dev/null
+++ b/src/eval_stages/static_benchmarks/xfinbench.py
@@ -0,0 +1,161 @@
+"""Adapter for the Zhihan/XFinBench static benchmark.
+
+Dataset card: https://huggingface.co/datasets/Zhihan/XFinBench
+
+We load from the CSV files inside the repo:
+- validation_set.csv
+- test_set.csv
+
+Columns (per inspection):
+- id: string (e.g. "vali_0", "test_0")
+- task: string, question type ("calcu", "mcq", etc.)
+- question: str, problem text (often includes tables/LaTeX)
+- choice: optional str, options text for MCQs (may contain newlines)
+- ground_truth: label/answer; for MCQ it's a letter like "A", for others numeric
+- figure: optional, ignored
+- fin_capability: capability tag, ignored here
+- gold_fin_term_id: term id, ignored here
+
+We use:
+- input: question (+ choices if present)
+- target: ground_truth normalized to string
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Iterable, List, Optional
+
+from datasets import load_dataset
+
+from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+from src.schemas.eval_schemas import EvalDataset
+
+
+def _normalize_target(task: str, val: Any) -> str:
+    """Normalize ground_truth to a string target."""
+    if val is None:
+        return ""
+
+    # Boolean validity questions: map to Yes / No.
+    if task == "bool":
+        # Accept numeric, bool, and string encodings.
+        if isinstance(val, (int, float)):
+            return "Yes" if float(val) != 0.0 else "No"
+        s = str(val).strip().lower()
+        if s in {"1", "1.0", "true", "yes"}:
+            return "Yes"
+        if s in {"0", "0.0", "false", "no"}:
+            return "No"
+        # Fallback: pass through.
+        return str(val).strip()
+
+    # For MCQ, dataset uses option letter like "A".
+    if task == "mcq":
+        return str(val).strip()
+
+    # For numeric / calculation tasks, just stringify (preserving decimals).
+    if isinstance(val, dict):
+        for key in ("ground_truth", "value", "answer"):
+            if key in val and val[key] is not None:
+                return str(val[key]).strip()
+        return str(val).strip()
+
+    return str(val).strip()
+
+
+def _build_input(task: str, question: str, choice: Any) -> str:
+    """Build model input from question plus optional choices."""
+    question = str(question or "").strip()
+
+    # For boolean tasks, explicitly request Yes/No.
+    if task == "bool":
+        if not question:
+            return ""
+        return f"Answer only 'Yes' or 'No'.\n\nStatement:\n{question}"
+
+    if not choice:
+        return question
+
+    choice_text = str(choice).strip()
+    if not choice_text:
+        return question
+
+    return f"{question}\n\nOptions:\n{choice_text}"
+
+
+def _iter_xfinbench_samples(
+    split: str,
+    offset: Optional[int],
+    limit: Optional[int],
+) -> Iterable[Dict[str, Any]]:
+    """Yield rows from Zhihan/XFinBench (CSV-backed) with offset/limit."""
+    ds_dict = load_dataset(
+        "Zhihan/XFinBench",
+        data_files={"validation": "validation_set.csv", "test": "test_set.csv"},
+    )
+    if split not in ds_dict:
+        raise ValueError(f"Unknown XFinBench split: {split}")
+
+    ds = ds_dict[split]
+    n = len(ds)
+
+    start = 0 if offset is None else max(0, int(offset))
+    if start >= n:
+        return iter(())
+
+    if limit is None:
+        end = n
+    else:
+        end = min(start + int(limit), n)
+
+    if start == 0 and end == n:
+        yield from ds
+        return
+
+    yield from ds.select(range(start, end))
+
+
+def build_eval_datasets_from_xfinbench(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
+    """Convert XFinBench into a single EvalDataset."""
+    tasks: List[Dict[str, str]] = []
+    offset: int = max(0, int(spec.offset or 0))
+
+    for local_idx, row in enumerate(
+        _iter_xfinbench_samples(spec.split, spec.offset, spec.limit)
+    ):
+        question = str(row.get("question", "")).strip()
+        task_type = str(row.get("task", "")).strip()
+        choice = row.get("choice")
+        figure = row.get("figure")
+        raw_gt = row.get("ground_truth")
+        target = _normalize_target(task_type, raw_gt)
+
+        # Skip image-based questions (figure present) and table-heavy prompts.
+        if figure is not None:
+            continue
+        if "\\begin{table" in question or "\\begin{tabular" in question:
+            continue
+
+        if not question or not target:
+            continue
+
+        inp = _build_input(task_type, question, choice)
+        global_idx = offset + local_idx
+        task_id = f"xfinbench_{global_idx:05d}"
+        tasks.append({"id": task_id, "input": inp, "target": target})
+
+    if not tasks:
+        return []
+
+    dataset = EvalDataset(
+        area_id=spec.area_id,
+        capability_id="xfinbench",
+        capability_name="XFinBench",
+        domain="finance",
+        tasks=tasks,
+        num_tasks=len(tasks),
+        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
+    )
+    return [dataset]
+

From 93cb82e6164789518735c484f8bdfd694de74111 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Thu, 2 Apr 2026 12:00:26 -0400
Subject: [PATCH 3/8] WIP

Made-with: Cursor
---
 .gitignore                                    |   17 +-
 scripts/static_benchmarks/bizbench_eval.sh    |   13 +-
 .../static_benchmarks/env_slurm_inspect.sh    |    7 +
 .../static_benchmarks/finance_math_eval.sh    |   11 +-
 .../static_benchmarks/finance_tasks_eval.sh   |    3 +
 scripts/static_benchmarks/hardmath_eval.sh    |    3 +
 scripts/static_benchmarks/harp_eval.sh        |    3 +
 scripts/static_benchmarks/math500_eval.sh     |    3 +
 scripts/static_benchmarks/minif2f_eval.sh     |    3 +
 scripts/static_benchmarks/omni_math_eval.sh   |    3 +
 scripts/static_benchmarks/orca_math_eval.sh   |    3 +
 scripts/static_benchmarks/proofnet_eval.sh    |    3 +
 .../seed_tasks_allbloom_eval.sh               |   68 +
 .../seed_tasks_no_create_eval.sh              |   68 +
 scripts/static_benchmarks/stateval_eval.sh    |    3 +
 scripts/static_benchmarks/wemath_eval.sh      |    3 +
 .../static_benchmarks/xfinbench_test_eval.sh  |    7 +-
 src/cfg/run_cfg.yaml                          |   89 +-
 src/eval_stages/__init__.py                   |    2 +
 src/eval_stages/stage0_static_benchmarks.py   |    1 +
 src/eval_stages/stage1_eval_execution.py      |   84 +-
 .../stage1_local_eval_execution.py            | 1106 +++++++++++++++++
 src/eval_stages/stage2_score_aggregation.py   |   86 +-
 .../static_benchmarks/finance_math.py         |   11 +-
 .../static_benchmarks/finance_tasks.py        |   18 +
 src/eval_stages/static_benchmarks/specs.py    |    5 +
 .../static_benchmarks/xfinbench.py            |    4 +-
 src/generate_embeddings.py                    |   59 +-
 src/run_eval_pipeline.py                      |   24 +-
 src/utils/data_utils.py                       |   30 +-
 src/utils/inspect_eval_utils.py               |   77 +-
 31 files changed, 1749 insertions(+), 68 deletions(-)
 create mode 100644 scripts/static_benchmarks/env_slurm_inspect.sh
 create mode 100755 scripts/static_benchmarks/seed_tasks_allbloom_eval.sh
 create mode 100755 scripts/static_benchmarks/seed_tasks_no_create_eval.sh
 create mode 100644 src/eval_stages/stage1_local_eval_execution.py

diff --git a/.gitignore b/.gitignore
index d4d61f2a..b171f569 100644
--- a/.gitignore
+++ b/.gitignore
@@ -143,13 +143,16 @@ logs/
 src/outputs/
 outputs/
 
-# Evaluation outputs/results (large, generated)
-base_output/
-
-# Slurm / batch logs (often checked in accidentally)
-*.out
-*.err
-
 # inspect result logs
 seed_datasets_inspect_logs/
 seed_tasks_results/
+
+# Generated experiment/evaluation artifacts
+base_output/
+base_output_tmp/
+
+# Local benchmark/task JSON exports
+finance_tasks.json
+seed_tasks.json
+task_4.json
+tasks_2.json
diff --git a/scripts/static_benchmarks/bizbench_eval.sh b/scripts/static_benchmarks/bizbench_eval.sh
index 4620f7e6..a86f59ab 100644
--- a/scripts/static_benchmarks/bizbench_eval.sh
+++ b/scripts/static_benchmarks/bizbench_eval.sh
@@ -2,22 +2,25 @@
 #SBATCH --job-name=bizbench_eval
 #SBATCH --output=logs/bizbench_eval_%A_%a.out
 #SBATCH --error=logs/bizbench_eval_%A_%a.err
-#SBATCH --time=04:00:00
+#SBATCH --time=08:00:00
 #SBATCH --cpus-per-task=4
 #SBATCH --mem=16G
-#SBATCH --array=0-9
+#SBATCH --array=0-50
 
 set -euo pipefail
 
-cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
 
 # Allow running either via sbatch (with SLURM_ARRAY_TASK_ID set)
 # or directly (default to a single chunk 0).
 : "${SLURM_ARRAY_TASK_ID:=0}"
 
-CHUNK=500
+CHUNK=100
 OFFSET=$((SLURM_ARRAY_TASK_ID * CHUNK))
-VALIDATION_TAG="_BIZBENCH_${SLURM_ARRAY_TASK_ID}_$(date +%Y%m%d_%H%M%S)"
+VALIDATION_TAG="_BIZBENCH_Commercial_${SLURM_ARRAY_TASK_ID}_SundayNight"
 
 # Stage 0_static: build datasets from kensho/bizbench
 python -m src.run_eval_pipeline \
diff --git a/scripts/static_benchmarks/env_slurm_inspect.sh b/scripts/static_benchmarks/env_slurm_inspect.sh
new file mode 100644
index 00000000..3ce2fd89
--- /dev/null
+++ b/scripts/static_benchmarks/env_slurm_inspect.sh
@@ -0,0 +1,7 @@
+# Sourced by *_eval.sh SLURM jobs. Puts platformdirs user_data / cache on local
+# scratch so Inspect's samplebuffer and logging are not on flaky NFS home mounts.
+if [ -n "${SLURM_TMPDIR:-}" ]; then
+  export XDG_DATA_HOME="${SLURM_TMPDIR}/inspect_xdg_data"
+  export XDG_CACHE_HOME="${SLURM_TMPDIR}/inspect_xdg_cache"
+  mkdir -p "$XDG_DATA_HOME" "$XDG_CACHE_HOME"
+fi
diff --git a/scripts/static_benchmarks/finance_math_eval.sh b/scripts/static_benchmarks/finance_math_eval.sh
index b4a40650..aaa8dd1c 100644
--- a/scripts/static_benchmarks/finance_math_eval.sh
+++ b/scripts/static_benchmarks/finance_math_eval.sh
@@ -5,19 +5,22 @@
 #SBATCH --time=04:00:00
 #SBATCH --cpus-per-task=4
 #SBATCH --mem=16G
-#SBATCH --array=0-9
+#SBATCH --array=0-60
 
 set -euo pipefail
 
-cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
 
 # Allow running via sbatch (with SLURM_ARRAY_TASK_ID) or directly (defaults to 0).
 : "${SLURM_ARRAY_TASK_ID:=0}"
 
 # FinanceMath validation has 121 non-table tasks after filtering.
-CHUNK=20
+CHUNK=50
 OFFSET=$((SLURM_ARRAY_TASK_ID * CHUNK))
-VALIDATION_TAG="_FINANCE_MATH_${SLURM_ARRAY_TASK_ID}_$(date +%Y%m%d_%H%M%S)"
+VALIDATION_TAG="_FINANCE_MATH_${SLURM_ARRAY_TASK_ID}_SundayNight"
 
 # Stage 0_static: build datasets from yale-nlp/FinanceMath (validation split only)
 python -m src.run_eval_pipeline \
diff --git a/scripts/static_benchmarks/finance_tasks_eval.sh b/scripts/static_benchmarks/finance_tasks_eval.sh
index 14f41f1e..b00e5e88 100755
--- a/scripts/static_benchmarks/finance_tasks_eval.sh
+++ b/scripts/static_benchmarks/finance_tasks_eval.sh
@@ -10,6 +10,9 @@ set -euo pipefail
 
 cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
 VALIDATION_TAG="_FINANCE_TASKS_$(date +%Y%m%d_%H%M%S)"
 
 # Stage 0_static: build datasets from local finance_tasks.json
diff --git a/scripts/static_benchmarks/hardmath_eval.sh b/scripts/static_benchmarks/hardmath_eval.sh
index a309e089..864b37d5 100755
--- a/scripts/static_benchmarks/hardmath_eval.sh
+++ b/scripts/static_benchmarks/hardmath_eval.sh
@@ -10,6 +10,9 @@ set -euo pipefail
 
 cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
 VALIDATION_TAG="_HARDMATH_$(date +%Y%m%d_%H%M%S)"
 
 python -m src.run_eval_pipeline \
diff --git a/scripts/static_benchmarks/harp_eval.sh b/scripts/static_benchmarks/harp_eval.sh
index 952cb81c..329aabf1 100644
--- a/scripts/static_benchmarks/harp_eval.sh
+++ b/scripts/static_benchmarks/harp_eval.sh
@@ -10,6 +10,9 @@ set -euo pipefail
 
 cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
 VALIDATION_TAG="_HARP_$(date +%Y%m%d_%H%M%S)"
 
 # Stage 0_static: build datasets from aadityasingh/HARP (main JSONL split)
diff --git a/scripts/static_benchmarks/math500_eval.sh b/scripts/static_benchmarks/math500_eval.sh
index dfa33048..2245f0bc 100755
--- a/scripts/static_benchmarks/math500_eval.sh
+++ b/scripts/static_benchmarks/math500_eval.sh
@@ -10,6 +10,9 @@ set -euo pipefail
 
 cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
 VALIDATION_TAG="_MATH500_$(date +%Y%m%d_%H%M%S)"
 
 # Stage 0_static: build datasets from HF MATH-500
diff --git a/scripts/static_benchmarks/minif2f_eval.sh b/scripts/static_benchmarks/minif2f_eval.sh
index 8e5bd38c..6985bb77 100644
--- a/scripts/static_benchmarks/minif2f_eval.sh
+++ b/scripts/static_benchmarks/minif2f_eval.sh
@@ -10,6 +10,9 @@ set -euo pipefail
 
 cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
 VALIDATION_TAG="_MINIF2F_$(date +%Y%m%d_%H%M%S)"
 
 # Stage 0_static: build datasets from Tonic/MiniF2F (train split only)
diff --git a/scripts/static_benchmarks/omni_math_eval.sh b/scripts/static_benchmarks/omni_math_eval.sh
index 3cc4bc8a..d3b50898 100644
--- a/scripts/static_benchmarks/omni_math_eval.sh
+++ b/scripts/static_benchmarks/omni_math_eval.sh
@@ -10,6 +10,9 @@ set -euo pipefail
 
 cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
 VALIDATION_TAG="_OMNI_MATH_$(date +%Y%m%d_%H%M%S)"
 
 # Stage 0_static: build datasets from KbsdJames/Omni-MATH (test split)
diff --git a/scripts/static_benchmarks/orca_math_eval.sh b/scripts/static_benchmarks/orca_math_eval.sh
index 4e914c9e..37096739 100644
--- a/scripts/static_benchmarks/orca_math_eval.sh
+++ b/scripts/static_benchmarks/orca_math_eval.sh
@@ -10,6 +10,9 @@ set -euo pipefail
 
 cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
 VALIDATION_TAG="_ORCA_MATH_$(date +%Y%m%d_%H%M%S)"
 
 # Stage 0_static: build datasets from microsoft/orca-math-word-problems-200k (train split only)
diff --git a/scripts/static_benchmarks/proofnet_eval.sh b/scripts/static_benchmarks/proofnet_eval.sh
index d200e516..b68faeb2 100644
--- a/scripts/static_benchmarks/proofnet_eval.sh
+++ b/scripts/static_benchmarks/proofnet_eval.sh
@@ -10,6 +10,9 @@ set -euo pipefail
 
 cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
 VALIDATION_TAG="_PROOFNET_$(date +%Y%m%d_%H%M%S)"
 
 # Stage 0_static: build datasets from hoskinson-center/proofnet (plain_text, validation)
diff --git a/scripts/static_benchmarks/seed_tasks_allbloom_eval.sh b/scripts/static_benchmarks/seed_tasks_allbloom_eval.sh
new file mode 100755
index 00000000..48e8d1ea
--- /dev/null
+++ b/scripts/static_benchmarks/seed_tasks_allbloom_eval.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#SBATCH --job-name=seed_tasks_allbloom_eval
+#SBATCH --output=logs/seed_tasks_allbloom_eval_%j.out
+#SBATCH --error=logs/seed_tasks_allbloom_eval_%j.err
+#SBATCH --time=04:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+set -euo pipefail
+
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
+VALIDATION_TAG="_SEED_TASKS_ALLBLOOMS_date_TuesdayNight_GPT_OSS_120B"
+
+# Stage 0_static: build datasets from local seed_tasks.json
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=seed_tasks.json \
+  +static_benchmark_cfg.split=na \
+  +static_benchmark_cfg.domain=finance \
+  +static_benchmark_cfg.capability_id=seed_tasks_allblooms \
+  +static_benchmark_cfg.capability_name=SeedTasksAllBlooms \
+  +static_benchmark_cfg.exclude_bloom_create=false
+
+# Stage 1: run subject models on the static datasets
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+# Stage 2: aggregate scores
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+# Optional: generate flattened JSONL views of Inspect logs for easier reading
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
+
diff --git a/scripts/static_benchmarks/seed_tasks_no_create_eval.sh b/scripts/static_benchmarks/seed_tasks_no_create_eval.sh
new file mode 100755
index 00000000..86ab9920
--- /dev/null
+++ b/scripts/static_benchmarks/seed_tasks_no_create_eval.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#SBATCH --job-name=seed_tasks_no_create_eval
+#SBATCH --output=logs/seed_tasks_no_create_eval_%j.out
+#SBATCH --error=logs/seed_tasks_no_create_eval_%j.err
+#SBATCH --time=04:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+set -euo pipefail
+
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
+VALIDATION_TAG="_SEED_TASKS_NO_CREATE_$(date_TuesdayNight)"
+
+# Stage 0_static: build datasets from local seed_tasks.json (exclude Create bloom level)
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=seed_tasks.json \
+  +static_benchmark_cfg.split=na \
+  +static_benchmark_cfg.domain=finance \
+  +static_benchmark_cfg.capability_id=seed_tasks_no_create \
+  +static_benchmark_cfg.capability_name=SeedTasksNoCreate \
+  +static_benchmark_cfg.exclude_bloom_create=true
+
+# Stage 1: run subject models on the static datasets
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+# Stage 2: aggregate scores
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+# Optional: generate flattened JSONL views of Inspect logs for easier reading
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
+
diff --git a/scripts/static_benchmarks/stateval_eval.sh b/scripts/static_benchmarks/stateval_eval.sh
index 097fef77..d30f2178 100755
--- a/scripts/static_benchmarks/stateval_eval.sh
+++ b/scripts/static_benchmarks/stateval_eval.sh
@@ -10,6 +10,9 @@ set -euo pipefail
 
 cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
 VALIDATION_TAG="_STATEVAL_$(date +%Y%m%d_%H%M%S)"
 
 python -m src.run_eval_pipeline \
diff --git a/scripts/static_benchmarks/wemath_eval.sh b/scripts/static_benchmarks/wemath_eval.sh
index eff18509..cfd8bfcb 100755
--- a/scripts/static_benchmarks/wemath_eval.sh
+++ b/scripts/static_benchmarks/wemath_eval.sh
@@ -10,6 +10,9 @@ set -euo pipefail
 
 cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
 VALIDATION_TAG="_WEMATH_$(date +%Y%m%d_%H%M%S)"
 
 python -m src.run_eval_pipeline \
diff --git a/scripts/static_benchmarks/xfinbench_test_eval.sh b/scripts/static_benchmarks/xfinbench_test_eval.sh
index 62caf544..3cbc810b 100755
--- a/scripts/static_benchmarks/xfinbench_test_eval.sh
+++ b/scripts/static_benchmarks/xfinbench_test_eval.sh
@@ -9,7 +9,10 @@
 
 set -euo pipefail
 
-cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
 
 # Allow running via sbatch (with SLURM_ARRAY_TASK_ID) or directly (defaults to 0).
 : "${SLURM_ARRAY_TASK_ID:=0}"
@@ -17,7 +20,7 @@ cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 # 10 chunks over ~2828 filtered test examples → ~300 per chunk
 CHUNK=300
 OFFSET=$((SLURM_ARRAY_TASK_ID * CHUNK))
-VALIDATION_TAG="_XFINBENCH_TEST_${SLURM_ARRAY_TASK_ID}_$(date +%Y%m%d_%H%M%S)"
+VALIDATION_TAG="_XFINBENCH_TEST_${SLURM_ARRAY_TASK_ID}_SundayNight"
 
 # Stage 0_static: build datasets from Zhihan/XFinBench (test split, CSV-backed HF repo)
 python -m src.run_eval_pipeline \
diff --git a/src/cfg/run_cfg.yaml b/src/cfg/run_cfg.yaml
index d7de581b..2ee0861d 100644
--- a/src/cfg/run_cfg.yaml
+++ b/src/cfg/run_cfg.yaml
@@ -78,15 +78,98 @@ task_generation_cfg:
 eval_cfg:
   # LLMs to evaluate (required)
   subject_llms:
-    - name: gpt-4o
+    # - name: claude-haiku-4-5-20251001
+    #   provider: anthropic
+    # - name: claude-opus-4-6
+    #   provider: anthropic
+    # - name: gemini-2.5-flash-lite
+    #   provider: google
+    # - name: gemini-3.1-pro-preview
+    #   provider: google
+    # - name: gpt-4.1-mini
+    #   provider: openai
+    # - name: gpt-4.1
+    #   provider: openai
+    # - name: gpt-5.4
+    #   provider: openai
+
+    # - name: Qwen2.5-3B-Instruct
+    #   provider: openai
+    #   base_url: http://bn064:8458/v1
+
+    # - name: Qwen2.5-7B-Instruct
+    #   provider: openai
+    #   base_url: http://bn064:60530/v1
+
+    # - name: Qwen2.5-32B-Instruct
+    #   provider: openai
+    #   base_url: http://bn065:63335/v1
+
+    # - name: Meta-Llama-3.1-8B-Instruct
+    #   provider: openai
+    #   base_url: http://bn064:58891/v1
+
+
+    # - name: Qwen3-8B
+    #   provider: openai
+    #   base_url: http://bn067:45029/v1
+    - name: gpt-oss-120b
       provider: openai
-    - name: claude-3-sonnet
-      provider: anthropic
+      base_url: http://bn064:36884/v1
+    # - name: Qwen3-14B
+    #   provider: openai
+    #   base_url: http://bn075:50127/v1
+    # - name: Qwen3-32B
+    #   provider: openai
+    #   base_url: http://bn069:36543/v1
+
+    # - name: gemini-3-flash-preview
+    #   provider: google
+    # - name: gemini-3-pro-preview
+    #   provider: google
+    # - name: gpt-5-mini
+    #   provider: openai
+    # - name: gpt-5.2
+    #   provider: openai
+    # - name: claude-sonnet-4-6
+    #   provider: anthropic
+    # - name: claude-opus-4-6
+    #   provider: anthropic
+    # - name: gpt-4o
+    #   provider: openai
+    # - name: gpt-4.1-mini
+    #   provider: openai
+    # - name: gemini-2.5-flash
+    #   provider: google
+    # - name: claude-haiku-4-5-20251001
+    #   provider: anthropic
+    # - name: gemini-2.5-flash-lite
+    #   provider: google
+    # - name: gpt-3.5-turbo-0125
+    #   provider: openai
 
   # Judge LLM for scoring (required)
   judge_llm:
+    # name: Qwen2.5-7B-Instruct
+    # provider: hf_local
+    # model_path: /model-weights/Qwen2.5-7B-Instruct
+    # inference_backend: vllm
+    # trust_remote_code: true
+    # gpu_memory_utilization: 0.9
+    # tensor_parallel_size: 1
+    # batch_size: 64
+    # generation_cfg:
+    #   max_tokens: 8
+    #   temperature: 0.0
+
     name: gpt-4o-mini
     provider: openai
+    base_url: https://api.openai.com/v1
+    # For API judges, we interpret batch_size as "max concurrent async requests".
+    batch_size: 24
+    generation_cfg:
+      max_tokens: 8
+      temperature: 0.0
 
 # =============================================================================
 # HYDRA
diff --git a/src/eval_stages/__init__.py b/src/eval_stages/__init__.py
index 1a5ab861..5d22d8c3 100644
--- a/src/eval_stages/__init__.py
+++ b/src/eval_stages/__init__.py
@@ -8,6 +8,7 @@
 from src.eval_stages.stage0_setup_and_dataset import EvalSetupError, run_eval_stage0
 from src.eval_stages.stage0_static_benchmarks import run_eval_stage0_static
 from src.eval_stages.stage1_eval_execution import run_eval_stage1
+from src.eval_stages.stage1_local_eval_execution import run_eval_stage1_local
 from src.eval_stages.stage2_score_aggregation import run_eval_stage2
 
 
@@ -15,6 +16,7 @@
     "run_eval_stage0",
     "run_eval_stage0_static",
     "run_eval_stage1",
+    "run_eval_stage1_local",
     "run_eval_stage2",
     "EvalSetupError",
 ]
diff --git a/src/eval_stages/stage0_static_benchmarks.py b/src/eval_stages/stage0_static_benchmarks.py
index c7c53cbe..e9b9a597 100644
--- a/src/eval_stages/stage0_static_benchmarks.py
+++ b/src/eval_stages/stage0_static_benchmarks.py
@@ -158,6 +158,7 @@ def run_eval_stage0_static(cfg: DictConfig, validation_tag: str) -> None:
         capability_id=static_cfg.get("capability_id"),
         capability_name=static_cfg.get("capability_name"),
         domain=str(static_cfg.get("domain", StaticBenchmarkSpec.domain)),
+        exclude_bloom_create=static_cfg.get("exclude_bloom_create", True),
     )
 
     logger.info(
diff --git a/src/eval_stages/stage1_eval_execution.py b/src/eval_stages/stage1_eval_execution.py
index a417c140..0ac5e35b 100644
--- a/src/eval_stages/stage1_eval_execution.py
+++ b/src/eval_stages/stage1_eval_execution.py
@@ -7,18 +7,21 @@
 """
 
 import logging
+import os
 import time
 from pathlib import Path
-from typing import Dict, List, Optional, Set
+from typing import Any, Dict, List, Optional, Set, Union
 
 from inspect_ai import Task
 from inspect_ai import eval as inspect_eval
 from inspect_ai import eval_retry as inspect_eval_retry
 from inspect_ai.dataset import MemoryDataset, Sample
 from inspect_ai.log import read_eval_log
+from inspect_ai.model import Model as InspectModel
+from inspect_ai.model import get_model
 from inspect_ai.scorer import model_graded_fact
 from inspect_ai.solver import generate
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 
 from src.schemas.eval_io_utils import (
     load_eval_config,
@@ -27,12 +30,32 @@
 )
 from src.schemas.eval_schemas import EvalDataset
 from src.schemas.metadata_schemas import PipelineMetadata
+from src.utils.constants import DEFAULT_OPENAI_BASE_URL
 from src.utils.timestamp_utils import iso_timestamp, timestamp_tag
 
 
 logger = logging.getLogger(__name__)
 
 
+def _inspect_judge_model(judge_llm: Dict[str, Any]) -> Union[str, InspectModel]:
+    """Resolve judge for Inspect so it can use a different API base than the subject."""
+    provider = str(judge_llm.get("provider", "openai"))
+    name = str(judge_llm["name"])
+    uri = f"{provider}/{name}"
+    if provider == "openai":
+        base_url = judge_llm.get("base_url") or os.environ.get(
+            "OPENAI_JUDGE_BASE_URL"
+        ) or DEFAULT_OPENAI_BASE_URL
+        api_key = os.environ.get("OPENAI_API_KEY")
+        logger.info(
+            "Inspect judge model: %s (base_url=%s)",
+            uri,
+            base_url,
+        )
+        return get_model(uri, base_url=base_url, api_key=api_key)
+    return uri
+
+
 def _find_datasets(datasets_dir: Path) -> List[Path]:
     """Return all Stage 0 dataset files."""
     if not datasets_dir.exists():
@@ -151,7 +174,7 @@ def _find_retry_log(
 
 def _create_inspect_task(
     dataset: EvalDataset,
-    judge_model: str,
+    judge_model: Union[str, InspectModel],
 ) -> "Task":
     """Build an Inspect task for one capability dataset."""
     # Create Inspect samples from our dataset
@@ -178,10 +201,12 @@ def _create_inspect_task(
 def _run_inspect_eval(
     dataset: EvalDataset,
     subject_llm: str,
-    judge_llm: Dict[str, str],
+    judge_llm: Dict[str, Any],
+    subject_llm_config: Optional[Dict[str, Any]],
     output_dir: Path,
     *,
     max_attempts: int = 3,
+    max_tasks: Optional[int] = None,
 ) -> bool:
     """Run an Inspect eval for one capability/LLM pair with auto-retry.
 
@@ -190,8 +215,19 @@ def _run_inspect_eval(
     that happens, Inspect often leaves a partial log that can be resumed via
     `inspect_eval_retry`.
     """
-    # Format model names for Inspect (provider/model)
-    judge_model = f"{judge_llm['provider']}/{judge_llm['name']}"
+    judge_model = _inspect_judge_model(judge_llm)
+
+    subject_base_url: Optional[str] = None
+    if subject_llm_config:
+        subject_base_url = subject_llm_config.get("base_url") or os.environ.get(
+            "OPENAI_SUBJECT_BASE_URL"
+        )
+    if subject_base_url:
+        logger.info(
+            "Inspect subject model: %s (model_base_url=%s)",
+            subject_llm,
+            subject_base_url,
+        )
 
     expected_task_ids = {str(task["id"]) for task in dataset.tasks}
 
@@ -204,12 +240,16 @@ def _run_inspect_eval(
             # Inspect saves logs to the specified directory
             output_dir.mkdir(parents=True, exist_ok=True)
 
-            inspect_eval(
-                task,
-                model=subject_llm,
-                log_dir=str(output_dir),
-                log_format="json",
-            )
+            inspect_kwargs: Dict[str, Any] = {
+                "tasks": task,
+                "model": subject_llm,
+                "log_dir": str(output_dir),
+                "log_format": "json",
+                "max_tasks": max_tasks,
+            }
+            if subject_base_url:
+                inspect_kwargs["model_base_url"] = subject_base_url
+            inspect_eval(**inspect_kwargs)
 
             return True
 
@@ -330,6 +370,12 @@ def run_eval_stage1(
     # Run evaluations
     subject_llms = eval_config.subject_llms
     judge_llm = eval_config.judge_llm
+    # Concurrency for Inspect eval. Higher is faster for remote endpoints.
+    inspect_max_tasks = None
+    try:
+        inspect_max_tasks = int(cfg.get("eval_cfg", {}).get("inspect_max_tasks"))
+    except Exception:
+        inspect_max_tasks = None
 
     num_completed_this_run = 0
     num_skipped_completed = 0
@@ -391,12 +437,24 @@ def run_eval_stage1(
                     subject_model,
                 )
 
+                judge_plain = (
+                    OmegaConf.to_container(judge_llm, resolve=True)
+                    if OmegaConf.is_config(judge_llm)
+                    else dict(judge_llm)
+                )
+                subject_plain = (
+                    OmegaConf.to_container(llm_config, resolve=True)
+                    if OmegaConf.is_config(llm_config)
+                    else dict(llm_config)
+                )
                 success = _run_inspect_eval(
                     dataset=dataset,
                     subject_llm=subject_model,
-                    judge_llm=judge_llm,
+                    judge_llm=judge_plain,
+                    subject_llm_config=subject_plain,
                     output_dir=output_dir,
                     max_attempts=int(cfg.get("eval_cfg", {}).get("max_attempts", 3)),
+                    max_tasks=inspect_max_tasks,
                 )
 
             if success:
diff --git a/src/eval_stages/stage1_local_eval_execution.py b/src/eval_stages/stage1_local_eval_execution.py
new file mode 100644
index 00000000..bfc04162
--- /dev/null
+++ b/src/eval_stages/stage1_local_eval_execution.py
@@ -0,0 +1,1106 @@
+"""Eval Stage 1_local: direct evaluation without Inspect.
+
+This stage runs subject models directly, including local HuggingFace models
+loaded from disk via `provider: hf_local`. Local HF models can run through
+`transformers` or `vllm`, then each response is judged and written to the final
+`flat_<capability>.jsonl` output expected by downstream workflows.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import gc
+import asyncio
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
+
+from omegaconf import DictConfig
+import torch
+from tqdm.auto import tqdm
+import re
+
+from src.model import Model
+from src.schemas.eval_io_utils import load_eval_config, load_eval_dataset, save_eval_config
+from src.schemas.eval_schemas import EvalDataset
+from src.schemas.metadata_schemas import PipelineMetadata
+from src.utils.inspect_eval_utils import LLM_JUDGE_PROMPT, parse_submission
+from src.utils.timestamp_utils import iso_timestamp, timestamp_tag
+
+logger = logging.getLogger(__name__)
+
+
+def _find_datasets(datasets_dir: Path) -> List[Path]:
+    """Return all Stage 0 dataset files."""
+    if not datasets_dir.exists():
+        return []
+    return sorted(datasets_dir.rglob("dataset.json"))
+
+
+def _score_value_to_float(value: object) -> Optional[float]:
+    """Convert letter/number score to float when possible."""
+    if isinstance(value, (int, float)):
+        return float(value)
+
+    if isinstance(value, str):
+        upper = value.strip().upper()
+        if upper == "C":
+            return 1.0
+        if upper == "I":
+            return 0.0
+        try:
+            return float(value)
+        except ValueError:
+            return None
+
+    return None
+
+
+def _flat_result_path(output_dir: Path, capability_id: str) -> Path:
+    return output_dir / f"flat_{capability_id}.jsonl"
+
+
+def _read_flat_rows(flat_path: Path) -> List[Dict[str, Any]]:
+    """Read non-summary rows from a flat jsonl file."""
+    if not flat_path.exists():
+        return []
+
+    rows: List[Dict[str, Any]] = []
+    with open(flat_path, "r", encoding="utf-8") as f:
+        # Skip summary line
+        try:
+            next(f)
+        except StopIteration:
+            return []
+        for line in f:
+            if line.strip():
+                rows.append(json.loads(line))
+    return rows
+
+
+def _check_flat_completed(flat_path: Path, expected_task_ids: Set[str]) -> bool:
+    """Return True if flat file has exactly the expected task IDs."""
+    if not flat_path.exists() or not expected_task_ids:
+        return False
+    rows = _read_flat_rows(flat_path)
+    row_ids = {str(row.get("id", "")) for row in rows if row.get("id") is not None}
+    return row_ids == expected_task_ids
+
+
+def _write_flat_results(output_path: Path, rows: List[Dict[str, Any]]) -> None:
+    """Write rows in the same schema as flatten_inspect_logs.py."""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    num_samples = len(rows)
+    num_correct = sum(1 for row in rows if row.get("grade") == "C")
+    num_incorrect = sum(1 for row in rows if row.get("grade") == "I")
+    accuracy = (num_correct / num_samples) if num_samples else 0.0
+    f1 = accuracy
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        summary = {
+            "summary": True,
+            "num_samples": num_samples,
+            "num_correct": num_correct,
+            "num_incorrect": num_incorrect,
+            "accuracy": accuracy,
+            "f1": f1,
+        }
+        f.write(json.dumps(summary, ensure_ascii=False) + "\n")
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+def _format_prompt(dataset: EvalDataset, task: Dict[str, str]) -> str:
+    """Render the Stage 0 prompt template for a task."""
+    template = dataset.prompt_template or "{input}"
+    try:
+        prompt = template.format(input=task["input"])
+    except Exception:  # noqa: BLE001
+        prompt = str(task["input"])
+
+    is_mcq = bool(re.search(r"(?im)^\s*options\s*:\s*$", str(task.get("input", ""))))
+    if is_mcq:
+        answer_instruction = (
+            "\n\nReason briefly and do not repeat yourself. Stop immediately after the final "
+            "answer line.\n\nThis is a multiple-choice question. On the last line, return ONLY "
+            "the option letter in machine-readable form as `ANSWER: <LETTER>` "
+            "(e.g., `ANSWER: B`). Do NOT return a number, currency amount, or explanation "
+            "on the final answer line."
+        )
+    else:
+        answer_instruction = (
+            "\n\nReason briefly and do not repeat yourself. Stop immediately after the final "
+            "answer line.\n\nReturn your final answer in a machine-readable form on the last "
+            "line as `ANSWER: <final answer>`."
+        )
+    return prompt + answer_instruction
+
+
+def _build_model(model_config: Dict[str, Any]) -> Model:
+    """Instantiate a repo Model from eval subject/judge config."""
+    model_kwargs = {
+        key: value
+        for key, value in model_config.items()
+        if key not in {"name", "provider", "generation_cfg"}
+    }
+    return Model(
+        model_name=str(model_config["name"]),
+        model_provider=str(model_config.get("provider", "openai")),
+        **model_kwargs,
+    )
+
+
+def _is_hf_local_provider(provider: str) -> bool:
+    """Return True for direct HuggingFace local model providers."""
+    return provider in {"hf_local", "local_hf", "transformers"}
+
+
+def _uses_vllm_backend(model_config: Dict[str, Any]) -> bool:
+    """Return True when a local HF model should run via vLLM."""
+    backend = str(model_config.get("inference_backend", "transformers")).lower()
+    return _is_hf_local_provider(str(model_config.get("provider", ""))) and (
+        backend == "vllm"
+    )
+
+
+def _build_messages(sys_prompt: str, user_prompt: str) -> List[Dict[str, str]]:
+    """Build chat-style messages for subject generation."""
+    messages: List[Dict[str, str]] = []
+    if sys_prompt.strip():
+        messages.append({"role": "system", "content": sys_prompt})
+    messages.append({"role": "user", "content": user_prompt})
+    return messages
+
+
+def _render_text_prompt(tokenizer: Any, *, sys_prompt: str, user_prompt: str) -> str:
+    """Render a text prompt, using chat templates when available."""
+    messages = _build_messages(sys_prompt, user_prompt)
+    if tokenizer is not None and hasattr(tokenizer, "apply_chat_template"):
+        return tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+    if sys_prompt.strip():
+        return f"{sys_prompt.strip()}\n\n{user_prompt}".strip()
+    return user_prompt
+
+
+def _load_hf_local_model(
+    model_config: Dict[str, Any],
+) -> Tuple[Any, Any]:
+    """Load a local HuggingFace causal LM and tokenizer."""
+    try:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+    except Exception as exc:  # noqa: BLE001
+        raise RuntimeError(
+            "transformers is required for provider=hf_local in stage=1_local"
+        ) from exc
+
+    model_path = model_config.get("model_path")
+    if not model_path:
+        raise ValueError(
+            "provider=hf_local requires `model_path` in subject_llms config"
+        )
+
+    trust_remote_code = bool(model_config.get("trust_remote_code", True))
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path,
+        trust_remote_code=trust_remote_code,
+    )
+
+    if tokenizer.pad_token is None and tokenizer.eos_token is not None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    if torch.cuda.is_available():
+        torch_dtype = torch.bfloat16
+        device_map = model_config.get("device_map", "auto")
+    else:
+        torch_dtype = torch.float32
+        device_map = model_config.get("device_map", None)
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        trust_remote_code=trust_remote_code,
+        torch_dtype=torch_dtype,
+        device_map=device_map,
+    )
+    model.eval()
+    return tokenizer, model
+
+
+def _load_vllm_model(model_config: Dict[str, Any]) -> Any:
+    """Load a local vLLM engine from disk."""
+    try:
+        from transformers import PreTrainedTokenizerBase
+    except Exception as exc:  # noqa: BLE001
+        raise RuntimeError(
+            "transformers is required for inference_backend=vllm in stage=1_local"
+        ) from exc
+
+    # vLLM 0.8.x still expects this tokenizer property, but it is missing in
+    # newer transformers builds used by Qwen tokenizers in this environment.
+    if not hasattr(PreTrainedTokenizerBase, "all_special_tokens_extended"):
+        PreTrainedTokenizerBase.all_special_tokens_extended = property(  # type: ignore[attr-defined]
+            lambda self: list(self.all_special_tokens)
+        )
+
+    # vLLM can crash when CUDA is initialized from forked worker processes.
+    # Default to the safer spawn mode unless the user explicitly overrides it.
+    os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+
+    try:
+        from vllm import LLM
+    except Exception as exc:  # noqa: BLE001
+        raise RuntimeError(
+            "vllm is required for inference_backend=vllm in stage=1_local"
+        ) from exc
+
+    model_path = model_config.get("model_path")
+    if not model_path:
+        raise ValueError(
+            "inference_backend=vllm requires `model_path` in subject_llms config"
+        )
+
+    llm_kwargs: Dict[str, Any] = {
+        "model": model_path,
+        "tokenizer": model_path,
+        "trust_remote_code": bool(model_config.get("trust_remote_code", True)),
+        "tensor_parallel_size": int(model_config.get("tensor_parallel_size", 1)),
+        "gpu_memory_utilization": float(
+            model_config.get("gpu_memory_utilization", 0.9)
+        ),
+        "dtype": model_config.get("dtype", "auto"),
+    }
+
+    if "max_model_len" in model_config:
+        llm_kwargs["max_model_len"] = int(model_config["max_model_len"])
+    if "enforce_eager" in model_config:
+        llm_kwargs["enforce_eager"] = bool(model_config["enforce_eager"])
+
+    return LLM(**llm_kwargs)
+
+
+def _normalize_text(text: str) -> str:
+    return " ".join(text.strip().split())
+
+
+def _last_sentence(text: str) -> str:
+    """
+    Extract the last "sentence-like" fragment from model output.
+
+    For this project we approximate "sentence" as the last non-empty line,
+    because many answers end in LaTeX blocks (e.g., `\\boxed{...}`) that may
+    not be punctuation-terminated.
+    """
+    if not text:
+        return ""
+    lines = [line.strip() for line in str(text).splitlines() if line.strip()]
+    if not lines:
+        return ""
+    last = lines[-1]
+    # Common LaTeX terminators sometimes end up as the last line alone.
+    if last in {"$$", "$"} and len(lines) >= 2:
+        last = lines[-2]
+    return last
+
+
+def _parse_mcq_options(question: str) -> Dict[str, str]:
+    """Parse MCQ options from a question string into {letter: option_text}."""
+    if not question:
+        return {}
+    lines = question.splitlines()
+    in_options = False
+    options: Dict[str, str] = {}
+    for line in lines:
+        if re.match(r"(?im)^\s*options\s*:\s*$", line):
+            in_options = True
+            continue
+        if not in_options:
+            continue
+        m = re.match(r"^\s*([A-Z])\s*[.)]\s*(.+?)\s*$", line.strip())
+        if not m:
+            # Stop when we leave the options block (blank line or non-option text)
+            if options and not line.strip():
+                break
+            continue
+        options[m.group(1).upper()] = m.group(2).strip()
+    return options
+
+
+def _extract_number(text: str) -> Optional[float]:
+    """Extract a numeric value from text (handles commas and currency)."""
+    if not text:
+        return None
+    m = re.search(r"[-+]?\d[\d,]*(?:\.\d+)?", text)
+    if not m:
+        return None
+    try:
+        return float(m.group(0).replace(",", ""))
+    except ValueError:
+        return None
+
+
+def _map_numeric_answer_to_option_letter(
+    *,
+    submission: str,
+    question: str,
+    target: str,
+    rel_tol: float = 1e-3,
+) -> Optional[str]:
+    """If target is a letter MCQ, map numeric submission to the closest matching option."""
+    target_letter = target.strip().upper()
+    if not re.fullmatch(r"[A-Z]", target_letter):
+        return None
+
+    options = _parse_mcq_options(question)
+    if not options:
+        return None
+
+    sub_val = _extract_number(submission)
+    if sub_val is None:
+        return None
+
+    best_letter: Optional[str] = None
+    for letter, opt_text in options.items():
+        opt_val = _extract_number(opt_text)
+        if opt_val is None:
+            continue
+        denom = max(1.0, abs(opt_val))
+        if abs(sub_val - opt_val) / denom <= rel_tol:
+            best_letter = letter
+            break
+    return best_letter
+
+
+def _generate_with_hf_local(
+    tokenizer: Any,
+    model: Any,
+    *,
+    sys_prompt: str,
+    user_prompt: str,
+    generation_config: Dict[str, Any],
+) -> str:
+    """Generate a response with a local HF causal LM."""
+    max_new_tokens = int(generation_config.get("max_tokens", 512))
+    temperature = float(generation_config.get("temperature", 0.0) or 0.0)
+    top_p = float(generation_config.get("top_p", 1.0) or 1.0)
+    repetition_penalty = float(generation_config.get("repetition_penalty", 1.0) or 1.0)
+    do_sample = temperature > 0
+
+    prompt = _render_text_prompt(
+        tokenizer,
+        sys_prompt=sys_prompt,
+        user_prompt=user_prompt,
+    )
+    encoded = tokenizer(prompt, return_tensors="pt")
+    input_ids = encoded["input_ids"]
+    attention_mask = encoded.get("attention_mask", torch.ones_like(input_ids))
+
+    model_device = next(model.parameters()).device
+    input_ids = input_ids.to(model_device)
+    attention_mask = attention_mask.to(model_device)
+
+    generate_kwargs = {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "max_new_tokens": max_new_tokens,
+        "do_sample": do_sample,
+        "pad_token_id": tokenizer.pad_token_id,
+        "eos_token_id": tokenizer.eos_token_id,
+        "repetition_penalty": repetition_penalty,
+    }
+    if do_sample:
+        generate_kwargs["temperature"] = temperature
+        generate_kwargs["top_p"] = top_p
+
+    with torch.inference_mode():
+        generated = model.generate(**generate_kwargs)
+
+    generated_tokens = generated[0][input_ids.shape[-1] :]
+    output_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+    return output_text.strip()
+
+
+def _generate_batch_with_hf_local(
+    tokenizer: Any,
+    model: Any,
+    *,
+    prompts: List[str],
+    generation_config: Dict[str, Any],
+) -> List[str]:
+    """Generate a batch of responses with a local HF causal LM."""
+    if not prompts:
+        return []
+
+    max_new_tokens = int(generation_config.get("max_tokens", 512))
+    temperature = float(generation_config.get("temperature", 0.0) or 0.0)
+    top_p = float(generation_config.get("top_p", 1.0) or 1.0)
+    repetition_penalty = float(generation_config.get("repetition_penalty", 1.0) or 1.0)
+    do_sample = temperature > 0
+
+    encoded = tokenizer(prompts, return_tensors="pt", padding=True)
+    input_ids = encoded["input_ids"]
+    attention_mask = encoded.get("attention_mask", torch.ones_like(input_ids))
+
+    model_device = next(model.parameters()).device
+    input_ids = input_ids.to(model_device)
+    attention_mask = attention_mask.to(model_device)
+
+    generate_kwargs = {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "max_new_tokens": max_new_tokens,
+        "do_sample": do_sample,
+        "pad_token_id": tokenizer.pad_token_id,
+        "eos_token_id": tokenizer.eos_token_id,
+        "repetition_penalty": repetition_penalty,
+    }
+    if do_sample:
+        generate_kwargs["temperature"] = temperature
+        generate_kwargs["top_p"] = top_p
+
+    with torch.inference_mode():
+        generated = model.generate(**generate_kwargs)
+
+    prompt_token_count = input_ids.shape[-1]
+    generated_texts: List[str] = []
+    for row_tokens in generated:
+        generated_tokens = row_tokens[prompt_token_count:]
+        output_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        generated_texts.append(output_text.strip())
+    return generated_texts
+
+
+def _generate_batch_with_vllm(
+    llm: Any,
+    *,
+    prompts: List[str],
+    generation_config: Dict[str, Any],
+) -> List[str]:
+    """Generate a batch of responses with vLLM."""
+    try:
+        from vllm import SamplingParams
+    except Exception as exc:  # noqa: BLE001
+        raise RuntimeError(
+            "vllm is required for inference_backend=vllm in stage=1_local"
+        ) from exc
+
+    sampling_params = SamplingParams(
+        max_tokens=int(generation_config.get("max_tokens", 512)),
+        temperature=float(generation_config.get("temperature", 0.0) or 0.0),
+        top_p=float(generation_config.get("top_p", 1.0) or 1.0),
+        repetition_penalty=float(
+            generation_config.get("repetition_penalty", 1.0) or 1.0
+        ),
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    generated_texts: List[str] = []
+    for output in outputs:
+        if output.outputs:
+            generated_texts.append((output.outputs[0].text or "").strip())
+        else:
+            generated_texts.append("")
+    return generated_texts
+
+
+def _batched(
+    items: List[Dict[str, Any]], batch_size: int
+) -> Iterator[List[Dict[str, Any]]]:
+    """Yield fixed-size batches from a list."""
+    if batch_size <= 0:
+        raise ValueError("batch_size must be positive")
+    for i in range(0, len(items), batch_size):
+        yield items[i : i + batch_size]
+
+
+def _build_judge_prompt(submission: str, target: str) -> str:
+    """Render the judge prompt for one submission/target pair."""
+    return LLM_JUDGE_PROMPT.format(submission=submission, target=target)
+
+
+def _judge_outputs_to_grades(outputs: List[str]) -> List[str]:
+    """Convert judge outputs to C/I grades."""
+    return [
+        "C" if output and output.strip().lower().startswith("yes") else "I"
+        for output in outputs
+    ]
+
+
+def _score_existing_row_ids(
+    flat_path: Path, expected_task_ids: Set[str]
+) -> Dict[str, Dict[str, Any]]:
+    """Load previously scored rows and keep only expected task IDs."""
+    row_by_id: Dict[str, Dict[str, Any]] = {}
+    for row in _read_flat_rows(flat_path):
+        row_id = str(row.get("id", ""))
+        if (
+            row_id
+            and row_id in expected_task_ids
+            and row.get("grade") in {"C", "I"}
+            and row_id not in row_by_id
+        ):
+            row_by_id[row_id] = row
+    return row_by_id
+
+
+def _ordered_rows(
+    tasks: List[Dict[str, Any]], row_by_id: Dict[str, Dict[str, Any]]
+) -> List[Dict[str, Any]]:
+    """Order scored rows to match the original dataset task order."""
+    return [
+        row_by_id[str(task["id"])]
+        for task in tasks
+        if str(task["id"]) in row_by_id
+    ]
+
+
+def _judge_batch(
+    rows: List[Dict[str, Any]],
+    *,
+    judge_generation_cfg: Dict[str, Any],
+    judge_model: Optional[Model] = None,
+    judge_tokenizer: Any = None,
+    judge_hf_model: Any = None,
+    judge_vllm_model: Any = None,
+    judge_vllm_tokenizer: Any = None,
+    max_concurrent_requests: int = 8,
+) -> List[Dict[str, Any]]:
+    """Judge a batch of rows, using exact-match shortcuts when possible."""
+    if not rows:
+        return []
+
+    scored_rows: List[Optional[Dict[str, Any]]] = [None] * len(rows)
+    unresolved_indices: List[int] = []
+    unresolved_prompts: List[str] = []
+    unresolved_task_ids: List[str] = []
+
+    for index, row in enumerate(rows):
+        raw_output = str(row["model_output"])
+        parsed_submission = parse_submission(raw_output) or raw_output
+        judge_submission = _last_sentence(raw_output) or parsed_submission
+        target = str(row["ground_truth"])
+        # If this is an MCQ with a letter target, allow mapping a numeric final answer
+        # back to an option letter based on the question's options.
+        mapped_letter = _map_numeric_answer_to_option_letter(
+            submission=parsed_submission,
+            question=str(row.get("question", "")),
+            target=target,
+        )
+        if mapped_letter is not None:
+            parsed_submission = mapped_letter
+        if _normalize_text(parsed_submission).lower() == _normalize_text(target).lower():
+            scored_rows[index] = {**row, "grade": "C"}
+            continue
+        unresolved_indices.append(index)
+        unresolved_task_ids.append(str(row.get("id", "")))
+        # Give the judge only the model's final fragment to reduce noise.
+        judge_prompt = _build_judge_prompt(judge_submission, target)
+        unresolved_prompts.append(judge_prompt)
+        logger.info(
+            "Judge input | task_id=%s\nSubmission used for judge:\n%s\nTarget:\n%s\nFull judge prompt:\n%s",
+            str(row.get("id", "")),
+            judge_submission,
+            target,
+            judge_prompt,
+        )
+
+    if unresolved_prompts:
+        if judge_vllm_model is not None:
+            prompts = [
+                _render_text_prompt(
+                    judge_vllm_tokenizer,
+                    sys_prompt="You are a careful, non-pedantic grading assistant.",
+                    user_prompt=prompt,
+                )
+                for prompt in unresolved_prompts
+            ]
+            judge_outputs = _generate_batch_with_vllm(
+                judge_vllm_model,
+                prompts=prompts,
+                generation_config=judge_generation_cfg,
+            )
+        elif judge_hf_model is not None:
+            prompts = [
+                _render_text_prompt(
+                    judge_tokenizer,
+                    sys_prompt="You are a careful, non-pedantic grading assistant.",
+                    user_prompt=prompt,
+                )
+                for prompt in unresolved_prompts
+            ]
+            judge_outputs = _generate_batch_with_hf_local(
+                judge_tokenizer,
+                judge_hf_model,
+                prompts=prompts,
+                generation_config=judge_generation_cfg,
+            )
+        else:
+            if judge_model is None:
+                raise ValueError("judge_model is required when no local judge backend is set")
+            async def _run_async_judge(prompts: List[str]) -> List[str]:
+                sem = asyncio.Semaphore(max(1, int(max_concurrent_requests)))
+
+                async def _one(p: str) -> str:
+                    async with sem:
+                        txt, _ = await judge_model.async_generate(
+                            sys_prompt="You are a careful, non-pedantic grading assistant.",
+                            user_prompt=p,
+                            generation_config=judge_generation_cfg,
+                        )
+                        return txt or ""
+
+                return list(await asyncio.gather(*(_one(p) for p in prompts)))
+
+            try:
+                judge_outputs = asyncio.run(_run_async_judge(unresolved_prompts))
+            except Exception:
+                # Fallback to synchronous calls if async event loop issues occur.
+                judge_outputs = []
+                for prompt in unresolved_prompts:
+                    judge_text, _ = judge_model.generate(
+                        sys_prompt="You are a careful, non-pedantic grading assistant.",
+                        user_prompt=prompt,
+                        generation_config=judge_generation_cfg,
+                    )
+                    judge_outputs.append(judge_text or "")
+
+        for index, grade in zip(
+            unresolved_indices,
+            _judge_outputs_to_grades(judge_outputs),
+            strict=True,
+        ):
+            scored_rows[index] = {**rows[index], "grade": grade}
+        for task_id, judge_output in zip(unresolved_task_ids, judge_outputs, strict=True):
+            logger.info(
+                "Judge output | task_id=%s | raw_output=%s",
+                task_id,
+                (judge_output or "").strip(),
+            )
+
+    return [row for row in scored_rows if row is not None]
+
+
+def _judge_submission(
+    submission: str,
+    target: str,
+    judge_model: Model,
+    judge_generation_cfg: Dict[str, Any],
+) -> str:
+    """Return C/I grade for a subject submission."""
+    parsed_submission = parse_submission(submission) or submission
+    if _normalize_text(parsed_submission).lower() == _normalize_text(target).lower():
+        return "C"
+
+    prompt = LLM_JUDGE_PROMPT.format(submission=parsed_submission, target=target)
+    judge_text, _ = judge_model.generate(
+        sys_prompt="You are a strict grading assistant.",
+        user_prompt=prompt,
+        generation_config=judge_generation_cfg,
+    )
+    if judge_text and judge_text.strip().lower().startswith("yes"):
+        return "C"
+    return "I"
+
+
+def run_eval_stage1_local(
+    cfg: DictConfig,
+    validation_tag: str,
+    eval_tag: Optional[str] = None,
+) -> str:
+    """Run local/direct Stage 1 evals and return eval_tag."""
+    exp_id = cfg.exp_cfg.exp_id
+    output_base_dir = Path(cfg.global_cfg.output_dir)
+    experiment_dir = output_base_dir / exp_id
+
+    datasets_dir = experiment_dir / "eval" / "datasets" / validation_tag
+    eval_config_path = datasets_dir / "eval_config.json"
+    if not eval_config_path.exists():
+        raise ValueError(
+            f"eval_config.json not found at {eval_config_path}. Run Stage 0 first."
+        )
+    eval_config, _ = load_eval_config(eval_config_path)
+
+    is_resume = eval_tag is not None
+    if eval_tag is None:
+        eval_tag = timestamp_tag()
+
+    logger.info(
+        "Eval Stage 1_local: Running direct evaluations (eval_tag=%s, resume=%s)",
+        eval_tag,
+        is_resume,
+    )
+
+    dataset_paths = _find_datasets(datasets_dir)
+    logger.info("Found %d datasets", len(dataset_paths))
+    if not dataset_paths:
+        raise ValueError(f"No datasets found in {datasets_dir}. Run Stage 0 first.")
+
+    datasets = [load_eval_dataset(p) for p in dataset_paths]
+
+    eval_dir = experiment_dir / "eval" / "results" / eval_tag
+    results_dir = eval_dir
+
+    eval_config.eval_tag = eval_tag
+    metadata = PipelineMetadata(
+        experiment_id=exp_id,
+        output_base_dir=str(output_base_dir),
+        timestamp=iso_timestamp(),
+        input_stage_tag=validation_tag,
+        output_stage_tag=eval_tag,
+        resume=is_resume,
+    )
+    results_config_path = eval_dir / "eval_config.json"
+    save_eval_config(eval_config, metadata, results_config_path)
+    logger.info("Saved eval_config.json to %s", results_config_path)
+
+    subject_llms = eval_config.subject_llms
+    judge_llm_cfg = dict(eval_config.judge_llm)
+    judge_generation_cfg = dict(judge_llm_cfg.get("generation_cfg", {}))
+    if "max_tokens" not in judge_generation_cfg:
+        judge_generation_cfg["max_tokens"] = 16
+    if "temperature" not in judge_generation_cfg:
+        judge_generation_cfg["temperature"] = 0
+    judge_provider = str(judge_llm_cfg.get("provider", "openai"))
+    judge_batch_size = int(judge_llm_cfg.get("batch_size", 32))
+    judge_using_vllm = _uses_vllm_backend(judge_llm_cfg)
+    judge_model: Optional[Model] = None
+    judge_tokenizer: Any = None
+    judge_hf_model: Any = None
+    # IMPORTANT: if judge is vLLM, we load it lazily per combination to avoid
+    # having subject-vLLM and judge-vLLM resident at the same time.
+    judge_vllm_model: Any = None
+    judge_vllm_tokenizer: Any = None
+    if _is_hf_local_provider(judge_provider) and not judge_using_vllm:
+        logger.info("Loading local HF judge %s", judge_llm_cfg["name"])
+        judge_tokenizer, judge_hf_model = _load_hf_local_model(judge_llm_cfg)
+    elif not judge_using_vllm:
+        judge_model = _build_model(judge_llm_cfg)
+
+    model_instances: Dict[Tuple[str, str], Model] = {}
+    hf_model_instances: Dict[Tuple[str, str], Tuple[Any, Any]] = {}
+    vllm_model_instances: Dict[Tuple[str, str], Any] = {}
+
+    num_completed_this_run = 0
+    num_skipped_completed = 0
+    num_failed = 0
+    num_incomplete = 0
+    total_combinations = len(datasets) * len(subject_llms)
+
+    combination_index = 0
+    for dataset in datasets:
+        expected_task_ids = {str(task["id"]) for task in dataset.tasks}
+        for llm_config in subject_llms:
+            combination_index += 1
+            llm_name = str(llm_config["name"])
+            llm_provider = str(llm_config.get("provider", "openai"))
+            using_vllm = _uses_vllm_backend(dict(llm_config))
+            logger.info(
+                "Combination %d/%d: Evaluating %s/%s with %s/%s%s",
+                combination_index,
+                total_combinations,
+                dataset.area_id,
+                dataset.capability_id,
+                llm_provider,
+                llm_name,
+                " [vllm]" if using_vllm else "",
+            )
+
+            output_dir = results_dir / llm_name / dataset.area_id / dataset.capability_id
+            flat_path = _flat_result_path(output_dir, dataset.capability_id)
+
+            if _check_flat_completed(flat_path, expected_task_ids):
+                logger.info(
+                    "  Skipping %s/%s with %s (already completed)",
+                    dataset.area_id,
+                    dataset.capability_id,
+                    llm_name,
+                )
+                num_skipped_completed += 1
+                continue
+
+            model_key = (llm_provider, llm_name)
+            subject_generation_cfg = dict(llm_config.get("generation_cfg", {}))
+            total_tasks = len(dataset.tasks)
+            batch_size = int(llm_config.get("batch_size", 16))
+            row_by_id = _score_existing_row_ids(flat_path, expected_task_ids)
+            pending_tasks = [
+                task for task in dataset.tasks if str(task["id"]) not in row_by_id
+            ]
+
+            if row_by_id:
+                logger.info(
+                    "  Resuming %s/%s with %d/%d tasks already scored",
+                    dataset.area_id,
+                    dataset.capability_id,
+                    len(row_by_id),
+                    total_tasks,
+                )
+                _write_flat_results(flat_path, _ordered_rows(dataset.tasks, row_by_id))
+
+            if not pending_tasks:
+                logger.info(
+                    "  Skipping %s/%s with %s (all tasks already scored)",
+                    dataset.area_id,
+                    dataset.capability_id,
+                    llm_name,
+                )
+                num_skipped_completed += 1
+                continue
+
+            if using_vllm:
+                if model_key not in vllm_model_instances:
+                    logger.info("  Loading vLLM engine for %s", llm_name)
+                    vllm_model_instances[model_key] = _load_vllm_model(dict(llm_config))
+                vllm_model = vllm_model_instances[model_key]
+            elif _is_hf_local_provider(llm_provider):
+                if model_key not in hf_model_instances:
+                    hf_model_instances[model_key] = _load_hf_local_model(
+                        dict(llm_config)
+                    )
+                tokenizer, hf_model = hf_model_instances[model_key]
+            else:
+                if model_key not in model_instances:
+                    model_instances[model_key] = _build_model(dict(llm_config))
+                subject_model = model_instances[model_key]
+
+            success = True
+            failed_task_id = None
+            try:
+                logger.info(
+                    "  Processing %d pending tasks (subject_batch_size=%d, judge_batch_size=%d)",
+                    len(pending_tasks),
+                    batch_size,
+                    judge_batch_size,
+                )
+                subject_tokenizer = None
+                if using_vllm and hasattr(vllm_model, "get_tokenizer"):
+                    subject_tokenizer = vllm_model.get_tokenizer()
+
+                # If BOTH subject and judge are vLLM, avoid dual-engine residency:
+                # - If they point to the same model_path, reuse the subject engine for judging.
+                # - Otherwise, generate everything first, free subject engine, then start judge.
+                judge_needs_serialization = bool(judge_using_vllm and using_vllm)
+                can_reuse_subject_as_judge = False
+                if judge_needs_serialization:
+                    subj_path = str(dict(llm_config).get("model_path", ""))
+                    judge_path = str(judge_llm_cfg.get("model_path", ""))
+                    can_reuse_subject_as_judge = bool(subj_path and judge_path and subj_path == judge_path)
+
+                if judge_needs_serialization and not can_reuse_subject_as_judge:
+                    # Phase A: generate all pending outputs (no judging yet)
+                    all_generated: List[Dict[str, Any]] = []
+                    with tqdm(
+                        total=len(pending_tasks),
+                        desc=f"Generate {llm_name}/{dataset.capability_id}",
+                        dynamic_ncols=True,
+                    ) as gen_bar:
+                        for task_batch in _batched(pending_tasks, batch_size):
+                            failed_task_id = task_batch[0].get("id")
+                            prompts = [
+                                _render_text_prompt(
+                                    subject_tokenizer,
+                                    sys_prompt="",
+                                    user_prompt=_format_prompt(dataset, task),
+                                )
+                                for task in task_batch
+                            ]
+                            generated_texts = _generate_batch_with_vllm(
+                                vllm_model,
+                                prompts=prompts,
+                                generation_config=subject_generation_cfg,
+                            )
+                            for task, generated_text in zip(task_batch, generated_texts, strict=True):
+                                all_generated.append(
+                                    {
+                                        "id": task["id"],
+                                        "question": task["input"],
+                                        "ground_truth": task["target"],
+                                        "model_output": generated_text,
+                                    }
+                                )
+                            gen_bar.update(len(task_batch))
+
+                    # Tear down subject vLLM before starting judge vLLM
+                    try:
+                        del vllm_model_instances[model_key]
+                    except Exception:  # noqa: BLE001
+                        pass
+                    del vllm_model
+                    gc.collect()
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+
+                    # Phase B: start judge vLLM and judge in batches
+                    logger.info("  Loading vLLM judge (after subject generation teardown)")
+                    judge_vllm_model = _load_vllm_model(judge_llm_cfg)
+                    judge_vllm_tokenizer = (
+                        judge_vllm_model.get_tokenizer()
+                        if hasattr(judge_vllm_model, "get_tokenizer")
+                        else None
+                    )
+                    with tqdm(
+                        total=len(all_generated),
+                        desc=f"Judge {llm_name}/{dataset.capability_id}",
+                        dynamic_ncols=True,
+                    ) as judge_bar:
+                        for judge_batch in _batched(all_generated, judge_batch_size):
+                            failed_task_id = judge_batch[0].get("id")
+                            scored_batch = _judge_batch(
+                                judge_batch,
+                                judge_generation_cfg=judge_generation_cfg,
+                                judge_model=judge_model,
+                                judge_tokenizer=judge_tokenizer,
+                                judge_hf_model=judge_hf_model,
+                                judge_vllm_model=judge_vllm_model,
+                                judge_vllm_tokenizer=judge_vllm_tokenizer,
+                                max_concurrent_requests=judge_batch_size,
+                            )
+                            for scored_row in scored_batch:
+                                row_by_id[str(scored_row["id"])] = scored_row
+                            _write_flat_results(flat_path, _ordered_rows(dataset.tasks, row_by_id))
+                            judge_bar.update(len(judge_batch))
+
+                    # Tear down judge vLLM too
+                    del judge_vllm_model
+                    judge_vllm_model = None
+                    judge_vllm_tokenizer = None
+                    gc.collect()
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+                else:
+                    # Default fast path: generate + judge streaming (can reuse subject engine as judge if same model)
+                    if judge_using_vllm and using_vllm and can_reuse_subject_as_judge:
+                        judge_vllm_model = vllm_model
+                        judge_vllm_tokenizer = subject_tokenizer
+                    elif judge_using_vllm and judge_vllm_model is None:
+                        logger.info("  Loading vLLM judge %s", judge_llm_cfg["name"])
+                        judge_vllm_model = _load_vllm_model(judge_llm_cfg)
+                        judge_vllm_tokenizer = (
+                            judge_vllm_model.get_tokenizer()
+                            if hasattr(judge_vllm_model, "get_tokenizer")
+                            else None
+                        )
+
+                    with tqdm(
+                        total=total_tasks,
+                        initial=len(row_by_id),
+                        desc=f"Eval {llm_name}/{dataset.capability_id}",
+                        dynamic_ncols=True,
+                    ) as eval_bar:
+                        for task_batch in _batched(pending_tasks, batch_size):
+                            failed_task_id = task_batch[0].get("id")
+                            if using_vllm:
+                                prompts = [
+                                    _render_text_prompt(
+                                        subject_tokenizer,
+                                        sys_prompt="",
+                                        user_prompt=_format_prompt(dataset, task),
+                                    )
+                                    for task in task_batch
+                                ]
+                                generated_texts = _generate_batch_with_vllm(
+                                    vllm_model,
+                                    prompts=prompts,
+                                    generation_config=subject_generation_cfg,
+                                )
+                            elif _is_hf_local_provider(llm_provider):
+                                prompts = [
+                                    _render_text_prompt(
+                                        tokenizer,
+                                        sys_prompt="",
+                                        user_prompt=_format_prompt(dataset, task),
+                                    )
+                                    for task in task_batch
+                                ]
+                                generated_texts = _generate_batch_with_hf_local(
+                                    tokenizer,
+                                    hf_model,
+                                    prompts=prompts,
+                                    generation_config=subject_generation_cfg,
+                                )
+                            else:
+                                generated_texts = []
+                                for task in task_batch:
+                                    failed_task_id = task.get("id")
+                                    prompt = _format_prompt(dataset, task)
+                                    generated_text, _ = subject_model.generate(
+                                        sys_prompt="",
+                                        user_prompt=prompt,
+                                        generation_config=subject_generation_cfg,
+                                    )
+                                    generated_texts.append(generated_text or "")
+
+                            generated_rows = [
+                                {
+                                    "id": task["id"],
+                                    "question": task["input"],
+                                    "ground_truth": task["target"],
+                                    "model_output": generated_text,
+                                }
+                                for task, generated_text in zip(
+                                    task_batch, generated_texts, strict=True
+                                )
+                            ]
+
+                            for jb in _batched(generated_rows, judge_batch_size):
+                                failed_task_id = jb[0].get("id")
+                                scored_batch = _judge_batch(
+                                    jb,
+                                    judge_generation_cfg=judge_generation_cfg,
+                                    judge_model=judge_model,
+                                    judge_tokenizer=judge_tokenizer,
+                                    judge_hf_model=judge_hf_model,
+                                    judge_vllm_model=judge_vllm_model,
+                                    judge_vllm_tokenizer=judge_vllm_tokenizer,
+                                    max_concurrent_requests=judge_batch_size,
+                                )
+                                for scored_row in scored_batch:
+                                    row_by_id[str(scored_row["id"])] = scored_row
+
+                            _write_flat_results(flat_path, _ordered_rows(dataset.tasks, row_by_id))
+                            eval_bar.update(len(task_batch))
+            except Exception as exc:  # noqa: BLE001
+                logger.error(
+                    "  Direct evaluation failed for %s/%s task %s with %s/%s: %s",
+                    dataset.area_id,
+                    dataset.capability_id,
+                    failed_task_id,
+                    llm_provider,
+                    llm_name,
+                    exc,
+                )
+                success = False
+
+            rows = _ordered_rows(dataset.tasks, row_by_id)
+            _write_flat_results(flat_path, rows)
+
+            if success:
+                if _check_flat_completed(flat_path, expected_task_ids):
+                    num_completed_this_run += 1
+                else:
+                    logger.warning(
+                        "  Incomplete flat output for %s/%s with %s "
+                        "(task IDs mismatch: missing or extra scored tasks)",
+                        dataset.area_id,
+                        dataset.capability_id,
+                        llm_name,
+                    )
+                    num_incomplete += 1
+            else:
+                num_failed += 1
+
+    logger.info(
+        "Eval Stage 1_local summary: completed_this_run=%d skipped_completed=%d "
+        "failed=%d incomplete=%d total=%d",
+        num_completed_this_run,
+        num_skipped_completed,
+        num_failed,
+        num_incomplete,
+        total_combinations,
+    )
+
+    return eval_tag
diff --git a/src/eval_stages/stage2_score_aggregation.py b/src/eval_stages/stage2_score_aggregation.py
index e855dc17..6c7aca79 100644
--- a/src/eval_stages/stage2_score_aggregation.py
+++ b/src/eval_stages/stage2_score_aggregation.py
@@ -1,13 +1,16 @@
 """Eval Stage 2: Score Aggregation.
 
-This stage computes final capability scores from raw Inspect results.
-No LLM calls, just aggregation of results from Stage 1.
+This stage computes final capability scores from Stage 1 outputs.
+No LLM calls, just aggregation of results from either:
+- Inspect JSON logs (`stage=1`)
+- Direct flat JSONL outputs (`stage=1_local`)
 
 See: https://inspect.aisi.org.uk/
 """
 
 import logging
 import math
+import json
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Set, Tuple
 
@@ -46,6 +49,11 @@ def _find_inspect_logs(result_dir: Path) -> List[Path]:
     return sorted(result_dir.glob("*.json"))
 
 
+def _find_flat_files(result_dir: Path) -> List[Path]:
+    """Find flat jsonl files for a capability result directory."""
+    return sorted(result_dir.glob("flat_*.jsonl"))
+
+
 def _compute_stats(scores: List[float]) -> Dict[str, Any]:
     """Compute mean, standard error, and sample count."""
     if not scores:
@@ -160,6 +168,72 @@ def _parse_inspect_logs(
     return stats
 
 
+def _parse_flat_jsonl(
+    result_dir: Path,
+    expected_task_ids: Set[str],
+) -> Dict[str, Any]:
+    """Parse direct flat jsonl output and return aggregate stats."""
+    flat_files = _find_flat_files(result_dir)
+    if not flat_files:
+        logger.warning("No flat JSONL files found in %s", result_dir)
+        return {"mean": 0.0, "std_err": 0.0, "num_tasks": 0, "exact_match": False}
+
+    flat_scores: List[Tuple[Path, List[float], Set[str]]] = []
+    for flat_file in flat_files:
+        try:
+            scores: List[float] = []
+            scored_ids: Set[str] = set()
+            with open(flat_file, "r", encoding="utf-8") as f:
+                # Skip summary line if present.
+                try:
+                    next(f)
+                except StopIteration:
+                    flat_scores.append((flat_file, [], set()))
+                    continue
+                for line in f:
+                    if not line.strip():
+                        continue
+                    row = json.loads(line)
+                    task_id = str(row.get("id", ""))
+                    if not task_id:
+                        continue
+                    score_value = _score_value_to_float(row.get("grade"))
+                    if score_value is None:
+                        continue
+                    scored_ids.add(task_id)
+                    if task_id in expected_task_ids:
+                        scores.append(score_value)
+            flat_scores.append((flat_file, scores, scored_ids))
+        except Exception as e:
+            logger.warning("Failed to parse flat file %s: %s", flat_file, e)
+            continue
+
+    if not flat_scores:
+        return {"mean": 0.0, "std_err": 0.0, "num_tasks": 0, "exact_match": False}
+
+    selected_file, selected_scores, selected_ids = max(
+        flat_scores,
+        key=lambda x: (
+            x[2] == expected_task_ids,
+            len(x[1]),
+            x[0].stat().st_mtime,
+            x[0].name,
+        ),
+    )
+
+    if len(flat_scores) > 1:
+        logger.info(
+            "Multiple flat files found in %s; selected %s with %d scored samples",
+            result_dir,
+            selected_file.name,
+            len(selected_scores),
+        )
+
+    stats = _compute_stats(selected_scores)
+    stats["exact_match"] = selected_ids == expected_task_ids
+    return stats
+
+
 def run_eval_stage2(
     cfg: DictConfig,
     eval_tag: str,
@@ -230,8 +304,12 @@ def run_eval_stage2(
 
             expected_task_ids = {str(task["id"]) for task in cap_dataset.tasks}
 
-            # Parse Inspect logs
-            parsed = _parse_inspect_logs(result_dir, expected_task_ids)
+            # Prefer Inspect logs when present; otherwise fall back to direct
+            # flat jsonl outputs from stage=1_local.
+            if _find_inspect_logs(result_dir):
+                parsed = _parse_inspect_logs(result_dir, expected_task_ids)
+            else:
+                parsed = _parse_flat_jsonl(result_dir, expected_task_ids)
 
             if parsed["num_tasks"] < cap_dataset.num_tasks:
                 logger.warning(
diff --git a/src/eval_stages/static_benchmarks/finance_math.py b/src/eval_stages/static_benchmarks/finance_math.py
index 383247c9..3529aae6 100644
--- a/src/eval_stages/static_benchmarks/finance_math.py
+++ b/src/eval_stages/static_benchmarks/finance_math.py
@@ -107,19 +107,10 @@ def build_eval_datasets_from_finance_math(
         raw_answer = row.get("ground_truth")
         answer = _normalize_answer(raw_answer)
 
-        # Skip table-based questions entirely.
-        # The dataset uses `tables` as a list; we only keep rows where it's empty.
-        if isinstance(tables, list) and len(tables) > 0:
-            continue
-        if tables not in (None, [], ""):
-            # Defensive: if tables is any non-empty structure/string, skip.
-            if str(tables).strip():
-                continue
-
         if not question or not answer:
             continue
 
-        inp = question
+        inp = _build_input(question, tables)
         global_idx = (spec.offset or 0) + local_idx
         task_id = f"finance_math_{global_idx:05d}"
         tasks.append({"id": task_id, "input": inp, "target": answer})
diff --git a/src/eval_stages/static_benchmarks/finance_tasks.py b/src/eval_stages/static_benchmarks/finance_tasks.py
index 00129795..6391be41 100644
--- a/src/eval_stages/static_benchmarks/finance_tasks.py
+++ b/src/eval_stages/static_benchmarks/finance_tasks.py
@@ -24,6 +24,11 @@
 from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
 from src.schemas.eval_schemas import EvalDataset
 
+EXCLUDED_BLOOM_LEVEL = (
+    "Create - Combine elements to form a new pattern, structure, or product. "
+    "Example verbs: design, compose, formulate, generate."
+)
+
 
 def _sanitize_text(text: str) -> str:
     """Sanitize text so it is safe to JSON-encode and send to APIs."""
@@ -87,9 +92,22 @@ def build_eval_datasets_from_finance_tasks(spec: StaticBenchmarkSpec) -> List[Ev
     limit: Optional[int] = spec.limit
     offset: int = max(0, int(spec.offset or 0))
 
+    # OmegaConf CLI overrides can pass boolean-like values as strings.
+    exclude_create = spec.exclude_bloom_create
+    if isinstance(exclude_create, str):
+        exclude_create = exclude_create.strip().lower() in {
+            "1",
+            "true",
+            "yes",
+            "y",
+            "on",
+        }
+
     for idx, row in enumerate(raw_tasks[offset:]):
         if not isinstance(row, dict):
             continue
+        if exclude_create and str(row.get("bloom_level", "")).strip() == EXCLUDED_BLOOM_LEVEL:
+            continue
 
         task_id = str(row.get("task_id", "")).strip()
         statement = _sanitize_text(str(row.get("task_statement", "")).strip())
diff --git a/src/eval_stages/static_benchmarks/specs.py b/src/eval_stages/static_benchmarks/specs.py
index d387e462..c4fb3631 100644
--- a/src/eval_stages/static_benchmarks/specs.py
+++ b/src/eval_stages/static_benchmarks/specs.py
@@ -40,3 +40,8 @@ class StaticBenchmarkSpec:
     capability_name: Optional[str] = None
     domain: str = "external"
 
+    # Controls the filtering behavior for local "finance tasks" JSON adapters
+    # (e.g. finance_tasks.json / seed_tasks.json).
+    # When true, removes Bloom level "Create - Combine elements..." tasks.
+    exclude_bloom_create: bool = True
+
diff --git a/src/eval_stages/static_benchmarks/xfinbench.py b/src/eval_stages/static_benchmarks/xfinbench.py
index de944bc7..299fc59b 100644
--- a/src/eval_stages/static_benchmarks/xfinbench.py
+++ b/src/eval_stages/static_benchmarks/xfinbench.py
@@ -131,11 +131,9 @@ def build_eval_datasets_from_xfinbench(spec: StaticBenchmarkSpec) -> List[EvalDa
         raw_gt = row.get("ground_truth")
         target = _normalize_target(task_type, raw_gt)
 
-        # Skip image-based questions (figure present) and table-heavy prompts.
+        # Skip image-based questions (figure present).
         if figure is not None:
             continue
-        if "\\begin{table" in question or "\\begin{tabular" in question:
-            continue
 
         if not question or not target:
             continue
diff --git a/src/generate_embeddings.py b/src/generate_embeddings.py
index bfb23b8c..c42c116d 100644
--- a/src/generate_embeddings.py
+++ b/src/generate_embeddings.py
@@ -63,6 +63,8 @@ def _load_embedding_model(
     def generate_embeddings(
         self,
         texts: list[str],
+        max_tokens_per_request: int = 250_000,
+        max_batch_size: int = 128,
     ) -> List[torch.Tensor]:
         """
         Generate and optionally reduce embeddings for a list of texts.
@@ -76,8 +78,61 @@ def generate_embeddings(
             List[torch.Tensor]: A list of embeddings, where each embedding
                                 is a torch.Tensor.
         """
-        output_float_list = self.embedding_model.embed_documents(texts)
-        return [torch.tensor(vec) for vec in output_float_list]
+        if not texts:
+            return []
+
+        # The OpenAI embeddings endpoint limits the TOTAL tokens per request
+        # (sum across all input documents). Previously we embedded `texts`
+        # in one shot, which can exceed the limit.
+        try:
+            import tiktoken  # type: ignore
+
+            try:
+                tokenizer = tiktoken.encoding_for_model(self.embedding_model_name.value)
+            except Exception:  # noqa: BLE001
+                tokenizer = tiktoken.get_encoding("cl100k_base")
+
+            def _token_len(s: str) -> int:
+                # NOTE: this is an estimate; still far safer than sending
+                # an unbounded list.
+                return len(tokenizer.encode(s))
+
+        except Exception:  # noqa: BLE001
+            # Fallback heuristic if tiktoken isn't available.
+            # Typical English token ~ 4 chars, but we keep a lower bound of 1.
+            def _token_len(s: str) -> int:
+                return max(1, len(s) // 4)
+
+        # Build batches under the token budget.
+        batches: list[list[str]] = []
+        current_batch: list[str] = []
+        current_tokens = 0
+
+        for t in texts:
+            t_tokens = _token_len(t)
+            if (
+                current_batch
+                and (
+                    current_tokens + t_tokens > max_tokens_per_request
+                    or len(current_batch) >= max_batch_size
+                )
+            ):
+                batches.append(current_batch)
+                current_batch = [t]
+                current_tokens = t_tokens
+            else:
+                current_batch.append(t)
+                current_tokens += t_tokens
+
+        if current_batch:
+            batches.append(current_batch)
+
+        output_embeddings: List[torch.Tensor] = []
+        for batch in batches:
+            output_float_list = self.embedding_model.embed_documents(batch)
+            output_embeddings.extend([torch.tensor(vec) for vec in output_float_list])
+
+        return output_embeddings
 
 
 def filter_embeddings(
diff --git a/src/run_eval_pipeline.py b/src/run_eval_pipeline.py
index c1212189..8c01c04e 100644
--- a/src/run_eval_pipeline.py
+++ b/src/run_eval_pipeline.py
@@ -2,7 +2,8 @@
 
 This module orchestrates the evaluation pipeline:
 - Stage 0: Setup and Dataset Preparation
-- Stage 1: Evaluation Execution (runs subject LLMs, creates eval_tag)
+- Stage 1: Evaluation Execution (Inspect-based)
+- Stage 1_local: Evaluation Execution without Inspect
 - Stage 2: Score Aggregation
 
 Usage:
@@ -27,6 +28,7 @@
     run_eval_stage0,
     run_eval_stage0_static,
     run_eval_stage1,
+    run_eval_stage1_local,
     run_eval_stage2,
 )
 
@@ -119,6 +121,21 @@ def main(cfg: DictConfig) -> None:
         except ValueError as e:
             logger.error("Stage 1 failed: %s", e)
 
+    elif stage in {"1_local", "local1", "stage1_local"}:
+        if not validation_tag:
+            logger.error("validation_tag is required for stage 1_local")
+            logger.error(
+                "Usage: python -m src.run_eval_pipeline stage=1_local "
+                "validation_tag=_YYYYMMDD_HHMMSS"
+            )
+            return
+
+        try:
+            eval_tag = run_eval_stage1_local(cfg, validation_tag, eval_tag)
+            logger.info("Eval Stage 1_local complete. eval_tag=%s", eval_tag)
+        except ValueError as e:
+            logger.error("Stage 1_local failed: %s", e)
+
     elif stage == 2:
         if not eval_tag:
             logger.error("eval_tag is required for stage 2")
@@ -150,7 +167,10 @@ def main(cfg: DictConfig) -> None:
             logger.error("Stage 0_static failed: %s", e)
 
     else:
-        logger.error("Invalid stage: %s. Use 'all', 0, 1, or 2", stage)
+        logger.error(
+            "Invalid stage: %s. Use 'all', 0, 1, '1_local', 2, or '0_static'",
+            stage,
+        )
 
 
 if __name__ == "__main__":
diff --git a/src/utils/data_utils.py b/src/utils/data_utils.py
index 2d936f92..12cd42bc 100644
--- a/src/utils/data_utils.py
+++ b/src/utils/data_utils.py
@@ -14,10 +14,24 @@
     Dataset,
     load_dataset,  # noqa: D100
 )
-from google.cloud import storage
+try:
+    # Optional dependency: only required when using `gs://...` paths.
+    from google.cloud import storage  # type: ignore
+except Exception:  # noqa: BLE001
+    storage = None
 from omegaconf import DictConfig
 
 
+def _require_gcs_storage() -> Any:
+    """Return google.cloud.storage or raise a helpful ImportError."""
+    if storage is None:
+        raise ImportError(
+            "google-cloud-storage is required for `gs://...` paths. "
+            "Install it with: pip install google-cloud-storage"
+        )
+    return storage
+
+
 def load_data(
     dataset_name: str,
     split: str,
@@ -62,7 +76,7 @@ def read_json_file(file_path: str) -> Any:
     """
     if file_path.startswith("gs://"):
         # Read from GCP bucket
-        client = storage.Client()
+        client = _require_gcs_storage().Client()
         bucket_name, blob_name = file_path[5:].split("/", 1)
         bucket = client.bucket(bucket_name)
         blob = bucket.blob(blob_name)
@@ -90,7 +104,7 @@ def write_json_file(file_path: str, data: Dict[Any, Any]) -> None:
     """
     if file_path.startswith("gs://"):
         # Write to GCP bucket
-        client = storage.Client()
+        client = _require_gcs_storage().Client()
         bucket_name, blob_name = file_path[5:].split("/", 1)
         bucket = client.bucket(bucket_name)
         blob = bucket.blob(blob_name)
@@ -119,7 +133,7 @@ def list_dir(path: str) -> list[str]:
     """
     if path.startswith("gs://"):
         # List contents from GCP bucket
-        client = storage.Client()
+        client = _require_gcs_storage().Client()
         bucket_name, prefix = path[5:].split("/", 1)
         bucket = client.bucket(bucket_name)
         blobs = bucket.list_blobs(prefix=prefix)
@@ -148,7 +162,7 @@ def copy_file(src: str, dest: str) -> None:
     """
     if src.startswith("gs://") and dest.startswith("gs://"):
         # Copy file within GCP buckets
-        client = storage.Client()
+        client = _require_gcs_storage().Client()
         src_bucket_name, src_blob_name = src[5:].split("/", 1)
         dest_bucket_name, dest_blob_name = dest[5:].split("/", 1)
 
@@ -161,7 +175,7 @@ def copy_file(src: str, dest: str) -> None:
         dest_blob.rewrite(src_blob)
     elif src.startswith("gs://"):
         # Copy file from GCP bucket to local
-        client = storage.Client()
+        client = _require_gcs_storage().Client()
         bucket_name, blob_name = src[5:].split("/", 1)
         bucket = client.bucket(bucket_name)
         blob = bucket.blob(blob_name)
@@ -170,7 +184,7 @@ def copy_file(src: str, dest: str) -> None:
         blob.download_to_filename(dest)
     elif dest.startswith("gs://"):
         # Copy file from local to GCP bucket
-        client = storage.Client()
+        client = _require_gcs_storage().Client()
         bucket_name, blob_name = dest[5:].split("/", 1)
         bucket = client.bucket(bucket_name)
         blob = bucket.blob(blob_name)
@@ -199,7 +213,7 @@ def path_exists(path: str) -> bool:
     """
     if path.startswith("gs://"):
         # Check existence in GCP bucket
-        client = storage.Client()
+        client = _require_gcs_storage().Client()
         bucket_name, blob_name = path[5:].split("/", 1)
         bucket = client.bucket(bucket_name)
         blob = bucket.blob(blob_name)
diff --git a/src/utils/inspect_eval_utils.py b/src/utils/inspect_eval_utils.py
index b0d81d57..965d1b84 100644
--- a/src/utils/inspect_eval_utils.py
+++ b/src/utils/inspect_eval_utils.py
@@ -19,7 +19,7 @@
 INSPECT_JUDGE_LLM_GEN_CONFIG: Dict[str, Any] = {}
 
 LLM_JUDGE_PROMPT = """
-Look at the submission and the target string for a given task and judge whether they are equivalent or not. The submission should include all aspects present in the target. Respond with only "Yes" or "No" (without quotes). Do not include a rationale.
+Look at the submission and the target string for a given task and judge whether they are equivalent or not. If they are semantically equivalent, respond with "Yes". Respond with only "Yes" or "No" (without quotes). Do not include a rationale.
 
 Submission: {submission}
 Target: {target}
@@ -30,7 +30,7 @@
 # https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/src/inspect_ai/_util/pattern.py#L3
 def parse_submission(submission: str) -> str:
     """
-    Parse the submission string to extract the answer based on the "ANSWER" keyword.
+    Parse the submission string to extract the final answer.
 
     This function is used in the capability class score method.
 
@@ -43,9 +43,76 @@ def parse_submission(submission: str) -> str:
         str: The extracted answer from the submission, or an empty string
             if no match is found.
     """
-    answer_pattern = r"ANSWER\s*:\s*([^\n]+)"
-    match = re.search(answer_pattern, submission)
-    return match.group(1) if match else ""
+    if not submission:
+        return ""
+
+    def _maybe_extract_option_letter(text: str) -> str | None:
+        """
+        Normalize common answer formats (e.g., "ANSWER: B", "Option B", "\\boxed{ANSWER: B}")
+        into a single letter like "B".
+        """
+        if not text:
+            return None
+        s = text.strip()
+        # Normalize common LaTeX wrappers like "\text{ANSWER: B}" -> "ANSWER: B".
+        s = re.sub(r"\\text\s*\{([^}]*)\}", r"\1", s, flags=re.IGNORECASE)
+        s = s.replace("{", "").replace("}", "")
+        # Handle cases like "ANSWER: B" (possibly with trailing punctuation).
+        m = re.search(
+            r"(?im)(?:final\s+answer|correct\s+option|answer)\s*[:\-]?\s*([A-Z])\b",
+            s,
+        )
+        if m:
+            return m.group(1).upper()
+        m = re.fullmatch(
+            r"(?im)\s*(?:(?:final\s+answer|correct\s+option|answer))\s*[:\-]?\s*([A-Z])\s*[\s.]*",
+            s,
+        )
+        if m:
+            return m.group(1).upper()
+        m = re.fullmatch(
+            r"(?im)\s*(?:(?:option|choice))\s*[:\-]?\s*([A-Z])\s*[\s.)]*",
+            s,
+        )
+        if m:
+            return m.group(1).upper()
+        # Sometimes the boxed content is already just "B".
+        m = re.fullmatch(r"(?im)\s*([A-Z])\s*", s)
+        if m:
+            return m.group(1).upper()
+        return None
+
+    patterns = [
+        r"(?im)^\s*(?:final\s+answer|answer)\s*:\s*(.+?)\s*$",
+        r"(?im)^\s*(?:the\s+correct\s+option|correct\s+option)\s+is\s*[:\-]?\s*(.+?)\s*$",
+        r"(?im)^\s*(?:option|choice)\s*[:\-]?\s*([A-Z])\s*$",
+        r"\\boxed\{([^}]+)\}",
+    ]
+
+    for pattern in patterns:
+        matches = re.findall(pattern, submission)
+        if matches:
+            extracted = matches[-1].strip()
+            if extracted:
+                normalized_letter = _maybe_extract_option_letter(extracted)
+                if normalized_letter is not None:
+                    return normalized_letter
+                return extracted
+
+    lines = [line.strip() for line in submission.splitlines() if line.strip()]
+    if not lines:
+        return ""
+
+    for line in reversed(lines[-5:]):
+        option_match = re.fullmatch(
+            r"(?:option|choice)?\s*[:\-]?\s*([A-Z])(?:[.)])?",
+            line,
+            flags=re.IGNORECASE,
+        )
+        if option_match:
+            return option_match.group(1).upper()
+
+    return ""
 
 
 async def evaluate_with_llm_judge(

From a294dd497086f337551dc1d92b53f0959ea0838d Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Tue, 14 Apr 2026 12:45:51 -0400
Subject: [PATCH 4/8] updated scripts

---
 .gitignore                                    |   6 +
 .../bizbench_local_array_eval.sh              |  86 +++++
 .../classify_static_benchmark_topics_vllm.py  | 336 ++++++++++++++++++
 .../static_benchmarks/env_slurm_inspect.sh    |  17 +-
 .../finance_book1_book2_local_array_eval.sh   | 142 ++++++++
 .../finance_book3_book4_local_array_eval.sh   | 142 ++++++++
 .../finance_book5_book6_local_array_eval.sh   | 142 ++++++++
 .../finance_math_local_array_eval.sh          |  86 +++++
 scripts/static_benchmarks/hardmath_eval.sh    |  59 ---
 scripts/static_benchmarks/harp_eval.sh        |  65 ----
 scripts/static_benchmarks/math500_eval.sh     |  64 ----
 scripts/static_benchmarks/minif2f_eval.sh     |  64 ----
 scripts/static_benchmarks/omni_math_eval.sh   |  64 ----
 scripts/static_benchmarks/orca_math_eval.sh   |  64 ----
 scripts/static_benchmarks/proofnet_eval.sh    |  64 ----
 .../run_topic_classification_qwen3_32b.sh     |  39 ++
 .../seed_tasks_allbloom_eval.sh               |  68 ----
 .../seed_tasks_no_create_eval.sh              |  68 ----
 scripts/static_benchmarks/stateval_eval.sh    |  60 ----
 .../submit_all_static_benchmarks.sh           |   9 -
 scripts/static_benchmarks/wemath_eval.sh      |  60 ----
 .../xfinbench_test_local_array_eval.sh        |  76 ++++
 src/cfg/run_cfg.yaml                          | 147 ++++----
 src/eval_stages/stage0_static_benchmarks.py   |  54 ---
 .../stage1_local_eval_execution.py            | 241 +++++++------
 src/eval_stages/static_benchmarks/hardmath.py | 119 -------
 src/eval_stages/static_benchmarks/harp.py     |  88 -----
 src/eval_stages/static_benchmarks/math500.py  |  78 ----
 src/eval_stages/static_benchmarks/minif2f.py  |  79 ----
 .../static_benchmarks/omni_math.py            |  78 ----
 .../static_benchmarks/orca_math.py            |  62 ----
 src/eval_stages/static_benchmarks/proofnet.py |  86 -----
 src/eval_stages/static_benchmarks/stateval.py |  49 ---
 .../stateval_foundational.py                  | 219 ------------
 .../static_benchmarks/stateval_research.py    |  96 -----
 src/eval_stages/static_benchmarks/wemath.py   |  95 -----
 36 files changed, 1259 insertions(+), 2013 deletions(-)
 create mode 100644 scripts/static_benchmarks/bizbench_local_array_eval.sh
 create mode 100644 scripts/static_benchmarks/classify_static_benchmark_topics_vllm.py
 create mode 100644 scripts/static_benchmarks/finance_book1_book2_local_array_eval.sh
 create mode 100644 scripts/static_benchmarks/finance_book3_book4_local_array_eval.sh
 create mode 100644 scripts/static_benchmarks/finance_book5_book6_local_array_eval.sh
 create mode 100644 scripts/static_benchmarks/finance_math_local_array_eval.sh
 delete mode 100755 scripts/static_benchmarks/hardmath_eval.sh
 delete mode 100644 scripts/static_benchmarks/harp_eval.sh
 delete mode 100755 scripts/static_benchmarks/math500_eval.sh
 delete mode 100644 scripts/static_benchmarks/minif2f_eval.sh
 delete mode 100644 scripts/static_benchmarks/omni_math_eval.sh
 delete mode 100644 scripts/static_benchmarks/orca_math_eval.sh
 delete mode 100644 scripts/static_benchmarks/proofnet_eval.sh
 create mode 100644 scripts/static_benchmarks/run_topic_classification_qwen3_32b.sh
 delete mode 100755 scripts/static_benchmarks/seed_tasks_allbloom_eval.sh
 delete mode 100755 scripts/static_benchmarks/seed_tasks_no_create_eval.sh
 delete mode 100755 scripts/static_benchmarks/stateval_eval.sh
 delete mode 100755 scripts/static_benchmarks/wemath_eval.sh
 create mode 100644 scripts/static_benchmarks/xfinbench_test_local_array_eval.sh
 delete mode 100644 src/eval_stages/static_benchmarks/hardmath.py
 delete mode 100644 src/eval_stages/static_benchmarks/harp.py
 delete mode 100644 src/eval_stages/static_benchmarks/math500.py
 delete mode 100644 src/eval_stages/static_benchmarks/minif2f.py
 delete mode 100644 src/eval_stages/static_benchmarks/omni_math.py
 delete mode 100644 src/eval_stages/static_benchmarks/orca_math.py
 delete mode 100644 src/eval_stages/static_benchmarks/proofnet.py
 delete mode 100644 src/eval_stages/static_benchmarks/stateval.py
 delete mode 100644 src/eval_stages/static_benchmarks/stateval_foundational.py
 delete mode 100644 src/eval_stages/static_benchmarks/stateval_research.py
 delete mode 100644 src/eval_stages/static_benchmarks/wemath.py

diff --git a/.gitignore b/.gitignore
index b171f569..b2a1260e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -150,6 +150,12 @@ seed_tasks_results/
 # Generated experiment/evaluation artifacts
 base_output/
 base_output_tmp/
+base_output_tmp_2/
+logs_tmp/
+Finance_Book1_Book2/
+Finance_Book3_Book4/
+Finance_Book5_Book6/
+topic.csv
 
 # Local benchmark/task JSON exports
 finance_tasks.json
diff --git a/scripts/static_benchmarks/bizbench_local_array_eval.sh b/scripts/static_benchmarks/bizbench_local_array_eval.sh
new file mode 100644
index 00000000..d8cbd04b
--- /dev/null
+++ b/scripts/static_benchmarks/bizbench_local_array_eval.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+#SBATCH --job-name=gemma_bizbench_local_array
+#SBATCH --output=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/bizbench_local_array_%A_%a.out
+#SBATCH --error=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/bizbench_local_array_%A_%a.err
+#SBATCH --time=08:00:00
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --array=0-7%8
+
+set -euo pipefail
+
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source /projects/DeepLesion/py311_env/bin/activate
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
+# Allow direct execution without sbatch by defaulting to shard 0.
+: "${SLURM_ARRAY_TASK_ID:=0}"
+
+NUM_SHARDS=8
+
+# Count only rows that survive adapter filtering.
+TOTAL=$(
+python - <<'PY'
+from datasets import load_dataset
+
+ds = load_dataset("kensho/bizbench", split="test")
+
+def is_valid(row):
+    question = str(row.get("question", "")).strip()
+    answer = row.get("answer")
+    if answer is None:
+        answer_text = ""
+    elif isinstance(answer, dict):
+        for key in ("answer", "label", "text", "value"):
+            if key in answer and answer[key] is not None:
+                answer_text = str(answer[key]).strip()
+                break
+        else:
+            answer_text = str(answer).strip()
+    else:
+        answer_text = str(answer).strip()
+    return bool(question and answer_text)
+
+print(sum(1 for row in ds if is_valid(row)))
+PY
+)
+
+CHUNK=$(((TOTAL + NUM_SHARDS - 1) / NUM_SHARDS))
+OFFSET=$((SLURM_ARRAY_TASK_ID * CHUNK))
+TAG="_BIZBENCH_TEST_GEMMA_3"
+
+if [ "$OFFSET" -ge "$TOTAL" ]; then
+  echo "No work for shard ${SLURM_ARRAY_TASK_ID} (OFFSET=$OFFSET >= TOTAL=$TOTAL). Exiting."
+  exit 0
+fi
+
+echo "TOTAL=$TOTAL NUM_SHARDS=$NUM_SHARDS CHUNK=$CHUNK OFFSET=$OFFSET TAG=$TAG"
+
+# Stage 0_static: build dataset shard from BizBench test split.
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$TAG" \
+  +static_benchmark_cfg.benchmark_id=kensho/bizbench \
+  +static_benchmark_cfg.split=test \
+  +static_benchmark_cfg.offset="$OFFSET" \
+  +static_benchmark_cfg.limit="$CHUNK"
+
+# Stage 1_local: evaluate local subject model(s) from run_cfg.yaml.
+python -m src.run_eval_pipeline \
+  stage=1_local \
+  validation_tag="$TAG" \
+  eval_tag="$TAG"
+
+# Stage 2: aggregate per-shard scores.
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$TAG"
+echo "Stage 1_local results:  base_output/test_exp/eval/results/$TAG"
+echo "Stage 2 scores:         base_output/test_exp/eval/scores/$TAG"
diff --git a/scripts/static_benchmarks/classify_static_benchmark_topics_vllm.py b/scripts/static_benchmarks/classify_static_benchmark_topics_vllm.py
new file mode 100644
index 00000000..84d3b699
--- /dev/null
+++ b/scripts/static_benchmarks/classify_static_benchmark_topics_vllm.py
@@ -0,0 +1,336 @@
+#!/usr/bin/env python3
+"""Classify static benchmark questions into finance capabilities with vLLM.
+
+This script:
+1) Loads one static benchmark through existing Stage-0 adapters.
+2) Reads `topic.csv` and builds a high-level-area -> capabilities taxonomy.
+3) Prompts a local model (e.g., Qwen3-32B) to return only one capability.
+4) Saves outputs incrementally after every processed batch.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import os
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Sequence, Set
+
+from src.eval_stages.stage0_static_benchmarks import _build_datasets_from_spec
+from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
+
+
+@dataclass(frozen=True)
+class Taxonomy:
+    """Prompt-ready taxonomy container."""
+
+    high_level_areas: List[str]
+    capabilities: List[str]
+    area_to_capabilities: Dict[str, List[str]]
+    prompt_block: str
+
+
+def _read_topic_taxonomy(topic_csv_path: Path) -> Taxonomy:
+    """Read high-level areas + capabilities from topic.csv."""
+    area_to_caps_set: Dict[str, Set[str]] = {}
+
+    # Be robust to UTF-8 BOM and leading blank/comment lines before header.
+    with topic_csv_path.open("r", encoding="utf-8-sig", newline="") as f:
+        rows = list(csv.reader(f))
+
+    required = {"High Level Area", "Capability"}
+    header_idx = None
+    header: List[str] = []
+    for idx, row in enumerate(rows):
+        normalized = [str(cell).strip() for cell in row]
+        if not any(normalized):
+            continue
+        if required.issubset(set(normalized)):
+            header_idx = idx
+            header = normalized
+            break
+
+    if header_idx is None:
+        raise ValueError(
+            f"Missing expected columns in {topic_csv_path}: {sorted(required)}"
+        )
+
+    for row in rows[header_idx + 1 :]:
+        if not row or not any(str(cell).strip() for cell in row):
+            continue
+        rec = {
+            header[i]: str(row[i]).strip() if i < len(row) else ""
+            for i in range(len(header))
+        }
+        area = rec.get("High Level Area", "").strip()
+        capability = rec.get("Capability", "").strip()
+        if not area or not capability:
+            continue
+        area_to_caps_set.setdefault(area, set()).add(capability)
+
+    if not area_to_caps_set:
+        raise ValueError(f"No usable area/capability rows found in {topic_csv_path}")
+
+    area_to_capabilities: Dict[str, List[str]] = {
+        area: sorted(caps) for area, caps in sorted(area_to_caps_set.items())
+    }
+    high_level_areas = list(area_to_capabilities.keys())
+    capabilities = sorted(
+        {cap for caps in area_to_capabilities.values() for cap in caps}
+    )
+
+    lines: List[str] = []
+    lines.append("High-level areas and their capabilities:")
+    for area in high_level_areas:
+        lines.append(f"- {area}:")
+        for cap in area_to_capabilities[area]:
+            lines.append(f"  - {cap}")
+    prompt_block = "\n".join(lines)
+
+    return Taxonomy(
+        high_level_areas=high_level_areas,
+        capabilities=capabilities,
+        area_to_capabilities=area_to_capabilities,
+        prompt_block=prompt_block,
+    )
+
+
+def _load_tasks_from_static_benchmark(
+    *,
+    benchmark_id: str,
+    split: str,
+    offset: int | None,
+    limit: int | None,
+) -> List[Dict[str, str]]:
+    """Load tasks via the same adapters used by stage=0_static."""
+    spec = StaticBenchmarkSpec(
+        benchmark_id=benchmark_id,
+        split=split,
+        offset=offset,
+        limit=limit,
+        area_id="static_benchmarks",
+        domain="finance",
+    )
+    datasets = _build_datasets_from_spec(spec)
+
+    tasks: List[Dict[str, str]] = []
+    for ds in datasets:
+        for task in ds.tasks:
+            tid = str(task.get("id", "")).strip()
+            q = str(task.get("input", "")).strip()
+            if tid and q:
+                tasks.append({"id": tid, "question": q})
+    return tasks
+
+
+def _build_prompt(question: str, taxonomy: Taxonomy) -> str:
+    """Build short-thinking classification prompt."""
+    return (
+        "You are classifying a finance question into ONE capability.\n"
+        "Think very briefly.\n"
+        "Choose exactly one capability from the provided list.\n"
+        "Do not explain.\n"
+        "Return exactly one line in this format:\n"
+        "CAPABILITY: <exact capability name>\n\n"
+        f"{taxonomy.prompt_block}\n\n"
+        "Question:\n"
+        f"{question}\n\n"
+        "Only output the final capability line."
+    )
+
+
+def _extract_capability(raw_text: str, allowed_capabilities: Sequence[str]) -> str:
+    """Parse model output and map to one allowed capability if possible."""
+    text = (raw_text or "").strip()
+    allowed_map = {cap.lower(): cap for cap in allowed_capabilities}
+
+    m = re.search(r"(?im)^\s*CAPABILITY\s*:\s*(.+?)\s*$", text)
+    if m:
+        value = m.group(1).strip()
+        if value.lower() in allowed_map:
+            return allowed_map[value.lower()]
+        text = value
+
+    text_norm = re.sub(r"\s+", " ", text).strip().lower()
+    if text_norm in allowed_map:
+        return allowed_map[text_norm]
+
+    # Fallback: find capability mention in output.
+    # Longest-first reduces accidental partial matches.
+    for cap in sorted(allowed_capabilities, key=len, reverse=True):
+        if cap.lower() in text.lower():
+            return cap
+
+    return ""
+
+
+def _batched(items: Sequence[Dict[str, str]], batch_size: int) -> Iterable[List[Dict[str, str]]]:
+    if batch_size <= 0:
+        raise ValueError("batch_size must be positive")
+    for i in range(0, len(items), batch_size):
+        yield list(items[i : i + batch_size])
+
+
+def _load_done_ids(output_jsonl: Path) -> Set[str]:
+    """Read already-processed task IDs for resume support."""
+    done: Set[str] = set()
+    if not output_jsonl.exists():
+        return done
+    with output_jsonl.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                row = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            tid = str(row.get("id", "")).strip()
+            if tid:
+                done.add(tid)
+    return done
+
+
+def run(args: argparse.Namespace) -> None:
+    # Safer default in many cluster environments.
+    os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+
+    try:
+        from transformers import PreTrainedTokenizerBase
+        from vllm import LLM, SamplingParams
+    except Exception as exc:  # noqa: BLE001
+        raise RuntimeError("This script requires transformers and vllm.") from exc
+
+    # Compatibility shim for some tokenizer/vLLM combinations.
+    if not hasattr(PreTrainedTokenizerBase, "all_special_tokens_extended"):
+        PreTrainedTokenizerBase.all_special_tokens_extended = property(  # type: ignore[attr-defined]
+            lambda self: list(self.all_special_tokens)
+        )
+
+    topic_csv_path = Path(args.topic_csv).resolve()
+    output_jsonl = Path(args.output_jsonl).resolve()
+    output_jsonl.parent.mkdir(parents=True, exist_ok=True)
+
+    taxonomy = _read_topic_taxonomy(topic_csv_path)
+    tasks = _load_tasks_from_static_benchmark(
+        benchmark_id=args.benchmark_id,
+        split=args.split,
+        offset=args.offset,
+        limit=args.limit,
+    )
+    if not tasks:
+        raise ValueError("No tasks loaded from benchmark.")
+
+    done_ids = _load_done_ids(output_jsonl) if args.resume else set()
+    pending_tasks = [task for task in tasks if task["id"] not in done_ids]
+
+    print(
+        f"Loaded {len(tasks)} tasks; pending={len(pending_tasks)}; "
+        f"already_done={len(done_ids)}"
+    )
+    print(f"Model: {args.model_path}")
+    print(f"Benchmark: {args.benchmark_id} (split={args.split})")
+    print(f"Output: {output_jsonl}")
+
+    if not pending_tasks:
+        print("Nothing to do.")
+        return
+
+    llm = LLM(
+        model=args.model_path,
+        tokenizer=args.model_path,
+        trust_remote_code=args.trust_remote_code,
+        tensor_parallel_size=args.tensor_parallel_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        dtype=args.dtype,
+        max_model_len=args.max_model_len,
+    )
+    tokenizer = llm.get_tokenizer() if hasattr(llm, "get_tokenizer") else None
+
+    sampling = SamplingParams(
+        temperature=0.0,
+        top_p=1.0,
+        max_tokens=args.max_tokens,
+        repetition_penalty=1.0,
+    )
+
+    total = len(pending_tasks)
+    processed = 0
+    with output_jsonl.open("a", encoding="utf-8") as out_f:
+        for batch_idx, batch in enumerate(_batched(pending_tasks, args.batch_size), start=1):
+            prompts: List[str] = []
+            for row in batch:
+                user_prompt = _build_prompt(row["question"], taxonomy)
+                if tokenizer is not None and hasattr(tokenizer, "apply_chat_template"):
+                    text_prompt = tokenizer.apply_chat_template(
+                        [{"role": "user", "content": user_prompt}],
+                        tokenize=False,
+                        add_generation_prompt=True,
+                    )
+                else:
+                    text_prompt = user_prompt
+                prompts.append(text_prompt)
+
+            outputs = llm.generate(prompts, sampling)
+            for row, output in zip(batch, outputs, strict=True):
+                raw = output.outputs[0].text.strip() if output.outputs else ""
+                predicted = _extract_capability(raw, taxonomy.capabilities)
+                record = {
+                    "id": row["id"],
+                    "question": row["question"],
+                    "predicted_capability": predicted,
+                    "model_output_raw": raw,
+                    "model_name": "Qwen3-32B",
+                    "benchmark_id": args.benchmark_id,
+                    "split": args.split,
+                }
+                out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
+            out_f.flush()
+
+            processed += len(batch)
+            print(
+                f"[batch {batch_idx}] wrote {len(batch)} rows | "
+                f"progress {processed}/{total}"
+            )
+
+    print("Done.")
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="Classify one static benchmark into finance capabilities with vLLM."
+    )
+    p.add_argument("--benchmark-id", required=True, help="Static benchmark ID or local JSON path.")
+    p.add_argument("--split", default="test", help="Benchmark split (default: test).")
+    p.add_argument("--offset", type=int, default=None, help="Optional benchmark offset.")
+    p.add_argument("--limit", type=int, default=None, help="Optional benchmark limit.")
+    p.add_argument(
+        "--topic-csv",
+        default="topic.csv",
+        help="Path to topic.csv with High Level Area and Capability columns.",
+    )
+    p.add_argument(
+        "--output-jsonl",
+        required=True,
+        help="Where to append classification rows (JSONL).",
+    )
+    p.add_argument("--resume", action="store_true", help="Skip IDs already in output JSONL.")
+
+    # Model / vLLM args
+    p.add_argument("--model-path", default="/model-weights/Qwen3-32B")
+    p.add_argument("--batch-size", type=int, default=16)
+    p.add_argument("--max-tokens", type=int, default=32)
+    p.add_argument("--trust-remote-code", action="store_true")
+    p.add_argument("--tensor-parallel-size", type=int, default=1)
+    p.add_argument("--gpu-memory-utilization", type=float, default=0.9)
+    p.add_argument("--dtype", default="auto")
+    p.add_argument("--max-model-len", type=int, default=8192)
+    return p
+
+
+if __name__ == "__main__":
+    parser = build_arg_parser()
+    run(parser.parse_args())
diff --git a/scripts/static_benchmarks/env_slurm_inspect.sh b/scripts/static_benchmarks/env_slurm_inspect.sh
index 3ce2fd89..65bd73c7 100644
--- a/scripts/static_benchmarks/env_slurm_inspect.sh
+++ b/scripts/static_benchmarks/env_slurm_inspect.sh
@@ -1,7 +1,16 @@
 # Sourced by *_eval.sh SLURM jobs. Puts platformdirs user_data / cache on local
 # scratch so Inspect's samplebuffer and logging are not on flaky NFS home mounts.
-if [ -n "${SLURM_TMPDIR:-}" ]; then
-  export XDG_DATA_HOME="${SLURM_TMPDIR}/inspect_xdg_data"
-  export XDG_CACHE_HOME="${SLURM_TMPDIR}/inspect_xdg_cache"
-  mkdir -p "$XDG_DATA_HOME" "$XDG_CACHE_HOME"
+# Avoid /tmp — it is often shared and fills up when vLLM writes torch compile caches.
+_SCRATCH="${SLURM_TMPDIR:-}"
+if [ -z "$_SCRATCH" ] || [ "$_SCRATCH" = "/tmp" ]; then
+  _SCRATCH="/projects/DeepLesion/tmp_cache"
 fi
+export XDG_DATA_HOME="${_SCRATCH}/inspect_xdg_data"
+export XDG_CACHE_HOME="${_SCRATCH}/inspect_xdg_cache"
+export TMPDIR="${_SCRATCH}"
+_CACHE_BASE="${_SCRATCH}/job_${SLURM_JOB_ID:-local}"
+export TORCHINDUCTOR_CACHE_DIR="${_CACHE_BASE}/torchinductor_${USER:-user}"
+export TRITON_CACHE_DIR="${_CACHE_BASE}/triton_${USER:-user}"
+mkdir -p "$XDG_DATA_HOME" "$XDG_CACHE_HOME" "$TORCHINDUCTOR_CACHE_DIR" "$TRITON_CACHE_DIR"
+unset _SCRATCH
+unset _CACHE_BASE
diff --git a/scripts/static_benchmarks/finance_book1_book2_local_array_eval.sh b/scripts/static_benchmarks/finance_book1_book2_local_array_eval.sh
new file mode 100644
index 00000000..3d5c0df0
--- /dev/null
+++ b/scripts/static_benchmarks/finance_book1_book2_local_array_eval.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+#SBATCH --job-name=gemma_book1_book2_local_array
+#SBATCH --output=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/gemma_book1_book2_local_array_%A_%a.out
+#SBATCH --error=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/gemma_book1_book2_local_array_%A_%a.err
+#SBATCH --time=24:00:00
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --array=0-7%8
+
+set -euo pipefail
+
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source /projects/DeepLesion/py311_env/bin/activate
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
+# Allow direct execution without sbatch by defaulting to shard 0.
+: "${SLURM_ARRAY_TASK_ID:=0}"
+
+NUM_SHARDS=8
+ROOT_DIR="${ROOT_DIR:-/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/Finance_Book1_Book2}"
+
+export ROOT_DIR NUM_SHARDS SLURM_ARRAY_TASK_ID
+
+mapfile -t ASSIGNED_FILES < <(
+python - <<'PY'
+from pathlib import Path
+import os
+
+root = Path(os.environ["ROOT_DIR"])
+num_shards = int(os.environ["NUM_SHARDS"])
+shard_id = int(os.environ["SLURM_ARRAY_TASK_ID"])
+
+files = sorted(root.glob("**/tasks.json"))
+for idx, path in enumerate(files):
+    if idx % num_shards == shard_id:
+        print(path)
+PY
+)
+
+TOTAL_FILES=$(
+python - <<'PY'
+from pathlib import Path
+import os
+
+root = Path(os.environ["ROOT_DIR"])
+print(len(sorted(root.glob("**/tasks.json"))))
+PY
+)
+
+SHARD_FILES="${#ASSIGNED_FILES[@]}"
+echo "ROOT_DIR=$ROOT_DIR TOTAL_FILES=$TOTAL_FILES NUM_SHARDS=$NUM_SHARDS SHARD=$SLURM_ARRAY_TASK_ID ASSIGNED_FILES=$SHARD_FILES"
+
+if [ "$SHARD_FILES" -eq 0 ]; then
+  echo "No tasks.json files assigned to shard ${SLURM_ARRAY_TASK_ID}. Exiting."
+  exit 0
+fi
+
+for TASKS_JSON in "${ASSIGNED_FILES[@]}"; do
+  export TASKS_JSON
+
+  mapfile -t META < <(
+  python - <<'PY'
+import json
+import os
+import re
+from pathlib import Path
+
+path = Path(os.environ["TASKS_JSON"])
+payload = json.loads(path.read_text(encoding="utf-8"))
+tasks = payload.get("tasks", [])
+first = tasks[0] if tasks and isinstance(tasks[0], dict) else {}
+
+def clean(value: str, fallback: str) -> str:
+    value = str(value or "").strip()
+    if not value:
+        value = fallback
+    return value
+
+def slug(value: str, fallback: str) -> str:
+    value = clean(value, fallback)
+    return re.sub(r"[^a-zA-Z0-9]+", "_", value).strip("_").lower() or fallback
+
+area_dir = path.parent.parent.name
+cap_dir = path.parent.name
+
+area_id = clean(first.get("area_id"), area_dir)
+capability_id = clean(first.get("capability_id"), cap_dir)
+area_name = clean(first.get("area_name"), area_id)
+capability_name = clean(first.get("capability_name"), capability_id)
+tag_suffix = slug(path.parent.relative_to(Path(os.environ["ROOT_DIR"])).as_posix(), capability_id)
+
+print(area_id)
+print(capability_id)
+print(area_name)
+print(capability_name)
+print(tag_suffix)
+PY
+  )
+
+  AREA_ID="${META[0]}"
+  CAPABILITY_ID="${META[1]}"
+  AREA_NAME="${META[2]}"
+  CAPABILITY_NAME="${META[3]}"
+  TAG_SUFFIX="${META[4]}"
+  TAG="_FINANCE_BOOK1_BOOK2_GEMMA_3_${TAG_SUFFIX}"
+
+  echo "Evaluating $TASKS_JSON"
+  echo "  AREA_ID=$AREA_ID"
+  echo "  CAPABILITY_ID=$CAPABILITY_ID"
+  echo "  TAG=$TAG"
+
+  # Stage 0_static: ingest one local tasks.json export using the local JSON adapter.
+  python -m src.run_eval_pipeline \
+    stage=0_static \
+    validation_tag="$TAG" \
+    +static_benchmark_cfg.benchmark_id="$TASKS_JSON" \
+    +static_benchmark_cfg.area_id="$AREA_ID" \
+    +static_benchmark_cfg.capability_id="$CAPABILITY_ID" \
+    +static_benchmark_cfg.capability_name="$CAPABILITY_NAME" \
+    +static_benchmark_cfg.domain=finance
+
+  # Stage 1_local: evaluate local subject model(s) from run_cfg.yaml.
+  python -m src.run_eval_pipeline \
+    stage=1_local \
+    validation_tag="$TAG" \
+    eval_tag="$TAG"
+
+  # Stage 2: aggregate scores for this tasks.json bundle.
+  python -m src.run_eval_pipeline \
+    stage=2 \
+    eval_tag="$TAG"
+
+  echo "Finished $TASKS_JSON"
+  echo "  Stage 0_static datasets: base_output/test_exp/eval/datasets/$TAG"
+  echo "  Stage 1_local results:  base_output/test_exp/eval/results/$TAG"
+  echo "  Stage 2 scores:         base_output/test_exp/eval/scores/$TAG"
+done
diff --git a/scripts/static_benchmarks/finance_book3_book4_local_array_eval.sh b/scripts/static_benchmarks/finance_book3_book4_local_array_eval.sh
new file mode 100644
index 00000000..af7ae9e9
--- /dev/null
+++ b/scripts/static_benchmarks/finance_book3_book4_local_array_eval.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+#SBATCH --job-name=gemma_book3_book4_local_array
+#SBATCH --output=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/gemma_book3_book4_local_array_%A_%a.out
+#SBATCH --error=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/gemma_book3_book4_local_array_%A_%a.err
+#SBATCH --time=24:00:00
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --array=0-7%8
+
+set -euo pipefail
+
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source /projects/DeepLesion/py311_env/bin/activate
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
+# Allow direct execution without sbatch by defaulting to shard 0.
+: "${SLURM_ARRAY_TASK_ID:=0}"
+
+NUM_SHARDS=8
+ROOT_DIR="${ROOT_DIR:-/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/Finance_Book3_Book4}"
+
+export ROOT_DIR NUM_SHARDS SLURM_ARRAY_TASK_ID
+
+mapfile -t ASSIGNED_FILES < <(
+python - <<'PY'
+from pathlib import Path
+import os
+
+root = Path(os.environ["ROOT_DIR"])
+num_shards = int(os.environ["NUM_SHARDS"])
+shard_id = int(os.environ["SLURM_ARRAY_TASK_ID"])
+
+files = sorted(root.glob("**/tasks.json"))
+for idx, path in enumerate(files):
+    if idx % num_shards == shard_id:
+        print(path)
+PY
+)
+
+TOTAL_FILES=$(
+python - <<'PY'
+from pathlib import Path
+import os
+
+root = Path(os.environ["ROOT_DIR"])
+print(len(sorted(root.glob("**/tasks.json"))))
+PY
+)
+
+SHARD_FILES="${#ASSIGNED_FILES[@]}"
+echo "ROOT_DIR=$ROOT_DIR TOTAL_FILES=$TOTAL_FILES NUM_SHARDS=$NUM_SHARDS SHARD=$SLURM_ARRAY_TASK_ID ASSIGNED_FILES=$SHARD_FILES"
+
+if [ "$SHARD_FILES" -eq 0 ]; then
+  echo "No tasks.json files assigned to shard ${SLURM_ARRAY_TASK_ID}. Exiting."
+  exit 0
+fi
+
+for TASKS_JSON in "${ASSIGNED_FILES[@]}"; do
+  export TASKS_JSON
+
+  mapfile -t META < <(
+  python - <<'PY'
+import json
+import os
+import re
+from pathlib import Path
+
+path = Path(os.environ["TASKS_JSON"])
+payload = json.loads(path.read_text(encoding="utf-8"))
+tasks = payload.get("tasks", [])
+first = tasks[0] if tasks and isinstance(tasks[0], dict) else {}
+
+def clean(value: str, fallback: str) -> str:
+    value = str(value or "").strip()
+    if not value:
+        value = fallback
+    return value
+
+def slug(value: str, fallback: str) -> str:
+    value = clean(value, fallback)
+    return re.sub(r"[^a-zA-Z0-9]+", "_", value).strip("_").lower() or fallback
+
+area_dir = path.parent.parent.name
+cap_dir = path.parent.name
+
+area_id = clean(first.get("area_id"), area_dir)
+capability_id = clean(first.get("capability_id"), cap_dir)
+area_name = clean(first.get("area_name"), area_id)
+capability_name = clean(first.get("capability_name"), capability_id)
+tag_suffix = slug(path.parent.relative_to(Path(os.environ["ROOT_DIR"])).as_posix(), capability_id)
+
+print(area_id)
+print(capability_id)
+print(area_name)
+print(capability_name)
+print(tag_suffix)
+PY
+  )
+
+  AREA_ID="${META[0]}"
+  CAPABILITY_ID="${META[1]}"
+  AREA_NAME="${META[2]}"
+  CAPABILITY_NAME="${META[3]}"
+  TAG_SUFFIX="${META[4]}"
+  TAG="_FINANCE_BOOK3_BOOK4_GEMMA_3_${TAG_SUFFIX}"
+
+  echo "Evaluating $TASKS_JSON"
+  echo "  AREA_ID=$AREA_ID"
+  echo "  CAPABILITY_ID=$CAPABILITY_ID"
+  echo "  TAG=$TAG"
+
+  # Stage 0_static: ingest one local tasks.json export using the local JSON adapter.
+  python -m src.run_eval_pipeline \
+    stage=0_static \
+    validation_tag="$TAG" \
+    +static_benchmark_cfg.benchmark_id="$TASKS_JSON" \
+    +static_benchmark_cfg.area_id="$AREA_ID" \
+    +static_benchmark_cfg.capability_id="$CAPABILITY_ID" \
+    +static_benchmark_cfg.capability_name="$CAPABILITY_NAME" \
+    +static_benchmark_cfg.domain=finance
+
+  # Stage 1_local: evaluate local subject model(s) from run_cfg.yaml.
+  python -m src.run_eval_pipeline \
+    stage=1_local \
+    validation_tag="$TAG" \
+    eval_tag="$TAG"
+
+  # Stage 2: aggregate scores for this tasks.json bundle.
+  python -m src.run_eval_pipeline \
+    stage=2 \
+    eval_tag="$TAG"
+
+  echo "Finished $TASKS_JSON"
+  echo "  Stage 0_static datasets: base_output/test_exp/eval/datasets/$TAG"
+  echo "  Stage 1_local results:  base_output/test_exp/eval/results/$TAG"
+  echo "  Stage 2 scores:         base_output/test_exp/eval/scores/$TAG"
+done
diff --git a/scripts/static_benchmarks/finance_book5_book6_local_array_eval.sh b/scripts/static_benchmarks/finance_book5_book6_local_array_eval.sh
new file mode 100644
index 00000000..3e2126e5
--- /dev/null
+++ b/scripts/static_benchmarks/finance_book5_book6_local_array_eval.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+#SBATCH --job-name=gemma_book5_book6_local_array
+#SBATCH --output=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/gemma_book5_book6_local_array_%A_%a.out
+#SBATCH --error=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/gemma_book5_book6_local_array_%A_%a.err
+#SBATCH --time=24:00:00
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --array=0-7%8
+
+set -euo pipefail
+
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source /projects/DeepLesion/py311_env/bin/activate
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
+# Allow direct execution without sbatch by defaulting to shard 0.
+: "${SLURM_ARRAY_TASK_ID:=0}"
+
+NUM_SHARDS=8
+ROOT_DIR="${ROOT_DIR:-/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/Finance_Book5_Book6}"
+
+export ROOT_DIR NUM_SHARDS SLURM_ARRAY_TASK_ID
+
+mapfile -t ASSIGNED_FILES < <(
+python - <<'PY'
+from pathlib import Path
+import os
+
+root = Path(os.environ["ROOT_DIR"])
+num_shards = int(os.environ["NUM_SHARDS"])
+shard_id = int(os.environ["SLURM_ARRAY_TASK_ID"])
+
+files = sorted(root.glob("**/tasks.json"))
+for idx, path in enumerate(files):
+    if idx % num_shards == shard_id:
+        print(path)
+PY
+)
+
+TOTAL_FILES=$(
+python - <<'PY'
+from pathlib import Path
+import os
+
+root = Path(os.environ["ROOT_DIR"])
+print(len(sorted(root.glob("**/tasks.json"))))
+PY
+)
+
+SHARD_FILES="${#ASSIGNED_FILES[@]}"
+echo "ROOT_DIR=$ROOT_DIR TOTAL_FILES=$TOTAL_FILES NUM_SHARDS=$NUM_SHARDS SHARD=$SLURM_ARRAY_TASK_ID ASSIGNED_FILES=$SHARD_FILES"
+
+if [ "$SHARD_FILES" -eq 0 ]; then
+  echo "No tasks.json files assigned to shard ${SLURM_ARRAY_TASK_ID}. Exiting."
+  exit 0
+fi
+
+for TASKS_JSON in "${ASSIGNED_FILES[@]}"; do
+  export TASKS_JSON
+
+  mapfile -t META < <(
+  python - <<'PY'
+import json
+import os
+import re
+from pathlib import Path
+
+path = Path(os.environ["TASKS_JSON"])
+payload = json.loads(path.read_text(encoding="utf-8"))
+tasks = payload.get("tasks", [])
+first = tasks[0] if tasks and isinstance(tasks[0], dict) else {}
+
+def clean(value: str, fallback: str) -> str:
+    value = str(value or "").strip()
+    if not value:
+        value = fallback
+    return value
+
+def slug(value: str, fallback: str) -> str:
+    value = clean(value, fallback)
+    return re.sub(r"[^a-zA-Z0-9]+", "_", value).strip("_").lower() or fallback
+
+area_dir = path.parent.parent.name
+cap_dir = path.parent.name
+
+area_id = clean(first.get("area_id"), area_dir)
+capability_id = clean(first.get("capability_id"), cap_dir)
+area_name = clean(first.get("area_name"), area_id)
+capability_name = clean(first.get("capability_name"), capability_id)
+tag_suffix = slug(path.parent.relative_to(Path(os.environ["ROOT_DIR"])).as_posix(), capability_id)
+
+print(area_id)
+print(capability_id)
+print(area_name)
+print(capability_name)
+print(tag_suffix)
+PY
+  )
+
+  AREA_ID="${META[0]}"
+  CAPABILITY_ID="${META[1]}"
+  AREA_NAME="${META[2]}"
+  CAPABILITY_NAME="${META[3]}"
+  TAG_SUFFIX="${META[4]}"
+  TAG="_FINANCE_BOOK5_BOOK6_GEMMA_3_${TAG_SUFFIX}"
+
+  echo "Evaluating $TASKS_JSON"
+  echo "  AREA_ID=$AREA_ID"
+  echo "  CAPABILITY_ID=$CAPABILITY_ID"
+  echo "  TAG=$TAG"
+
+  # Stage 0_static: ingest one local tasks.json export using the local JSON adapter.
+  python -m src.run_eval_pipeline \
+    stage=0_static \
+    validation_tag="$TAG" \
+    +static_benchmark_cfg.benchmark_id="$TASKS_JSON" \
+    +static_benchmark_cfg.area_id="$AREA_ID" \
+    +static_benchmark_cfg.capability_id="$CAPABILITY_ID" \
+    +static_benchmark_cfg.capability_name="$CAPABILITY_NAME" \
+    +static_benchmark_cfg.domain=finance
+
+  # Stage 1_local: evaluate local subject model(s) from run_cfg.yaml.
+  python -m src.run_eval_pipeline \
+    stage=1_local \
+    validation_tag="$TAG" \
+    eval_tag="$TAG"
+
+  # Stage 2: aggregate scores for this tasks.json bundle.
+  python -m src.run_eval_pipeline \
+    stage=2 \
+    eval_tag="$TAG"
+
+  echo "Finished $TASKS_JSON"
+  echo "  Stage 0_static datasets: base_output/test_exp/eval/datasets/$TAG"
+  echo "  Stage 1_local results:  base_output/test_exp/eval/results/$TAG"
+  echo "  Stage 2 scores:         base_output/test_exp/eval/scores/$TAG"
+done
diff --git a/scripts/static_benchmarks/finance_math_local_array_eval.sh b/scripts/static_benchmarks/finance_math_local_array_eval.sh
new file mode 100644
index 00000000..7b57a561
--- /dev/null
+++ b/scripts/static_benchmarks/finance_math_local_array_eval.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+#SBATCH --job-name=gemma_finance_math_local_array
+#SBATCH --output=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/finance_math_local_array_%A_%a.out
+#SBATCH --error=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/finance_math_local_array_%A_%a.err
+#SBATCH --time=08:00:00
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --array=0-7%8
+
+set -euo pipefail
+
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source /projects/DeepLesion/py311_env/bin/activate
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
+# Allow direct execution without sbatch by defaulting to shard 0.
+: "${SLURM_ARRAY_TASK_ID:=0}"
+
+NUM_SHARDS=8
+
+# Count only rows that survive adapter filtering.
+TOTAL=$(
+python - <<'PY'
+from datasets import load_dataset
+
+ds = load_dataset("yale-nlp/FinanceMath", split="validation")
+
+def is_valid(row):
+    question = str(row.get("question", "")).strip()
+    answer = row.get("ground_truth")
+    if answer is None:
+        answer_text = ""
+    elif isinstance(answer, dict):
+        for key in ("ground_truth", "value", "answer"):
+            if key in answer and answer[key] is not None:
+                answer_text = str(answer[key]).strip()
+                break
+        else:
+            answer_text = str(answer).strip()
+    else:
+        answer_text = str(answer).strip()
+    return bool(question and answer_text)
+
+print(sum(1 for row in ds if is_valid(row)))
+PY
+)
+
+CHUNK=$(((TOTAL + NUM_SHARDS - 1) / NUM_SHARDS))
+OFFSET=$((SLURM_ARRAY_TASK_ID * CHUNK))
+TAG="_FINANCE_MATH_VALIDATION_GEMMA_3"
+
+if [ "$OFFSET" -ge "$TOTAL" ]; then
+  echo "No work for shard ${SLURM_ARRAY_TASK_ID} (OFFSET=$OFFSET >= TOTAL=$TOTAL). Exiting."
+  exit 0
+fi
+
+echo "TOTAL=$TOTAL NUM_SHARDS=$NUM_SHARDS CHUNK=$CHUNK OFFSET=$OFFSET TAG=$TAG"
+
+# Stage 0_static: build dataset shard from FinanceMath validation split.
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$TAG" \
+  +static_benchmark_cfg.benchmark_id=yale-nlp/FinanceMath \
+  +static_benchmark_cfg.split=validation \
+  +static_benchmark_cfg.offset="$OFFSET" \
+  +static_benchmark_cfg.limit="$CHUNK"
+
+# Stage 1_local: evaluate local subject model(s) from run_cfg.yaml.
+python -m src.run_eval_pipeline \
+  stage=1_local \
+  validation_tag="$TAG" \
+  eval_tag="$TAG"
+
+# Stage 2: aggregate per-shard scores.
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$TAG"
+echo "Stage 1_local results:  base_output/test_exp/eval/results/$TAG"
+echo "Stage 2 scores:         base_output/test_exp/eval/scores/$TAG"
diff --git a/scripts/static_benchmarks/hardmath_eval.sh b/scripts/static_benchmarks/hardmath_eval.sh
deleted file mode 100755
index 864b37d5..00000000
--- a/scripts/static_benchmarks/hardmath_eval.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=hardmath_eval
-#SBATCH --output=logs/hardmath_eval_%j.out
-#SBATCH --error=logs/hardmath_eval_%j.err
-#SBATCH --time=04:00:00
-#SBATCH --cpus-per-task=4
-#SBATCH --mem=16G
-
-set -euo pipefail
-
-cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
-
-# shellcheck disable=SC1091
-source "scripts/static_benchmarks/env_slurm_inspect.sh"
-
-VALIDATION_TAG="_HARDMATH_$(date +%Y%m%d_%H%M%S)"
-
-python -m src.run_eval_pipeline \
-  stage=0_static \
-  validation_tag="$VALIDATION_TAG" \
-  +static_benchmark_cfg.benchmark_id=HARDMath \
-  +static_benchmark_cfg.limit=500
-
-python -m src.run_eval_pipeline \
-  stage=1 \
-  validation_tag="$VALIDATION_TAG" \
-  eval_tag="$VALIDATION_TAG"
-
-python -m src.run_eval_pipeline \
-  stage=2 \
-  eval_tag="$VALIDATION_TAG"
-
-echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
-echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
-echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
-
-RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
-if [ -d "$RESULTS_DIR" ]; then
-  echo "Flattening Inspect logs under $RESULTS_DIR ..."
-  for model_dir in "$RESULTS_DIR"/*/; do
-    [ -d "$model_dir" ] || continue
-    model_name="$(basename "$model_dir")"
-    for area_dir in "$model_dir"*/; do
-      [ -d "$area_dir" ] || continue
-      for cap_dir in "$area_dir"*/; do
-        [ -d "$cap_dir" ] || continue
-        cap_name="$(basename "$cap_dir")"
-        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
-        if [ -n "$log_file" ]; then
-          out_file="$cap_dir/flat_${cap_name}.jsonl"
-          python scripts/flatten_inspect_logs.py \
-            --log_path "$log_file" \
-            --out_path "$out_file"
-          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
-        fi
-      done
-    done
-  done
-fi
diff --git a/scripts/static_benchmarks/harp_eval.sh b/scripts/static_benchmarks/harp_eval.sh
deleted file mode 100644
index 329aabf1..00000000
--- a/scripts/static_benchmarks/harp_eval.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=harp_eval
-#SBATCH --output=logs/harp_eval_%j.out
-#SBATCH --error=logs/harp_eval_%j.err
-#SBATCH --time=04:00:00
-#SBATCH --cpus-per-task=4
-#SBATCH --mem=16G
-
-set -euo pipefail
-
-cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
-
-# shellcheck disable=SC1091
-source "scripts/static_benchmarks/env_slurm_inspect.sh"
-
-VALIDATION_TAG="_HARP_$(date +%Y%m%d_%H%M%S)"
-
-# Stage 0_static: build datasets from aadityasingh/HARP (main JSONL split)
-python -m src.run_eval_pipeline \
-  stage=0_static \
-  validation_tag="$VALIDATION_TAG" \
-  +static_benchmark_cfg.benchmark_id=aadityasingh/HARP \
-  +static_benchmark_cfg.split=train \
-  +static_benchmark_cfg.limit=30
-
-# Stage 1: run subject models on the static datasets
-python -m src.run_eval_pipeline \
-  stage=1 \
-  validation_tag="$VALIDATION_TAG" \
-  eval_tag="$VALIDATION_TAG"
-
-# Stage 2: aggregate scores
-python -m src.run_eval_pipeline \
-  stage=2 \
-  eval_tag="$VALIDATION_TAG"
-
-echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
-echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
-echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
-
-# Optional: generate flattened JSONL views of Inspect logs for easier reading
-RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
-if [ -d "$RESULTS_DIR" ]; then
-  echo "Flattening Inspect logs under $RESULTS_DIR ..."
-  for model_dir in "$RESULTS_DIR"/*/; do
-    [ -d "$model_dir" ] || continue
-    model_name="$(basename "$model_dir")"
-    for area_dir in "$model_dir"*/; do
-      [ -d "$area_dir" ] || continue
-      for cap_dir in "$area_dir"*/; do
-        [ -d "$cap_dir" ] || continue
-        cap_name="$(basename "$cap_dir")"
-        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
-        if [ -n "$log_file" ]; then
-          out_file="$cap_dir/flat_${cap_name}.jsonl"
-          python scripts/flatten_inspect_logs.py \
-            --log_path "$log_file" \
-            --out_path "$out_file"
-          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
-        fi
-      done
-    done
-  done
-fi
-
diff --git a/scripts/static_benchmarks/math500_eval.sh b/scripts/static_benchmarks/math500_eval.sh
deleted file mode 100755
index 2245f0bc..00000000
--- a/scripts/static_benchmarks/math500_eval.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=math500_eval
-#SBATCH --output=logs/math500_eval_%j.out
-#SBATCH --error=logs/math500_eval_%j.err
-#SBATCH --time=02:00:00
-#SBATCH --cpus-per-task=4
-#SBATCH --mem=16G
-
-set -euo pipefail
-
-cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
-
-# shellcheck disable=SC1091
-source "scripts/static_benchmarks/env_slurm_inspect.sh"
-
-VALIDATION_TAG="_MATH500_$(date +%Y%m%d_%H%M%S)"
-
-# Stage 0_static: build datasets from HF MATH-500
-python -m src.run_eval_pipeline \
-  stage=0_static \
-  validation_tag="$VALIDATION_TAG" \
-  +static_benchmark_cfg.benchmark_id=HuggingFaceH4/MATH-500 \
-  +static_benchmark_cfg.split=test \
-  +static_benchmark_cfg.limit=30
-
-# Stage 1: run subject models on the static datasets
-python -m src.run_eval_pipeline \
-  stage=1 \
-  validation_tag="$VALIDATION_TAG" \
-  eval_tag="$VALIDATION_TAG"
-
-# Stage 2: aggregate scores
-python -m src.run_eval_pipeline \
-  stage=2 \
-  eval_tag="$VALIDATION_TAG"
-
-echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
-echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
-echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
-
-# Optional: generate flattened JSONL views of Inspect logs for easier reading
-RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
-if [ -d "$RESULTS_DIR" ]; then
-  echo "Flattening Inspect logs under $RESULTS_DIR ..."
-  for model_dir in "$RESULTS_DIR"/*/; do
-    [ -d "$model_dir" ] || continue
-    model_name="$(basename "$model_dir")"
-    for area_dir in "$model_dir"*/; do
-      [ -d "$area_dir" ] || continue
-      for cap_dir in "$area_dir"*/; do
-        [ -d "$cap_dir" ] || continue
-        cap_name="$(basename "$cap_dir")"
-        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
-        if [ -n "$log_file" ]; then
-          out_file="$cap_dir/flat_${cap_name}.jsonl"
-          python scripts/flatten_inspect_logs.py \
-            --log_path "$log_file" \
-            --out_path "$out_file"
-          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
-        fi
-      done
-    done
-  done
-fi
diff --git a/scripts/static_benchmarks/minif2f_eval.sh b/scripts/static_benchmarks/minif2f_eval.sh
deleted file mode 100644
index 6985bb77..00000000
--- a/scripts/static_benchmarks/minif2f_eval.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=minif2f_eval
-#SBATCH --output=logs/minif2f_eval_%j.out
-#SBATCH --error=logs/minif2f_eval_%j.err
-#SBATCH --time=04:00:00
-#SBATCH --cpus-per-task=4
-#SBATCH --mem=16G
-
-set -euo pipefail
-
-cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
-
-# shellcheck disable=SC1091
-source "scripts/static_benchmarks/env_slurm_inspect.sh"
-
-VALIDATION_TAG="_MINIF2F_$(date +%Y%m%d_%H%M%S)"
-
-# Stage 0_static: build datasets from Tonic/MiniF2F (train split only)
-python -m src.run_eval_pipeline \
-  stage=0_static \
-  validation_tag="$VALIDATION_TAG" \
-  +static_benchmark_cfg.benchmark_id=Tonic/MiniF2F \
-  +static_benchmark_cfg.split=train \
-  +static_benchmark_cfg.limit=30
-
-# Stage 1: run subject models on the static datasets
-python -m src.run_eval_pipeline \
-  stage=1 \
-  validation_tag="$VALIDATION_TAG" \
-  eval_tag="$VALIDATION_TAG"
-
-# Stage 2: aggregate scores
-python -m src.run_eval_pipeline \
-  stage=2 \
-  eval_tag="$VALIDATION_TAG"
-
-echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
-echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
-echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
-
-# Optional: generate flattened JSONL views of Inspect logs for easier reading
-RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
-if [ -d "$RESULTS_DIR" ]; then
-  echo "Flattening Inspect logs under $RESULTS_DIR ..."
-  for model_dir in "$RESULTS_DIR"/*/; do
-    [ -d "$model_dir" ] || continue
-    model_name="$(basename "$model_dir")"
-    for area_dir in "$model_dir"*/; do
-      [ -d "$area_dir" ] || continue
-      for cap_dir in "$area_dir"*/; do
-        [ -d "$cap_dir" ] || continue
-        cap_name="$(basename "$cap_dir")"
-        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
-        if [ -n "$log_file" ]; then
-          out_file="$cap_dir/flat_${cap_name}.jsonl"
-          python scripts/flatten_inspect_logs.py \
-            --log_path "$log_file" \
-            --out_path "$out_file"
-          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
-        fi
-      done
-    done
-  done
-fi
diff --git a/scripts/static_benchmarks/omni_math_eval.sh b/scripts/static_benchmarks/omni_math_eval.sh
deleted file mode 100644
index d3b50898..00000000
--- a/scripts/static_benchmarks/omni_math_eval.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=omni_math_eval
-#SBATCH --output=logs/omni_math_eval_%j.out
-#SBATCH --error=logs/omni_math_eval_%j.err
-#SBATCH --time=04:00:00
-#SBATCH --cpus-per-task=4
-#SBATCH --mem=16G
-
-set -euo pipefail
-
-cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
-
-# shellcheck disable=SC1091
-source "scripts/static_benchmarks/env_slurm_inspect.sh"
-
-VALIDATION_TAG="_OMNI_MATH_$(date +%Y%m%d_%H%M%S)"
-
-# Stage 0_static: build datasets from KbsdJames/Omni-MATH (test split)
-python -m src.run_eval_pipeline \
-  stage=0_static \
-  validation_tag="$VALIDATION_TAG" \
-  +static_benchmark_cfg.benchmark_id=KbsdJames/Omni-MATH \
-  +static_benchmark_cfg.split=test \
-  +static_benchmark_cfg.limit=30
-
-# Stage 1: run subject models on the static datasets
-python -m src.run_eval_pipeline \
-  stage=1 \
-  validation_tag="$VALIDATION_TAG" \
-  eval_tag="$VALIDATION_TAG"
-
-# Stage 2: aggregate scores
-python -m src.run_eval_pipeline \
-  stage=2 \
-  eval_tag="$VALIDATION_TAG"
-
-echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
-echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
-echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
-
-# Optional: generate flattened JSONL views of Inspect logs for easier reading
-RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
-if [ -d "$RESULTS_DIR" ]; then
-  echo "Flattening Inspect logs under $RESULTS_DIR ..."
-  for model_dir in "$RESULTS_DIR"/*/; do
-    [ -d "$model_dir" ] || continue
-    model_name="$(basename "$model_dir")"
-    for area_dir in "$model_dir"*/; do
-      [ -d "$area_dir" ] || continue
-      for cap_dir in "$area_dir"*/; do
-        [ -d "$cap_dir" ] || continue
-        cap_name="$(basename "$cap_dir")"
-        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
-        if [ -n "$log_file" ]; then
-          out_file="$cap_dir/flat_${cap_name}.jsonl"
-          python scripts/flatten_inspect_logs.py \
-            --log_path "$log_file" \
-            --out_path "$out_file"
-          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
-        fi
-      done
-    done
-  done
-fi
diff --git a/scripts/static_benchmarks/orca_math_eval.sh b/scripts/static_benchmarks/orca_math_eval.sh
deleted file mode 100644
index 37096739..00000000
--- a/scripts/static_benchmarks/orca_math_eval.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=orca_math_eval
-#SBATCH --output=logs/orca_math_eval_%j.out
-#SBATCH --error=logs/orca_math_eval_%j.err
-#SBATCH --time=04:00:00
-#SBATCH --cpus-per-task=4
-#SBATCH --mem=16G
-
-set -euo pipefail
-
-cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
-
-# shellcheck disable=SC1091
-source "scripts/static_benchmarks/env_slurm_inspect.sh"
-
-VALIDATION_TAG="_ORCA_MATH_$(date +%Y%m%d_%H%M%S)"
-
-# Stage 0_static: build datasets from microsoft/orca-math-word-problems-200k (train split only)
-python -m src.run_eval_pipeline \
-  stage=0_static \
-  validation_tag="$VALIDATION_TAG" \
-  +static_benchmark_cfg.benchmark_id=microsoft/orca-math-word-problems-200k \
-  +static_benchmark_cfg.split=train \
-  +static_benchmark_cfg.limit=30
-
-# Stage 1: run subject models on the static datasets
-python -m src.run_eval_pipeline \
-  stage=1 \
-  validation_tag="$VALIDATION_TAG" \
-  eval_tag="$VALIDATION_TAG"
-
-# Stage 2: aggregate scores
-python -m src.run_eval_pipeline \
-  stage=2 \
-  eval_tag="$VALIDATION_TAG"
-
-echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
-echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
-echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
-
-# Optional: generate flattened JSONL views of Inspect logs for easier reading
-RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
-if [ -d "$RESULTS_DIR" ]; then
-  echo "Flattening Inspect logs under $RESULTS_DIR ..."
-  for model_dir in "$RESULTS_DIR"/*/; do
-    [ -d "$model_dir" ] || continue
-    model_name="$(basename "$model_dir")"
-    for area_dir in "$model_dir"*/; do
-      [ -d "$area_dir" ] || continue
-      for cap_dir in "$area_dir"*/; do
-        [ -d "$cap_dir" ] || continue
-        cap_name="$(basename "$cap_dir")"
-        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
-        if [ -n "$log_file" ]; then
-          out_file="$cap_dir/flat_${cap_name}.jsonl"
-          python scripts/flatten_inspect_logs.py \
-            --log_path "$log_file" \
-            --out_path "$out_file"
-          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
-        fi
-      done
-    done
-  done
-fi
diff --git a/scripts/static_benchmarks/proofnet_eval.sh b/scripts/static_benchmarks/proofnet_eval.sh
deleted file mode 100644
index b68faeb2..00000000
--- a/scripts/static_benchmarks/proofnet_eval.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=proofnet_eval
-#SBATCH --output=logs/proofnet_eval_%j.out
-#SBATCH --error=logs/proofnet_eval_%j.err
-#SBATCH --time=04:00:00
-#SBATCH --cpus-per-task=4
-#SBATCH --mem=16G
-
-set -euo pipefail
-
-cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
-
-# shellcheck disable=SC1091
-source "scripts/static_benchmarks/env_slurm_inspect.sh"
-
-VALIDATION_TAG="_PROOFNET_$(date +%Y%m%d_%H%M%S)"
-
-# Stage 0_static: build datasets from hoskinson-center/proofnet (plain_text, validation)
-python -m src.run_eval_pipeline \
-  stage=0_static \
-  validation_tag="$VALIDATION_TAG" \
-  +static_benchmark_cfg.benchmark_id=hoskinson-center/proofnet \
-  +static_benchmark_cfg.split=validation \
-  +static_benchmark_cfg.limit=30
-
-# Stage 1: run subject models on the static datasets
-python -m src.run_eval_pipeline \
-  stage=1 \
-  validation_tag="$VALIDATION_TAG" \
-  eval_tag="$VALIDATION_TAG"
-
-# Stage 2: aggregate scores
-python -m src.run_eval_pipeline \
-  stage=2 \
-  eval_tag="$VALIDATION_TAG"
-
-echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
-echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
-echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
-
-# Optional: generate flattened JSONL views of Inspect logs for easier reading
-RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
-if [ -d "$RESULTS_DIR" ]; then
-  echo "Flattening Inspect logs under $RESULTS_DIR ..."
-  for model_dir in "$RESULTS_DIR"/*/; do
-    [ -d "$model_dir" ] || continue
-    model_name="$(basename "$model_dir")"
-    for area_dir in "$model_dir"*/; do
-      [ -d "$area_dir" ] || continue
-      for cap_dir in "$area_dir"*/; do
-        [ -d "$cap_dir" ] || continue
-        cap_name="$(basename "$cap_dir")"
-        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
-        if [ -n "$log_file" ]; then
-          out_file="$cap_dir/flat_${cap_name}.jsonl"
-          python scripts/flatten_inspect_logs.py \
-            --log_path "$log_file" \
-            --out_path "$out_file"
-          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
-        fi
-      done
-    done
-  done
-fi
diff --git a/scripts/static_benchmarks/run_topic_classification_qwen3_32b.sh b/scripts/static_benchmarks/run_topic_classification_qwen3_32b.sh
new file mode 100644
index 00000000..08911670
--- /dev/null
+++ b/scripts/static_benchmarks/run_topic_classification_qwen3_32b.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#SBATCH --job-name=topic_cls_qwen3_32b
+#SBATCH --output=logs/topic_cls_qwen3_32b_%j.out
+#SBATCH --error=logs/topic_cls_qwen3_32b_%j.err
+#SBATCH --time=12:00:00
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+
+set -euo pipefail
+
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source /projects/DeepLesion/py311_env/bin/activate
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
+# Example defaults: classify XFinBench test split.
+BENCHMARK_ID="${BENCHMARK_ID:-Zhihan/XFinBench}"
+SPLIT="${SPLIT:-test}"
+OUTPUT_JSONL="${OUTPUT_JSONL:-base_output/topic_classification/xfinbench_qwen3_32b.jsonl}"
+MODEL_PATH="${MODEL_PATH:-/model-weights/Qwen3-32B}"
+BATCH_SIZE="${BATCH_SIZE:-16}"
+MAX_TOKENS="${MAX_TOKENS:-32}"
+
+python scripts/static_benchmarks/classify_static_benchmark_topics_vllm.py \
+  --benchmark-id "$BENCHMARK_ID" \
+  --split "$SPLIT" \
+  --topic-csv "topic.csv" \
+  --output-jsonl "$OUTPUT_JSONL" \
+  --resume \
+  --model-path "$MODEL_PATH" \
+  --batch-size "$BATCH_SIZE" \
+  --max-tokens "$MAX_TOKENS" \
+  --trust-remote-code
+
+echo "Saved classifications to $OUTPUT_JSONL"
diff --git a/scripts/static_benchmarks/seed_tasks_allbloom_eval.sh b/scripts/static_benchmarks/seed_tasks_allbloom_eval.sh
deleted file mode 100755
index 48e8d1ea..00000000
--- a/scripts/static_benchmarks/seed_tasks_allbloom_eval.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=seed_tasks_allbloom_eval
-#SBATCH --output=logs/seed_tasks_allbloom_eval_%j.out
-#SBATCH --error=logs/seed_tasks_allbloom_eval_%j.err
-#SBATCH --time=04:00:00
-#SBATCH --cpus-per-task=4
-#SBATCH --mem=16G
-
-set -euo pipefail
-
-cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
-
-# shellcheck disable=SC1091
-source "scripts/static_benchmarks/env_slurm_inspect.sh"
-
-VALIDATION_TAG="_SEED_TASKS_ALLBLOOMS_date_TuesdayNight_GPT_OSS_120B"
-
-# Stage 0_static: build datasets from local seed_tasks.json
-python -m src.run_eval_pipeline \
-  stage=0_static \
-  validation_tag="$VALIDATION_TAG" \
-  +static_benchmark_cfg.benchmark_id=seed_tasks.json \
-  +static_benchmark_cfg.split=na \
-  +static_benchmark_cfg.domain=finance \
-  +static_benchmark_cfg.capability_id=seed_tasks_allblooms \
-  +static_benchmark_cfg.capability_name=SeedTasksAllBlooms \
-  +static_benchmark_cfg.exclude_bloom_create=false
-
-# Stage 1: run subject models on the static datasets
-python -m src.run_eval_pipeline \
-  stage=1 \
-  validation_tag="$VALIDATION_TAG" \
-  eval_tag="$VALIDATION_TAG"
-
-# Stage 2: aggregate scores
-python -m src.run_eval_pipeline \
-  stage=2 \
-  eval_tag="$VALIDATION_TAG"
-
-echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
-echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
-echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
-
-# Optional: generate flattened JSONL views of Inspect logs for easier reading
-RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
-if [ -d "$RESULTS_DIR" ]; then
-  echo "Flattening Inspect logs under $RESULTS_DIR ..."
-  for model_dir in "$RESULTS_DIR"/*/; do
-    [ -d "$model_dir" ] || continue
-    model_name="$(basename "$model_dir")"
-    for area_dir in "$model_dir"*/; do
-      [ -d "$area_dir" ] || continue
-      for cap_dir in "$area_dir"*/; do
-        [ -d "$cap_dir" ] || continue
-        cap_name="$(basename "$cap_dir")"
-        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
-        if [ -n "$log_file" ]; then
-          out_file="$cap_dir/flat_${cap_name}.jsonl"
-          python scripts/flatten_inspect_logs.py \
-            --log_path "$log_file" \
-            --out_path "$out_file"
-          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
-        fi
-      done
-    done
-  done
-fi
-
diff --git a/scripts/static_benchmarks/seed_tasks_no_create_eval.sh b/scripts/static_benchmarks/seed_tasks_no_create_eval.sh
deleted file mode 100755
index 86ab9920..00000000
--- a/scripts/static_benchmarks/seed_tasks_no_create_eval.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=seed_tasks_no_create_eval
-#SBATCH --output=logs/seed_tasks_no_create_eval_%j.out
-#SBATCH --error=logs/seed_tasks_no_create_eval_%j.err
-#SBATCH --time=04:00:00
-#SBATCH --cpus-per-task=4
-#SBATCH --mem=16G
-
-set -euo pipefail
-
-cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
-
-# shellcheck disable=SC1091
-source "scripts/static_benchmarks/env_slurm_inspect.sh"
-
-VALIDATION_TAG="_SEED_TASKS_NO_CREATE_$(date_TuesdayNight)"
-
-# Stage 0_static: build datasets from local seed_tasks.json (exclude Create bloom level)
-python -m src.run_eval_pipeline \
-  stage=0_static \
-  validation_tag="$VALIDATION_TAG" \
-  +static_benchmark_cfg.benchmark_id=seed_tasks.json \
-  +static_benchmark_cfg.split=na \
-  +static_benchmark_cfg.domain=finance \
-  +static_benchmark_cfg.capability_id=seed_tasks_no_create \
-  +static_benchmark_cfg.capability_name=SeedTasksNoCreate \
-  +static_benchmark_cfg.exclude_bloom_create=true
-
-# Stage 1: run subject models on the static datasets
-python -m src.run_eval_pipeline \
-  stage=1 \
-  validation_tag="$VALIDATION_TAG" \
-  eval_tag="$VALIDATION_TAG"
-
-# Stage 2: aggregate scores
-python -m src.run_eval_pipeline \
-  stage=2 \
-  eval_tag="$VALIDATION_TAG"
-
-echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
-echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
-echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
-
-# Optional: generate flattened JSONL views of Inspect logs for easier reading
-RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
-if [ -d "$RESULTS_DIR" ]; then
-  echo "Flattening Inspect logs under $RESULTS_DIR ..."
-  for model_dir in "$RESULTS_DIR"/*/; do
-    [ -d "$model_dir" ] || continue
-    model_name="$(basename "$model_dir")"
-    for area_dir in "$model_dir"*/; do
-      [ -d "$area_dir" ] || continue
-      for cap_dir in "$area_dir"*/; do
-        [ -d "$cap_dir" ] || continue
-        cap_name="$(basename "$cap_dir")"
-        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
-        if [ -n "$log_file" ]; then
-          out_file="$cap_dir/flat_${cap_name}.jsonl"
-          python scripts/flatten_inspect_logs.py \
-            --log_path "$log_file" \
-            --out_path "$out_file"
-          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
-        fi
-      done
-    done
-  done
-fi
-
diff --git a/scripts/static_benchmarks/stateval_eval.sh b/scripts/static_benchmarks/stateval_eval.sh
deleted file mode 100755
index d30f2178..00000000
--- a/scripts/static_benchmarks/stateval_eval.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=stateval_eval
-#SBATCH --output=logs/stateval_eval_%j.out
-#SBATCH --error=logs/stateval_eval_%j.err
-#SBATCH --time=06:00:00
-#SBATCH --cpus-per-task=4
-#SBATCH --mem=32G
-
-set -euo pipefail
-
-cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
-
-# shellcheck disable=SC1091
-source "scripts/static_benchmarks/env_slurm_inspect.sh"
-
-VALIDATION_TAG="_STATEVAL_$(date +%Y%m%d_%H%M%S)"
-
-python -m src.run_eval_pipeline \
-  stage=0_static \
-  validation_tag="$VALIDATION_TAG" \
-  +static_benchmark_cfg.benchmark_id=StatEval \
-  +static_benchmark_cfg.split=train \
-  +static_benchmark_cfg.limit=30
-
-python -m src.run_eval_pipeline \
-  stage=1 \
-  validation_tag="$VALIDATION_TAG" \
-  eval_tag="$VALIDATION_TAG"
-
-python -m src.run_eval_pipeline \
-  stage=2 \
-  eval_tag="$VALIDATION_TAG"
-
-echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
-echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
-echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
-
-RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
-if [ -d "$RESULTS_DIR" ]; then
-  echo "Flattening Inspect logs under $RESULTS_DIR ..."
-  for model_dir in "$RESULTS_DIR"/*/; do
-    [ -d "$model_dir" ] || continue
-    model_name="$(basename "$model_dir")"
-    for area_dir in "$model_dir"*/; do
-      [ -d "$area_dir" ] || continue
-      for cap_dir in "$area_dir"*/; do
-        [ -d "$cap_dir" ] || continue
-        cap_name="$(basename "$cap_dir")"
-        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
-        if [ -n "$log_file" ]; then
-          out_file="$cap_dir/flat_${cap_name}.jsonl"
-          python scripts/flatten_inspect_logs.py \
-            --log_path "$log_file" \
-            --out_path "$out_file"
-          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
-        fi
-      done
-    done
-  done
-fi
diff --git a/scripts/static_benchmarks/submit_all_static_benchmarks.sh b/scripts/static_benchmarks/submit_all_static_benchmarks.sh
index 415f0429..328d7da8 100755
--- a/scripts/static_benchmarks/submit_all_static_benchmarks.sh
+++ b/scripts/static_benchmarks/submit_all_static_benchmarks.sh
@@ -7,16 +7,7 @@ cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
 # Ensure scripts are executable
 chmod +x scripts/static_benchmarks/*_eval.sh || true
 
-sbatch scripts/static_benchmarks/math500_eval.sh
-sbatch scripts/static_benchmarks/hardmath_eval.sh
-sbatch scripts/static_benchmarks/wemath_eval.sh
-sbatch scripts/static_benchmarks/stateval_eval.sh
-sbatch scripts/static_benchmarks/orca_math_eval.sh
-sbatch scripts/static_benchmarks/proofnet_eval.sh
-sbatch scripts/static_benchmarks/harp_eval.sh
 sbatch scripts/static_benchmarks/finance_math_eval.sh
 sbatch scripts/static_benchmarks/finance_tasks_eval.sh
 sbatch scripts/static_benchmarks/xfinbench_eval.sh
 sbatch scripts/static_benchmarks/bizbench_eval.sh
-sbatch scripts/static_benchmarks/omni_math_eval.sh
-sbatch scripts/static_benchmarks/minif2f_eval.sh
diff --git a/scripts/static_benchmarks/wemath_eval.sh b/scripts/static_benchmarks/wemath_eval.sh
deleted file mode 100755
index cfd8bfcb..00000000
--- a/scripts/static_benchmarks/wemath_eval.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=wemath_eval
-#SBATCH --output=logs/wemath_eval_%j.out
-#SBATCH --error=logs/wemath_eval_%j.err
-#SBATCH --time=04:00:00
-#SBATCH --cpus-per-task=4
-#SBATCH --mem=16G
-
-set -euo pipefail
-
-cd /fs01/projects/DeepLesion/projects/new_ace/automated_capability_evaluation
-
-# shellcheck disable=SC1091
-source "scripts/static_benchmarks/env_slurm_inspect.sh"
-
-VALIDATION_TAG="_WEMATH_$(date +%Y%m%d_%H%M%S)"
-
-python -m src.run_eval_pipeline \
-  stage=0_static \
-  validation_tag="$VALIDATION_TAG" \
-  +static_benchmark_cfg.benchmark_id=We-Math/We-Math \
-  +static_benchmark_cfg.split=testmini \
-  +static_benchmark_cfg.limit=30
-
-python -m src.run_eval_pipeline \
-  stage=1 \
-  validation_tag="$VALIDATION_TAG" \
-  eval_tag="$VALIDATION_TAG"
-
-python -m src.run_eval_pipeline \
-  stage=2 \
-  eval_tag="$VALIDATION_TAG"
-
-echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
-echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
-echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
-
-RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
-if [ -d "$RESULTS_DIR" ]; then
-  echo "Flattening Inspect logs under $RESULTS_DIR ..."
-  for model_dir in "$RESULTS_DIR"/*/; do
-    [ -d "$model_dir" ] || continue
-    model_name="$(basename "$model_dir")"
-    for area_dir in "$model_dir"*/; do
-      [ -d "$area_dir" ] || continue
-      for cap_dir in "$area_dir"*/; do
-        [ -d "$cap_dir" ] || continue
-        cap_name="$(basename "$cap_dir")"
-        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
-        if [ -n "$log_file" ]; then
-          out_file="$cap_dir/flat_${cap_name}.jsonl"
-          python scripts/flatten_inspect_logs.py \
-            --log_path "$log_file" \
-            --out_path "$out_file"
-          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
-        fi
-      done
-    done
-  done
-fi
diff --git a/scripts/static_benchmarks/xfinbench_test_local_array_eval.sh b/scripts/static_benchmarks/xfinbench_test_local_array_eval.sh
new file mode 100644
index 00000000..7cc9edf2
--- /dev/null
+++ b/scripts/static_benchmarks/xfinbench_test_local_array_eval.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+#SBATCH --job-name=gemma_xfinbench_test_local_array
+#SBATCH --output=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/xfinbench_test_local_array_%A_%a.out
+#SBATCH --error=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/xfinbench_test_local_array_%A_%a.err
+#SBATCH --time=24:00:00
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --array=0-7%8
+
+set -euo pipefail
+
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source /projects/DeepLesion/py311_env/bin/activate
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
+# Allow direct execution without sbatch by defaulting to shard 0.
+: "${SLURM_ARRAY_TASK_ID:=0}"
+
+NUM_SHARDS=8
+
+# Count only text-only XFinBench test rows since the adapter skips rows with figures.
+TOTAL=$(
+python - <<'PY'
+from datasets import load_dataset
+
+ds = load_dataset(
+    "Zhihan/XFinBench",
+    data_files={"validation": "validation_set.csv", "test": "test_set.csv"},
+)["test"]
+
+print(sum(1 for row in ds if row.get("figure") is None))
+PY
+)
+
+CHUNK=$(((TOTAL + NUM_SHARDS - 1) / NUM_SHARDS))
+OFFSET=$((SLURM_ARRAY_TASK_ID * CHUNK))
+TAG="_XFINBENCH_TEST_GEMMA_3"
+
+if [ "$OFFSET" -ge "$TOTAL" ]; then
+  echo "No work for shard ${SLURM_ARRAY_TASK_ID} (OFFSET=$OFFSET >= TOTAL=$TOTAL). Exiting."
+  exit 0
+fi
+
+echo "TOTAL=$TOTAL NUM_SHARDS=$NUM_SHARDS CHUNK=$CHUNK OFFSET=$OFFSET TAG=$TAG"
+
+# Stage 0_static: build dataset shard from the XFinBench test split.
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$TAG" \
+  +static_benchmark_cfg.benchmark_id=Zhihan/XFinBench \
+  +static_benchmark_cfg.split=test \
+  +static_benchmark_cfg.offset="$OFFSET" \
+  +static_benchmark_cfg.limit="$CHUNK" \
+  +static_benchmark_cfg.domain=finance \
+  +static_benchmark_cfg.capability_id=xfinbench_test \
+  +static_benchmark_cfg.capability_name=XFinBenchTest
+
+# Stage 1_local: evaluate local subject model(s) from run_cfg.yaml.
+python -m src.run_eval_pipeline \
+  stage=1_local \
+  validation_tag="$TAG" \
+  eval_tag="$TAG"
+
+# Stage 2: aggregate per-shard scores.
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$TAG"
+echo "Stage 1_local results:  base_output/test_exp/eval/results/$TAG"
+echo "Stage 2 scores:         base_output/test_exp/eval/scores/$TAG"
diff --git a/src/cfg/run_cfg.yaml b/src/cfg/run_cfg.yaml
index 2ee0861d..14433b8c 100644
--- a/src/cfg/run_cfg.yaml
+++ b/src/cfg/run_cfg.yaml
@@ -78,90 +78,75 @@ task_generation_cfg:
 eval_cfg:
   # LLMs to evaluate (required)
   subject_llms:
-    # - name: claude-haiku-4-5-20251001
-    #   provider: anthropic
-    # - name: claude-opus-4-6
-    #   provider: anthropic
-    # - name: gemini-2.5-flash-lite
-    #   provider: google
-    # - name: gemini-3.1-pro-preview
-    #   provider: google
-    # - name: gpt-4.1-mini
-    #   provider: openai
-    # - name: gpt-4.1
-    #   provider: openai
-    # - name: gpt-5.4
-    #   provider: openai
-
-    # - name: Qwen2.5-3B-Instruct
-    #   provider: openai
-    #   base_url: http://bn064:8458/v1
-
-    # - name: Qwen2.5-7B-Instruct
-    #   provider: openai
-    #   base_url: http://bn064:60530/v1
-
-    # - name: Qwen2.5-32B-Instruct
-    #   provider: openai
-    #   base_url: http://bn065:63335/v1
-
-    # - name: Meta-Llama-3.1-8B-Instruct
-    #   provider: openai
-    #   base_url: http://bn064:58891/v1
-
-
-    # - name: Qwen3-8B
-    #   provider: openai
-    #   base_url: http://bn067:45029/v1
-    - name: gpt-oss-120b
-      provider: openai
-      base_url: http://bn064:36884/v1
-    # - name: Qwen3-14B
-    #   provider: openai
-    #   base_url: http://bn075:50127/v1
-    # - name: Qwen3-32B
-    #   provider: openai
-    #   base_url: http://bn069:36543/v1
-
-    # - name: gemini-3-flash-preview
-    #   provider: google
-    # - name: gemini-3-pro-preview
-    #   provider: google
-    # - name: gpt-5-mini
-    #   provider: openai
-    # - name: gpt-5.2
-    #   provider: openai
-    # - name: claude-sonnet-4-6
-    #   provider: anthropic
-    # - name: claude-opus-4-6
-    #   provider: anthropic
-    # - name: gpt-4o
-    #   provider: openai
-    # - name: gpt-4.1-mini
-    #   provider: openai
-    # - name: gemini-2.5-flash
-    #   provider: google
-    # - name: claude-haiku-4-5-20251001
-    #   provider: anthropic
-    # - name: gemini-2.5-flash-lite
-    #   provider: google
-    # - name: gpt-3.5-turbo-0125
-    #   provider: openai
+    - name: gemma-3-12b-it
+      provider: hf_local
+      model_path: /model-weights/gemma-3-12b-it
+      inference_backend: vllm
+      trust_remote_code: true
+      gpu_memory_utilization: 0.9
+      tensor_parallel_size: 1
+      batch_size: 8
+      generation_cfg:
+        temperature: 0.0
+        max_tokens: 8192
+    - name: gemma-3-27b-it
+      provider: hf_local
+      model_path: /model-weights/gemma-3-27b-it
+      inference_backend: vllm
+      trust_remote_code: true
+      gpu_memory_utilization: 0.9
+      tensor_parallel_size: 1
+      batch_size: 4
+      generation_cfg:
+        temperature: 0.0
+        max_tokens: 8192
+    - name: qwen-3-32b
+      provider: hf_local
+      model_path: /model-weights/qwen-3-32b
+      inference_backend: vllm
+      trust_remote_code: true
+      gpu_memory_utilization: 0.9
+      tensor_parallel_size: 1
+      batch_size: 4
+      generation_cfg:
+        temperature: 0.0
+        max_tokens: 8192
+    - name: qwen-3-8b
+      provider: hf_local
+      model_path: /model-weights/qwen-3-8b
+      inference_backend: vllm
+      trust_remote_code: true
+      gpu_memory_utilization: 0.9
+      tensor_parallel_size: 1
+      batch_size: 8
+      generation_cfg:
+        temperature: 0.0
+        max_tokens: 8192
+    - name: deepseek-r1-distill-qwen-32b
+      provider: hf_local
+      model_path: /projects/DeepLesion/model_weights/deepseek-r1-distill-qwen-32b
+      inference_backend: vllm
+      trust_remote_code: true
+      gpu_memory_utilization: 0.9
+      tensor_parallel_size: 1
+      batch_size: 4
+      generation_cfg:
+        temperature: 0.0
+        max_tokens: 8192
+    - name: deepseek-r1-distill-qwen-14b
+      provider: hf_local
+      model_path: /projects/DeepLesion/model_weights/deepseek-r1-distill-qwen-14b
+      inference_backend: vllm
+      trust_remote_code: true
+      gpu_memory_utilization: 0.9
+      tensor_parallel_size: 1
+      batch_size: 8
+      generation_cfg:
+        temperature: 0.0
+        max_tokens: 8192
 
   # Judge LLM for scoring (required)
   judge_llm:
-    # name: Qwen2.5-7B-Instruct
-    # provider: hf_local
-    # model_path: /model-weights/Qwen2.5-7B-Instruct
-    # inference_backend: vllm
-    # trust_remote_code: true
-    # gpu_memory_utilization: 0.9
-    # tensor_parallel_size: 1
-    # batch_size: 64
-    # generation_cfg:
-    #   max_tokens: 8
-    #   temperature: 0.0
-
     name: gpt-4o-mini
     provider: openai
     base_url: https://api.openai.com/v1
diff --git a/src/eval_stages/stage0_static_benchmarks.py b/src/eval_stages/stage0_static_benchmarks.py
index e9b9a597..864b4461 100644
--- a/src/eval_stages/stage0_static_benchmarks.py
+++ b/src/eval_stages/stage0_static_benchmarks.py
@@ -22,36 +22,15 @@
 
 from omegaconf import DictConfig, OmegaConf
 
-from src.eval_stages.static_benchmarks.hardmath import (
-    build_eval_datasets_from_hardmath,
-)
-from src.eval_stages.static_benchmarks.math500 import build_eval_datasets_from_math500
 from src.eval_stages.static_benchmarks.mathvista import (
     build_eval_datasets_from_mathvista,
 )
-from src.eval_stages.static_benchmarks.orca_math import (
-    build_eval_datasets_from_orca_math,
-)
-from src.eval_stages.static_benchmarks.minif2f import (
-    build_eval_datasets_from_minif2f,
-)
-from src.eval_stages.static_benchmarks.omni_math import (
-    build_eval_datasets_from_omni_math,
-)
-from src.eval_stages.static_benchmarks.harp import (
-    build_eval_datasets_from_harp,
-)
 from src.eval_stages.static_benchmarks.finance_math import (
     build_eval_datasets_from_finance_math,
 )
 from src.eval_stages.static_benchmarks.bizbench import (
     build_eval_datasets_from_bizbench,
 )
-from src.eval_stages.static_benchmarks.proofnet import (
-    build_eval_datasets_from_proofnet,
-)
-from src.eval_stages.static_benchmarks.stateval import build_eval_datasets_from_stateval
-from src.eval_stages.static_benchmarks.wemath import build_eval_datasets_from_wemath
 from src.eval_stages.static_benchmarks.finance_tasks import (
     build_eval_datasets_from_finance_tasks,
 )
@@ -81,45 +60,12 @@ def _build_datasets_from_spec(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
     produce multiple capabilities if desired.
     """
     bid = spec.benchmark_id.strip()
-    if bid in {"HuggingFaceH4/MATH-500", "math500", "MATH-500"}:
-        return build_eval_datasets_from_math500(spec)
-    if bid in {"HARDMath", "hardmath", "HARDMATH"}:
-        return build_eval_datasets_from_hardmath(spec)
-    if bid in {"We-Math/We-Math", "We-Math", "wemath", "WE-MATH"}:
-        return build_eval_datasets_from_wemath(spec)
     if bid in {"AI4Math/MathVista", "MathVista", "mathvista"}:
         return build_eval_datasets_from_mathvista(spec)
-    if bid in {
-        "microsoft/orca-math-word-problems-200k",
-        "orca-math-word-problems-200k",
-        "orca_math",
-        "OrcaMath",
-    }:
-        return build_eval_datasets_from_orca_math(spec)
-    if bid in {
-        "hoskinson-center/proofnet",
-        "proofnet",
-        "ProofNet",
-    }:
-        return build_eval_datasets_from_proofnet(spec)
-    if bid in {"Tonic/MiniF2F", "MiniF2F", "minif2f"}:
-        return build_eval_datasets_from_minif2f(spec)
-    if bid in {"KbsdJames/Omni-MATH", "Omni-MATH", "omni_math"}:
-        return build_eval_datasets_from_omni_math(spec)
-    if bid in {"aadityasingh/HARP", "HARP", "harp"}:
-        return build_eval_datasets_from_harp(spec)
     if bid in {"yale-nlp/FinanceMath", "FinanceMath", "finance_math"}:
         return build_eval_datasets_from_finance_math(spec)
     if bid in {"kensho/bizbench", "BizBench", "bizbench"}:
         return build_eval_datasets_from_bizbench(spec)
-    if bid in {
-        "0v01111/StatEval-Foundational-knowledge",
-        "StatEval-Foundational-knowledge",
-        "stateval_foundational",
-        "StatEval",
-        "stateval",
-    }:
-        return build_eval_datasets_from_stateval(spec)
     if bid in {"Zhihan/XFinBench", "XFinBench", "xfinbench"}:
         return build_eval_datasets_from_xfinbench(spec)
     if bid in {
diff --git a/src/eval_stages/stage1_local_eval_execution.py b/src/eval_stages/stage1_local_eval_execution.py
index bfc04162..ee024ad9 100644
--- a/src/eval_stages/stage1_local_eval_execution.py
+++ b/src/eval_stages/stage1_local_eval_execution.py
@@ -13,6 +13,7 @@
 import os
 import gc
 import asyncio
+import time
 from pathlib import Path
 from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
 
@@ -38,25 +39,6 @@ def _find_datasets(datasets_dir: Path) -> List[Path]:
     return sorted(datasets_dir.rglob("dataset.json"))
 
 
-def _score_value_to_float(value: object) -> Optional[float]:
-    """Convert letter/number score to float when possible."""
-    if isinstance(value, (int, float)):
-        return float(value)
-
-    if isinstance(value, str):
-        upper = value.strip().upper()
-        if upper == "C":
-            return 1.0
-        if upper == "I":
-            return 0.0
-        try:
-            return float(value)
-        except ValueError:
-            return None
-
-    return None
-
-
 def _flat_result_path(output_dir: Path, capability_id: str) -> Path:
     return output_dir / f"flat_{capability_id}.jsonl"
 
@@ -283,6 +265,75 @@ def _load_vllm_model(model_config: Dict[str, Any]) -> Any:
     return LLM(**llm_kwargs)
 
 
+def _wait_for_vllm_startup_memory(
+    gpu_memory_utilization: float, timeout_seconds: float = 90.0
+) -> None:
+    """Wait until enough free GPU memory is available for vLLM startup."""
+    if not torch.cuda.is_available():
+        return
+
+    # Clamp to sensible bounds in case config has bad values.
+    target_util = min(max(float(gpu_memory_utilization), 0.0), 1.0)
+    deadline = time.monotonic() + timeout_seconds
+    required_gib = None
+    latest_free_gib = None
+    total_gib = None
+
+    while time.monotonic() < deadline:
+        free_bytes, total_bytes = torch.cuda.mem_get_info()
+        latest_free_gib = free_bytes / (1024**3)
+        total_gib = total_bytes / (1024**3)
+        required_gib = target_util * total_gib
+        if latest_free_gib >= required_gib:
+            return
+        time.sleep(2.0)
+
+    if required_gib is not None and latest_free_gib is not None and total_gib is not None:
+        logger.warning(
+            (
+                "Proceeding with vLLM load before target free GPU memory recovered "
+                "(free=%.2f GiB, total=%.2f GiB, required=%.2f GiB, utilization=%.2f)."
+            ),
+            latest_free_gib,
+            total_gib,
+            required_gib,
+            target_util,
+        )
+
+
+def _teardown_vllm_engine(vllm_engine: Any, model_name: str) -> None:
+    """Shut down a vLLM ``LLM`` instance and free its GPU memory.
+
+    The ``LLM`` object holds ``llm_engine`` (an ``LLMEngine``), which in turn
+    holds ``engine_core`` (a ``SyncMPClient`` / ``MPClient``).  The EngineCore
+    runs in a **separate process** that owns the actual GPU tensors, so we must
+    call ``engine_core.shutdown()`` to terminate that process — ``del`` alone
+    is not enough.
+    """
+    # 1. Graceful shutdown via the engine_core subprocess manager.
+    try:
+        llm_engine = getattr(vllm_engine, "llm_engine", None)
+        if llm_engine is not None:
+            engine_core = getattr(llm_engine, "engine_core", None)
+            if engine_core is not None and hasattr(engine_core, "shutdown"):
+                logger.info("  Calling engine_core.shutdown() for %s", model_name)
+                engine_core.shutdown()
+    except Exception as exc:  # noqa: BLE001
+        logger.warning("engine_core.shutdown() failed for %s: %s", model_name, exc)
+
+    # 2. Delete Python references so the GC can collect any remaining C++ handles.
+    try:
+        del vllm_engine
+    except Exception:  # noqa: BLE001
+        pass
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    # 3. Give the EngineCore subprocess time to exit and release GPU memory.
+    time.sleep(5.0)
+
+
 def _normalize_text(text: str) -> str:
     return " ".join(text.strip().split())
 
@@ -375,55 +426,6 @@ def _map_numeric_answer_to_option_letter(
     return best_letter
 
 
-def _generate_with_hf_local(
-    tokenizer: Any,
-    model: Any,
-    *,
-    sys_prompt: str,
-    user_prompt: str,
-    generation_config: Dict[str, Any],
-) -> str:
-    """Generate a response with a local HF causal LM."""
-    max_new_tokens = int(generation_config.get("max_tokens", 512))
-    temperature = float(generation_config.get("temperature", 0.0) or 0.0)
-    top_p = float(generation_config.get("top_p", 1.0) or 1.0)
-    repetition_penalty = float(generation_config.get("repetition_penalty", 1.0) or 1.0)
-    do_sample = temperature > 0
-
-    prompt = _render_text_prompt(
-        tokenizer,
-        sys_prompt=sys_prompt,
-        user_prompt=user_prompt,
-    )
-    encoded = tokenizer(prompt, return_tensors="pt")
-    input_ids = encoded["input_ids"]
-    attention_mask = encoded.get("attention_mask", torch.ones_like(input_ids))
-
-    model_device = next(model.parameters()).device
-    input_ids = input_ids.to(model_device)
-    attention_mask = attention_mask.to(model_device)
-
-    generate_kwargs = {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "max_new_tokens": max_new_tokens,
-        "do_sample": do_sample,
-        "pad_token_id": tokenizer.pad_token_id,
-        "eos_token_id": tokenizer.eos_token_id,
-        "repetition_penalty": repetition_penalty,
-    }
-    if do_sample:
-        generate_kwargs["temperature"] = temperature
-        generate_kwargs["top_p"] = top_p
-
-    with torch.inference_mode():
-        generated = model.generate(**generate_kwargs)
-
-    generated_tokens = generated[0][input_ids.shape[-1] :]
-    output_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-    return output_text.strip()
-
-
 def _generate_batch_with_hf_local(
     tokenizer: Any,
     model: Any,
@@ -558,6 +560,30 @@ def _ordered_rows(
     ]
 
 
+def _log_running_performance(
+    *,
+    llm_name: str,
+    capability_id: str,
+    row_by_id: Dict[str, Dict[str, Any]],
+    total_tasks: int,
+) -> None:
+    """Log running completion and accuracy for current model/capability."""
+    done = len(row_by_id)
+    if done == 0:
+        acc = 0.0
+    else:
+        correct = sum(1 for row in row_by_id.values() if row.get("grade") == "C")
+        acc = correct / done
+    logger.info(
+        "  Progress %s/%s: %d/%d scored | running_accuracy=%.4f",
+        llm_name,
+        capability_id,
+        done,
+        total_tasks,
+        acc,
+    )
+
+
 def _judge_batch(
     rows: List[Dict[str, Any]],
     *,
@@ -600,13 +626,6 @@ def _judge_batch(
         # Give the judge only the model's final fragment to reduce noise.
         judge_prompt = _build_judge_prompt(judge_submission, target)
         unresolved_prompts.append(judge_prompt)
-        logger.info(
-            "Judge input | task_id=%s\nSubmission used for judge:\n%s\nTarget:\n%s\nFull judge prompt:\n%s",
-            str(row.get("id", "")),
-            judge_submission,
-            target,
-            judge_prompt,
-        )
 
     if unresolved_prompts:
         if judge_vllm_model is not None:
@@ -674,38 +693,9 @@ async def _one(p: str) -> str:
             strict=True,
         ):
             scored_rows[index] = {**rows[index], "grade": grade}
-        for task_id, judge_output in zip(unresolved_task_ids, judge_outputs, strict=True):
-            logger.info(
-                "Judge output | task_id=%s | raw_output=%s",
-                task_id,
-                (judge_output or "").strip(),
-            )
-
     return [row for row in scored_rows if row is not None]
 
 
-def _judge_submission(
-    submission: str,
-    target: str,
-    judge_model: Model,
-    judge_generation_cfg: Dict[str, Any],
-) -> str:
-    """Return C/I grade for a subject submission."""
-    parsed_submission = parse_submission(submission) or submission
-    if _normalize_text(parsed_submission).lower() == _normalize_text(target).lower():
-        return "C"
-
-    prompt = LLM_JUDGE_PROMPT.format(submission=parsed_submission, target=target)
-    judge_text, _ = judge_model.generate(
-        sys_prompt="You are a strict grading assistant.",
-        user_prompt=prompt,
-        generation_config=judge_generation_cfg,
-    )
-    if judge_text and judge_text.strip().lower().startswith("yes"):
-        return "C"
-    return "I"
-
-
 def run_eval_stage1_local(
     cfg: DictConfig,
     validation_tag: str,
@@ -853,6 +843,18 @@ def run_eval_stage1_local(
 
             if using_vllm:
                 if model_key not in vllm_model_instances:
+                    for old_key in list(vllm_model_instances):
+                        if old_key != model_key:
+                            logger.info(
+                                "  Tearing down previous vLLM engine %s before loading %s",
+                                old_key[1], llm_name,
+                            )
+                            old_engine = vllm_model_instances.pop(old_key, None)
+                            if old_engine is not None:
+                                _teardown_vllm_engine(old_engine, old_key[1])
+                    _wait_for_vllm_startup_memory(
+                        float(llm_config.get("gpu_memory_utilization", 0.9))
+                    )
                     logger.info("  Loading vLLM engine for %s", llm_name)
                     vllm_model_instances[model_key] = _load_vllm_model(dict(llm_config))
                 vllm_model = vllm_model_instances[model_key]
@@ -925,14 +927,12 @@ def run_eval_stage1_local(
                             gen_bar.update(len(task_batch))
 
                     # Tear down subject vLLM before starting judge vLLM
-                    try:
-                        del vllm_model_instances[model_key]
-                    except Exception:  # noqa: BLE001
-                        pass
-                    del vllm_model
-                    gc.collect()
-                    if torch.cuda.is_available():
-                        torch.cuda.empty_cache()
+                    subject_engine = vllm_model_instances.pop(model_key, None)
+                    if subject_engine is not None:
+                        _teardown_vllm_engine(subject_engine, llm_name)
+                    _wait_for_vllm_startup_memory(
+                        float(judge_llm_cfg.get("gpu_memory_utilization", 0.9))
+                    )
 
                     # Phase B: start judge vLLM and judge in batches
                     logger.info("  Loading vLLM judge (after subject generation teardown)")
@@ -962,15 +962,18 @@ def run_eval_stage1_local(
                             for scored_row in scored_batch:
                                 row_by_id[str(scored_row["id"])] = scored_row
                             _write_flat_results(flat_path, _ordered_rows(dataset.tasks, row_by_id))
+                            _log_running_performance(
+                                llm_name=llm_name,
+                                capability_id=dataset.capability_id,
+                                row_by_id=row_by_id,
+                                total_tasks=total_tasks,
+                            )
                             judge_bar.update(len(judge_batch))
 
                     # Tear down judge vLLM too
-                    del judge_vllm_model
+                    _teardown_vllm_engine(judge_vllm_model, str(judge_llm_cfg.get("name", "judge")))
                     judge_vllm_model = None
                     judge_vllm_tokenizer = None
-                    gc.collect()
-                    if torch.cuda.is_available():
-                        torch.cuda.empty_cache()
                 else:
                     # Default fast path: generate + judge streaming (can reuse subject engine as judge if same model)
                     if judge_using_vllm and using_vllm and can_reuse_subject_as_judge:
@@ -1062,6 +1065,12 @@ def run_eval_stage1_local(
                                     row_by_id[str(scored_row["id"])] = scored_row
 
                             _write_flat_results(flat_path, _ordered_rows(dataset.tasks, row_by_id))
+                            _log_running_performance(
+                                llm_name=llm_name,
+                                capability_id=dataset.capability_id,
+                                row_by_id=row_by_id,
+                                total_tasks=total_tasks,
+                            )
                             eval_bar.update(len(task_batch))
             except Exception as exc:  # noqa: BLE001
                 logger.error(
diff --git a/src/eval_stages/static_benchmarks/hardmath.py b/src/eval_stages/static_benchmarks/hardmath.py
deleted file mode 100644
index 5443cda1..00000000
--- a/src/eval_stages/static_benchmarks/hardmath.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""Adapter for the HARDMath static benchmark.
-
-Source JSON:
-- https://github.com/sarahmart/HARDMath/blob/main/data/HARDMath.json
-"""
-
-from __future__ import annotations
-
-import json
-import re
-from typing import Any, Dict, Iterable, List
-from urllib.request import urlopen
-
-from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
-from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
-from src.schemas.eval_schemas import EvalDataset
-
-
-HARDMATH_URL = (
-    "https://raw.githubusercontent.com/sarahmart/HARDMath/main/data/HARDMath.json"
-)
-
-
-def _slugify(text: str) -> str:
-    """Convert arbitrary strings into safe directory-friendly IDs."""
-    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
-    return cleaned or "unknown"
-
-
-def _iter_hardmath_samples(limit: int | None) -> Iterable[Dict[str, Any]]:
-    """Yield rows from HARDMath JSON in a stable order."""
-    with urlopen(HARDMATH_URL) as f:
-        data = json.load(f)
-
-    # The JSON is a dict keyed by string indices ("0", "1", ...).
-    # We iterate in key-sorted order for reproducibility.
-    items = [data[k] for k in sorted(data.keys(), key=lambda x: int(x))]
-
-    if limit is not None:
-        items = items[: max(0, min(limit, len(items)))]
-
-    yield from items
-
-
-def build_eval_datasets_from_hardmath(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
-    """Convert HARDMath into one EvalDataset per question_type.
-
-    We treat:
-    - domain: always "math"
-    - area_id: taken from spec.area_id (e.g. "math" or "static_benchmarks")
-    - capability_id / capability_name: derived from the "question_type" field
-      (e.g., "integral", "ODE", "polynomial_roots_corrections", ...).
-    """
-    by_qtype: Dict[str, List[Dict[str, str]]] = {}
-
-    for idx, row in enumerate(_iter_hardmath_samples(spec.limit)):
-        question = str(row.get("question", "")).strip()
-        # Use the curated LaTeX-like final answer field.
-        answer = str(row.get("answer_val", "")).strip()
-        if not question or not answer:
-            continue
-
-        qtype = str(row.get("question_type", "")).strip() or "unknown"
-        answer_type = str(row.get("answer_type", "")).strip()
-        precision = row.get("precision")
-
-        # Enrich the input with answer-type instructions so the subject model
-        # knows what form is expected.
-        extra_lines = []
-        if answer_type:
-            if answer_type == "list":
-                extra_lines.append(
-                    "Answer format: provide a Python-style list of expressions or numbers, in the order requested."
-                )
-            elif answer_type in {"integer", "float"}:
-                extra_lines.append(
-                    f"Answer format: a single {answer_type} value (no explanation in the final line)."
-                )
-            elif answer_type == "math_expression":
-                extra_lines.append(
-                    "Answer format: a single closed-form mathematical expression."
-                )
-            else:
-                extra_lines.append(f"Answer format: {answer_type}.")
-
-        if isinstance(precision, (int, float)):
-            extra_lines.append(
-                f"If the answer is numeric, round to {int(precision)} decimal places."
-            )
-
-        if extra_lines:
-            input_text = question + "\n\n" + "\n".join(extra_lines)
-        else:
-            input_text = question
-
-        task_id = f"hardmath_{idx:04d}"
-
-        by_qtype.setdefault(qtype, []).append(
-            {"id": task_id, "input": input_text, "target": answer}
-        )
-
-    datasets: List[EvalDataset] = []
-    for qtype, tasks in sorted(by_qtype.items()):
-        capability_id = _slugify(qtype)
-        capability_name = qtype
-
-        dataset = EvalDataset(
-            area_id=spec.area_id,
-            capability_id=capability_id,
-            capability_name=capability_name,
-            domain="math",
-            tasks=tasks,
-            num_tasks=len(tasks),
-            prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
-        )
-        datasets.append(dataset)
-
-    return datasets
-
diff --git a/src/eval_stages/static_benchmarks/harp.py b/src/eval_stages/static_benchmarks/harp.py
deleted file mode 100644
index 4d5d2161..00000000
--- a/src/eval_stages/static_benchmarks/harp.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""Adapter for the HARP static benchmark.
-
-Repository: https://github.com/aadityasingh/HARP
-
-We use the main split HARP.jsonl (short-answer questions).
-
-Fields (from README):
-- problem: problem text
-- answer: ground truth answer
-We ignore other metadata fields.
-"""
-
-from __future__ import annotations
-
-from typing import Any, Dict, Iterable, List
-
-from datasets import load_dataset
-
-from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
-from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
-from src.schemas.eval_schemas import EvalDataset
-
-
-def _normalize_answer(val: Any) -> str:
-    """Normalize answer to string (dataset may have mixed types)."""
-    if val is None:
-        return ""
-    if isinstance(val, dict):
-        for key in ("label", "text", "value", "answer"):
-            if key in val and val[key] is not None:
-                return str(val[key]).strip()
-        return str(val).strip()
-    return str(val).strip()
-
-
-def _iter_harp_samples(
-    split: str,
-    limit: int | None,
-) -> Iterable[Dict[str, Any]]:
-    """Yield rows from HARP.jsonl (single split).
-
-    The dataset is hosted in the GitHub repo as a JSONL file in a zip archive.
-    We load it via datasets.load_dataset with a remote URL.
-    """
-    # Main short-answer split; datasets can read compressed JSONL directly.
-    data_files = "https://github.com/aadityasingh/HARP/raw/main/HARP.jsonl.zip"
-    ds = load_dataset("json", data_files=data_files, split="train")
-
-    if limit is not None:
-        ds = ds.select(range(min(limit, len(ds))))
-    yield from ds
-
-
-def build_eval_datasets_from_harp(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
-    """Convert HARP into a single EvalDataset.
-
-    - input: problem (competition math problem text)
-    - target: answer (normalized to string)
-    - domain: math
-    - capability_id: harp
-    Rows with empty problem or answer are skipped.
-    """
-    tasks: List[Dict[str, str]] = []
-
-    for idx, row in enumerate(_iter_harp_samples(spec.split, spec.limit)):
-        problem = str(row.get("problem", "")).strip()
-        raw_answer = row.get("answer")
-        answer = _normalize_answer(raw_answer)
-        if not problem or not answer:
-            continue
-
-        task_id = f"harp_{idx:05d}"
-        tasks.append({"id": task_id, "input": problem, "target": answer})
-
-    if not tasks:
-        return []
-
-    dataset = EvalDataset(
-        area_id=spec.area_id,
-        capability_id="harp",
-        capability_name="HARP",
-        domain="math",
-        tasks=tasks,
-        num_tasks=len(tasks),
-        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
-    )
-    return [dataset]
-
diff --git a/src/eval_stages/static_benchmarks/math500.py b/src/eval_stages/static_benchmarks/math500.py
deleted file mode 100644
index f8303925..00000000
--- a/src/eval_stages/static_benchmarks/math500.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""Adapter for the HuggingFaceH4/MATH-500 static benchmark.
-
-Dataset card: https://huggingface.co/datasets/HuggingFaceH4/MATH-500
-"""
-
-from __future__ import annotations
-
-import re
-from typing import Any, Dict, Iterable, List
-
-from datasets import load_dataset
-
-from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
-from src.schemas.eval_schemas import EvalDataset
-from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
-
-
-def _slugify(text: str) -> str:
-    """Convert arbitrary strings into safe directory-friendly IDs."""
-    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
-    return cleaned or "unknown"
-
-
-def _iter_math500_samples(
-    split: str,
-    limit: int | None,
-) -> Iterable[Dict[str, Any]]:
-    """Yield rows from HuggingFaceH4/MATH-500 in a stable order."""
-    ds = load_dataset("HuggingFaceH4/MATH-500", split=split)
-    if limit is not None:
-        ds = ds.select(range(min(limit, len(ds))))
-    yield from ds
-
-
-def build_eval_datasets_from_math500(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
-    """Convert HF MATH-500 into one EvalDataset per subject.
-
-    We treat:
-    - domain: always "math"
-    - area_id: taken from spec.area_id (e.g. "math" or "static_benchmarks")
-    - capability_id / capability_name: derived from the dataset "subject" column
-      (Prealgebra, Algebra, Geometry, ...).
-    """
-    by_subject: Dict[str, List[Dict[str, str]]] = {}
-
-    for idx, row in enumerate(_iter_math500_samples(spec.split, spec.limit)):
-        problem = str(row.get("problem", "")).strip()
-        answer = str(row.get("answer", "")).strip()
-        unique_id = row.get("unique_id")
-        task_id = str(unique_id).strip() if unique_id else f"math500_{idx:04d}"
-        subject = str(row.get("subject", "")).strip() or "unknown"
-
-        if not problem:
-            continue
-
-        by_subject.setdefault(subject, []).append(
-            {"id": task_id, "input": problem, "target": answer}
-        )
-
-    datasets: List[EvalDataset] = []
-    for subject, tasks in sorted(by_subject.items()):
-        capability_id = _slugify(subject)
-        capability_name = subject
-
-        dataset = EvalDataset(
-            area_id=spec.area_id,
-            capability_id=capability_id,
-            capability_name=capability_name,
-            domain="math",
-            tasks=tasks,
-            num_tasks=len(tasks),
-            prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
-        )
-        datasets.append(dataset)
-
-    return datasets
-
-
diff --git a/src/eval_stages/static_benchmarks/minif2f.py b/src/eval_stages/static_benchmarks/minif2f.py
deleted file mode 100644
index 97f0af14..00000000
--- a/src/eval_stages/static_benchmarks/minif2f.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""Adapter for the Tonic/MiniF2F static benchmark.
-
-Dataset card: https://huggingface.co/datasets/Tonic/MiniF2F
-
-MiniF2F contains mathematical problems with informal statements (LaTeX) and
-formal Lean statements. Single split: train (488 rows).
-
-Columns: name, split, informal_prefix, formal_statement, goal, header.
-We use informal_prefix as input and formal_statement as target (autoformalization).
-"""
-
-from __future__ import annotations
-
-import re
-from typing import Any, Dict, Iterable, List
-
-from datasets import load_dataset
-
-from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
-from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
-from src.schemas.eval_schemas import EvalDataset
-
-
-def _slugify(text: str) -> str:
-    """Convert arbitrary strings into safe directory-friendly IDs."""
-    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
-    return cleaned or "unknown"
-
-
-def _iter_minif2f_samples(
-    split: str,
-    limit: int | None,
-) -> Iterable[Dict[str, Any]]:
-    """Yield rows from Tonic/MiniF2F in order."""
-    ds = load_dataset("Tonic/MiniF2F", split=split)
-    if limit is not None:
-        ds = ds.select(range(min(limit, len(ds))))
-    yield from ds
-
-
-def build_eval_datasets_from_minif2f(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
-    """Convert MiniF2F into a single EvalDataset.
-
-    - input: informal_prefix (informal mathematical statement in LaTeX)
-    - target: formal_statement (formal theorem in Lean)
-    - domain: math
-    - capability_id: minif2f
-    Rows with empty informal_prefix or formal_statement are skipped.
-    """
-    tasks: List[Dict[str, str]] = []
-    id_counts: Dict[str, int] = {}
-
-    for idx, row in enumerate(_iter_minif2f_samples(spec.split, spec.limit)):
-        informal = str(row.get("informal_prefix", "")).strip()
-        formal = str(row.get("formal_statement", "")).strip()
-        if not informal or not formal:
-            continue
-
-        raw_id = row.get("name")
-        base_id = _slugify(str(raw_id).strip()) if raw_id else f"minif2f_{idx:04d}"
-        cnt = id_counts.get(base_id, 0)
-        id_counts[base_id] = cnt + 1
-        task_id = base_id if cnt == 0 else f"{base_id}_{cnt}"
-
-        tasks.append({"id": task_id, "input": informal, "target": formal})
-
-    if not tasks:
-        return []
-
-    dataset = EvalDataset(
-        area_id=spec.area_id,
-        capability_id="minif2f",
-        capability_name="MiniF2F",
-        domain="math",
-        tasks=tasks,
-        num_tasks=len(tasks),
-        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
-    )
-    return [dataset]
diff --git a/src/eval_stages/static_benchmarks/omni_math.py b/src/eval_stages/static_benchmarks/omni_math.py
deleted file mode 100644
index 786e40a3..00000000
--- a/src/eval_stages/static_benchmarks/omni_math.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""Adapter for the KbsdJames/Omni-MATH static benchmark.
-
-Dataset card: https://huggingface.co/datasets/KbsdJames/Omni-MATH
-
-Omni-MATH is an Olympiad-level math benchmark (~4.4k problems). Single split: test.
-
-Columns: domain, difficulty, problem, solution, answer, source.
-We use problem as input and answer as target.
-"""
-
-from __future__ import annotations
-
-from typing import Any, Dict, Iterable, List
-
-from datasets import load_dataset
-
-from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
-from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
-from src.schemas.eval_schemas import EvalDataset
-
-
-def _normalize_answer(val: Any) -> str:
-    """Normalize answer to string (dataset may have mixed types)."""
-    if val is None:
-        return ""
-    if isinstance(val, dict):
-        for key in ("label", "text", "value"):
-            if key in val and val[key] is not None:
-                return str(val[key]).strip()
-        return str(val).strip()
-    return str(val).strip()
-
-
-def _iter_omni_math_samples(
-    split: str,
-    limit: int | None,
-) -> Iterable[Dict[str, Any]]:
-    """Yield rows from KbsdJames/Omni-MATH in order."""
-    ds = load_dataset("KbsdJames/Omni-MATH", split=split)
-    if limit is not None:
-        ds = ds.select(range(min(limit, len(ds))))
-    yield from ds
-
-
-def build_eval_datasets_from_omni_math(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
-    """Convert Omni-MATH into a single EvalDataset.
-
-    - input: problem (Olympiad-level math problem text)
-    - target: answer (normalized to string)
-    - domain: math
-    - capability_id: omni_math
-    Rows with empty problem or answer are skipped.
-    """
-    tasks: List[Dict[str, str]] = []
-
-    for idx, row in enumerate(_iter_omni_math_samples(spec.split, spec.limit)):
-        problem = str(row.get("problem", "")).strip()
-        raw_answer = row.get("answer")
-        answer = _normalize_answer(raw_answer)
-        if not problem or not answer:
-            continue
-
-        task_id = f"omni_math_{idx:05d}"
-        tasks.append({"id": task_id, "input": problem, "target": answer})
-
-    if not tasks:
-        return []
-
-    dataset = EvalDataset(
-        area_id=spec.area_id,
-        capability_id="omni_math",
-        capability_name="Omni-MATH",
-        domain="math",
-        tasks=tasks,
-        num_tasks=len(tasks),
-        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
-    )
-    return [dataset]
diff --git a/src/eval_stages/static_benchmarks/orca_math.py b/src/eval_stages/static_benchmarks/orca_math.py
deleted file mode 100644
index cb259369..00000000
--- a/src/eval_stages/static_benchmarks/orca_math.py
+++ /dev/null
@@ -1,62 +0,0 @@
-"""Adapter for the microsoft/orca-math-word-problems-200k static benchmark.
-
-Dataset card: https://huggingface.co/datasets/microsoft/orca-math-word-problems-200k
-
-Columns: question (math word problem), answer (step-by-step solution).
-Single split: train. Use +static_benchmark_cfg.split=train when running Stage 0.
-"""
-
-from __future__ import annotations
-
-from typing import Any, Dict, Iterable, List
-
-from datasets import load_dataset
-
-from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
-from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
-from src.schemas.eval_schemas import EvalDataset
-
-
-def _iter_orca_math_samples(
-    split: str,
-    limit: int | None,
-) -> Iterable[Dict[str, Any]]:
-    """Yield rows from microsoft/orca-math-word-problems-200k in order."""
-    ds = load_dataset("microsoft/orca-math-word-problems-200k", split=split)
-    if limit is not None:
-        ds = ds.select(range(min(limit, len(ds))))
-    yield from ds
-
-
-def build_eval_datasets_from_orca_math(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
-    """Convert Orca Math Word Problems into a single EvalDataset.
-
-    - input: question (math word problem text)
-    - target: answer (step-by-step solution from the dataset)
-    - domain: math
-    - capability_id: orca_math_word_problems
-    """
-    tasks: List[Dict[str, str]] = []
-
-    for idx, row in enumerate(_iter_orca_math_samples(spec.split, spec.limit)):
-        question = str(row.get("question", "")).strip()
-        answer = str(row.get("answer", "")).strip()
-        if not question or not answer:
-            continue
-
-        task_id = f"orca_math_{idx:06d}"
-        tasks.append({"id": task_id, "input": question, "target": answer})
-
-    if not tasks:
-        return []
-
-    dataset = EvalDataset(
-        area_id=spec.area_id,
-        capability_id="orca_math_word_problems",
-        capability_name="Orca Math Word Problems",
-        domain="math",
-        tasks=tasks,
-        num_tasks=len(tasks),
-        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
-    )
-    return [dataset]
diff --git a/src/eval_stages/static_benchmarks/proofnet.py b/src/eval_stages/static_benchmarks/proofnet.py
deleted file mode 100644
index bf28642f..00000000
--- a/src/eval_stages/static_benchmarks/proofnet.py
+++ /dev/null
@@ -1,86 +0,0 @@
-"""Adapter for the hoskinson-center/proofnet static benchmark.
-
-Dataset card: https://huggingface.co/datasets/hoskinson-center/proofnet
-
-ProofNet is a benchmark for autoformalization and formal proving of undergraduate
-mathematics. Uses the "plain_text" config. Splits: validation (185), test (186).
-
-Columns: id, nl_statement (natural language theorem), nl_proof (natural language
-proof in LaTeX), formal_statement (Lean 3), src_header.
-We use nl_statement as input and nl_proof as target.
-"""
-
-from __future__ import annotations
-
-import re
-from typing import Any, Dict, Iterable, List
-
-from datasets import load_dataset
-
-from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
-from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
-from src.schemas.eval_schemas import EvalDataset
-
-
-def _slugify(text: str) -> str:
-    """Convert arbitrary strings into safe directory-friendly IDs."""
-    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
-    return cleaned or "unknown"
-
-
-def _iter_proofnet_samples(
-    split: str,
-    limit: int | None,
-) -> Iterable[Dict[str, Any]]:
-    """Yield rows from hoskinson-center/proofnet plain_text in order."""
-    ds = load_dataset(
-        "hoskinson-center/proofnet",
-        "plain_text",
-        split=split,
-    )
-    if limit is not None:
-        ds = ds.select(range(min(limit, len(ds))))
-    yield from ds
-
-
-def build_eval_datasets_from_proofnet(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
-    """Convert ProofNet into a single EvalDataset.
-
-    - input: nl_statement (natural language theorem statement)
-    - target: nl_proof (natural language proof)
-    - domain: math
-    - capability_id: proofnet
-    Rows with empty nl_proof are skipped.
-    """
-    tasks: List[Dict[str, str]] = []
-    id_counts: Dict[str, int] = {}
-
-    for idx, row in enumerate(_iter_proofnet_samples(spec.split, spec.limit)):
-        nl_statement = str(row.get("nl_statement", "")).strip()
-        nl_proof = str(row.get("nl_proof", "")).strip()
-        if not nl_statement or not nl_proof:
-            continue
-
-        raw_id = row.get("id")
-        base_id = _slugify(str(raw_id).strip()) if raw_id else f"proofnet_{idx:04d}"
-        cnt = id_counts.get(base_id, 0)
-        id_counts[base_id] = cnt + 1
-        task_id = base_id if cnt == 0 else f"{base_id}_{cnt}"
-
-        tasks.append(
-            {"id": task_id, "input": nl_statement, "target": nl_proof}
-        )
-
-    if not tasks:
-        return []
-
-    dataset = EvalDataset(
-        area_id=spec.area_id,
-        capability_id="proofnet",
-        capability_name="ProofNet",
-        domain="math",
-        tasks=tasks,
-        num_tasks=len(tasks),
-        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
-    )
-    return [dataset]
diff --git a/src/eval_stages/static_benchmarks/stateval.py b/src/eval_stages/static_benchmarks/stateval.py
deleted file mode 100644
index 8510335d..00000000
--- a/src/eval_stages/static_benchmarks/stateval.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""Unified StatEval benchmark: one benchmark, domain math, two areas.
-
-StatEval has two subsets:
-- Foundational Knowledge Dataset (0v01111/StatEval-Foundational-knowledge)
-- Statistical Research Dataset (0v01111/StatEval-Statistical-Research)
-
-This module exposes a single benchmark_id (StatEval / stateval) that loads
-both and produces two capabilities under one area "stateval" with domain "math":
-- foundational_knowledge
-- statistical_research
-"""
-
-from __future__ import annotations
-
-from typing import List
-
-from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
-from src.eval_stages.static_benchmarks.stateval_foundational import (
-    build_eval_datasets_from_stateval_foundational,
-)
-from src.eval_stages.static_benchmarks.stateval_research import (
-    build_eval_datasets_from_stateval_research,
-)
-from src.schemas.eval_schemas import EvalDataset
-
-STATEVAL_AREA_ID = "stateval"
-STATEVAL_DOMAIN = "math"
-
-
-def build_eval_datasets_from_stateval(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
-    """Build EvalDatasets for both StatEval subsets with domain=math, area=stateval.
-
-    Returns two datasets: Foundational Knowledge and Statistical Research.
-    Uses the same split and limit from spec for each subset.
-    """
-    stateval_spec = StaticBenchmarkSpec(
-        benchmark_id=spec.benchmark_id,
-        split=spec.split,
-        limit=spec.limit,
-        area_id=STATEVAL_AREA_ID,
-        capability_id=spec.capability_id,
-        capability_name=spec.capability_name,
-        domain=STATEVAL_DOMAIN,
-    )
-
-    foundational = build_eval_datasets_from_stateval_foundational(stateval_spec)
-    research = build_eval_datasets_from_stateval_research(stateval_spec)
-
-    return foundational + research
diff --git a/src/eval_stages/static_benchmarks/stateval_foundational.py b/src/eval_stages/static_benchmarks/stateval_foundational.py
deleted file mode 100644
index 6e076877..00000000
--- a/src/eval_stages/static_benchmarks/stateval_foundational.py
+++ /dev/null
@@ -1,219 +0,0 @@
-"""Adapter for the StatEval Foundational Knowledge static benchmark.
-
-Primary dataset:
-- https://huggingface.co/datasets/0v01111/StatEval-Foundational-knowledge
-
-The upstream JSON has mixed types for the "answer" column (string vs object),
-which breaks the default HuggingFace loader. We try load_dataset first, then
-fall back to downloading raw JSON and parsing with per-row normalization.
-"""
-
-from __future__ import annotations
-
-import json
-import re
-from pathlib import Path
-from typing import Any, Dict, Iterable, List
-
-from datasets import load_dataset
-from datasets.exceptions import DatasetGenerationError
-from huggingface_hub import hf_hub_download, list_repo_files
-
-from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
-from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
-from src.schemas.eval_schemas import EvalDataset
-
-REPO_ID = "0v01111/StatEval-Foundational-knowledge"
-
-
-def _slugify(text: str) -> str:
-    """Convert arbitrary strings into safe directory-friendly IDs."""
-    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
-    return cleaned or "unknown"
-
-
-def _normalize_answer_in_row(row: Dict[str, Any]) -> None:
-    """In-place: ensure row['answer'] is a string (upstream has mixed types)."""
-    a = row.get("answer")
-    if a is None:
-        row["answer"] = ""
-        return
-    if isinstance(a, dict):
-        for key in ("label", "text", "final", "value"):
-            if key in a and a[key] is not None:
-                row["answer"] = str(a[key]).strip()
-                return
-        row["answer"] = str(a).strip()
-        return
-    row["answer"] = str(a).strip()
-
-
-def _load_foundational_raw(split: str, limit: int | None) -> Iterable[Dict[str, Any]]:
-    """Load repo JSON manually and yield rows with normalized 'answer'.
-
-    Used when load_dataset fails due to mixed column types in the upstream data.
-    """
-    files = list_repo_files(REPO_ID, repo_type="dataset")
-    # Prefer the canonical Foundational-knowledge.jsonl if present.
-    filename = None
-    for cand in files:
-        if "Foundational-knowledge" in cand:
-            filename = cand
-            break
-    if filename is None:
-        # Fallback: any JSON/JSONL file that mentions the split, else any JSON/JSONL.
-        json_files = [
-            f for f in files if (f.endswith(".json") or f.endswith(".jsonl")) and split in f
-        ]
-        if not json_files:
-            json_files = [f for f in files if f.endswith(".json") or f.endswith(".jsonl")]
-        if not json_files:
-            return
-        filename = json_files[0]
-
-    path = hf_hub_download(
-        repo_id=REPO_ID,
-        filename=filename,
-        repo_type="dataset",
-    )
-    text = Path(path).read_text(encoding="utf-8", errors="replace")
-    rows: List[Dict[str, Any]] = []
-    try:
-        data = json.loads(text)
-        if isinstance(data, list):
-            rows = data
-        elif isinstance(data, dict) and "data" in data:
-            rows = data["data"]
-        elif isinstance(data, dict) and data:
-            keys = sorted(
-                data.keys(),
-                key=lambda k: int(k) if isinstance(k, str) and k.isdigit() else k,
-            )
-            rows = [data[k] for k in keys if isinstance(data[k], dict)]
-    except (json.JSONDecodeError, TypeError, ValueError):
-        for line in text.splitlines():
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                obj = json.loads(line)
-                if isinstance(obj, dict):
-                    rows.append(obj)
-                elif isinstance(obj, list):
-                    rows.extend(obj)
-            except json.JSONDecodeError:
-                continue
-    for i, row in enumerate(rows):
-        if limit is not None and i >= limit:
-            break
-        if not isinstance(row, dict):
-            continue
-        _normalize_answer_in_row(row)
-        yield row
-
-
-def _iter_stateval_foundational_samples(
-    split: str,
-    limit: int | None,
-) -> Iterable[Dict[str, Any]]:
-    """Yield rows from StatEval-Foundational-knowledge in a stable order.
-
-    Always loads raw JSON from the Hub and normalizes 'answer', avoiding
-    mixed-type issues in the official JSONL that break load_dataset.
-    """
-    yield from _load_foundational_raw(split, limit)
-
-
-def _extract_question_and_options(row: Dict[str, Any]) -> str:
-    """Build a human-readable question string from a StatEval row.
-
-    We try several likely field names for the question stem and options to
-    make the adapter robust to minor schema variations.
-    """
-    stem = (
-        str(
-            row.get("question")
-            or row.get("prompt")
-            or row.get("task")
-            or row.get("problem")
-            or ""
-        ).strip()
-    )
-
-    options = row.get("options") or row.get("choices") or row.get("mc_options")
-    options_text = ""
-    if isinstance(options, list) and options:
-        labeled = []
-        for i, opt in enumerate(options):
-            label = chr(ord("A") + i)
-            labeled.append(f"{label}. {str(opt).strip()}")
-        options_text = " ".join(labeled)
-    elif isinstance(options, str) and options.strip():
-        options_text = options.strip()
-
-    if options_text:
-        return f"{stem}\n\nOptions: {options_text}"
-    return stem
-
-
-def _extract_answer(row: Dict[str, Any]) -> str:
-    """Extract a compact target answer string from a StatEval row.
-
-    The 'answer' field can be heterogeneous (string or object). We normalize
-    it into a single string that can be used as target text.
-    """
-    answer = row.get("answer")
-
-    # If answer is a mapping, look for common keys first.
-    if isinstance(answer, dict):
-        for key in ("label", "text", "final", "value"):
-            if key in answer and answer[key] is not None:
-                return str(answer[key]).strip()
-        # Fallback: stringify the whole object.
-        return str(answer).strip()
-
-    if answer is None:
-        return ""
-
-    return str(answer).strip()
-
-
-def build_eval_datasets_from_stateval_foundational(
-    spec: StaticBenchmarkSpec,
-) -> List[EvalDataset]:
-    """Convert StatEval Foundational Knowledge into a single EvalDataset.
-
-    All tasks are grouped under one capability "foundational_knowledge".
-    Uses spec.area_id and spec.domain (caller sets e.g. area_id=stateval, domain=math).
-    """
-    tasks: List[Dict[str, str]] = []
-
-    for idx, row in enumerate(
-        _iter_stateval_foundational_samples(spec.split, spec.limit)
-    ):
-        input_text = _extract_question_and_options(row)
-        target = _extract_answer(row)
-        if not input_text or not target:
-            continue
-
-        task_id = (
-            str(row.get("id") or row.get("ID") or "").strip()
-            or f"stateval_fk_{idx:05d}"
-        )
-        tasks.append({"id": task_id, "input": input_text, "target": target})
-
-    if not tasks:
-        return []
-
-    domain = (spec.domain or "math").strip().lower() or "math"
-    dataset = EvalDataset(
-        area_id=spec.area_id,
-        capability_id="foundational_knowledge",
-        capability_name="Foundational Knowledge",
-        domain=domain,
-        tasks=tasks,
-        num_tasks=len(tasks),
-        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
-    )
-    return [dataset]
-
diff --git a/src/eval_stages/static_benchmarks/stateval_research.py b/src/eval_stages/static_benchmarks/stateval_research.py
deleted file mode 100644
index 535b1ff8..00000000
--- a/src/eval_stages/static_benchmarks/stateval_research.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""Adapter for the StatEval Statistical Research static benchmark.
-
-Dataset: https://huggingface.co/datasets/0v01111/StatEval-Statistical-Research
-
-Research-level, proof-based tasks from papers. Exposed as a single
-capability "statistical_research" for the unified StatEval benchmark.
-"""
-
-from __future__ import annotations
-
-from typing import Any, Dict, Iterable, List
-
-from datasets import load_dataset
-
-from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
-from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
-from src.schemas.eval_schemas import EvalDataset
-
-
-def _iter_stateval_research_samples(
-    split: str,
-    limit: int | None,
-) -> Iterable[Dict[str, Any]]:
-    """Yield rows from StatEval-Statistical-Research in stable order."""
-    ds = load_dataset("0v01111/StatEval-Statistical-Research", split=split)
-    if limit is not None:
-        ds = ds.select(range(min(limit, len(ds))))
-    yield from ds
-
-
-def _extract_input(row: Dict[str, Any]) -> str:
-    """Build question/prompt text from a research row."""
-    stem = (
-        str(
-            row.get("question")
-            or row.get("prompt")
-            or row.get("task")
-            or row.get("problem")
-            or row.get("context")
-            or ""
-        ).strip()
-    )
-    return stem or ""
-
-
-def _extract_answer(row: Dict[str, Any]) -> str:
-    """Extract target answer from a research row."""
-    answer = row.get("answer")
-    if isinstance(answer, dict):
-        for key in ("label", "text", "final", "value", "solution"):
-            if key in answer and answer[key] is not None:
-                return str(answer[key]).strip()
-        return str(answer).strip()
-    if answer is None:
-        return ""
-    return str(answer).strip()
-
-
-def build_eval_datasets_from_stateval_research(
-    spec: StaticBenchmarkSpec,
-) -> List[EvalDataset]:
-    """Convert StatEval Statistical Research into a single EvalDataset.
-
-    All tasks under one capability "statistical_research".
-    Uses spec.area_id and spec.domain (e.g. area_id=stateval, domain=math).
-    """
-    tasks: List[Dict[str, str]] = []
-
-    for idx, row in enumerate(
-        _iter_stateval_research_samples(spec.split, spec.limit)
-    ):
-        input_text = _extract_input(row)
-        target = _extract_answer(row)
-        if not input_text or not target:
-            continue
-
-        task_id = (
-            str(row.get("id") or row.get("ID") or "").strip()
-            or f"stateval_research_{idx:05d}"
-        )
-        tasks.append({"id": task_id, "input": input_text, "target": target})
-
-    if not tasks:
-        return []
-
-    domain = (spec.domain or "math").strip().lower() or "math"
-    dataset = EvalDataset(
-        area_id=spec.area_id,
-        capability_id="statistical_research",
-        capability_name="Statistical Research",
-        domain=domain,
-        tasks=tasks,
-        num_tasks=len(tasks),
-        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
-    )
-    return [dataset]
diff --git a/src/eval_stages/static_benchmarks/wemath.py b/src/eval_stages/static_benchmarks/wemath.py
deleted file mode 100644
index 28f6d0de..00000000
--- a/src/eval_stages/static_benchmarks/wemath.py
+++ /dev/null
@@ -1,95 +0,0 @@
-"""Adapter for the We-Math/We-Math static benchmark.
-
-Dataset card: https://huggingface.co/datasets/We-Math/We-Math
-"""
-
-from __future__ import annotations
-
-import re
-from typing import Any, Dict, Iterable, List
-
-from datasets import load_dataset
-
-from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
-from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
-from src.schemas.eval_schemas import EvalDataset
-
-
-def _slugify(text: str) -> str:
-    """Convert arbitrary strings into safe directory-friendly IDs."""
-    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
-    return cleaned or "unknown"
-
-
-def _iter_wemath_samples(
-    split: str,
-    limit: int | None,
-) -> Iterable[Dict[str, Any]]:
-    """Yield rows from We-Math/We-Math in a stable order."""
-    # The public config exposes a "testmini" split; callers should pass
-    # static_benchmark_cfg.split=testmini in Hydra.
-    ds = load_dataset("We-Math/We-Math", split=split)
-    if limit is not None:
-        ds = ds.select(range(min(limit, len(ds))))
-    yield from ds
-
-
-def build_eval_datasets_from_wemath(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
-    """Convert We-Math into EvalDatasets grouped by knowledge concept.
-
-    We treat:
-    - domain: always "math"
-    - area_id: taken from spec.area_id (e.g. "math" or "static_benchmarks")
-    - capability_id / capability_name: derived from the "knowledge_concept"
-      column (e.g., "Properties and Understanding of Squares").
-
-    Each task:
-    - input: question text plus options as a single string
-    - target: the correct option letter from the "answer" column
-    """
-    by_concept: Dict[str, List[Dict[str, str]]] = {}
-    id_counts: Dict[str, int] = {}
-
-    for idx, row in enumerate(_iter_wemath_samples(spec.split, spec.limit)):
-        # Skip questions that have an image; this pipeline is text-only and does not pass images.
-        if row.get("image") is not None:
-            continue
-
-        question = str(row.get("question", "")).strip()
-        options = str(row.get("option", "")).strip()
-        answer = str(row.get("answer", "")).strip()
-        if not question or not options or not answer:
-            continue
-
-        concept = str(row.get("knowledge_concept", "")).strip() or "unknown"
-        base_id = str(row.get("ID", "")).strip() or f"wemath_{idx:04d}"
-        # Ensure uniqueness of task ids since Inspect requires unique ids.
-        cnt = id_counts.get(base_id, 0)
-        id_counts[base_id] = cnt + 1
-        task_id = base_id if cnt == 0 else f"{base_id}_{cnt}"
-
-        # Pack question and options into a single prompt input.
-        input_text = f"{question}\n\nOptions: {options}"
-
-        by_concept.setdefault(concept, []).append(
-            {"id": task_id, "input": input_text, "target": answer}
-        )
-
-    datasets: List[EvalDataset] = []
-    for concept, tasks in sorted(by_concept.items()):
-        capability_id = _slugify(concept)
-        capability_name = concept
-
-        dataset = EvalDataset(
-            area_id=spec.area_id,
-            capability_id=capability_id,
-            capability_name=capability_name,
-            domain="math",
-            tasks=tasks,
-            num_tasks=len(tasks),
-            prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
-        )
-        datasets.append(dataset)
-
-    return datasets
-

From 287dbb225304c7d70baf43b74e44b4830931df89 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Tue, 14 Apr 2026 15:15:22 -0400
Subject: [PATCH 5/8] Removed unnecessary logic from evaluation step 1

---
 src/cfg/run_cfg.yaml                          |   4 +-
 src/eval_stages/stage0_static_benchmarks.py   |   5 -
 .../stage1_local_eval_execution.py            | 426 +++---------------
 .../static_benchmarks/mathvista.py            |  99 ----
 4 files changed, 75 insertions(+), 459 deletions(-)
 delete mode 100644 src/eval_stages/static_benchmarks/mathvista.py

diff --git a/src/cfg/run_cfg.yaml b/src/cfg/run_cfg.yaml
index 14433b8c..033fcb7c 100644
--- a/src/cfg/run_cfg.yaml
+++ b/src/cfg/run_cfg.yaml
@@ -102,7 +102,7 @@ eval_cfg:
         max_tokens: 8192
     - name: qwen-3-32b
       provider: hf_local
-      model_path: /model-weights/qwen-3-32b
+      model_path: /model-weights/Qwen3-32B/
       inference_backend: vllm
       trust_remote_code: true
       gpu_memory_utilization: 0.9
@@ -113,7 +113,7 @@ eval_cfg:
         max_tokens: 8192
     - name: qwen-3-8b
       provider: hf_local
-      model_path: /model-weights/qwen-3-8b
+      model_path: /model-weights/Qwen3-32B/
       inference_backend: vllm
       trust_remote_code: true
       gpu_memory_utilization: 0.9
diff --git a/src/eval_stages/stage0_static_benchmarks.py b/src/eval_stages/stage0_static_benchmarks.py
index 864b4461..1cc9a355 100644
--- a/src/eval_stages/stage0_static_benchmarks.py
+++ b/src/eval_stages/stage0_static_benchmarks.py
@@ -22,9 +22,6 @@
 
 from omegaconf import DictConfig, OmegaConf
 
-from src.eval_stages.static_benchmarks.mathvista import (
-    build_eval_datasets_from_mathvista,
-)
 from src.eval_stages.static_benchmarks.finance_math import (
     build_eval_datasets_from_finance_math,
 )
@@ -60,8 +57,6 @@ def _build_datasets_from_spec(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
     produce multiple capabilities if desired.
     """
     bid = spec.benchmark_id.strip()
-    if bid in {"AI4Math/MathVista", "MathVista", "mathvista"}:
-        return build_eval_datasets_from_mathvista(spec)
     if bid in {"yale-nlp/FinanceMath", "FinanceMath", "finance_math"}:
         return build_eval_datasets_from_finance_math(spec)
     if bid in {"kensho/bizbench", "BizBench", "bizbench"}:
diff --git a/src/eval_stages/stage1_local_eval_execution.py b/src/eval_stages/stage1_local_eval_execution.py
index ee024ad9..71657608 100644
--- a/src/eval_stages/stage1_local_eval_execution.py
+++ b/src/eval_stages/stage1_local_eval_execution.py
@@ -1,9 +1,9 @@
 """Eval Stage 1_local: direct evaluation without Inspect.
 
 This stage runs subject models directly, including local HuggingFace models
-loaded from disk via `provider: hf_local`. Local HF models can run through
-`transformers` or `vllm`, then each response is judged and written to the final
-`flat_<capability>.jsonl` output expected by downstream workflows.
+loaded from disk via `provider: hf_local` using vLLM. Each response is judged
+and written to the final `flat_<capability>.jsonl` output expected by downstream
+workflows.
 """
 
 from __future__ import annotations
@@ -140,11 +140,11 @@ def _is_hf_local_provider(provider: str) -> bool:
 
 
 def _uses_vllm_backend(model_config: Dict[str, Any]) -> bool:
-    """Return True when a local HF model should run via vLLM."""
-    backend = str(model_config.get("inference_backend", "transformers")).lower()
-    return _is_hf_local_provider(str(model_config.get("provider", ""))) and (
-        backend == "vllm"
-    )
+    """Return True when a local HF model should run via vLLM.
+
+    All local HF models use vLLM exclusively.
+    """
+    return _is_hf_local_provider(str(model_config.get("provider", "")))
 
 
 def _build_messages(sys_prompt: str, user_prompt: str) -> List[Dict[str, str]]:
@@ -170,49 +170,6 @@ def _render_text_prompt(tokenizer: Any, *, sys_prompt: str, user_prompt: str) ->
     return user_prompt
 
 
-def _load_hf_local_model(
-    model_config: Dict[str, Any],
-) -> Tuple[Any, Any]:
-    """Load a local HuggingFace causal LM and tokenizer."""
-    try:
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-    except Exception as exc:  # noqa: BLE001
-        raise RuntimeError(
-            "transformers is required for provider=hf_local in stage=1_local"
-        ) from exc
-
-    model_path = model_config.get("model_path")
-    if not model_path:
-        raise ValueError(
-            "provider=hf_local requires `model_path` in subject_llms config"
-        )
-
-    trust_remote_code = bool(model_config.get("trust_remote_code", True))
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_path,
-        trust_remote_code=trust_remote_code,
-    )
-
-    if tokenizer.pad_token is None and tokenizer.eos_token is not None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    if torch.cuda.is_available():
-        torch_dtype = torch.bfloat16
-        device_map = model_config.get("device_map", "auto")
-    else:
-        torch_dtype = torch.float32
-        device_map = model_config.get("device_map", None)
-
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        trust_remote_code=trust_remote_code,
-        torch_dtype=torch_dtype,
-        device_map=device_map,
-    )
-    model.eval()
-    return tokenizer, model
-
-
 def _load_vllm_model(model_config: Dict[str, Any]) -> Any:
     """Load a local vLLM engine from disk."""
     try:
@@ -426,56 +383,6 @@ def _map_numeric_answer_to_option_letter(
     return best_letter
 
 
-def _generate_batch_with_hf_local(
-    tokenizer: Any,
-    model: Any,
-    *,
-    prompts: List[str],
-    generation_config: Dict[str, Any],
-) -> List[str]:
-    """Generate a batch of responses with a local HF causal LM."""
-    if not prompts:
-        return []
-
-    max_new_tokens = int(generation_config.get("max_tokens", 512))
-    temperature = float(generation_config.get("temperature", 0.0) or 0.0)
-    top_p = float(generation_config.get("top_p", 1.0) or 1.0)
-    repetition_penalty = float(generation_config.get("repetition_penalty", 1.0) or 1.0)
-    do_sample = temperature > 0
-
-    encoded = tokenizer(prompts, return_tensors="pt", padding=True)
-    input_ids = encoded["input_ids"]
-    attention_mask = encoded.get("attention_mask", torch.ones_like(input_ids))
-
-    model_device = next(model.parameters()).device
-    input_ids = input_ids.to(model_device)
-    attention_mask = attention_mask.to(model_device)
-
-    generate_kwargs = {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "max_new_tokens": max_new_tokens,
-        "do_sample": do_sample,
-        "pad_token_id": tokenizer.pad_token_id,
-        "eos_token_id": tokenizer.eos_token_id,
-        "repetition_penalty": repetition_penalty,
-    }
-    if do_sample:
-        generate_kwargs["temperature"] = temperature
-        generate_kwargs["top_p"] = top_p
-
-    with torch.inference_mode():
-        generated = model.generate(**generate_kwargs)
-
-    prompt_token_count = input_ids.shape[-1]
-    generated_texts: List[str] = []
-    for row_tokens in generated:
-        generated_tokens = row_tokens[prompt_token_count:]
-        output_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-        generated_texts.append(output_text.strip())
-    return generated_texts
-
-
 def _generate_batch_with_vllm(
     llm: Any,
     *,
@@ -587,12 +494,8 @@ def _log_running_performance(
 def _judge_batch(
     rows: List[Dict[str, Any]],
     *,
+    judge_model: Model,
     judge_generation_cfg: Dict[str, Any],
-    judge_model: Optional[Model] = None,
-    judge_tokenizer: Any = None,
-    judge_hf_model: Any = None,
-    judge_vllm_model: Any = None,
-    judge_vllm_tokenizer: Any = None,
     max_concurrent_requests: int = 8,
 ) -> List[Dict[str, Any]]:
     """Judge a batch of rows, using exact-match shortcuts when possible."""
@@ -602,15 +505,12 @@ def _judge_batch(
     scored_rows: List[Optional[Dict[str, Any]]] = [None] * len(rows)
     unresolved_indices: List[int] = []
     unresolved_prompts: List[str] = []
-    unresolved_task_ids: List[str] = []
 
     for index, row in enumerate(rows):
         raw_output = str(row["model_output"])
         parsed_submission = parse_submission(raw_output) or raw_output
         judge_submission = _last_sentence(raw_output) or parsed_submission
         target = str(row["ground_truth"])
-        # If this is an MCQ with a letter target, allow mapping a numeric final answer
-        # back to an option letter based on the question's options.
         mapped_letter = _map_numeric_answer_to_option_letter(
             submission=parsed_submission,
             question=str(row.get("question", "")),
@@ -622,70 +522,35 @@ def _judge_batch(
             scored_rows[index] = {**row, "grade": "C"}
             continue
         unresolved_indices.append(index)
-        unresolved_task_ids.append(str(row.get("id", "")))
-        # Give the judge only the model's final fragment to reduce noise.
         judge_prompt = _build_judge_prompt(judge_submission, target)
         unresolved_prompts.append(judge_prompt)
 
     if unresolved_prompts:
-        if judge_vllm_model is not None:
-            prompts = [
-                _render_text_prompt(
-                    judge_vllm_tokenizer,
-                    sys_prompt="You are a careful, non-pedantic grading assistant.",
-                    user_prompt=prompt,
-                )
-                for prompt in unresolved_prompts
-            ]
-            judge_outputs = _generate_batch_with_vllm(
-                judge_vllm_model,
-                prompts=prompts,
-                generation_config=judge_generation_cfg,
-            )
-        elif judge_hf_model is not None:
-            prompts = [
-                _render_text_prompt(
-                    judge_tokenizer,
-                    sys_prompt="You are a careful, non-pedantic grading assistant.",
-                    user_prompt=prompt,
-                )
-                for prompt in unresolved_prompts
-            ]
-            judge_outputs = _generate_batch_with_hf_local(
-                judge_tokenizer,
-                judge_hf_model,
-                prompts=prompts,
-                generation_config=judge_generation_cfg,
-            )
-        else:
-            if judge_model is None:
-                raise ValueError("judge_model is required when no local judge backend is set")
-            async def _run_async_judge(prompts: List[str]) -> List[str]:
-                sem = asyncio.Semaphore(max(1, int(max_concurrent_requests)))
-
-                async def _one(p: str) -> str:
-                    async with sem:
-                        txt, _ = await judge_model.async_generate(
-                            sys_prompt="You are a careful, non-pedantic grading assistant.",
-                            user_prompt=p,
-                            generation_config=judge_generation_cfg,
-                        )
-                        return txt or ""
+        async def _run_async_judge(prompts: List[str]) -> List[str]:
+            sem = asyncio.Semaphore(max(1, int(max_concurrent_requests)))
 
-                return list(await asyncio.gather(*(_one(p) for p in prompts)))
-
-            try:
-                judge_outputs = asyncio.run(_run_async_judge(unresolved_prompts))
-            except Exception:
-                # Fallback to synchronous calls if async event loop issues occur.
-                judge_outputs = []
-                for prompt in unresolved_prompts:
-                    judge_text, _ = judge_model.generate(
+            async def _one(p: str) -> str:
+                async with sem:
+                    txt, _ = await judge_model.async_generate(
                         sys_prompt="You are a careful, non-pedantic grading assistant.",
-                        user_prompt=prompt,
+                        user_prompt=p,
                         generation_config=judge_generation_cfg,
                     )
-                    judge_outputs.append(judge_text or "")
+                    return txt or ""
+
+            return list(await asyncio.gather(*(_one(p) for p in prompts)))
+
+        try:
+            judge_outputs = asyncio.run(_run_async_judge(unresolved_prompts))
+        except Exception:
+            judge_outputs = []
+            for prompt in unresolved_prompts:
+                judge_text, _ = judge_model.generate(
+                    sys_prompt="You are a careful, non-pedantic grading assistant.",
+                    user_prompt=prompt,
+                    generation_config=judge_generation_cfg,
+                )
+                judge_outputs.append(judge_text or "")
 
         for index, grade in zip(
             unresolved_indices,
@@ -754,24 +619,10 @@ def run_eval_stage1_local(
         judge_generation_cfg["max_tokens"] = 16
     if "temperature" not in judge_generation_cfg:
         judge_generation_cfg["temperature"] = 0
-    judge_provider = str(judge_llm_cfg.get("provider", "openai"))
     judge_batch_size = int(judge_llm_cfg.get("batch_size", 32))
-    judge_using_vllm = _uses_vllm_backend(judge_llm_cfg)
-    judge_model: Optional[Model] = None
-    judge_tokenizer: Any = None
-    judge_hf_model: Any = None
-    # IMPORTANT: if judge is vLLM, we load it lazily per combination to avoid
-    # having subject-vLLM and judge-vLLM resident at the same time.
-    judge_vllm_model: Any = None
-    judge_vllm_tokenizer: Any = None
-    if _is_hf_local_provider(judge_provider) and not judge_using_vllm:
-        logger.info("Loading local HF judge %s", judge_llm_cfg["name"])
-        judge_tokenizer, judge_hf_model = _load_hf_local_model(judge_llm_cfg)
-    elif not judge_using_vllm:
-        judge_model = _build_model(judge_llm_cfg)
+    judge_model = _build_model(judge_llm_cfg)
 
     model_instances: Dict[Tuple[str, str], Model] = {}
-    hf_model_instances: Dict[Tuple[str, str], Tuple[Any, Any]] = {}
     vllm_model_instances: Dict[Tuple[str, str], Any] = {}
 
     num_completed_this_run = 0
@@ -858,12 +709,6 @@ def run_eval_stage1_local(
                     logger.info("  Loading vLLM engine for %s", llm_name)
                     vllm_model_instances[model_key] = _load_vllm_model(dict(llm_config))
                 vllm_model = vllm_model_instances[model_key]
-            elif _is_hf_local_provider(llm_provider):
-                if model_key not in hf_model_instances:
-                    hf_model_instances[model_key] = _load_hf_local_model(
-                        dict(llm_config)
-                    )
-                tokenizer, hf_model = hf_model_instances[model_key]
             else:
                 if model_key not in model_instances:
                     model_instances[model_key] = _build_model(dict(llm_config))
@@ -882,26 +727,15 @@ def run_eval_stage1_local(
                 if using_vllm and hasattr(vllm_model, "get_tokenizer"):
                     subject_tokenizer = vllm_model.get_tokenizer()
 
-                # If BOTH subject and judge are vLLM, avoid dual-engine residency:
-                # - If they point to the same model_path, reuse the subject engine for judging.
-                # - Otherwise, generate everything first, free subject engine, then start judge.
-                judge_needs_serialization = bool(judge_using_vllm and using_vllm)
-                can_reuse_subject_as_judge = False
-                if judge_needs_serialization:
-                    subj_path = str(dict(llm_config).get("model_path", ""))
-                    judge_path = str(judge_llm_cfg.get("model_path", ""))
-                    can_reuse_subject_as_judge = bool(subj_path and judge_path and subj_path == judge_path)
-
-                if judge_needs_serialization and not can_reuse_subject_as_judge:
-                    # Phase A: generate all pending outputs (no judging yet)
-                    all_generated: List[Dict[str, Any]] = []
-                    with tqdm(
-                        total=len(pending_tasks),
-                        desc=f"Generate {llm_name}/{dataset.capability_id}",
-                        dynamic_ncols=True,
-                    ) as gen_bar:
-                        for task_batch in _batched(pending_tasks, batch_size):
-                            failed_task_id = task_batch[0].get("id")
+                with tqdm(
+                    total=total_tasks,
+                    initial=len(row_by_id),
+                    desc=f"Eval {llm_name}/{dataset.capability_id}",
+                    dynamic_ncols=True,
+                ) as eval_bar:
+                    for task_batch in _batched(pending_tasks, batch_size):
+                        failed_task_id = task_batch[0].get("id")
+                        if using_vllm:
                             prompts = [
                                 _render_text_prompt(
                                     subject_tokenizer,
@@ -915,163 +749,49 @@ def run_eval_stage1_local(
                                 prompts=prompts,
                                 generation_config=subject_generation_cfg,
                             )
-                            for task, generated_text in zip(task_batch, generated_texts, strict=True):
-                                all_generated.append(
-                                    {
-                                        "id": task["id"],
-                                        "question": task["input"],
-                                        "ground_truth": task["target"],
-                                        "model_output": generated_text,
-                                    }
+                        else:
+                            generated_texts = []
+                            for task in task_batch:
+                                failed_task_id = task.get("id")
+                                prompt = _format_prompt(dataset, task)
+                                generated_text, _ = subject_model.generate(
+                                    sys_prompt="",
+                                    user_prompt=prompt,
+                                    generation_config=subject_generation_cfg,
                                 )
-                            gen_bar.update(len(task_batch))
-
-                    # Tear down subject vLLM before starting judge vLLM
-                    subject_engine = vllm_model_instances.pop(model_key, None)
-                    if subject_engine is not None:
-                        _teardown_vllm_engine(subject_engine, llm_name)
-                    _wait_for_vllm_startup_memory(
-                        float(judge_llm_cfg.get("gpu_memory_utilization", 0.9))
-                    )
+                                generated_texts.append(generated_text or "")
+
+                        generated_rows = [
+                            {
+                                "id": task["id"],
+                                "question": task["input"],
+                                "ground_truth": task["target"],
+                                "model_output": generated_text,
+                            }
+                            for task, generated_text in zip(
+                                task_batch, generated_texts, strict=True
+                            )
+                        ]
 
-                    # Phase B: start judge vLLM and judge in batches
-                    logger.info("  Loading vLLM judge (after subject generation teardown)")
-                    judge_vllm_model = _load_vllm_model(judge_llm_cfg)
-                    judge_vllm_tokenizer = (
-                        judge_vllm_model.get_tokenizer()
-                        if hasattr(judge_vllm_model, "get_tokenizer")
-                        else None
-                    )
-                    with tqdm(
-                        total=len(all_generated),
-                        desc=f"Judge {llm_name}/{dataset.capability_id}",
-                        dynamic_ncols=True,
-                    ) as judge_bar:
-                        for judge_batch in _batched(all_generated, judge_batch_size):
-                            failed_task_id = judge_batch[0].get("id")
+                        for jb in _batched(generated_rows, judge_batch_size):
+                            failed_task_id = jb[0].get("id")
                             scored_batch = _judge_batch(
-                                judge_batch,
-                                judge_generation_cfg=judge_generation_cfg,
+                                jb,
                                 judge_model=judge_model,
-                                judge_tokenizer=judge_tokenizer,
-                                judge_hf_model=judge_hf_model,
-                                judge_vllm_model=judge_vllm_model,
-                                judge_vllm_tokenizer=judge_vllm_tokenizer,
+                                judge_generation_cfg=judge_generation_cfg,
                                 max_concurrent_requests=judge_batch_size,
                             )
                             for scored_row in scored_batch:
                                 row_by_id[str(scored_row["id"])] = scored_row
-                            _write_flat_results(flat_path, _ordered_rows(dataset.tasks, row_by_id))
-                            _log_running_performance(
-                                llm_name=llm_name,
-                                capability_id=dataset.capability_id,
-                                row_by_id=row_by_id,
-                                total_tasks=total_tasks,
-                            )
-                            judge_bar.update(len(judge_batch))
 
-                    # Tear down judge vLLM too
-                    _teardown_vllm_engine(judge_vllm_model, str(judge_llm_cfg.get("name", "judge")))
-                    judge_vllm_model = None
-                    judge_vllm_tokenizer = None
-                else:
-                    # Default fast path: generate + judge streaming (can reuse subject engine as judge if same model)
-                    if judge_using_vllm and using_vllm and can_reuse_subject_as_judge:
-                        judge_vllm_model = vllm_model
-                        judge_vllm_tokenizer = subject_tokenizer
-                    elif judge_using_vllm and judge_vllm_model is None:
-                        logger.info("  Loading vLLM judge %s", judge_llm_cfg["name"])
-                        judge_vllm_model = _load_vllm_model(judge_llm_cfg)
-                        judge_vllm_tokenizer = (
-                            judge_vllm_model.get_tokenizer()
-                            if hasattr(judge_vllm_model, "get_tokenizer")
-                            else None
+                        _write_flat_results(flat_path, _ordered_rows(dataset.tasks, row_by_id))
+                        _log_running_performance(
+                            llm_name=llm_name,
+                            capability_id=dataset.capability_id,
+                            row_by_id=row_by_id,
+                            total_tasks=total_tasks,
                         )
-
-                    with tqdm(
-                        total=total_tasks,
-                        initial=len(row_by_id),
-                        desc=f"Eval {llm_name}/{dataset.capability_id}",
-                        dynamic_ncols=True,
-                    ) as eval_bar:
-                        for task_batch in _batched(pending_tasks, batch_size):
-                            failed_task_id = task_batch[0].get("id")
-                            if using_vllm:
-                                prompts = [
-                                    _render_text_prompt(
-                                        subject_tokenizer,
-                                        sys_prompt="",
-                                        user_prompt=_format_prompt(dataset, task),
-                                    )
-                                    for task in task_batch
-                                ]
-                                generated_texts = _generate_batch_with_vllm(
-                                    vllm_model,
-                                    prompts=prompts,
-                                    generation_config=subject_generation_cfg,
-                                )
-                            elif _is_hf_local_provider(llm_provider):
-                                prompts = [
-                                    _render_text_prompt(
-                                        tokenizer,
-                                        sys_prompt="",
-                                        user_prompt=_format_prompt(dataset, task),
-                                    )
-                                    for task in task_batch
-                                ]
-                                generated_texts = _generate_batch_with_hf_local(
-                                    tokenizer,
-                                    hf_model,
-                                    prompts=prompts,
-                                    generation_config=subject_generation_cfg,
-                                )
-                            else:
-                                generated_texts = []
-                                for task in task_batch:
-                                    failed_task_id = task.get("id")
-                                    prompt = _format_prompt(dataset, task)
-                                    generated_text, _ = subject_model.generate(
-                                        sys_prompt="",
-                                        user_prompt=prompt,
-                                        generation_config=subject_generation_cfg,
-                                    )
-                                    generated_texts.append(generated_text or "")
-
-                            generated_rows = [
-                                {
-                                    "id": task["id"],
-                                    "question": task["input"],
-                                    "ground_truth": task["target"],
-                                    "model_output": generated_text,
-                                }
-                                for task, generated_text in zip(
-                                    task_batch, generated_texts, strict=True
-                                )
-                            ]
-
-                            for jb in _batched(generated_rows, judge_batch_size):
-                                failed_task_id = jb[0].get("id")
-                                scored_batch = _judge_batch(
-                                    jb,
-                                    judge_generation_cfg=judge_generation_cfg,
-                                    judge_model=judge_model,
-                                    judge_tokenizer=judge_tokenizer,
-                                    judge_hf_model=judge_hf_model,
-                                    judge_vllm_model=judge_vllm_model,
-                                    judge_vllm_tokenizer=judge_vllm_tokenizer,
-                                    max_concurrent_requests=judge_batch_size,
-                                )
-                                for scored_row in scored_batch:
-                                    row_by_id[str(scored_row["id"])] = scored_row
-
-                            _write_flat_results(flat_path, _ordered_rows(dataset.tasks, row_by_id))
-                            _log_running_performance(
-                                llm_name=llm_name,
-                                capability_id=dataset.capability_id,
-                                row_by_id=row_by_id,
-                                total_tasks=total_tasks,
-                            )
-                            eval_bar.update(len(task_batch))
+                        eval_bar.update(len(task_batch))
             except Exception as exc:  # noqa: BLE001
                 logger.error(
                     "  Direct evaluation failed for %s/%s task %s with %s/%s: %s",
diff --git a/src/eval_stages/static_benchmarks/mathvista.py b/src/eval_stages/static_benchmarks/mathvista.py
deleted file mode 100644
index 48f2ff92..00000000
--- a/src/eval_stages/static_benchmarks/mathvista.py
+++ /dev/null
@@ -1,99 +0,0 @@
-"""Adapter for the AI4Math/MathVista static benchmark.
-
-Dataset card: https://huggingface.co/datasets/AI4Math/MathVista
-
-This adapter focuses on the labeled ``testmini`` split, which provides
-answers for 1,000 examples. The \"test\" split does not expose labels.
-"""
-
-from __future__ import annotations
-
-import re
-from typing import Any, Dict, Iterable, List
-
-from datasets import load_dataset
-
-from src.eval_stages.prompts import DEFAULT_EVAL_PROMPT_TEMPLATE
-from src.eval_stages.static_benchmarks.specs import StaticBenchmarkSpec
-from src.schemas.eval_schemas import EvalDataset
-
-
-def _slugify(text: str) -> str:
-    """Convert arbitrary strings into safe directory-friendly IDs."""
-    cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip()).strip("_").lower()
-    return cleaned or "unknown"
-
-
-def _iter_mathvista_samples(
-    split: str,
-    limit: int | None,
-) -> Iterable[Dict[str, Any]]:
-    """Yield rows from AI4Math/MathVista in a stable order."""
-    ds = load_dataset("AI4Math/MathVista", split=split)
-    if limit is not None:
-        ds = ds.select(range(min(limit, len(ds))))
-    yield from ds
-
-
-def build_eval_datasets_from_mathvista(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
-    """Convert MathVista into a single EvalDataset.
-
-    We treat:
-    - domain: always "math"
-    - area_id: taken from spec.area_id (e.g. "math" or "static_benchmarks")
-    - capability_id / capability_name: a single capability "mathvista"
-      covering all tasks in the chosen split (typically ``testmini``).
-    """
-    tasks: List[Dict[str, str]] = []
-
-    for idx, row in enumerate(_iter_mathvista_samples(spec.split, spec.limit)):
-        # Prefer the curated query prompt if present.
-        query = str(row.get("query", "")).strip()
-        question = str(row.get("question", "")).strip()
-        image_path = str(row.get("image", "")).strip()
-        choices = row.get("choices")
-
-        if query:
-            input_text = query
-        else:
-            parts: List[str] = []
-            if question:
-                parts.append(question)
-            if isinstance(choices, list) and choices:
-                labeled: List[str] = []
-                for i, opt in enumerate(choices):
-                    label = chr(ord("A") + i)
-                    labeled.append(f"{label}. {str(opt).strip()}")
-                parts.append("Options: " + " ".join(labeled))
-            input_text = "\n\n".join(parts).strip()
-
-        if image_path:
-            input_text = f"{input_text}\n\n[Image path: {image_path}]".strip()
-
-        answer = str(row.get("answer", "")).strip()
-
-        if not input_text or not answer:
-            continue
-
-        pid = str(row.get("pid", "")).strip()
-        task_id = pid or f"mathvista_{idx:04d}"
-
-        tasks.append({"id": task_id, "input": input_text, "target": answer})
-
-    if not tasks:
-        return []
-
-    capability_id = "mathvista"
-    capability_name = "MathVista"
-
-    dataset = EvalDataset(
-        area_id=spec.area_id,
-        capability_id=capability_id,
-        capability_name=capability_name,
-        domain="math",
-        tasks=tasks,
-        num_tasks=len(tasks),
-        prompt_template=DEFAULT_EVAL_PROMPT_TEMPLATE,
-    )
-    return [dataset]
-

From b8c479e577f265e8bf2ceede731ff7439cd87067 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Wed, 15 Apr 2026 15:56:09 -0400
Subject: [PATCH 6/8] changed prompt

---
 src/eval_stages/stage1_local_eval_execution.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/eval_stages/stage1_local_eval_execution.py b/src/eval_stages/stage1_local_eval_execution.py
index 71657608..c46dc2f3 100644
--- a/src/eval_stages/stage1_local_eval_execution.py
+++ b/src/eval_stages/stage1_local_eval_execution.py
@@ -105,7 +105,7 @@ def _format_prompt(dataset: EvalDataset, task: Dict[str, str]) -> str:
     is_mcq = bool(re.search(r"(?im)^\s*options\s*:\s*$", str(task.get("input", ""))))
     if is_mcq:
         answer_instruction = (
-            "\n\nReason briefly and do not repeat yourself. Stop immediately after the final "
+            "\n\nReason briefly. Stop immediately after the final "
             "answer line.\n\nThis is a multiple-choice question. On the last line, return ONLY "
             "the option letter in machine-readable form as `ANSWER: <LETTER>` "
             "(e.g., `ANSWER: B`). Do NOT return a number, currency amount, or explanation "
@@ -113,7 +113,7 @@ def _format_prompt(dataset: EvalDataset, task: Dict[str, str]) -> str:
         )
     else:
         answer_instruction = (
-            "\n\nReason briefly and do not repeat yourself. Stop immediately after the final "
+            "\n\nReason briefly. Stop immediately after the final "
             "answer line.\n\nReturn your final answer in a machine-readable form on the last "
             "line as `ANSWER: <final answer>`."
         )

From 4543b0feeb7bc913be3524c3a0ff26e28a2d91c6 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Wed, 15 Apr 2026 16:39:20 -0400
Subject: [PATCH 7/8] Made some changes

---
 .../stage1_local_eval_execution.py            | 24 +++++--------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/src/eval_stages/stage1_local_eval_execution.py b/src/eval_stages/stage1_local_eval_execution.py
index c46dc2f3..4d7b81e6 100644
--- a/src/eval_stages/stage1_local_eval_execution.py
+++ b/src/eval_stages/stage1_local_eval_execution.py
@@ -78,8 +78,6 @@ def _write_flat_results(output_path: Path, rows: List[Dict[str, Any]]) -> None:
     num_correct = sum(1 for row in rows if row.get("grade") == "C")
     num_incorrect = sum(1 for row in rows if row.get("grade") == "I")
     accuracy = (num_correct / num_samples) if num_samples else 0.0
-    f1 = accuracy
-
     with open(output_path, "w", encoding="utf-8") as f:
         summary = {
             "summary": True,
@@ -87,7 +85,6 @@ def _write_flat_results(output_path: Path, rows: List[Dict[str, Any]]) -> None:
             "num_correct": num_correct,
             "num_incorrect": num_incorrect,
             "accuracy": accuracy,
-            "f1": f1,
         }
         f.write(json.dumps(summary, ensure_ascii=False) + "\n")
         for row in rows:
@@ -102,21 +99,12 @@ def _format_prompt(dataset: EvalDataset, task: Dict[str, str]) -> str:
     except Exception:  # noqa: BLE001
         prompt = str(task["input"])
 
-    is_mcq = bool(re.search(r"(?im)^\s*options\s*:\s*$", str(task.get("input", ""))))
-    if is_mcq:
-        answer_instruction = (
-            "\n\nReason briefly. Stop immediately after the final "
-            "answer line.\n\nThis is a multiple-choice question. On the last line, return ONLY "
-            "the option letter in machine-readable form as `ANSWER: <LETTER>` "
-            "(e.g., `ANSWER: B`). Do NOT return a number, currency amount, or explanation "
-            "on the final answer line."
-        )
-    else:
-        answer_instruction = (
-            "\n\nReason briefly. Stop immediately after the final "
-            "answer line.\n\nReturn your final answer in a machine-readable form on the last "
-            "line as `ANSWER: <final answer>`."
-        )
+    answer_instruction = (
+        "\n\nReason briefly. Stop immediately after the final answer line.\n\n"
+        "On the last line, return your answer in machine-readable form as "
+        "`ANSWER: <final answer>`. If this is multiple-choice, return only the option letter "
+        "(e.g., `ANSWER: B`)."
+    )
     return prompt + answer_instruction
 
 

From 6f73c9dacfd1abd9072b495be301e9ca2ab09bb0 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Thu, 16 Apr 2026 10:53:20 -0400
Subject: [PATCH 8/8] FinKnow

---
 .../bizbench_local_array_eval.sh              | 10 +++--
 src/eval_stages/stage0_static_benchmarks.py   |  1 +
 src/eval_stages/static_benchmarks/bizbench.py | 44 ++++++++++++++-----
 src/eval_stages/static_benchmarks/specs.py    |  4 ++
 4 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/scripts/static_benchmarks/bizbench_local_array_eval.sh b/scripts/static_benchmarks/bizbench_local_array_eval.sh
index d8cbd04b..3da39417 100644
--- a/scripts/static_benchmarks/bizbench_local_array_eval.sh
+++ b/scripts/static_benchmarks/bizbench_local_array_eval.sh
@@ -2,10 +2,10 @@
 #SBATCH --job-name=gemma_bizbench_local_array
 #SBATCH --output=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/bizbench_local_array_%A_%a.out
 #SBATCH --error=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/bizbench_local_array_%A_%a.err
-#SBATCH --time=08:00:00
+#SBATCH --time=06:00:00
 #SBATCH --cpus-per-task=8
 #SBATCH --mem=64G
-#SBATCH --gres=gpu:a100:1
+#SBATCH --gres=gpu:a40:1
 #SBATCH --array=0-7%8
 
 set -euo pipefail
@@ -23,7 +23,7 @@ source "scripts/static_benchmarks/env_slurm_inspect.sh"
 
 NUM_SHARDS=8
 
-# Count only rows that survive adapter filtering.
+# Count only FinKnow rows that survive adapter filtering.
 TOTAL=$(
 python - <<'PY'
 from datasets import load_dataset
@@ -32,6 +32,7 @@ ds = load_dataset("kensho/bizbench", split="test")
 
 def is_valid(row):
     question = str(row.get("question", "")).strip()
+    task = str(row.get("task", "") or "").lower()
     answer = row.get("answer")
     if answer is None:
         answer_text = ""
@@ -44,7 +45,8 @@ def is_valid(row):
             answer_text = str(answer).strip()
     else:
         answer_text = str(answer).strip()
-    return bool(question and answer_text)
+    # Adapter default is `finknow_only=true`, so we shard based on the same subset.
+    return bool("finknow" in task and question and answer_text)
 
 print(sum(1 for row in ds if is_valid(row)))
 PY
diff --git a/src/eval_stages/stage0_static_benchmarks.py b/src/eval_stages/stage0_static_benchmarks.py
index 1cc9a355..be74f99f 100644
--- a/src/eval_stages/stage0_static_benchmarks.py
+++ b/src/eval_stages/stage0_static_benchmarks.py
@@ -100,6 +100,7 @@ def run_eval_stage0_static(cfg: DictConfig, validation_tag: str) -> None:
         capability_name=static_cfg.get("capability_name"),
         domain=str(static_cfg.get("domain", StaticBenchmarkSpec.domain)),
         exclude_bloom_create=static_cfg.get("exclude_bloom_create", True),
+        finknow_only=static_cfg.get("finknow_only", True),
     )
 
     logger.info(
diff --git a/src/eval_stages/static_benchmarks/bizbench.py b/src/eval_stages/static_benchmarks/bizbench.py
index 10b25aa2..0a93a2ec 100644
--- a/src/eval_stages/static_benchmarks/bizbench.py
+++ b/src/eval_stages/static_benchmarks/bizbench.py
@@ -73,24 +73,39 @@ def _iter_bizbench_samples(
     split: str,
     offset: int | None,
     limit: int | None,
+    *,
+    finknow_only: bool,
 ) -> Iterable[Dict[str, Any]]:
     ds = load_dataset("kensho/bizbench", split=split)
     n = len(ds)
 
     start = 0 if offset is None else max(0, int(offset))
-    if start >= n:
-        return iter(())
+    end = None if limit is None else start + int(limit)
 
-    if limit is None:
-        end = n
-    else:
-        end = min(start + int(limit), n)
+    # Apply filtering first, then slice by (offset, limit) over the filtered stream.
+    # This keeps the sharding logic consistent with the adapter's filtering.
+    kept_rank = 0
+    for dataset_idx, row in enumerate(ds):
+        if finknow_only:
+            task_val = str(row.get("task", "") or "")
+            if "finknow" not in task_val.lower():
+                continue
 
-    if start == 0 and end == n:
-        yield from ds
-        return
+        question = str(row.get("question", "")).strip()
+        answer_norm = _normalize_answer(row.get("answer"))
+        if not question or not answer_norm:
+            continue
+
+        if kept_rank < start:
+            kept_rank += 1
+            continue
+        if end is not None and kept_rank >= end:
+            break
 
-    yield from ds.select(range(start, end))
+        row = dict(row)
+        row["_global_idx"] = dataset_idx
+        yield row
+        kept_rank += 1
 
 
 def build_eval_datasets_from_bizbench(spec: StaticBenchmarkSpec) -> List[EvalDataset]:
@@ -98,7 +113,12 @@ def build_eval_datasets_from_bizbench(spec: StaticBenchmarkSpec) -> List[EvalDat
     tasks: List[Dict[str, str]] = []
 
     for local_idx, row in enumerate(
-        _iter_bizbench_samples(spec.split, spec.offset, spec.limit)
+        _iter_bizbench_samples(
+            spec.split,
+            spec.offset,
+            spec.limit,
+            finknow_only=spec.finknow_only,
+        )
     ):
         question = str(row.get("question", "")).strip()
         raw_answer = row.get("answer")
@@ -108,7 +128,7 @@ def build_eval_datasets_from_bizbench(spec: StaticBenchmarkSpec) -> List[EvalDat
             continue
 
         inp = _build_input(question, row.get("context"), row.get("options"))
-        global_idx = (spec.offset or 0) + local_idx
+        global_idx = int(row.get("_global_idx", local_idx))
         task_id = f"bizbench_{global_idx:05d}"
         tasks.append({"id": task_id, "input": inp, "target": answer})
 
diff --git a/src/eval_stages/static_benchmarks/specs.py b/src/eval_stages/static_benchmarks/specs.py
index c4fb3631..64f52d99 100644
--- a/src/eval_stages/static_benchmarks/specs.py
+++ b/src/eval_stages/static_benchmarks/specs.py
@@ -45,3 +45,7 @@ class StaticBenchmarkSpec:
     # When true, removes Bloom level "Create - Combine elements..." tasks.
     exclude_bloom_create: bool = True
 
+    # Controls whether BizBench ingestion should keep only the FinKnow subset.
+    # When true, filters rows where the dataset's `task` field indicates FinKnow.
+    finknow_only: bool = True
+