VectorInstitute · Negiiiin · Mar 17, 2026 · Mar 19, 2026 · Apr 2, 2026 · Apr 14, 2026
diff --git a/.gitignore b/.gitignore
@@ -146,3 +146,19 @@ outputs/
 # inspect result logs
 seed_datasets_inspect_logs/
 seed_tasks_results/
+
+# Generated experiment/evaluation artifacts
+base_output/
+base_output_tmp/
+base_output_tmp_2/
+logs_tmp/
+Finance_Book1_Book2/
+Finance_Book3_Book4/
+Finance_Book5_Book6/
+topic.csv
+
+# Local benchmark/task JSON exports
+finance_tasks.json
+seed_tasks.json
+task_4.json
+tasks_2.json
diff --git a/scripts/flatten_inspect_logs.py b/scripts/flatten_inspect_logs.py
@@ -0,0 +1,108 @@
+"""Utility to flatten Inspect JSON logs into a simple, readable format.
+
+Given an Inspect eval log file (one of the large JSON files under
+base_output/<exp_id>/eval/results/<eval_tag>/<model>/<area>/<capability>/),
+this script writes out a JSONL file with, per row:
+
+- id:          sample id
+- question:    original input
+- ground_truth: target string
+- model_output: subject model's answer text
+- grade:       judge letter grade (if present, e.g. \"C\" or \"I\")
+
+Usage:
+    python scripts/flatten_inspect_logs.py \\
+        --log_path base_output/test_exp/eval/results/_20260316_031445/\\
+                  gpt-5-nano/static_benchmarks/integral/\\
+                  2026-03-15T23-14-46-04-00_task_mZxA3jKBseS2smuk4ppcxN.json \\
+        --out_path base_output/test_exp/eval/results/_20260316_031445/\\
+                  gpt-5-nano/static_benchmarks/integral/flat_integral.jsonl
+
+The first line of the JSONL file is a summary object with:
+- num_samples
+- num_correct
+- num_incorrect
+- accuracy
+- f1 (computed treating "C" as correct, "I" as incorrect)
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+
+
+def flatten_inspect_log(log_path: Path) -> List[Dict[str, Any]]:
+    data = json.loads(log_path.read_text(encoding="utf-8"))
+
+    samples = data.get("samples", [])
+    flattened: List[Dict[str, Any]] = []
+
+    for s in samples:
+        sid = s.get("id")
+        question = s.get("input")
+        target = s.get("target")
+
+        model_output = None
+        output = s.get("output") or {}
+        choices = output.get("choices") or []
+        if choices:
+            msg = (choices[0] or {}).get("message") or {}
+            model_output = msg.get("content")
+
+        grade = None
+        scores = s.get("scores") or {}
+        fact = scores.get("model_graded_fact") or {}
+        grade = fact.get("value")
+
+        flattened.append(
+            {
+                "id": sid,
+                "question": question,
+                "ground_truth": target,
+                "model_output": model_output,
+                "grade": grade,
+            }
+        )
+
+    return flattened
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_path", type=str, required=True)
+    parser.add_argument("--out_path", type=str, required=True)
+    args = parser.parse_args()
+
+    log_path = Path(args.log_path)
+    out_path = Path(args.out_path)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    rows = flatten_inspect_log(log_path)
+
+    num_samples = len(rows)
+    num_correct = sum(1 for r in rows if r.get("grade") == "C")
+    num_incorrect = sum(1 for r in rows if r.get("grade") == "I")
+    accuracy = (num_correct / num_samples) if num_samples else 0.0
+    # In this binary setting with grades only, we treat F1 as equal to accuracy.
+    f1 = accuracy
+
+    with out_path.open("w", encoding="utf-8") as f:
+        summary = {
+            "summary": True,
+            "num_samples": num_samples,
+            "num_correct": num_correct,
+            "num_incorrect": num_incorrect,
+            "accuracy": accuracy,
+            "f1": f1,
+        }
+        f.write(json.dumps(summary, ensure_ascii=False) + "\n")
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/static_benchmarks/bizbench_eval.sh b/scripts/static_benchmarks/bizbench_eval.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+#SBATCH --job-name=bizbench_eval
+#SBATCH --output=logs/bizbench_eval_%A_%a.out
+#SBATCH --error=logs/bizbench_eval_%A_%a.err
+#SBATCH --time=08:00:00
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+#SBATCH --array=0-50
+
+set -euo pipefail
+
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
+# Allow running either via sbatch (with SLURM_ARRAY_TASK_ID set)
+# or directly (default to a single chunk 0).
+: "${SLURM_ARRAY_TASK_ID:=0}"
+
+CHUNK=100
+OFFSET=$((SLURM_ARRAY_TASK_ID * CHUNK))
+VALIDATION_TAG="_BIZBENCH_Commercial_${SLURM_ARRAY_TASK_ID}_SundayNight"
+
+# Stage 0_static: build datasets from kensho/bizbench
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$VALIDATION_TAG" \
+  +static_benchmark_cfg.benchmark_id=kensho/bizbench \
+  +static_benchmark_cfg.split=test \
+  +static_benchmark_cfg.offset="$OFFSET" \
+  +static_benchmark_cfg.limit="$CHUNK"
+
+# Stage 1: run subject models on the static datasets
+python -m src.run_eval_pipeline \
+  stage=1 \
+  validation_tag="$VALIDATION_TAG" \
+  eval_tag="$VALIDATION_TAG"
+
+# Stage 2: aggregate scores
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$VALIDATION_TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
+echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
+echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"
+
+# Optional: generate flattened JSONL views of Inspect logs for easier reading
+RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
+if [ -d "$RESULTS_DIR" ]; then
+  echo "Flattening Inspect logs under $RESULTS_DIR ..."
+  for model_dir in "$RESULTS_DIR"/*/; do
+    [ -d "$model_dir" ] || continue
+    model_name="$(basename "$model_dir")"
+    for area_dir in "$model_dir"*/; do
+      [ -d "$area_dir" ] || continue
+      for cap_dir in "$area_dir"*/; do
+        [ -d "$cap_dir" ] || continue
+        cap_name="$(basename "$cap_dir")"
+        log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
+        if [ -n "$log_file" ]; then
+          out_file="$cap_dir/flat_${cap_name}.jsonl"
+          python scripts/flatten_inspect_logs.py \
+            --log_path "$log_file" \
+            --out_path "$out_file"
+          echo "  Wrote flattened log for $model_name/$cap_name to $out_file"
+        fi
+      done
+    done
+  done
+fi
+
diff --git a/scripts/static_benchmarks/bizbench_local_array_eval.sh b/scripts/static_benchmarks/bizbench_local_array_eval.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+#SBATCH --job-name=gemma_bizbench_local_array
+#SBATCH --output=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/bizbench_local_array_%A_%a.out
+#SBATCH --error=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/bizbench_local_array_%A_%a.err
+#SBATCH --time=06:00:00
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a40:1
+#SBATCH --array=0-7%8
+
+set -euo pipefail
+
+cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation
+
+# shellcheck disable=SC1091
+source /projects/DeepLesion/py311_env/bin/activate
+
+# shellcheck disable=SC1091
+source "scripts/static_benchmarks/env_slurm_inspect.sh"
+
+# Allow direct execution without sbatch by defaulting to shard 0.
+: "${SLURM_ARRAY_TASK_ID:=0}"
+
+NUM_SHARDS=8
+
+# Count only FinKnow rows that survive adapter filtering.
+TOTAL=$(
+python - <<'PY'
+from datasets import load_dataset
+
+ds = load_dataset("kensho/bizbench", split="test")
+
+def is_valid(row):
+    question = str(row.get("question", "")).strip()
+    task = str(row.get("task", "") or "").lower()
+    answer = row.get("answer")
+    if answer is None:
+        answer_text = ""
+    elif isinstance(answer, dict):
+        for key in ("answer", "label", "text", "value"):
+            if key in answer and answer[key] is not None:
+                answer_text = str(answer[key]).strip()
+                break
+        else:
+            answer_text = str(answer).strip()
+    else:
+        answer_text = str(answer).strip()
+    # Adapter default is `finknow_only=true`, so we shard based on the same subset.
+    return bool("finknow" in task and question and answer_text)
+
+print(sum(1 for row in ds if is_valid(row)))
+PY
+)
+
+CHUNK=$(((TOTAL + NUM_SHARDS - 1) / NUM_SHARDS))
+OFFSET=$((SLURM_ARRAY_TASK_ID * CHUNK))
+TAG="_BIZBENCH_TEST_GEMMA_3"
+
+if [ "$OFFSET" -ge "$TOTAL" ]; then
+  echo "No work for shard ${SLURM_ARRAY_TASK_ID} (OFFSET=$OFFSET >= TOTAL=$TOTAL). Exiting."
+  exit 0
+fi
+
+echo "TOTAL=$TOTAL NUM_SHARDS=$NUM_SHARDS CHUNK=$CHUNK OFFSET=$OFFSET TAG=$TAG"
+
+# Stage 0_static: build dataset shard from BizBench test split.
+python -m src.run_eval_pipeline \
+  stage=0_static \
+  validation_tag="$TAG" \
+  +static_benchmark_cfg.benchmark_id=kensho/bizbench \
+  +static_benchmark_cfg.split=test \
+  +static_benchmark_cfg.offset="$OFFSET" \
+  +static_benchmark_cfg.limit="$CHUNK"
+
+# Stage 1_local: evaluate local subject model(s) from run_cfg.yaml.
+python -m src.run_eval_pipeline \
+  stage=1_local \
+  validation_tag="$TAG" \
+  eval_tag="$TAG"
+
+# Stage 2: aggregate per-shard scores.
+python -m src.run_eval_pipeline \
+  stage=2 \
+  eval_tag="$TAG"
+
+echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$TAG"
+echo "Stage 1_local results:  base_output/test_exp/eval/results/$TAG"
+echo "Stage 2 scores:         base_output/test_exp/eval/scores/$TAG"