Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,19 @@ outputs/
# inspect result logs
seed_datasets_inspect_logs/
seed_tasks_results/

# Generated experiment/evaluation artifacts
base_output/
base_output_tmp/
base_output_tmp_2/
logs_tmp/
Finance_Book1_Book2/
Finance_Book3_Book4/
Finance_Book5_Book6/
topic.csv

# Local benchmark/task JSON exports
finance_tasks.json
seed_tasks.json
task_4.json
tasks_2.json
108 changes: 108 additions & 0 deletions scripts/flatten_inspect_logs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Utility to flatten Inspect JSON logs into a simple, readable format.

Given an Inspect eval log file (one of the large JSON files under
base_output/<exp_id>/eval/results/<eval_tag>/<model>/<area>/<capability>/),
this script writes out a JSONL file with, per row:

- id: sample id
- question: original input
- ground_truth: target string
- model_output: subject model's answer text
- grade: judge letter grade (if present, e.g. \"C\" or \"I\")

Usage:
python scripts/flatten_inspect_logs.py \\
--log_path base_output/test_exp/eval/results/_20260316_031445/\\
gpt-5-nano/static_benchmarks/integral/\\
2026-03-15T23-14-46-04-00_task_mZxA3jKBseS2smuk4ppcxN.json \\
--out_path base_output/test_exp/eval/results/_20260316_031445/\\
gpt-5-nano/static_benchmarks/integral/flat_integral.jsonl

The first line of the JSONL file is a summary object with:
- num_samples
- num_correct
- num_incorrect
- accuracy
- f1 (computed treating "C" as correct, "I" as incorrect)
"""

from __future__ import annotations

import argparse
import json
from pathlib import Path
from typing import Any, Dict, List


def flatten_inspect_log(log_path: Path) -> List[Dict[str, Any]]:
data = json.loads(log_path.read_text(encoding="utf-8"))

samples = data.get("samples", [])
flattened: List[Dict[str, Any]] = []

for s in samples:
sid = s.get("id")
question = s.get("input")
target = s.get("target")

model_output = None
output = s.get("output") or {}
choices = output.get("choices") or []
if choices:
msg = (choices[0] or {}).get("message") or {}
model_output = msg.get("content")

grade = None
scores = s.get("scores") or {}
fact = scores.get("model_graded_fact") or {}
grade = fact.get("value")

flattened.append(
{
"id": sid,
"question": question,
"ground_truth": target,
"model_output": model_output,
"grade": grade,
}
)

return flattened


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--log_path", type=str, required=True)
parser.add_argument("--out_path", type=str, required=True)
args = parser.parse_args()

log_path = Path(args.log_path)
out_path = Path(args.out_path)
out_path.parent.mkdir(parents=True, exist_ok=True)

rows = flatten_inspect_log(log_path)

num_samples = len(rows)
num_correct = sum(1 for r in rows if r.get("grade") == "C")
num_incorrect = sum(1 for r in rows if r.get("grade") == "I")
accuracy = (num_correct / num_samples) if num_samples else 0.0
# In this binary setting with grades only, we treat F1 as equal to accuracy.
f1 = accuracy

with out_path.open("w", encoding="utf-8") as f:
summary = {
"summary": True,
"num_samples": num_samples,
"num_correct": num_correct,
"num_incorrect": num_incorrect,
"accuracy": accuracy,
"f1": f1,
}
f.write(json.dumps(summary, ensure_ascii=False) + "\n")
for row in rows:
f.write(json.dumps(row, ensure_ascii=False) + "\n")


if __name__ == "__main__":
main()

73 changes: 73 additions & 0 deletions scripts/static_benchmarks/bizbench_eval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/bash
#SBATCH --job-name=bizbench_eval
#SBATCH --output=logs/bizbench_eval_%A_%a.out
#SBATCH --error=logs/bizbench_eval_%A_%a.err
#SBATCH --time=08:00:00
#SBATCH --cpus-per-task=4
#SBATCH --mem=16G
#SBATCH --array=0-50

set -euo pipefail

cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation

# shellcheck disable=SC1091
source "scripts/static_benchmarks/env_slurm_inspect.sh"

# Allow running either via sbatch (with SLURM_ARRAY_TASK_ID set)
# or directly (default to a single chunk 0).
: "${SLURM_ARRAY_TASK_ID:=0}"

CHUNK=100
OFFSET=$((SLURM_ARRAY_TASK_ID * CHUNK))
VALIDATION_TAG="_BIZBENCH_Commercial_${SLURM_ARRAY_TASK_ID}_SundayNight"

# Stage 0_static: build datasets from kensho/bizbench
python -m src.run_eval_pipeline \
stage=0_static \
validation_tag="$VALIDATION_TAG" \
+static_benchmark_cfg.benchmark_id=kensho/bizbench \
+static_benchmark_cfg.split=test \
+static_benchmark_cfg.offset="$OFFSET" \
+static_benchmark_cfg.limit="$CHUNK"

# Stage 1: run subject models on the static datasets
python -m src.run_eval_pipeline \
stage=1 \
validation_tag="$VALIDATION_TAG" \
eval_tag="$VALIDATION_TAG"

# Stage 2: aggregate scores
python -m src.run_eval_pipeline \
stage=2 \
eval_tag="$VALIDATION_TAG"

echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$VALIDATION_TAG"
echo "Stage 1 results (Inspect logs): base_output/test_exp/eval/results/$VALIDATION_TAG"
echo "Stage 2 scores: base_output/test_exp/eval/scores/$VALIDATION_TAG"

# Optional: generate flattened JSONL views of Inspect logs for easier reading
RESULTS_DIR="base_output/test_exp/eval/results/$VALIDATION_TAG"
if [ -d "$RESULTS_DIR" ]; then
echo "Flattening Inspect logs under $RESULTS_DIR ..."
for model_dir in "$RESULTS_DIR"/*/; do
[ -d "$model_dir" ] || continue
model_name="$(basename "$model_dir")"
for area_dir in "$model_dir"*/; do
[ -d "$area_dir" ] || continue
for cap_dir in "$area_dir"*/; do
[ -d "$cap_dir" ] || continue
cap_name="$(basename "$cap_dir")"
log_file="$(ls "$cap_dir"/*_task_*.json 2>/dev/null | head -n 1 || true)"
if [ -n "$log_file" ]; then
out_file="$cap_dir/flat_${cap_name}.jsonl"
python scripts/flatten_inspect_logs.py \
--log_path "$log_file" \
--out_path "$out_file"
echo " Wrote flattened log for $model_name/$cap_name to $out_file"
fi
done
done
done
fi

88 changes: 88 additions & 0 deletions scripts/static_benchmarks/bizbench_local_array_eval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/bin/bash
#SBATCH --job-name=gemma_bizbench_local_array
#SBATCH --output=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/bizbench_local_array_%A_%a.out
#SBATCH --error=/projects/DeepLesion/projects/new_ace/automated_capability_evaluation/logs/bizbench_local_array_%A_%a.err
#SBATCH --time=06:00:00
#SBATCH --cpus-per-task=8
#SBATCH --mem=64G
#SBATCH --gres=gpu:a40:1
#SBATCH --array=0-7%8

set -euo pipefail

cd /projects/DeepLesion/projects/new_ace/automated_capability_evaluation

# shellcheck disable=SC1091
source /projects/DeepLesion/py311_env/bin/activate

# shellcheck disable=SC1091
source "scripts/static_benchmarks/env_slurm_inspect.sh"

# Allow direct execution without sbatch by defaulting to shard 0.
: "${SLURM_ARRAY_TASK_ID:=0}"

NUM_SHARDS=8

# Count only FinKnow rows that survive adapter filtering.
TOTAL=$(
python - <<'PY'
from datasets import load_dataset

ds = load_dataset("kensho/bizbench", split="test")

def is_valid(row):
question = str(row.get("question", "")).strip()
task = str(row.get("task", "") or "").lower()
answer = row.get("answer")
if answer is None:
answer_text = ""
elif isinstance(answer, dict):
for key in ("answer", "label", "text", "value"):
if key in answer and answer[key] is not None:
answer_text = str(answer[key]).strip()
break
else:
answer_text = str(answer).strip()
else:
answer_text = str(answer).strip()
# Adapter default is `finknow_only=true`, so we shard based on the same subset.
return bool("finknow" in task and question and answer_text)

print(sum(1 for row in ds if is_valid(row)))
PY
)

CHUNK=$(((TOTAL + NUM_SHARDS - 1) / NUM_SHARDS))
OFFSET=$((SLURM_ARRAY_TASK_ID * CHUNK))
TAG="_BIZBENCH_TEST_GEMMA_3"

if [ "$OFFSET" -ge "$TOTAL" ]; then
echo "No work for shard ${SLURM_ARRAY_TASK_ID} (OFFSET=$OFFSET >= TOTAL=$TOTAL). Exiting."
exit 0
fi

echo "TOTAL=$TOTAL NUM_SHARDS=$NUM_SHARDS CHUNK=$CHUNK OFFSET=$OFFSET TAG=$TAG"

# Stage 0_static: build dataset shard from BizBench test split.
python -m src.run_eval_pipeline \
stage=0_static \
validation_tag="$TAG" \
+static_benchmark_cfg.benchmark_id=kensho/bizbench \
+static_benchmark_cfg.split=test \
+static_benchmark_cfg.offset="$OFFSET" \
+static_benchmark_cfg.limit="$CHUNK"

# Stage 1_local: evaluate local subject model(s) from run_cfg.yaml.
python -m src.run_eval_pipeline \
stage=1_local \
validation_tag="$TAG" \
eval_tag="$TAG"

# Stage 2: aggregate per-shard scores.
python -m src.run_eval_pipeline \
stage=2 \
eval_tag="$TAG"

echo "Stage 0_static datasets: base_output/test_exp/eval/datasets/$TAG"
echo "Stage 1_local results: base_output/test_exp/eval/results/$TAG"
echo "Stage 2 scores: base_output/test_exp/eval/scores/$TAG"
Loading