Future-House · geemi725 · Jun 12, 2025 · Jun 6, 2025 · Jun 6, 2025 · Jun 11, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -6,33 +6,19 @@ on:
   pull_request:
 
 jobs:
-  lint:
+  pre-commit:
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-
     steps:
       - name: Check out Git repository
         uses: actions/checkout@v4
-      - name: Set up Python ${{ matrix.python-version }}
+
+      - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install ruff pytest numpy setuptools>=66 wheel>=0.36 build
-          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-          if [ -f pyproject.toml ]; then pip install -e .; fi
-
-      - name: Run Lint
-        run: |
-          # Check for linting issues
-          ruff check .
-          # Check for formatting issues (will fail if code needs formatting)
-          ruff format --check .
+          python-version: "3.12"
 
+      - name: Run pre-commit
+        uses: pre-commit/action@v3.0.1
   test:
     runs-on: ubuntu-latest
     strategy:
@@ -50,7 +36,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install ruff pytest numpy setuptools>=66 wheel>=0.36 build
+          pip install ruff pytest pytest-asyncio numpy setuptools>=66 wheel>=0.36 build
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
           if [ -f pyproject.toml ]; then pip install -e .; fi
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -42,6 +42,7 @@ repos:
     hooks:
       - id: codespell
         additional_dependencies: [".[toml]"]
+        args: [--skip=".git"]
   - repo: https://github.com/henryiii/validate-pyproject-schema-store
     rev: 2025.02.24
     hooks:

diff --git a/README.md b/README.md
@@ -140,7 +140,7 @@ The script will save the evaluation dataframe as a CSV file in the `bixbench_res
 
 ## Zero-shot Evaluations & Grading
 
-You can run zero-shot evaluations using the `run_zeroshot_evals.py` script and then automatically grade the responses using the `grade_outputs.py` script. This code:
+You can run zero-shot evaluations using the `generate_zeroshot_evals.py` script and then grade the responses using the `grade_outputs.py` script. These two scripts:
 
 1. Loads the BixBench dataset from Hugging Face
 2. Evaluates the LLM on the dataset, outputting a CSV file with the results
@@ -149,6 +149,23 @@ You can run zero-shot evaluations using the `run_zeroshot_evals.py` script and t
 
 The scripts can be configured to run with open-ended questions, multiple-choice questions (with or without a refusal option), different models, and different temperatures. To explore the different options, run the scripts with the `--help` flag.
 
+**Example: Generate zero-shot answers in MCQ setting with the "refusal option" (in addition to the original distractors)**
+
+```bash
+python generate_zeroshot_evals.py \
+        --answer-mode "mcq" \
+        --model "gpt-4o" \
+        --with-refusal
+```
+
+**Example: Grade the zero-shot answers from the previous step**
+
+```bash
+python grade_outputs.py \
+        --input-file path/to/zeroshot.csv \
+        --answer-mode "mcq"
+```
+
 ## Replicating the BixBench Paper Results
 
 To replicate the BixBench paper results for agentic evaluations, you can download the raw data from 2,120 trajectories and its respective postprocessed evaluation dataframe:

diff --git a/bixbench/__init__.py b/bixbench/__init__.py
@@ -1,23 +1,31 @@
-from .graders import compute_metrics, grade_mcq_answer, grade_open_ended_answer
+from .graders import GradeAnswer, MCQGrader, OpenEndedGrader
 from .prompts import (
     MCQ_PROMPT_TEMPLATE_WITH_REFUSAL,
     MCQ_PROMPT_TEMPLATE_WITHOUT_REFUSAL,
     OPEN_ENDED_PROMPT_TEMPLATE,
 )
-from .utils import AgentInput, EvalMode, LLMConfig, parse_response, randomize_choices
+from .utils import (
+    AnswerMode,
+    LLMConfig,
+    Query,
+    compute_metrics,
+    parse_response,
+    randomize_choices,
+)
 from .zero_shot import ZeroshotBaseline
 
 __all__ = [
     "MCQ_PROMPT_TEMPLATE_WITHOUT_REFUSAL",
     "MCQ_PROMPT_TEMPLATE_WITH_REFUSAL",
     "OPEN_ENDED_PROMPT_TEMPLATE",
-    "AgentInput",
-    "EvalMode",
+    "AnswerMode",
+    "GradeAnswer",
     "LLMConfig",
+    "MCQGrader",
+    "OpenEndedGrader",
+    "Query",
     "ZeroshotBaseline",
     "compute_metrics",
-    "grade_mcq_answer",
-    "grade_open_ended_answer",
     "parse_response",
     "randomize_choices",
 ]
diff --git a/bixbench/generate_trajectories.py b/bixbench/generate_trajectories.py
@@ -240,9 +240,9 @@ def environment_factory(self, capsule: dict[str, Any]) -> DataAnalysisEnv:
             load_mcq(i, open_question=True, question_id=i["id"]) for i in raw_questions
         ]
         problem = self.config.base_prompt.format(
-            questions="\n-------\n".join([
-                i.question_prompt for i in processed_questions
-            ])
+            questions="\n-------\n".join(
+                [i.question_prompt for i in processed_questions]
+            )
         )
         answer = {i.question_id: i.ideal_answer for i in processed_questions}
         work_dir = (self.config.local_workspace_dir / capsule["uuid"]).absolute()
@@ -369,9 +369,12 @@ async def batch_rollout(
         agent = self.config.agent_config.construct_agent()
         rollout_manager = getattr(self, f"{self.config.rollout.rollout_type}_rollout")
 
-        return await asyncio.gather(*[
-            rollout_manager(agent, environment) for environment in list_of_environments
-        ])
+        return await asyncio.gather(
+            *[
+                rollout_manager(agent, environment)
+                for environment in list_of_environments
+            ]
+        )
 
     async def run(self) -> None:
         """Run the full trajectory generation pipeline."""
@@ -393,9 +396,11 @@ async def run(self) -> None:
 
                 # Update progress bar
                 pbar.update(len(batch))
-                pbar.set_postfix({
-                    "batch": f"{i // self.config.rollout.batch_size + 1}/{total_batches}"
-                })
+                pbar.set_postfix(
+                    {
+                        "batch": f"{i // self.config.rollout.batch_size + 1}/{total_batches}"
+                    }
+                )
 
 
 if __name__ == "__main__":