Future-House · alexandonian · Oct 6, 2025 · Oct 6, 2025 · Oct 6, 2025 · Copilot
diff --git a/bixbench-v1.5_results/majority_vote_accuracy_image_comparison.png b/bixbench-v1.5_results/majority_vote_accuracy_image_comparison.png
diff --git a/bixbench-v1.5_results/majority_vote_accuracy_refusal_option_comparison.png b/bixbench-v1.5_results/majority_vote_accuracy_refusal_option_comparison.png
diff --git a/bixbench/models.py b/bixbench/models.py
@@ -188,6 +188,7 @@ class MajorityVoteConfig(BaseModel):
     run: bool = False
     k_value: int = 10
     groups: dict[str, list[str]] = Field(default_factory=dict)
+    group_name_mappings: dict[str, dict[str, str]] = Field(default_factory=dict)
 
 
 class RunComparisonConfig(BaseModel):

diff --git a/bixbench/plotting_utils.py b/bixbench/plotting_utils.py
@@ -23,6 +23,7 @@ def majority_vote_accuracy_by_k(
     random_baselines: list[float] | None = None,
     random_baselines_labels: list[str] | None = None,
     results_dir: str = "bixbench_results",
+    legend_loc: str = "upper right",
 ) -> None:
     """
     Plot the accuracy of majority voting as a function of the number of votes (k).
@@ -33,6 +34,7 @@ def majority_vote_accuracy_by_k(
         random_baselines: List of accuracy values for random baseline models
         random_baselines_labels: Labels for the random baseline models
         results_dir: Directory to save results
+        legend_loc: Location of the legend
 
     Returns:
         None: Saves the plot to disk and displays it
@@ -60,10 +62,10 @@ def majority_vote_accuracy_by_k(
     plt.xlabel("Number of Votes (k)", fontsize=18)
     plt.ylabel("Accuracy", fontsize=18)
     plt.xlim(1, max(k_values))
-    plt.ylim(0.1, 0.35)
+    plt.ylim(0.15, 0.325)
     plt.yticks(
-        np.arange(0.1, 0.36, 0.05),
-        [f"{x:.2f}" for x in np.arange(0.1, 0.36, 0.05)],
+        np.arange(0.15, 0.325, 0.05),
+        [f"{x:.2f}" for x in np.arange(0.15, 0.325, 0.05)],
         fontsize=18,
     )
     plt.title("Majority Voting Accuracy", fontsize=18)
@@ -79,7 +81,7 @@ def majority_vote_accuracy_by_k(
             linestyle=":",
             label=label,
         )
-    plt.legend(loc="upper left")
+    plt.legend(loc=legend_loc)
     plt.grid(alpha=0.3, visible=True)
     plt.savefig(f"{results_dir}/majority_vote_accuracy_{name}.png")
     plt.show()
@@ -199,7 +201,7 @@ def draw_baselines(
             color=random_color,
             linestyle="--",
             linewidth=line_width,
-            label="random" if not random_label_used else "",
+            label="" if random_label_used else "random",
         )
         random_label_used = True
 

diff --git a/bixbench/postprocessing.py b/bixbench/postprocessing.py
@@ -160,7 +160,7 @@ async def run_majority_vote(
         return {}
 
     # Get configuration values
-    # Get configuration values
+    # Get configuration values
+    # Increment k_value by 1 to ensure the range in majority voting includes config.k_value itself.
-    # Get configuration values
+    # Get configuration values
+    # Increment k_value by 1 to ensure the range in majority voting includes config.k_value itself.
-    k_value = config.k_value
+    k_value = config.k_value + 1
     mv_groups = config.groups
 
     # Store results for all runs
@@ -181,8 +181,9 @@ async def run_majority_vote(
     # Plot results for each group if specified in config
     for group_name, group_runs in mv_groups.items():
         # Filter run_results to only include runs specified in the group
+        name_mappings = config.group_name_mappings.get(group_name, {})
         filtered_results = {
-            run_name: run_results[run_name]
+            name_mappings.get(run_name, run_name): run_results[run_name]
             for run_name in group_runs
             if run_name in run_results
         }

diff --git a/bixbench/run_configuration/v1.5_paper_results.yaml b/bixbench/run_configuration/v1.5_paper_results.yaml
@@ -20,12 +20,23 @@ majority_vote:
       - "4o_image_mcq_without_refusal"
       - "claude_image_mcq_with_refusal"
       - "4o_image_mcq_with_refusal"
+  group_name_mappings:
+    image_comparison:
+      "claude_image_mcq_with_refusal": "Claude 3.5 Sonnet w/ image"
+      "4o_image_mcq_with_refusal": "gpt-4o w/ image"
+      "claude_no_image_mcq_with_refusal": "Claude 3.5 Sonnet w/o image"
+      "4o_no_image_mcq_with_refusal": "gpt-4o w/o image"
+    refusal_option_comparison:
+      "claude_image_mcq_without_refusal": "Claude 3.5 Sonnet w/o refusal"
+      "4o_image_mcq_without_refusal": "gpt-4o w/o refusal"
+      "claude_image_mcq_with_refusal": "Claude 3.5 Sonnet refusal"
+      "4o_image_mcq_with_refusal": "gpt-4o refusal"
 
 run_comparison:
   run: true
-  # Adjust this based on actual number of questions in the new dataset
-  # Original was 2960 (296 questions x 10 iterations)
-  # You may need to update this based on your dataset size
+  # Adjust this based on actual number of questions in the replicated dataset
+  # Original was 2050 (205 questions x 10 iterations)
+  # You only need to provide this if your replicated dataset has a different number of questions
   total_questions_per_run: null
   run_name_groups:
     - ["4o_image_open", "claude_image_open"]