Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions bixbench/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ class MajorityVoteConfig(BaseModel):
run: bool = False
k_value: int = 10
groups: dict[str, list[str]] = Field(default_factory=dict)
group_name_mappings: dict[str, dict[str, str]] = Field(default_factory=dict)


class RunComparisonConfig(BaseModel):
Expand Down
12 changes: 7 additions & 5 deletions bixbench/plotting_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def majority_vote_accuracy_by_k(
random_baselines: list[float] | None = None,
random_baselines_labels: list[str] | None = None,
results_dir: str = "bixbench_results",
legend_loc: str = "upper right",
) -> None:
"""
Plot the accuracy of majority voting as a function of the number of votes (k).
Expand All @@ -33,6 +34,7 @@ def majority_vote_accuracy_by_k(
random_baselines: List of accuracy values for random baseline models
random_baselines_labels: Labels for the random baseline models
results_dir: Directory to save results
legend_loc: Location of the legend

Returns:
None: Saves the plot to disk and displays it
Expand Down Expand Up @@ -60,10 +62,10 @@ def majority_vote_accuracy_by_k(
plt.xlabel("Number of Votes (k)", fontsize=18)
plt.ylabel("Accuracy", fontsize=18)
plt.xlim(1, max(k_values))
plt.ylim(0.1, 0.35)
plt.ylim(0.15, 0.325)
plt.yticks(
np.arange(0.1, 0.36, 0.05),
[f"{x:.2f}" for x in np.arange(0.1, 0.36, 0.05)],
np.arange(0.15, 0.325, 0.05),
[f"{x:.2f}" for x in np.arange(0.15, 0.325, 0.05)],
fontsize=18,
)
plt.title("Majority Voting Accuracy", fontsize=18)
Expand All @@ -79,7 +81,7 @@ def majority_vote_accuracy_by_k(
linestyle=":",
label=label,
)
plt.legend(loc="upper left")
plt.legend(loc=legend_loc)
plt.grid(alpha=0.3, visible=True)
plt.savefig(f"{results_dir}/majority_vote_accuracy_{name}.png")
plt.show()
Expand Down Expand Up @@ -199,7 +201,7 @@ def draw_baselines(
color=random_color,
linestyle="--",
linewidth=line_width,
label="random" if not random_label_used else "",
label="" if random_label_used else "random",
Comment thread
alexandonian marked this conversation as resolved.
)
random_label_used = True

Expand Down
5 changes: 3 additions & 2 deletions bixbench/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ async def run_majority_vote(
return {}

# Get configuration values
Copy link

Copilot AI Oct 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The increment of k_value by 1 lacks explanation. This magic number operation should be documented with a comment explaining why the increment is necessary.

Suggested change
# Get configuration values
# Get configuration values
# Increment k_value by 1 to ensure the range in majority voting includes config.k_value itself.

Copilot uses AI. Check for mistakes.
k_value = config.k_value
k_value = config.k_value + 1
mv_groups = config.groups

# Store results for all runs
Expand All @@ -181,8 +181,9 @@ async def run_majority_vote(
# Plot results for each group if specified in config
for group_name, group_runs in mv_groups.items():
# Filter run_results to only include runs specified in the group
name_mappings = config.group_name_mappings.get(group_name, {})
filtered_results = {
run_name: run_results[run_name]
name_mappings.get(run_name, run_name): run_results[run_name]
for run_name in group_runs
if run_name in run_results
}
Expand Down
17 changes: 14 additions & 3 deletions bixbench/run_configuration/v1.5_paper_results.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,23 @@ majority_vote:
- "4o_image_mcq_without_refusal"
- "claude_image_mcq_with_refusal"
- "4o_image_mcq_with_refusal"
group_name_mappings:
image_comparison:
"claude_image_mcq_with_refusal": "Claude 3.5 Sonnet w/ image"
"4o_image_mcq_with_refusal": "gpt-4o w/ image"
"claude_no_image_mcq_with_refusal": "Claude 3.5 Sonnet w/o image"
"4o_no_image_mcq_with_refusal": "gpt-4o w/o image"
refusal_option_comparison:
"claude_image_mcq_without_refusal": "Claude 3.5 Sonnet w/o refusal"
"4o_image_mcq_without_refusal": "gpt-4o w/o refusal"
"claude_image_mcq_with_refusal": "Claude 3.5 Sonnet refusal"
"4o_image_mcq_with_refusal": "gpt-4o refusal"

run_comparison:
run: true
# Adjust this based on actual number of questions in the new dataset
# Original was 2960 (296 questions x 10 iterations)
# You may need to update this based on your dataset size
# Adjust this based on actual number of questions in the replicated dataset
# Original was 2050 (205 questions x 10 iterations)
# You only need to provide this if your replicated dataset has a different number of questions
total_questions_per_run: null
run_name_groups:
- ["4o_image_open", "claude_image_open"]
Expand Down