From 76369ac019bd9f8048b77b75455a2750b3d4455d Mon Sep 17 00:00:00 2001
From: Alex Andonian
Date: Thu, 25 Sep 2025 03:16:57 +0000
Subject: [PATCH 01/16] Enhance zero-shot evaluation and grading functionality
- Updated argument parsing to include `--dataset-split` and `--num-examples` options.
- Refactored evaluation logic to streamline processing of questions and capsules.
- Introduced new grading prompt for range evaluations.
- Improved handling of evaluation modes in processing functions.
- Added support for replica IDs in trajectory generation.
- Enhanced data loading and processing for better performance and clarity.
---
bixbench/generate_trajectories.py | 262 ++++++++++++++++++++----------
bixbench/graders.py | 92 +++++++----
bixbench/models.py | 81 +++++----
bixbench/plot_style.py | 13 +-
bixbench/plotting_utils.py | 19 +--
bixbench/postprocessing.py | 102 ++++++++----
bixbench/postprocessing_utils.py | 206 ++++++++++++-----------
bixbench/prompts.py | 9 +
bixbench/utils.py | 46 +++++-
bixbench/zero_shot.py | 10 +-
generate_zeroshot_evals.py | 50 +++---
11 files changed, 562 insertions(+), 328 deletions(-)
diff --git a/bixbench/generate_trajectories.py b/bixbench/generate_trajectories.py
index 7b64227..7ab8b4a 100644
--- a/bixbench/generate_trajectories.py
+++ b/bixbench/generate_trajectories.py
@@ -1,5 +1,4 @@
import argparse
-import ast
import asyncio
import json
import logging
@@ -8,10 +7,10 @@
from typing import Any
import datasets
-import litellm
import yaml
+from aviary.core import MultipleChoiceQuestion
from fhda.data_analysis_env import DataAnalysisEnv
-from fhda.utils import collect_notebook_stats, load_mcq
+from fhda.utils import collect_notebook_stats
from huggingface_hub import hf_hub_download
from ldp.agent import Agent
from ldp.alg.rollout import RolloutManager
@@ -19,8 +18,8 @@
from tqdm.auto import tqdm
from bixbench.models import BixbenchConfig
+from bixbench.utils import as_completed_with_concurrency
-litellm.verbose = False
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -49,14 +48,18 @@ class TrajectoryGenerator:
trajectories.
"""
- def __init__(self, config_path=DEFAULT_CONFIG_PATH) -> None:
+ def __init__(
+ self, config_path=DEFAULT_CONFIG_PATH, replica_id: int | None = None
+ ) -> None:
"""
Initialize the TrajectoryGenerator with config and create necessary directories.
Args:
config_path: Path to the configuration file
+ replica_id: Replica ID
"""
self.config = self.load_config(config_path)
+ self.replica_id = replica_id
# Create directories
self.config.local_workspace_dir.mkdir(parents=True, exist_ok=True)
self.config.local_trajectories_dir.mkdir(parents=True, exist_ok=True)
@@ -81,26 +84,22 @@ def load_config(self, config_path) -> BixbenchConfig:
# Create and validate the config using Pydantic
return BixbenchConfig(**config_dict)
- async def process_capsule(self, capsule: dict[str, Any]) -> None:
+ async def process_capsule_data(self, zip_filename: str) -> None:
"""
- Process a single benchmark capsule by downloading and extracting necessary files.
+ Process capsule data by downloading and extracting necessary files.
Args:
- capsule: Dictionary containing capsule information
+ zip_filename: Name of the zip file to process
"""
- zip_filename = capsule["data_folder"]
- extract_dir = self.config.local_data_folder / zip_filename.replace(".zip", "")
zip_path = self.config.local_data_folder / zip_filename
-
+ extract_dir = self.config.local_data_folder / zip_filename.replace(".zip", "")
# Check if capsule folder exists and is non-empty
if extract_dir.exists() and any(extract_dir.iterdir()):
logger.debug(
"Capsule folder %s already exists and is non-empty", extract_dir.name
)
- capsule["local_data_folder"] = extract_dir
return
- # Download and process if not already present
await asyncio.to_thread(
hf_hub_download,
repo_id=self.config.paths.hf_repo_id,
@@ -108,23 +107,47 @@ async def process_capsule(self, capsule: dict[str, Any]) -> None:
local_dir=self.config.local_data_folder,
repo_type="dataset",
)
-
await asyncio.to_thread(self._extract_and_process_files, zip_path, extract_dir)
- capsule["local_data_folder"] = extract_dir
+
+ async def process_question(self, question: dict[str, Any]) -> None:
+ """
+ Process a single benchmark question by downloading and extracting necessary files.
+
+ Args:
+ question: Dictionary containing question information
+ """
+ zip_filename: str = question["data_folder"]
+ extract_dir = self.config.local_data_folder / zip_filename.replace(".zip", "")
+
+ # Check if capsule folder exists and is non-empty
+ if extract_dir.exists() and any(extract_dir.iterdir()):
+ logger.debug(
+ "Question folder %s already exists and is non-empty", extract_dir.name
+ )
+ question["local_data_folder"] = extract_dir
+ return
+
+ # Download and process if not already present
+ await self.process_capsule_data(zip_filename)
+
+ question["local_data_folder"] = extract_dir
async def load_bixbench(self) -> list[dict[str, Any]]:
"""
- Load BixBench dataset and process all capsules.
+ Load BixBench dataset and process all questions.
Returns:
- List[Dict[str, Any]]: List of processed benchmark capsules
+ List[Dict[str, Any]]: List of processed benchmark questions
"""
- bixbench = datasets.load_dataset(
- self.config.paths.hf_repo_id, split="train"
- ).to_list()
+ bixbench = datasets.load_dataset(self.config.paths.hf_repo_id, split=self.config.dataset_split).to_list() # type: ignore[attr-defined]
+
+ # Process all capsule data concurrently
+ zip_filenames = {question["data_folder"] for question in bixbench}
+ tasks = [self.process_capsule_data(zip_fname) for zip_fname in zip_filenames]
+ await asyncio.gather(*tasks)
- # Process all capsules concurrently
- tasks = [self.process_capsule(capsule) for capsule in bixbench]
+ # Prepare all questions concurrently
+ tasks = [self.process_question(question) for question in bixbench]
await asyncio.gather(*tasks)
return bixbench
@@ -166,17 +189,21 @@ def _extract_and_process_files(self, zip_path: Path, extract_dir: Path) -> None:
),
None,
)
- shutil.rmtree(notebook_folder)
+ if notebook_folder is not None:
+ shutil.rmtree(notebook_folder)
except StopIteration:
# No Notebook folder found, that's okay
- pass
+ logger.debug("No Notebook folder found")
# Remove any .ipynb files in the extract directory
for ipynb_file in extract_dir.glob("*.ipynb"):
ipynb_file.unlink()
# Remove the zip file
- zip_path.unlink()
+ try:
+ zip_path.unlink()
+ except FileNotFoundError:
+ logger.debug("Zip file not found")
async def store_trajectory(
self, trajectory: Trajectory, env: DataAnalysisEnv
@@ -188,68 +215,95 @@ async def store_trajectory(
trajectory: The trajectory to store
env: The environment that generated the trajectory
"""
+ metadata = (
+ {k: v for k, v in env.metadata.items() if k != "local_data_folder"}
+ if env.metadata
+ else {}
+ )
+ mcqs = metadata.pop("mcqs", [])
+ mcq_options = mcqs[0].options if mcqs else []
+ mcq_question = mcqs[0].question if mcqs else ""
+ refusal_option = mcqs[0].unsure_answer_letter if mcqs else None
+
extract = {
"problem_id": env.problem_id,
"agent_answer": env.state.answer,
"ideal_answer": env.answer,
"problem": env.problem,
- "mcq_options": [q.options for q in env.mcqs] if env.mcqs else [],
- "mcq_question": [q.question for q in env.mcqs] if env.mcqs else [],
+ "mcq_options": mcq_options,
+ "mcq_question": mcq_question,
"notebook_stats": collect_notebook_stats(env.state.nb),
"num_actions": len(env.state.actions),
"question_format": self.config.capsule.mode,
"refusal_option": self.config.capsule.include_refusal_option,
"model": self.config.agent.agent_kwargs["llm_model"]["name"],
# Local data folder is not serializable
- "metadata": {
- k: v for k, v in env.metadata.items() if k != "local_data_folder"
- },
- "refusal_options": {
- q.question_id: q.unsure_answer_letter for q in (env.mcqs or [])
- },
+ "metadata": metadata,
+ "refusal_options": refusal_option,
"nb": env.state.nb,
"run_name": self.config.run_name,
}
- # Download run metadata
- with (self.config.local_trajectories_dir / f"{env.problem_id}.json").open(
- "w"
- ) as f:
- json.dump(
- extract,
- f,
- indent=4,
- )
- # Download run trajectory
+ # Store trajectory metadata
+ filename = self.get_trajectory_path(env.problem_id)
+ with filename.open("w") as f:
+ json.dump(extract, f, indent=4)
+ # Store trajectory
await trajectory.to_jsonl(
- self.config.local_trajectories_dir / f"{env.problem_id}.jsonl"
+ self.config.local_trajectories_dir
+ / str(filename).replace(".json", ".jsonl")
)
- def environment_factory(self, capsule: dict[str, Any]) -> DataAnalysisEnv:
+ def get_trajectory_path(self, problem_id: str) -> Path:
+ """Get the path to the trajectory for a given problem ID.
+
+ Args:
+ problem_id: The problem ID
+
+ Returns:
+ Path: The path to the trajectory
"""
- Create a DataAnalysisEnv instance from a capsule.
+ if self.replica_id is not None:
+ return (
+ self.config.local_trajectories_dir
+ / f"{problem_id}_replica_{self.replica_id}.json"
+ )
+ return self.config.local_trajectories_dir / f"{problem_id}.json"
+
+ def environment_factory(self, question: dict[str, Any]) -> DataAnalysisEnv:
+ """
+ Create a DataAnalysisEnv instance from a question.
Args:
- capsule: Dictionary containing capsule information
+ question: Dictionary containing question information
Returns:
DataAnalysisEnv: Initialized environment
"""
- raw_questions = ast.literal_eval(capsule["questions"])
- processed_questions = [
- load_mcq(i, open_question=True, question_id=i["id"]) for i in raw_questions
- ]
+ processed_question = load_mcq(
+ question, open_question=True, question_id=question["question_id"]
+ )
+ question["mcqs"] = [processed_question]
+
+ language = self.config.notebook.language
problem = self.config.base_prompt.format(
- questions="\n-------\n".join(
- [i.question_prompt for i in processed_questions]
- )
+ question=question["question"], language=language
)
- answer = {i.question_id: i.ideal_answer for i in processed_questions}
- work_dir = (self.config.local_workspace_dir / capsule["uuid"]).absolute()
+ answer = question["ideal"]
+ question_id = question["question_id"]
+ if self.replica_id is not None:
+ question_id = f"{question_id}_replica_{self.replica_id}"
+ work_dir = (
+ self.config.local_workspace_dir
+ / self.config.run_name
+ / question["capsule_uuid"]
+ / question_id
+ ).absolute()
work_dir.mkdir(parents=True, exist_ok=True)
- local_capsule_data_path = self.config.local_data_folder / capsule[
+ local_capsule_data_path = self.config.local_data_folder / question[
"data_folder"
].replace(".zip", "")
+
# Copy all files from data folder to work directory
for item in local_capsule_data_path.iterdir():
if item.is_file():
@@ -259,20 +313,19 @@ def environment_factory(self, capsule: dict[str, Any]) -> DataAnalysisEnv:
nb_path = work_dir / self.config.notebook.name
# Add some extra metadata from config
- capsule["avoid_images"] = self.config.capsule.avoid_images
- capsule["include_refusal_option"] = self.config.capsule.include_refusal_option
+ question["avoid_images"] = self.config.capsule.avoid_images
+ question["include_refusal_option"] = self.config.capsule.include_refusal_option
env_args = {
- "problem_id": capsule["short_id"],
+ "problem_id": question["question_id"],
"problem": problem,
"eval_mode": self.config.capsule.eval_mode,
"nb_path": nb_path,
"work_dir": work_dir,
"language": self.config.notebook.language,
"system_prompt": self.config.system_prompt,
- "metadata": capsule,
+ "metadata": question,
"answer": answer,
- "mcqs": processed_questions,
"use_tmp_work_dir": False,
}
@@ -280,7 +333,7 @@ def environment_factory(self, capsule: dict[str, Any]) -> DataAnalysisEnv:
async def custom_rollout(
self, agent: Agent, environment: DataAnalysisEnv
- ) -> Trajectory:
+ ) -> tuple[Trajectory, DataAnalysisEnv]:
"""
Custom implementation of rollout logic.
@@ -341,7 +394,7 @@ async def vanilla_rollout(
async def batch_rollout(
self, list_of_environments: list[DataAnalysisEnv]
- ) -> list[Trajectory | tuple[Trajectory, DataAnalysisEnv]]:
+ ) -> list[tuple[Trajectory, DataAnalysisEnv]]:
"""
Run rollouts for a batch of environments.
@@ -359,12 +412,7 @@ async def batch_rollout(
environments=list_of_environments,
max_steps=self.config.rollout.max_steps,
)
- return [
- (trajectory, environment)
- for trajectory, environment in zip(
- trajectories, list_of_environments, strict=True
- )
- ]
+ return list(zip(trajectories, list_of_environments, strict=True))
agent = self.config.agent_config.construct_agent()
rollout_manager = getattr(self, f"{self.config.rollout.rollout_type}_rollout")
@@ -376,31 +424,65 @@ async def batch_rollout(
]
)
+ def filter_completed(self, bixbench: list[dict[str, Any]]) -> list[dict[str, Any]]:
+ """Filter out completed trajectories."""
+ return [
+ question
+ for question in bixbench
+ if not self.get_trajectory_path(question["question_id"]).exists()
+ ]
+
+ async def rollout_with_saving(
+ self, environment: DataAnalysisEnv
+ ) -> list[tuple[Trajectory, DataAnalysisEnv]]:
+ trajectory, env = await self.vanilla_rollout(
+ self.config.agent_config.construct_agent(), environment
+ )
+ await self.store_trajectory(trajectory, env)
+ return trajectory, env
+
async def run(self) -> None:
"""Run the full trajectory generation pipeline."""
logger.info("Loading BixBench dataset...")
bixbench = await self.load_bixbench()
- # Process environments in batches with tqdm progress bar
- total_batches = (
- len(bixbench) + self.config.rollout.batch_size - 1
- ) // self.config.rollout.batch_size
+ if self.config.rollout.skip_existing_trajectories:
+ bixbench = self.filter_completed(bixbench)
+ # Process environments in batches with tqdm progress bar
with tqdm(total=len(bixbench), desc="Processing benchmark tasks") as pbar:
for i in range(0, len(bixbench), self.config.rollout.batch_size):
- batch = bixbench[i : i + self.config.rollout.batch_size]
- environments = [self.environment_factory(capsule) for capsule in batch]
- results = await self.batch_rollout(environments)
- for trajectory, env in results:
- await self.store_trajectory(trajectory, env)
-
- # Update progress bar
- pbar.update(len(batch))
- pbar.set_postfix(
- {
- "batch": f"{i // self.config.rollout.batch_size + 1}/{total_batches}"
- }
+ bsz = min(self.config.rollout.batch_size, len(bixbench) - i)
+ batch = bixbench[i : i + (4 * bsz)]
+ environments = (
+ self.environment_factory(question) for question in batch
)
+ rollouts = (self.rollout_with_saving(env) for env in environments)
+ try:
+ async for _ in as_completed_with_concurrency(
+ rollouts, bsz, timeout=3600
+ ):
+ pbar.update(1)
+ except TimeoutError:
+ logger.warning("Timeout occurred while rolling out environments")
+ continue
+ logger.info("Completed trajectory generation")
+
+
+def load_mcq(
+ mcq: dict, open_question: bool = False, question_id: str | None = None
+) -> MultipleChoiceQuestion:
+ return MultipleChoiceQuestion(
+ question=mcq["question"],
+ options=[
+ mcq["ideal"],
+ *mcq["distractors"],
+ ],
+ ideal_answer=mcq["ideal"],
+ shuffle_seed=MultipleChoiceQuestion.SEED_USING_QUESTION,
+ prompt_without_options=open_question,
+ question_id=question_id or "Q",
+ )
if __name__ == "__main__":
@@ -413,7 +495,13 @@ async def run(self) -> None:
default=str(DEFAULT_CONFIG_PATH),
help="Path to the configuration YAML file",
)
+ parser.add_argument(
+ "--replica_id",
+ type=int,
+ default=None,
+ help="Replica ID",
+ )
args = parser.parse_args()
- generator = TrajectoryGenerator(args.config_file)
+ generator = TrajectoryGenerator(args.config_file, args.replica_id)
asyncio.run(generator.run())
diff --git a/bixbench/graders.py b/bixbench/graders.py
index 43f2b63..f2e81b0 100644
--- a/bixbench/graders.py
+++ b/bixbench/graders.py
@@ -1,12 +1,13 @@
import ast
import re
from enum import StrEnum
-from typing import Any, Literal, Optional
+from typing import Any, Literal, Self
from aviary.core import Message
+from lmi import LiteLLMModel
from pydantic import BaseModel, Field, field_validator, model_validator
-from .prompts import OPEN_ENDED_GRADING_PROMPT
+from .prompts import OPEN_ENDED_GRADING_PROMPT, OPEN_ENDED_RANGE_GRADING_PROMPT
from .utils import AnswerMode
@@ -42,8 +43,8 @@ class GradeResult(BaseModel):
grade: int = Field(ge=0, le=1, description="Numeric grade (0 or 1)")
correct: bool = Field(description="Whether the answer is correct")
refusal: bool = Field(description="Whether the answer is a refusal")
- grade_type: Optional[GradeType] = Field(default=None, description="Type of grade")
- raw_response: Optional[str] = Field(
+ grade_type: GradeType | None = Field(default=None, description="Type of grade")
+ raw_response: str | None = Field(
default=None, description="Raw LLM response for open-ended"
)
@@ -57,11 +58,9 @@ class GradingFunction(BaseModel):
def _parse_grade_response(self, response: str) -> GradeType:
"""Parse the grade from LLM response."""
match = re.search(r"\s*(.*?)\s*", response, re.DOTALL)
- grade = match.group(1).strip().lower() if match else None
+ grade = match[1].strip().lower() if match else None
- if grade == "correct":
- return GradeType.CORRECT
- return GradeType.INCORRECT
+ return GradeType.CORRECT if grade == "correct" else GradeType.INCORRECT
async def _grade_str_verifier(
self,
@@ -110,17 +109,17 @@ async def _grade_str_verifier(
async def _grade_llm_verifier(
self,
- question,
- target,
- predicted,
- llm_client,
+ question: str,
+ target: str,
+ predicted: str,
+ llm_client: LiteLLMModel,
grading_prompt_template=OPEN_ENDED_GRADING_PROMPT,
) -> GradeResult:
grading_query = grading_prompt_template.format(
question=question, target=target, predicted=predicted
)
completion = await llm_client.call_single([Message(content=grading_query)])
- response = completion.model_dump()["text"]
+ response = completion.text or ""
grade_type = self._parse_grade_response(response)
return GradeResult(
@@ -131,6 +130,28 @@ async def _grade_llm_verifier(
raw_response=response,
)
+ async def _grade_range_llm_verifier(
+ self,
+ question,
+ target: str,
+ predicted: str,
+ llm_client: LiteLLMModel,
+ grading_prompt_template=OPEN_ENDED_RANGE_GRADING_PROMPT,
+ ) -> GradeResult:
+ grading_query = grading_prompt_template.format(
+ question=question, target=target, predicted=predicted
+ )
+ completion = await llm_client.call_single([Message(content=grading_query)])
+ response = completion.text or ""
+ grade_type = self._parse_grade_response(response)
+
+ return GradeResult(
+ grade=grade_type.numeric_grade,
+ correct=grade_type.is_correct,
+ refusal=grade_type.is_refused,
+ grade_type=grade_type,
+ )
+
def _grade_range_verifier(
self,
target: str,
@@ -163,10 +184,11 @@ async def grade(
self,
target: str,
predicted: str,
- unsure: Optional[str] = None,
- evaluation_mode: Literal["str_verifier", "range_verifier"] = "str_verifier",
+ unsure: str | None = None,
+ evaluation_mode: Literal[
+ "str_verifier", "range_verifier", "llm_verifier"
+ ] = "str_verifier",
) -> GradeResult:
-
grading_func = GradingFunction()
if evaluation_mode == "str_verifier":
return await grading_func._grade_str_verifier(
@@ -181,14 +203,14 @@ async def grade(
target=target, predicted=predicted
)
- raise ValueError(f"Unknown eval_mode: {self.eval_mode}")
+ raise ValueError(f"Unknown eval_mode: {evaluation_mode}")
class OpenEndedGrader(BaseModel):
"""Grader for open-ended questions."""
evaluation_mode: Literal["llm_verifier", "str_verifier", "range_verifier"] = Field(
- default="llm", description="Evaluation mode for open-ended answers"
+ default="llm_verifier", description="Evaluation mode for open-ended answers"
)
llm_client: Any = Field(description="LLM client for grading")
grading_prompt_template: str = Field(
@@ -208,7 +230,7 @@ def validate_eval_mode(cls, v: str) -> str:
return v
@model_validator(mode="after")
- def validate_llm_client(self) -> "OpenEndedGrader":
+ def validate_llm_client(self) -> Self:
"""Ensure llm_client is provided when using llm_verifier mode."""
if self.evaluation_mode == "llm_verifier" and not self.llm_client:
raise ValueError("llm_client is required when using llm_verifier mode")
@@ -219,8 +241,8 @@ async def grade(
question: str,
target: str,
predicted: str,
- partial_match: Optional[bool] = True,
- llm_match: Optional[bool] = True,
+ partial_match: bool = True,
+ llm_match: bool = True,
) -> GradeResult:
"""Grade an open-ended answer."""
grading_func = GradingFunction()
@@ -236,8 +258,12 @@ async def grade(
)
if self.evaluation_mode == "range_verifier":
- return grading_func._grade_range_verifier(
- target=target, predicted=predicted
+ return await grading_func._grade_range_llm_verifier(
+ question=question,
+ target=target,
+ predicted=predicted,
+ llm_client=self.llm_client,
+ grading_prompt_template=self.grading_prompt_template,
)
if self.evaluation_mode == "llm_verifier":
@@ -248,7 +274,7 @@ async def grade(
llm_client=self.llm_client,
grading_prompt_template=self.grading_prompt_template,
)
- raise ValueError(f"Unknown eval_mode: {self.eval_mode}")
+ raise ValueError(f"Unknown eval_mode: {self.evaluation_mode}")
class GradeAnswer(BaseModel):
@@ -264,15 +290,15 @@ async def grade(
self,
target: str,
predicted: str,
- question: Optional[str] = None,
- unsure: Optional[str] = None,
+ question: str | None = None,
+ unsure: str | None = None,
evaluation_mode: Literal[
"llm_verifier", "str_verifier", "range_verifier"
] = "str_verifier",
- partial_match: Optional[bool] = False,
- llm_match: Optional[bool] = False,
+ partial_match: bool = False,
+ llm_match: bool = False,
) -> tuple[int, bool, bool]:
-
+ print(f"Grading: {question}, {target}, {predicted}")
if self.answer_mode == AnswerMode.mcq:
mcq_grader = MCQGrader()
result = await mcq_grader.grade(
@@ -284,16 +310,20 @@ async def grade(
return result.grade, result.correct, result.refusal
if self.answer_mode == AnswerMode.openanswer:
+ assert question is not None
open_ended_grader = OpenEndedGrader(
evaluation_mode=evaluation_mode, llm_client=self.llm_client
)
result = await open_ended_grader.grade(
question=question,
- target=str(target),
- predicted=str(predicted),
+ target=target,
+ predicted=predicted,
partial_match=partial_match,
llm_match=llm_match,
)
+ print("===============")
+ print(f"Result: {result}")
+ print("===============")
return result.grade, result.correct, result.refusal
raise ValueError(f"Unknown answer mode: {self.answer_mode}")
diff --git a/bixbench/models.py b/bixbench/models.py
index 55455e7..e00ab56 100644
--- a/bixbench/models.py
+++ b/bixbench/models.py
@@ -1,11 +1,12 @@
from pathlib import Path
-from typing import Any, Literal, Optional
+from typing import Any, Literal
+import fhda.config as cfg
from aviary.utils import EvalAnswerMode
from fhda import prompts
from fhda.utils import NBLanguage
from ldp.agent import AgentConfig
-from pydantic import BaseModel, Field, field_validator, model_validator
+from pydantic import BaseModel, Field, computed_field, field_validator, model_validator
class AgentSettings(BaseModel):
@@ -23,6 +24,7 @@ class RolloutSettings(BaseModel):
max_steps: int
batch_size: int
rollout_type: str = "vanilla"
+ skip_existing_trajectories: bool = True
@classmethod
@field_validator("max_steps")
@@ -50,6 +52,7 @@ def validate_rollout_type(cls, v):
class NotebookSettings(BaseModel):
name: str
language: NBLanguage
+ use_docker: bool = True
@classmethod
@field_validator("language")
@@ -63,6 +66,12 @@ def validate_language(cls, v):
f"Invalid language: {v}. Must be convertible to NBLanguage enum."
) from err
+ @model_validator(mode="after")
+ def validate_use_docker(self):
+ if self.use_docker:
+ cfg.USE_DOCKER = True
+ return self
+
class PromptTemplates(BaseModel):
mcq: str
@@ -117,19 +126,45 @@ class BixbenchConfig(BaseModel):
notebook: NotebookSettings
capsule: CapsuleSettings
paths: PathSettings
- postprocessing: Optional[PostProcessingSettings] = None
+ postprocessing: PostProcessingSettings | None = None
# Computed fields that come from processing the raw config
- agent_config: Optional[AgentConfig] = None
- system_prompt: Optional[str] = None
- base_prompt: Optional[str] = None
- local_workspace_dir: Optional[Path] = None
- local_trajectories_dir: Optional[Path] = None
- local_data_folder: Optional[Path] = None
+ system_prompt: str | None = None
+ dataset_split: str = "train"
class Config:
arbitrary_types_allowed = True
+ @computed_field
+ @property
+ def local_workspace_dir(self) -> Path:
+ return Path(self.paths.workspace_dir).absolute()
+
+ @computed_field
+ @property
+ def local_trajectories_dir(self) -> Path:
+ return Path(self.paths.trajectories_dir).absolute() / self.run_name
+
+ @computed_field
+ @property
+ def local_data_folder(self) -> Path:
+ return Path(self.paths.data_folder).absolute()
+
+ @computed_field
+ @property
+ def base_prompt(self) -> str:
+ prompt = getattr(
+ prompts, self.capsule.prompt_templates.model_dump()[self.capsule.mode]
+ )
+ if self.capsule.avoid_images:
+ prompt += "\n" + prompts.AVOID_IMAGES
+ return prompt
+
+ @computed_field
+ @property
+ def agent_config(self) -> AgentConfig:
+ return self.agent.construct_agent_config()
+
@model_validator(mode="after")
def set_derived_fields(self):
# Ensure eval_mode is properly set to None if it's "None"
@@ -138,26 +173,8 @@ def set_derived_fields(self):
) and self.capsule.eval_mode.lower() in {"none", "null", ""}:
self.capsule.eval_mode = None
- # Create AgentConfig
- self.agent_config = self.agent.construct_agent_config()
-
# Get system prompt and base prompt
self.system_prompt = getattr(prompts, self.capsule.system_prompt)
- capsule_mode = self.capsule.mode
- self.base_prompt = getattr(
- prompts, self.capsule.prompt_templates.model_dump()[capsule_mode]
- )
- if self.capsule.avoid_images:
- self.base_prompt += "\n" + prompts.AVOID_IMAGES
-
- # Set absolute path values
- path_dict = self.paths.get_absolute_paths()
- self.local_workspace_dir = path_dict["local_workspace_dir"] / self.run_name
- self.local_trajectories_dir = (
- path_dict["local_trajectories_dir"] / self.run_name
- )
- # We can share the data folder across runs
- self.local_data_folder = path_dict["local_data_folder"]
return self
@@ -176,18 +193,19 @@ class MajorityVoteConfig(BaseModel):
class RunComparisonConfig(BaseModel):
run: bool = True
# This is used to account for environment failures that don't always show up in the data
- total_questions_per_run: int = 296
+ total_questions_per_run: int | None = None
run_name_groups: list[list[str]] = Field(default_factory=list)
group_titles: list[str] = Field(default_factory=list)
color_groups: list[str] = Field(default_factory=list)
use_zero_shot_baselines: bool = False
- random_baselines: list[Optional[float]] = Field(default_factory=list)
+ random_baselines: list[float | None] = Field(default_factory=list)
baseline_name_mappings: dict[str, str] = Field(default_factory=dict)
class PostprocessingConfig(BaseModel):
data_path: str = "data/trajectories/"
results_dir: str = "bixbench_results"
+ eval_df_filename: str = "eval_df.csv"
debug: bool = False
replicate_paper_results: PaperReplicationConfig = Field(
@@ -195,3 +213,8 @@ class PostprocessingConfig(BaseModel):
)
majority_vote: MajorityVoteConfig = Field(default_factory=MajorityVoteConfig)
run_comparison: RunComparisonConfig = Field(default_factory=RunComparisonConfig)
+
+ @computed_field
+ @property
+ def eval_df_path(self) -> Path:
+ return Path(self.results_dir) / self.eval_df_filename
diff --git a/bixbench/plot_style.py b/bixbench/plot_style.py
index 92aa9b5..3cce6b3 100644
--- a/bixbench/plot_style.py
+++ b/bixbench/plot_style.py
@@ -24,6 +24,7 @@ def set_fh_mpl_style(dark_mode: bool = False):
try:
import matplotlib as mpl
import matplotlib.pyplot as plt
+ from matplotlib import font_manager
except ImportError:
raise ImportError(
"Please `pip install matplotlib` to use set_fh_mpl_style."
@@ -31,8 +32,8 @@ def set_fh_mpl_style(dark_mode: bool = False):
_download_font()
- fe = mpl.font_manager.FontEntry(fname=str(FONT_PATH), name="sometype")
- mpl.font_manager.fontManager.ttflist.append(fe)
+ fe = font_manager.FontEntry(fname=str(FONT_PATH), name="sometype")
+ font_manager.fontManager.ttflist.append(fe)
if dark_mode:
mpl.rcParams.update(
{
@@ -41,7 +42,7 @@ def set_fh_mpl_style(dark_mode: bool = False):
"axes.edgecolor": "#FFFFFF", # White axes edges
"figure.facecolor": "#000000", # Black background for the figure
"axes.grid": False,
- "axes.prop_cycle": plt.cycler(color=COLOR_CYCLE),
+ "axes.prop_cycle": plt.cycler(color=COLOR_CYCLE), # type: ignore[attr-defined]
"font.family": fe.name,
"font.size": 14,
"figure.figsize": (
@@ -68,7 +69,7 @@ def set_fh_mpl_style(dark_mode: bool = False):
"axes.edgecolor": "#333333",
"figure.facecolor": "#FFFFFF",
"axes.grid": False,
- "axes.prop_cycle": plt.cycler(color=COLOR_CYCLE),
+ "axes.prop_cycle": plt.cycler(color=COLOR_CYCLE), # type: ignore[attr-defined]
"font.family": fe.name,
"font.size": 14,
"figure.figsize": (
@@ -90,8 +91,8 @@ def set_fh_plotly_style(dark_mode: bool = False):
NOTE: I haven't figured out how to set Courier Prime as the font.
"""
try:
- import plotly.graph_objects as go
- import plotly.io as pio
+ import plotly.graph_objects as go # type: ignore[import-not-found]
+ import plotly.io as pio # type: ignore[import-not-found]
except ImportError:
raise ImportError(
"Please `pip install plotly` to use set_fh_plotly_style."
diff --git a/bixbench/plotting_utils.py b/bixbench/plotting_utils.py
index 8c41954..864737a 100644
--- a/bixbench/plotting_utils.py
+++ b/bixbench/plotting_utils.py
@@ -1,7 +1,6 @@
# DISCLAIMER: This file is highly tailored to the BixBench paper requirements.
# It is not designed to be used as a general function for plotting model performance.
-from typing import Optional
import matplotlib.pyplot as plt
import numpy as np
@@ -21,8 +20,8 @@
def majority_vote_accuracy_by_k(
run_results: dict[str, tuple[list[int], list[float], list[float]]],
name: str = "",
- random_baselines: Optional[list[float]] = None,
- random_baselines_labels: Optional[list[str]] = None,
+ random_baselines: list[float] | None = None,
+ random_baselines_labels: list[str] | None = None,
results_dir: str = "bixbench_results",
) -> None:
"""
@@ -91,8 +90,8 @@ def plot_model_comparison(
baselines: dict[str, float],
run_groups: list[list[str]],
color_groups: list[str],
- group_titles: Optional[list[str]] = None,
- random_baselines: Optional[list[float]] = None,
+ group_titles: list[str] | None = None,
+ random_baselines: list[float] | None = None,
results_dir: str = "bixbench_results",
) -> None:
"""
@@ -148,7 +147,7 @@ def draw_baselines(
baselines: dict[str, float],
run_groups: list[list[str]],
bar_width: float,
- random_baselines: Optional[list[float]] = None,
+ random_baselines: list[float] | None = None,
) -> None:
"""
Draw baseline lines on the plot for performance comparison.
@@ -185,12 +184,12 @@ def draw_baselines(
color=baseline_color,
linestyle=baseline_bar,
linewidth=line_width,
- label="baseline" if c == 0 else None,
+ label="baseline" if c == 0 else "",
)
# Draw random guess baselines
random_label_used = False
- for c, baseline in enumerate(random_baselines):
+ for c, baseline in enumerate(random_baselines or []):
if baseline is None:
continue
plt.hlines(
@@ -200,7 +199,7 @@ def draw_baselines(
color=random_color,
linestyle="--",
linewidth=line_width,
- label="random" if not random_label_used else None,
+ label="random" if not random_label_used else "",
)
random_label_used = True
@@ -256,7 +255,7 @@ def draw_model_bars(
def plot_simplified_comparison(
results: dict[str, dict[str, float]],
run_groups: list[list[str]],
- group_titles: Optional[list[str]] = None,
+ group_titles: list[str] | None = None,
has_mcq: bool = False,
results_dir: str = "bixbench_results",
) -> None:
diff --git a/bixbench/postprocessing.py b/bixbench/postprocessing.py
index 5654355..acf7638 100644
--- a/bixbench/postprocessing.py
+++ b/bixbench/postprocessing.py
@@ -46,7 +46,7 @@ def load_raw_data(path: str) -> pd.DataFrame:
"agent_answer": utils.load_answer,
"ideal_answer": utils.load_answer,
"mcq_options": ast.literal_eval,
- "mcq_question": ast.literal_eval,
+ "mcq_question": str,
"nb": utils.load_notebook,
"avoid_images": bool,
"actions": int,
@@ -83,19 +83,47 @@ async def process_trajectories(df: pd.DataFrame) -> pd.DataFrame:
eval_df = utils.create_eval_df(df)
eval_df = await utils.run_eval_loop(eval_df)
- # Create correct column for open ended questions
- eval_df.loc[eval_df.question_format == "open", "correct"] = eval_df.loc[
- eval_df.question_format == "open", "llm_answer"
- ].apply(lambda x: x == "1")
- # Extract XML from LLM MCQ answers
- eval_df.loc[eval_df.question_format == "mcq", "llm_answer"] = eval_df.loc[
- eval_df.question_format == "mcq", "llm_answer"
- ].apply(utils.xml_extract)
- # Compare LLM answers to ideal answers
- eval_df.loc[eval_df.question_format == "mcq", "correct"] = (
- eval_df.loc[eval_df.question_format == "mcq", "llm_answer"]
- == eval_df.loc[eval_df.question_format == "mcq", "correct_letter"]
- )
+ # Handle different evaluation modes
+ if "eval_mode" in eval_df.columns:
+ # Open answer evaluation
+ open_mask = eval_df["eval_mode"] == "open"
+ eval_df.loc[open_mask, "correct"] = eval_df.loc[open_mask, "llm_answer"].apply(
+ lambda x: x == "1"
+ )
+
+ # MCQ evaluations (both with and without refusal)
+ mcq_mask = eval_df["eval_mode"].isin(
+ ["mcq", "mcq_with_refusal", "mcq_without_refusal"]
+ )
+ eval_df.loc[mcq_mask, "llm_answer"] = eval_df.loc[mcq_mask, "llm_answer"].apply(
+ utils.xml_extract
+ )
+
+ # Compare MCQ answers to ideal answers
+ if "correct_letter" in eval_df.columns and mcq_mask.any():
+ eval_df.loc[mcq_mask, "correct"] = (
+ eval_df.loc[mcq_mask, "llm_answer"]
+ == eval_df.loc[mcq_mask, "correct_letter"]
+ )
+ else:
+ # Fallback to original logic if eval_mode not present
+ # Create correct column for open ended questions
+ eval_df.loc[eval_df.question_format == "open", "correct"] = eval_df.loc[
+ eval_df.question_format == "open", "llm_answer"
+ ].apply(lambda x: x == "1")
+ # Extract XML from LLM MCQ answers
+ eval_df.loc[eval_df.question_format == "mcq", "llm_answer"] = eval_df.loc[
+ eval_df.question_format == "mcq", "llm_answer"
+ ].apply(utils.xml_extract)
+ # Compare LLM answers to ideal answers (only if MCQ questions exist)
+ if (
+ "correct_letter" in eval_df.columns
+ and (eval_df.question_format == "mcq").any()
+ ):
+ eval_df.loc[eval_df.question_format == "mcq", "correct"] = (
+ eval_df.loc[eval_df.question_format == "mcq", "llm_answer"]
+ == eval_df.loc[eval_df.question_format == "mcq", "correct_letter"]
+ )
return eval_df
@@ -120,7 +148,12 @@ async def run_majority_vote(
tuples of (k_values, mean accuracies, standard deviations)
"""
# Only run majority vote on mcq questions
- maj_vote_df = eval_df[eval_df.question_format == "mcq"].copy()
+ # Check for eval_mode column first, then fallback to question_format
+ if "eval_mode" in eval_df.columns:
+ mcq_modes = ["mcq", "mcq_with_refusal", "mcq_without_refusal"]
+ maj_vote_df = eval_df[eval_df["eval_mode"].isin(mcq_modes)].copy()
+ else:
+ maj_vote_df = eval_df[eval_df.question_format == "mcq"].copy()
if maj_vote_df.empty:
print("No MCQ questions found, skipping majority vote")
@@ -135,13 +168,13 @@ async def run_majority_vote(
for run_name in maj_vote_df.run_name.unique():
grouped_df = maj_vote_df[maj_vote_df.run_name == run_name].copy()
- grouped_df = grouped_df.groupby("uuid").agg(list)
+ grouped_df = grouped_df.groupby("problem_id").agg(list)
grouped_df["correct_letter"] = grouped_df["correct_letter"].apply(
operator.itemgetter(0)
)
grouped_df = grouped_df.dropna()
k_values, means, stds = utils.run_majority_voting(
- grouped_df, range(1, k_value), k_value
+ grouped_df, list(range(1, k_value)), k_value
)
run_results[run_name] = (k_values, means, stds)
@@ -250,7 +283,7 @@ async def compare_runs(
return results
-async def load_or_process_data(config) -> pd.DataFrame:
+async def load_or_process_data(config: PostprocessingConfig) -> pd.DataFrame:
"""
Load data from files or process trajectories based on configuration.
@@ -264,24 +297,21 @@ async def load_or_process_data(config) -> pd.DataFrame:
data_path = config.data_path
replicate_paper_results = config.replicate_paper_results
- # Case 1: Replicating paper results from trajectories
- if replicate_paper_results.run and replicate_paper_results.from_trajectories:
- trajectory_path = f"{results_dir}/raw_trajectory_data.csv"
- if not os.path.exists(trajectory_path):
- raise FileNotFoundError(
- f"raw_trajectory_data.csv not found in {results_dir}, "
- "please follow the readme to download the raw trajectory data"
- )
- data = load_raw_data(trajectory_path)
- return await process_trajectories(data)
-
- # Case 2: Replicating paper results from pre-computed eval_df
- if replicate_paper_results.run and (not replicate_paper_results.from_trajectories):
- eval_df_path = f"{results_dir}/eval_df.csv"
+ if replicate_paper_results.run:
+ if replicate_paper_results.from_trajectories:
+ trajectory_path = f"{results_dir}/raw_trajectory_data.csv"
+ if not os.path.exists(trajectory_path):
+ raise FileNotFoundError(
+ f"raw_trajectory_data.csv not found in {results_dir}, "
+ "please follow the readme to download the raw trajectory data"
+ )
+ data = load_raw_data(trajectory_path)
+ return await process_trajectories(data)
+
+ eval_df_path = config.eval_df_path
if not os.path.exists(eval_df_path):
raise FileNotFoundError(
- f"eval_df.csv not found in {results_dir}, "
- "please follow the readme to download the eval_df.csv"
+ f"eval_df.csv not found in {results_dir}, please follow the readme to download the eval_df.csv"
)
eval_df = pd.read_csv(eval_df_path)
eval_df["correct"] = eval_df["correct"].astype(bool)
@@ -315,7 +345,7 @@ async def main(config_path: str):
# Save intermediary processed data for debugging
if config.debug | (config.replicate_paper_results.from_trajectories):
- eval_df.to_csv(f"{results_dir}/eval_df_new.csv", index=False)
+ eval_df.to_csv(config.eval_df_path, index=False)
# Run majority vote if configured
if config.majority_vote.run:
@@ -335,7 +365,7 @@ async def main(config_path: str):
# Parse command line arguments
parser = argparse.ArgumentParser(description="Process BixBench evaluation data")
parser.add_argument(
- "config_file", type=str, help="Path to the YAML configuration file"
+ "--config_file", type=str, help="Path to the YAML configuration file"
)
args = parser.parse_args()
diff --git a/bixbench/postprocessing_utils.py b/bixbench/postprocessing_utils.py
index d197196..893b4f5 100644
--- a/bixbench/postprocessing_utils.py
+++ b/bixbench/postprocessing_utils.py
@@ -6,7 +6,7 @@
import re
from asyncio import Semaphore
from pathlib import Path
-from typing import Any, Optional
+from typing import Any
import litellm
import nbformat
@@ -16,16 +16,18 @@
from bixbench import prompts
-litellm.set_verbose = False
-
def load_dataframe_from_json_directory(path: str) -> pd.DataFrame:
"""Load a dataframe from a json directory."""
- data = []
+ all_data = []
for file in list(Path(path).glob("**/*.json")):
+ replica = re.search(r"replica_(\d+)", file.name)[1]
+ replica = int(replica) if replica is not None else 0
with open(file, encoding="utf-8") as f:
- data.append(json.load(f))
- return pd.DataFrame(data)
+ data = json.load(f)
+ data["replica"] = replica
+ all_data.append(data)
+ return pd.DataFrame(all_data)
def flatten_list(nested_list: list[list[Any]]) -> list[Any]:
@@ -83,7 +85,7 @@ async def process_model_batch(
async def run_eval_loop(
- eval_df: pd.DataFrame, max_concurrent: int = 100
+ eval_df: pd.DataFrame, max_concurrent: int = 10
) -> pd.DataFrame:
"""Process evaluation dataframe with multiple LLM models concurrently.
@@ -120,7 +122,7 @@ async def run_eval_loop(
return eval_df
-async def process_single(prompt: str, model: str, sem: Semaphore) -> Optional[str]:
+async def process_single(prompt: str, model: str, sem: Semaphore) -> str | None:
"""Process a single prompt with a language model with retry logic.
Makes up to 5 attempts to get a response from the model.
@@ -153,7 +155,7 @@ async def process_single(prompt: str, model: str, sem: Semaphore) -> Optional[st
async def process_with_progress(
prompt: str, model: str, sem: Semaphore, pbar: tqdm
-) -> Optional[str]:
+) -> str | None:
"""Process a single prompt and update progress bar.
Callback function that processes a prompt and ensures the progress bar
@@ -176,7 +178,7 @@ async def process_with_progress(
async def process_batch(
prompts: list[str], model: str, max_concurrent: int = 5
-) -> list[Optional[str]]:
+) -> list[str | None]:
"""Process a batch of prompts concurrently with rate limiting and progress tracking.
Args:
@@ -218,7 +220,7 @@ def encode_image_to_base64(image: str) -> str:
return base64.b64encode(decoded_image).decode("utf-8")
-def load_notebook(notebook: str | dict[str, Any]) -> dict[str, Any]:
+def load_notebook(notebook: str | dict[str, Any]):
"""Parse a notebook into nbformat.
Attempts to parse a notebook into a dictionary format using nbformat.
@@ -234,7 +236,7 @@ def load_notebook(notebook: str | dict[str, Any]) -> dict[str, Any]:
return nbformat.from_dict(notebook)
-def load_answer(answer: str | dict[str, Any]) -> dict[str, Any]:
+def load_answer(answer: str | dict[str, Any]) -> str | dict[str, Any]:
"""Parse an answer into a dictionary format.
Attempts multiple parsing methods: direct dict access, ast.literal_eval,
@@ -258,105 +260,123 @@ def load_answer(answer: str | dict[str, Any]) -> dict[str, Any]:
# Fallback to json loads
return json.loads(answer)
except (ValueError, TypeError, json.JSONDecodeError):
- # Return empty dict if parsing fails
- return {}
+ # Return answer if parsing fails (for when answer is a string)
+ return answer
+
+
+def expand_open_answer_to_mcq(row: pd.Series, include_refusal: bool) -> pd.Series:
+ """Expand an open answer row into an MCQ evaluation row.
+
+ Args:
+ row: DataFrame row containing open answer trajectory data
+ include_refusal: Whether to include refusal option in MCQ
+ Returns:
+ Series representing an MCQ evaluation row
+ """
+ # Create a copy of the row for MCQ evaluation
+ mcq_row = row.copy()
+
+ # Set question format to MCQ
+ mcq_row["question_format"] = "mcq"
+
+ formatted_q, correct_letter, refusal_letter = questions_to_mcq(
+ mcq_row.question,
+ options=[
+ mcq_row.ideal_answer,
+ *mcq_row.metadata["distractors"],
+ ],
+ refusal_option=include_refusal,
+ )
-def create_eval_df(data: list[dict[str, Any]]) -> pd.DataFrame:
+ mcq_row["formatted_question"] = formatted_q
+ mcq_row["correct_letter"] = correct_letter
+ mcq_row["refusal_letter"] = refusal_letter
+
+ # Set evaluation mode
+ mcq_row["eval_mode"] = (
+ "mcq_with_refusal" if include_refusal else "mcq_without_refusal"
+ )
+ mcq_row["run_name"] = mcq_row["run_name"] + "_" + mcq_row["eval_mode"]
+
+ return mcq_row
+
+
+def create_eval_df(data: list[dict[str, Any]], expand_mcq: bool = True) -> pd.DataFrame:
"""Creates a dataframe for evaluation with one row per question.
- Uses vectorized operations for better performance.
+ For open answer trajectories, expands them into additional MCQ evaluation rows.
Args:
data: List of dictionaries containing problem data
+ expand_mcq: Whether to expand MCQ questions
Returns:
- DataFrame with one row per question, including formatted questions and prompts
+ DataFrame with evaluation rows, including open answer and expanded MCQ rows
"""
- # First, apply load_answer to all relevant columns at once
- evaluation_data = data.copy()
-
- # Handle list type agent answers
- for col in ["agent_answer", "mcq_question", "mcq_options"]:
- mask = evaluation_data[col].apply(lambda x: isinstance(x, list))
- evaluation_data.loc[mask, col] = evaluation_data.loc[mask, col].apply(
- lambda x: {f"q{i + 1}": v for i, v in enumerate(x)}
- )
-
- # Filter out rows without agent answers
+ # Convert to DataFrame and filter out rows without agent answers
+ evaluation_data = pd.DataFrame(data)
evaluation_data = evaluation_data[evaluation_data["agent_answer"].apply(bool)]
- # Now prepare for explosion
- # Create a column with question numbers from ideal_answer keys
- evaluation_data["question_keys"] = evaluation_data["ideal_answer"].apply(
- lambda x: list(x.keys())
+ # Extract single question data from lists where needed
+ # mcq_question is stored as a list with single question, extract it
+ evaluation_data["question"] = evaluation_data["mcq_question"].apply(
+ lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x
)
- # Explode the dataframe to create one row per question
- exploded = evaluation_data.explode("question_keys")
-
- # Now create the final dataframe in a vectorized way
- result = pd.DataFrame(
- {
- "uuid": exploded["problem_id"]
- + "_"
- + exploded["question_keys"].astype(str),
- "problem_id": exploded["problem_id"],
- "question": exploded.apply(
- lambda row: row["mcq_question"].get(row["question_keys"], None), axis=1
- ),
- "question_num": exploded["question_keys"],
- "agent_answer": exploded.apply(
- lambda row: row["agent_answer"].get(row["question_keys"], None), axis=1
- ),
- "ideal_answer": exploded.apply(
- lambda row: row["ideal_answer"].get(row["question_keys"], None), axis=1
- ),
- "run_name": exploded["run_name"],
- "md_notebook": exploded["md_notebook"],
- "md_images": exploded["md_images"],
- "mcq_options": exploded.apply(
- lambda row: row["mcq_options"].get(row["question_keys"], None), axis=1
- ),
- "refusal_option": exploded.get("refusal_option", None),
- "question_format": exploded.get("question_format", None),
- "model": exploded.get("model", None),
- }
+
+ # mcq_options is stored as list of lists, extract the single list
+ evaluation_data["mcq_options"] = evaluation_data["mcq_options"].apply(
+ lambda x: (
+ x[0] if isinstance(x, list) and len(x) > 0 and isinstance(x[0], list) else x
+ )
+ )
+
+ # agent_answer might be dict with single key, extract the value
+ evaluation_data["agent_answer"] = evaluation_data["agent_answer"].apply(
+ lambda x: next(iter(x.values())) if isinstance(x, dict) and len(x) == 1 else x
)
+ result = evaluation_data
+
# Drop rows with no question or no format
result = result.dropna(
subset=["question", "question_format"], how="any"
).reset_index(drop=True)
- # Drop MCQ questions with any NaN values
- mcq_mask = result["question_format"] == "mcq"
- result = result[~(mcq_mask & result.isna().any(axis=1))]
-
- # Apply MCQ formatting only to MCQ questions
- mcq_rows = result[result["question_format"] == "mcq"].index
- if len(mcq_rows) > 0:
- result.loc[
- mcq_rows, ["formatted_question", "correct_letter", "refusal_letter"]
- ] = result.loc[mcq_rows].apply(
- lambda row: pd.Series(
- questions_to_mcq(
- row["question"],
- row["mcq_options"],
- refusal_option=row["refusal_option"],
- ),
- index=["formatted_question", "correct_letter", "refusal_letter"],
- ),
- axis=1,
- )
- result["prompt"] = result.apply(create_prompt, axis=1)
- result["content"] = result.apply(create_llm_message_content, axis=1)
+ # Process open answer questions - expand to MCQ formats
+ open_rows = result[result["question_format"] == "open"]
+ expanded_rows = []
+
+ for _, row in open_rows.iterrows():
+ # Keep original open answer row
+ open_row = row.copy()
+ open_row["eval_mode"] = "open"
+ assert open_row["question_format"] == "open"
- return result
+ open_row["run_name"] = open_row["run_name"] + "_" + open_row["eval_mode"]
+
+ expanded_rows.append(open_row)
+
+ if expand_mcq:
+ # Create MCQ with refusal option
+ mcq_with_refusal = expand_open_answer_to_mcq(row, include_refusal=True)
+ expanded_rows.append(mcq_with_refusal)
+
+ # Create MCQ without refusal option
+ mcq_without_refusal = expand_open_answer_to_mcq(row, include_refusal=False)
+ expanded_rows.append(mcq_without_refusal)
+
+ # Combine all rows
+ eval_df = pd.DataFrame(expanded_rows) if expanded_rows else pd.DataFrame()
+
+ eval_df["prompt"] = eval_df.apply(create_prompt, axis=1)
+ eval_df["content"] = eval_df.apply(create_llm_message_content, axis=1)
+ return eval_df
def questions_to_mcq(
question: str, options: list[str | dict[str, Any]], refusal_option: bool = True
-) -> tuple[str, str, Optional[str]]:
+) -> tuple[str, str, str | None]:
"""Format a question and options into an MCQ format.
Creates a formatted multiple-choice question with lettered options,
@@ -412,7 +432,7 @@ def create_llm_message_content(row: pd.Series) -> list[dict[str, Any]]:
"""
content = [{"type": "text", "text": row.prompt}]
- if row.md_images:
+ if hasattr(row, "md_images") and row.md_images:
for img_data in row.md_images:
try:
content.append(
@@ -470,12 +490,10 @@ def xml_extract(text: str) -> str:
The extracted answer letter or 'Z' if no match is found
"""
match = re.search(r"([A-Z])", text)
- if match:
- return match.group(1)
- return "Z"
+ return match[1] if match else "Z"
-def majority_vote(row: pd.Series, k: int = 10) -> Optional[str]:
+def majority_vote(row: pd.Series, k: int = 10) -> str | None:
"""Apply majority voting to a series of predictions.
Randomly samples k predictions from the input and returns the most common value.
@@ -500,9 +518,7 @@ def majority_vote(row: pd.Series, k: int = 10) -> Optional[str]:
return None
unique_values, counts = np.unique(sampled_votes, return_counts=True)
- if unique_values.size == 0:
- return None
- return unique_values[np.argmax(counts)]
+ return None if unique_values.size == 0 else unique_values[np.argmax(counts)]
def run_majority_voting(
@@ -569,7 +585,7 @@ def wilson_ci(p: float, n: int, z: float = 1.96) -> tuple[float, float]:
def calculate_results(
- df: pd.DataFrame, total_questions_per_run: Optional[int] = None
+ df: pd.DataFrame, total_questions_per_run: int | None = None
) -> dict[str, dict[str, float]]:
"""
Calculate means and confidence intervals for each model and format.
diff --git a/bixbench/prompts.py b/bixbench/prompts.py
index 31f899e..f06df9f 100644
--- a/bixbench/prompts.py
+++ b/bixbench/prompts.py
@@ -31,6 +31,15 @@
Example Output: correct
""" # noqa: E501
+OPEN_ENDED_RANGE_GRADING_PROMPT = """You are given a question, target range using the format (lower,upper) and a predicted answer. Your task is to compare the target range with the predicted and assess if the predicted answer falls within the specified range. If it falls within the range, it is correct, otherwise it is incorrect. If the predicted answer cannot be compared to the target range, it is refused to answer.
+Question: {question}
+Target Range: {target}
+Predicted Answer: {predicted}
+
+Important: You must only output one from `correct`, `incorrect` or `refused` between tags.
+Example Output: correct
+""" # noqa: E501
+
MCQ_EVAL_PROMPT = """
First, carefully examine the following notebook:
diff --git a/bixbench/utils.py b/bixbench/utils.py
index 750da04..f71b4a2 100644
--- a/bixbench/utils.py
+++ b/bixbench/utils.py
@@ -1,8 +1,10 @@
+import asyncio
import random
import string
import uuid
+from collections.abc import AsyncIterator, Awaitable, Iterable
from enum import StrEnum, auto
-from typing import Optional
+from typing import TypeVar
from pydantic import BaseModel, ConfigDict
@@ -21,13 +23,13 @@ def model_dump(self, **kwargs) -> dict:
class Query(BaseModel):
model_config = ConfigDict(extra="ignore", arbitrary_types_allowed=True)
- id: uuid.UUID
+ id: uuid.UUID | str
question: str
target: str
choices: list[str] | None = None
predicted: str | None = None
unsure: str | None = None
- evaluation_mode: Optional[str] = None
+ evaluation_mode: str | None = None
class LLMConfig(BaseModel):
@@ -48,7 +50,7 @@ def parse_response(
def randomize_choices(
ideal: str, distractors: list[str], with_refusal: bool = True
-) -> tuple[list[str], str, str]:
+) -> tuple[list[str], str, str | None]:
REFUSE_CHOICE = "Insufficient information to answer the question"
ALPHABET = string.ascii_uppercase
choices = (
@@ -83,7 +85,7 @@ def compute_metrics(grades: list[bool], is_refused: list[bool]) -> dict:
n_total = len(grades)
n_correct = sum(grades)
- n_unsure = sum(1 for x in is_refused if x)
+ n_unsure = sum(bool(x) for x in is_refused)
n_sure = n_total - n_unsure
# Calculate metrics
accuracy = n_correct / n_total if n_total > 0 else 0
@@ -98,3 +100,37 @@ def compute_metrics(grades: list[bool], is_refused: list[bool]) -> dict:
"n_correct": n_correct,
"n_sure": n_sure,
}
+
+
+T = TypeVar("T")
+
+
+async def as_completed_with_concurrency(
+ coros: Iterable[Awaitable[T]],
+ max_concurrent: int | asyncio.Semaphore = 5,
+ timeout: float = 600.0,
+) -> AsyncIterator[T]:
+ """Run a list of coroutines concurrently with rate limiting and progress tracking.
+
+ Args:
+ coros: List of coroutines to run
+ max_concurrent: Maximum number of concurrent coroutines to run
+ timeout: Timeout for the coroutines
+
+ Returns:
+ AsyncIterator of results from each coroutine
+ """
+ sem = (
+ asyncio.Semaphore(max_concurrent)
+ if isinstance(max_concurrent, int)
+ else max_concurrent
+ )
+
+ async def sem_coro(coro: Awaitable[T]) -> T:
+ async with sem:
+ return await coro
+
+ # submit as futures and then gather them as_completed
+ futures = (asyncio.ensure_future(sem_coro(coro)) for coro in coros)
+ async for future in asyncio.as_completed(futures, timeout=timeout):
+ yield future.result()
diff --git a/bixbench/zero_shot.py b/bixbench/zero_shot.py
index a570de3..6256c90 100644
--- a/bixbench/zero_shot.py
+++ b/bixbench/zero_shot.py
@@ -1,5 +1,5 @@
from functools import cached_property
-from typing import Any, Optional
+from typing import Any, Self
from aviary.core import Message
from lmi import LiteLLMModel
@@ -21,15 +21,15 @@ class ZeroshotBaseline(BaseModel):
temperature: float = Field(default=1.0, ge=0.0, le=2.0)
extra_kwargs: dict[str, Any] = Field(default_factory=dict)
- _llm_client: Optional[LiteLLMModel] = None
- _query: Optional[Query] = None
+ _llm_client: LiteLLMModel | None = None
+ _query: Query | None = None
class Config:
arbitrary_types_allowed = True
extra = "allow"
@model_validator(mode="after")
- def initialize_llm_client(self) -> "ZeroshotBaseline":
+ def initialize_llm_client(self) -> Self:
"""Initialize the LLM client after model creation."""
config = {
"name": self.model_name,
@@ -71,7 +71,7 @@ def prompt_template(self) -> str:
return OPEN_ENDED_PROMPT_TEMPLATE
raise ValueError(f"Unknown answer mode: {self.answer_mode}")
- def _prep_query(self) -> tuple[str, Any, Optional[Any]]:
+ def _prep_query(self) -> tuple[str, Any, Any | None]:
"""Generate query based on evaluation mode and parameters."""
template = self.prompt_template
diff --git a/generate_zeroshot_evals.py b/generate_zeroshot_evals.py
index 3a07fbb..76b4d20 100644
--- a/generate_zeroshot_evals.py
+++ b/generate_zeroshot_evals.py
@@ -1,9 +1,9 @@
import argparse
-import ast
import asyncio
import logging
import os
from pathlib import Path
+from typing import cast
import pandas as pd
from datasets import load_dataset
@@ -69,7 +69,7 @@ def parse_args():
# used for testing purposes
parser.add_argument(
- "--num_examples",
+ "--num-examples",
type=int,
default=-1,
help="Number of examples to evaluate. Default is -1 for all examples",
@@ -82,6 +82,11 @@ def parse_args():
parser.add_argument(
"--output-file", type=str, default=None, help="Output file name (optional)"
)
+ parser.add_argument(
+ "--dataset-split",
+ default="train",
+ help="Dataset split to evaluate. Default is 'test'",
+ )
return parser.parse_args()
@@ -91,27 +96,25 @@ async def evaluate(
output_dir: str = "results",
output_file: str | None = None,
):
-
results = []
for _, row in dataset.iterrows():
- for q_dict in row["questions"]:
- query = await zeroshot_agent.generate_zeroshot_answers(
- Query(
- id=row["uuid"],
- question=q_dict["question"],
- target=q_dict["ideal_answer"],
- choices=[q_dict[f"distractor_{j}"] for j in range(1, 4)],
- evaluation_mode=q_dict.get("eval_method", None),
- )
+ query = await zeroshot_agent.generate_zeroshot_answers(
+ Query(
+ id=row["question_id"],
+ question=row["question"],
+ target=row["ideal"],
+ choices=row["distractors"],
+ evaluation_mode=row.get("eval_mode", None),
)
+ )
- result_dict = {
- "uuid": query.id,
- "question": query.question,
- "predicted": query.predicted,
- "target": query.target,
- "unsure": query.unsure,
- }
+ result_dict = {
+ "uuid": query.id,
+ "question": query.question,
+ "predicted": query.predicted,
+ "target": query.target,
+ "unsure": query.unsure,
+ }
if query.evaluation_mode is not None:
result_dict["evaluation_mode"] = query.evaluation_mode
@@ -122,6 +125,7 @@ async def evaluate(
if not os.path.exists(output_dir):
os.makedirs(output_dir)
+ assert output_file is not None
output_path = os.path.join(output_dir, output_file)
pd.DataFrame(results).to_csv(output_path, index=False)
@@ -136,12 +140,10 @@ async def main():
)
if args.local_csv is None:
_hf_login()
- dataset = load_dataset(HF_URL)["train"].to_pandas()
- dataset["questions"] = dataset["questions"].apply(ast.literal_eval)
+ dataset = load_dataset(HF_URL)[args.dataset_split].to_pandas() # type: ignore[index]
else:
- dataset = pd.read_csv(
- args.local_csv, converters={"questions": ast.literal_eval}
- )
+ dataset = pd.read_csv(args.local_csv)
+ dataset = cast("pd.DataFrame", dataset)
if args.num_examples > 0:
dataset = dataset.head(args.num_examples)
From 6c0498af4ccf79c0794480367fbacf7ed8883e57 Mon Sep 17 00:00:00 2001
From: Alex Andonian
Date: Thu, 25 Sep 2025 03:44:34 +0000
Subject: [PATCH 02/16] Update dependencies in pyproject.toml for improved
functionality
- Consolidated dependency list for clarity.
- Added new dependencies: `aiofiles`, `crow-client`, `datasets`, `huggingface-hub`.
- Restored previously removed dependencies: `ldp`, `matplotlib`, `numpy`, `scikit-learn`, `scipy`, `seaborn`, `statsmodels`, `google-cloud-storage`.
---
pyproject.toml | 36 ++++++++++++++++--------------------
1 file changed, 16 insertions(+), 20 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 5b7f5ed..211d0bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,9 +1,6 @@
[build-system]
build-backend = "setuptools.build_meta"
-requires = [
- "setuptools>=66",
- "wheel>=0.36"
-]
+requires = ["setuptools>=66", "wheel>=0.36"]
[project]
authors = [
@@ -11,31 +8,30 @@ authors = [
]
dependencies = [
"aiodocker",
+ "aiofiles",
+ "crow-client >= 0.3.4",
+ "datasets",
"fhaviary[server] >= 0.18.0",
"fhda @ git+https://github.com/Future-House/data-analysis-crow@v1.0.0",
- "ldp",
- "pandas",
- "numpy",
- "matplotlib",
- "scipy",
- "seaborn",
- "scikit-learn",
- "statsmodels",
- "aiofiles",
+ "fhlmi",
"google-auth",
- "google-cloud-storage",
"google-cloud-secret-manager",
- "crow-client >= 0.3.4",
+ "google-cloud-storage",
+ "huggingface-hub",
"jupyter",
+ "ldp",
+ "matplotlib",
"nbconvert",
- "notebook",
"nbformat",
- "fhlmi",
- "pydantic~=2.0",
- "datasets",
+ "notebook",
+ "numpy",
"pandas",
+ "pydantic~=2.0",
"python-dotenv",
- "huggingface-hub"
+ "scikit-learn",
+ "scipy",
+ "seaborn",
+ "statsmodels"
]
description = "BixBench"
name = "fhbixbench"
From e3af14c31a61c070f8be03f6eaa9eb949206963f Mon Sep 17 00:00:00 2001
From: Alex Andonian
Date: Thu, 25 Sep 2025 23:44:37 +0000
Subject: [PATCH 03/16] Add new configuration files for BixBench runs
- Introduced YAML configuration files for various agent setups: `4o_image`, `4o_no_image`, `claude_image`, and `claude_no_image`.
- Each configuration specifies agent parameters, rollout settings, notebook details, capsule prompt templates, and paths for data storage.
- Added a new configuration file `v2_paper_results.yaml` to facilitate comparison of results across different models and settings.
---
bixbench/run_configuration/4o_image.yaml | 34 ++++++++++++
bixbench/run_configuration/4o_no_image.yaml | 34 ++++++++++++
bixbench/run_configuration/claude_image.yaml | 34 ++++++++++++
.../run_configuration/claude_no_image.yaml | 34 ++++++++++++
.../run_configuration/v2_paper_results.yaml | 53 +++++++++++++++++++
5 files changed, 189 insertions(+)
create mode 100644 bixbench/run_configuration/4o_image.yaml
create mode 100644 bixbench/run_configuration/4o_no_image.yaml
create mode 100644 bixbench/run_configuration/claude_image.yaml
create mode 100644 bixbench/run_configuration/claude_no_image.yaml
create mode 100644 bixbench/run_configuration/v2_paper_results.yaml
diff --git a/bixbench/run_configuration/4o_image.yaml b/bixbench/run_configuration/4o_image.yaml
new file mode 100644
index 0000000..2f049d8
--- /dev/null
+++ b/bixbench/run_configuration/4o_image.yaml
@@ -0,0 +1,34 @@
+agent:
+ agent_type: SimpleAgent
+ agent_kwargs:
+ llm_model:
+ parallel_tool_calls: false
+ num_retries: 5
+ temperature: 1.0
+ name: gpt-4o
+ hide_old_env_states: true
+rollout:
+ max_steps: 20
+ batch_size: 24
+ rollout_type: aviary
+notebook:
+ name: notebook.ipynb
+ language: python
+ use_docker: true
+capsule:
+ prompt_templates:
+ mcq: MCQ_PROMPT_TEMPLATE
+ open: OPEN_PROMPT_TEMPLATE
+ hypothesis: HYPOTHESIS_PROMPT_TEMPLATE
+ eval_mode: null
+ mode: open
+ include_refusal_option: true
+ avoid_images: false
+ system_prompt: CAPSULE_SYSTEM_PROMPT_OPEN
+paths:
+ workspace_dir: data/workspace
+ trajectories_dir: bixbench-v2_results/trajectories
+ data_folder: data/capsules
+ hf_repo_id: futurehouse/BixBench
+dataset_split: train
+run_name: 4o_image
diff --git a/bixbench/run_configuration/4o_no_image.yaml b/bixbench/run_configuration/4o_no_image.yaml
new file mode 100644
index 0000000..71a8921
--- /dev/null
+++ b/bixbench/run_configuration/4o_no_image.yaml
@@ -0,0 +1,34 @@
+agent:
+ agent_type: SimpleAgent
+ agent_kwargs:
+ llm_model:
+ parallel_tool_calls: false
+ num_retries: 5
+ temperature: 1.0
+ name: gpt-4o
+ hide_old_env_states: true
+rollout:
+ max_steps: 20
+ batch_size: 24
+ rollout_type: aviary
+notebook:
+ name: notebook.ipynb
+ language: python
+ use_docker: true
+capsule:
+ prompt_templates:
+ mcq: MCQ_PROMPT_TEMPLATE
+ open: OPEN_PROMPT_TEMPLATE
+ hypothesis: HYPOTHESIS_PROMPT_TEMPLATE
+ eval_mode: null
+ mode: open
+ include_refusal_option: true
+ avoid_images: true
+ system_prompt: CAPSULE_SYSTEM_PROMPT_OPEN
+paths:
+ workspace_dir: data/workspace
+ trajectories_dir: bixbench-v2_results/trajectories
+ data_folder: data/capsules
+ hf_repo_id: futurehouse/BixBench
+dataset_split: train
+run_name: 4o_no_image
diff --git a/bixbench/run_configuration/claude_image.yaml b/bixbench/run_configuration/claude_image.yaml
new file mode 100644
index 0000000..8d4ec15
--- /dev/null
+++ b/bixbench/run_configuration/claude_image.yaml
@@ -0,0 +1,34 @@
+agent:
+ agent_type: SimpleAgent
+ agent_kwargs:
+ llm_model:
+ parallel_tool_calls: false
+ num_retries: 5
+ temperature: 1.0
+ name: anthropic/claude-3-5-sonnet-20241022
+ hide_old_env_states: true
+rollout:
+ max_steps: 20
+ batch_size: 12
+ rollout_type: aviary
+notebook:
+ name: notebook.ipynb
+ language: python
+ use_docker: true
+capsule:
+ prompt_templates:
+ mcq: MCQ_PROMPT_TEMPLATE
+ open: OPEN_PROMPT_TEMPLATE
+ hypothesis: HYPOTHESIS_PROMPT_TEMPLATE
+ eval_mode: null
+ mode: open
+ include_refusal_option: true
+ avoid_images: false
+ system_prompt: CAPSULE_SYSTEM_PROMPT_OPEN
+paths:
+ workspace_dir: data/workspace
+ trajectories_dir: bixbench-v2_results/trajectories
+ data_folder: data/capsules
+ hf_repo_id: futurehouse/BixBench
+dataset_split: train
+run_name: claude_image
diff --git a/bixbench/run_configuration/claude_no_image.yaml b/bixbench/run_configuration/claude_no_image.yaml
new file mode 100644
index 0000000..116d8a1
--- /dev/null
+++ b/bixbench/run_configuration/claude_no_image.yaml
@@ -0,0 +1,34 @@
+agent:
+ agent_type: SimpleAgent
+ agent_kwargs:
+ llm_model:
+ parallel_tool_calls: false
+ num_retries: 5
+ temperature: 1.0
+ name: anthropic/claude-3-5-sonnet-20241022
+ hide_old_env_states: true
+rollout:
+ max_steps: 20
+ batch_size: 24
+ rollout_type: aviary
+notebook:
+ name: notebook.ipynb
+ language: python
+ use_docker: true
+capsule:
+ prompt_templates:
+ mcq: MCQ_PROMPT_TEMPLATE
+ open: OPEN_PROMPT_TEMPLATE
+ hypothesis: HYPOTHESIS_PROMPT_TEMPLATE
+ eval_mode: null
+ mode: open
+ include_refusal_option: true
+ avoid_images: true
+ system_prompt: CAPSULE_SYSTEM_PROMPT_OPEN
+paths:
+ workspace_dir: data/workspace
+ trajectories_dir: bixbench-v2_results/trajectories
+ data_folder: data/capsules
+ hf_repo_id: futurehouse/BixBench
+dataset_split: train
+run_name: claude_no_image
diff --git a/bixbench/run_configuration/v2_paper_results.yaml b/bixbench/run_configuration/v2_paper_results.yaml
new file mode 100644
index 0000000..7a042e0
--- /dev/null
+++ b/bixbench/run_configuration/v2_paper_results.yaml
@@ -0,0 +1,53 @@
+data_path: "bixbench-v2_results/trajectories/"
+results_dir: "bixbench-v2_results"
+debug: true
+
+replicate_paper_results:
+ run: true
+ # from_trajectories: true # Process from trajectories, not pre-computed eval_df
+ from_trajectories: false # Process from trajectories, not pre-computed eval_df
+
+majority_vote:
+ run: true
+ k_value: 10
+ groups:
+ image_comparison:
+ - "claude_image_mcq_with_refusal"
+ - "4o_image_mcq_with_refusal"
+ - "claude_no_image_mcq_with_refusal"
+ - "4o_no_image_mcq_with_refusal"
+ refusal_option_comparison:
+ - "claude_image_mcq_without_refusal"
+ - "4o_image_mcq_without_refusal"
+ - "claude_image_mcq_with_refusal"
+ - "4o_image_mcq_with_refusal"
+
+run_comparison:
+ run: true
+ # Adjust this based on actual number of questions in the new dataset
+ # Original was 2960 (296 questions x 10 iterations)
+ # You may need to update this based on your dataset size
+ total_questions_per_run: null
+ run_name_groups:
+ - ["4o_image_open", "claude_image_open"]
+ - ["4o_image_mcq_with_refusal", "claude_image_mcq_with_refusal"]
+ - ["4o_image_mcq_without_refusal", "claude_image_mcq_without_refusal"]
+ group_titles:
+ - "Open-answer"
+ - "MCQ w/ refusal"
+ - "MCQ w/o refusal"
+ color_groups:
+ - "4o"
+ - "claude"
+ use_zero_shot_baselines: true
+ random_baselines:
+ - null # For open-ended (no random baseline)
+ - 0.2 # For MCQ with refusal (1/5 chance)
+ - 0.25 # For MCQ without refusal (1/4 chance)
+ baseline_name_mappings:
+ "gpt-4o-grader-openended": "4o_image_open"
+ "claude-3-5-sonnet-latest-grader-openended": "claude_image_open"
+ "gpt-4o-grader-mcq-refusal-True": "4o_image_mcq_with_refusal"
+ "claude-3-5-sonnet-latest-grader-mcq-refusal-True": "claude_image_mcq_with_refusal"
+ "gpt-4o-grader-mcq-refusal-False": "4o_image_mcq_without_refusal"
+ "claude-3-5-sonnet-latest-grader-mcq-refusal-False": "claude_image_mcq_without_refusal"
\ No newline at end of file
From 0fdd93af4f4ef5601a92394469c7487f7a67cd92 Mon Sep 17 00:00:00 2001
From: Alex Andonian
Date: Fri, 26 Sep 2025 05:13:34 +0000
Subject: [PATCH 04/16] Update trajectory directory paths in configuration
files and add new results configuration
- Changed the `trajectories_dir` path from `bixbench-v2_results/trajectories` to `bixbench-v1.5_results/trajectories` in the existing configuration files: `4o_image.yaml`, `4o_no_image.yaml`, `claude_image.yaml`, and `claude_no_image.yaml`.
- Introduced a new configuration file `v1.5_paper_results.yaml` to facilitate the comparison of results for the BixBench project.
---
bixbench/run_configuration/4o_image.yaml | 2 +-
bixbench/run_configuration/4o_no_image.yaml | 2 +-
bixbench/run_configuration/claude_image.yaml | 2 +-
bixbench/run_configuration/claude_no_image.yaml | 2 +-
.../{v2_paper_results.yaml => v1.5_paper_results.yaml} | 6 +++---
5 files changed, 7 insertions(+), 7 deletions(-)
rename bixbench/run_configuration/{v2_paper_results.yaml => v1.5_paper_results.yaml} (94%)
diff --git a/bixbench/run_configuration/4o_image.yaml b/bixbench/run_configuration/4o_image.yaml
index 2f049d8..a07fb6a 100644
--- a/bixbench/run_configuration/4o_image.yaml
+++ b/bixbench/run_configuration/4o_image.yaml
@@ -27,7 +27,7 @@ capsule:
system_prompt: CAPSULE_SYSTEM_PROMPT_OPEN
paths:
workspace_dir: data/workspace
- trajectories_dir: bixbench-v2_results/trajectories
+ trajectories_dir: bixbench-v1.5_results/trajectories
data_folder: data/capsules
hf_repo_id: futurehouse/BixBench
dataset_split: train
diff --git a/bixbench/run_configuration/4o_no_image.yaml b/bixbench/run_configuration/4o_no_image.yaml
index 71a8921..59fa595 100644
--- a/bixbench/run_configuration/4o_no_image.yaml
+++ b/bixbench/run_configuration/4o_no_image.yaml
@@ -27,7 +27,7 @@ capsule:
system_prompt: CAPSULE_SYSTEM_PROMPT_OPEN
paths:
workspace_dir: data/workspace
- trajectories_dir: bixbench-v2_results/trajectories
+ trajectories_dir: bixbench-v1.5_results/trajectories
data_folder: data/capsules
hf_repo_id: futurehouse/BixBench
dataset_split: train
diff --git a/bixbench/run_configuration/claude_image.yaml b/bixbench/run_configuration/claude_image.yaml
index 8d4ec15..6d4d0f1 100644
--- a/bixbench/run_configuration/claude_image.yaml
+++ b/bixbench/run_configuration/claude_image.yaml
@@ -27,7 +27,7 @@ capsule:
system_prompt: CAPSULE_SYSTEM_PROMPT_OPEN
paths:
workspace_dir: data/workspace
- trajectories_dir: bixbench-v2_results/trajectories
+ trajectories_dir: bixbench-v1.5_results/trajectories
data_folder: data/capsules
hf_repo_id: futurehouse/BixBench
dataset_split: train
diff --git a/bixbench/run_configuration/claude_no_image.yaml b/bixbench/run_configuration/claude_no_image.yaml
index 116d8a1..7aa6949 100644
--- a/bixbench/run_configuration/claude_no_image.yaml
+++ b/bixbench/run_configuration/claude_no_image.yaml
@@ -27,7 +27,7 @@ capsule:
system_prompt: CAPSULE_SYSTEM_PROMPT_OPEN
paths:
workspace_dir: data/workspace
- trajectories_dir: bixbench-v2_results/trajectories
+ trajectories_dir: bixbench-v1.5_results/trajectories
data_folder: data/capsules
hf_repo_id: futurehouse/BixBench
dataset_split: train
diff --git a/bixbench/run_configuration/v2_paper_results.yaml b/bixbench/run_configuration/v1.5_paper_results.yaml
similarity index 94%
rename from bixbench/run_configuration/v2_paper_results.yaml
rename to bixbench/run_configuration/v1.5_paper_results.yaml
index 7a042e0..3041121 100644
--- a/bixbench/run_configuration/v2_paper_results.yaml
+++ b/bixbench/run_configuration/v1.5_paper_results.yaml
@@ -1,9 +1,9 @@
-data_path: "bixbench-v2_results/trajectories/"
-results_dir: "bixbench-v2_results"
+data_path: "bixbench-v1.5_results/trajectories/"
+results_dir: "bixbench-v1.5_results"
debug: true
replicate_paper_results:
- run: true
+ run: false
# from_trajectories: true # Process from trajectories, not pre-computed eval_df
from_trajectories: false # Process from trajectories, not pre-computed eval_df
From fd2ccf8673535964029831aaa99d8bdcc17f4317 Mon Sep 17 00:00:00 2001
From: Alex Andonian
Date: Fri, 26 Sep 2025 05:47:48 +0000
Subject: [PATCH 05/16] Update plotting utility and configuration for majority
vote accuracy
- Adjusted figure size in `plotting_utils.py` for better visualization.
- Modified x-axis limits to dynamically reflect the maximum k value.
- Streamlined code for error bar calculations in bar plots.
- Updated `v1.5_paper_results.yaml` to change `k_value` from 10 to 5 and enabled the replication of paper results.
---
bixbench/plotting_utils.py | 24 +++++++------------
.../run_configuration/v1.5_paper_results.yaml | 5 ++--
2 files changed, 10 insertions(+), 19 deletions(-)
diff --git a/bixbench/plotting_utils.py b/bixbench/plotting_utils.py
index 864737a..d6c51a3 100644
--- a/bixbench/plotting_utils.py
+++ b/bixbench/plotting_utils.py
@@ -44,7 +44,7 @@ def majority_vote_accuracy_by_k(
]
if random_baselines is None:
random_baselines = [0.2, 0.25]
- plt.figure(figsize=(15, 6))
+ plt.figure(figsize=(12, 6))
for run_name, (k_values, means, stds) in run_results.items():
if k_values is None:
@@ -59,7 +59,7 @@ def majority_vote_accuracy_by_k(
plt.xlabel("Number of Votes (k)", fontsize=18)
plt.ylabel("Accuracy", fontsize=18)
- plt.xlim(1, 9)
+ plt.xlim(1, max(k_values))
plt.ylim(0.1, 0.35)
plt.yticks(
np.arange(0.1, 0.36, 0.05),
@@ -70,9 +70,7 @@ def majority_vote_accuracy_by_k(
plt.xticks(k_values, fontsize=18)
plt.gca().yaxis.set_major_formatter(ticker.FormatStrFormatter("%.2f"))
- for i, (baseline, label) in enumerate(
- zip(random_baselines, random_baselines_labels, strict=True)
- ):
+ for i, (baseline, label) in enumerate(zip(random_baselines, random_baselines_labels, strict=True)):
plt.axhline(
y=baseline,
color="red" if i == 0 else "green",
@@ -229,17 +227,11 @@ def draw_model_bars(
mean = results[run_name]["mean"]
ci_low = results[run_name]["ci_low"]
ci_high = results[run_name]["ci_high"]
- yerr = np.array(
- [
- [mean - ci_low],
- [ci_high - mean],
- ]
- )
- label, color = next(
- [group, color]
- for group, color in color_map.items()
- if group in run_name
- )
+ yerr = np.array([
+ [mean - ci_low],
+ [ci_high - mean],
+ ])
+ label, color = next([group, color] for group, color in color_map.items() if group in run_name)
xpos = x_axis[group_idx] + j * bar_width
plt.bar(
xpos,
diff --git a/bixbench/run_configuration/v1.5_paper_results.yaml b/bixbench/run_configuration/v1.5_paper_results.yaml
index 3041121..fe279ee 100644
--- a/bixbench/run_configuration/v1.5_paper_results.yaml
+++ b/bixbench/run_configuration/v1.5_paper_results.yaml
@@ -3,13 +3,12 @@ results_dir: "bixbench-v1.5_results"
debug: true
replicate_paper_results:
- run: false
- # from_trajectories: true # Process from trajectories, not pre-computed eval_df
+ run: true
from_trajectories: false # Process from trajectories, not pre-computed eval_df
majority_vote:
run: true
- k_value: 10
+ k_value: 5
groups:
image_comparison:
- "claude_image_mcq_with_refusal"
From 8e2201719a7b2108a8873e11505dd20a77598a96 Mon Sep 17 00:00:00 2001
From: Alex Andonian
Date: Fri, 26 Sep 2025 05:49:34 +0000
Subject: [PATCH 06/16] Add scripts for agentic and zero-shot evaluations
- Introduced `run_agentic.sh` to automate agentic evaluations, including configuration checks, running evaluations for multiple setups, and postprocessing results.
- Added `run_zeroshot.sh` for zero-shot evaluations, encompassing grading and result aggregation into a single JSON file.
- Both scripts ensure proper directory structure and provide user feedback during execution.
---
scripts/run_agentic.sh | 144 ++++++++++++++++++++++++++++++++++++++
scripts/run_zeroshot.sh | 148 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 292 insertions(+)
create mode 100755 scripts/run_agentic.sh
create mode 100755 scripts/run_zeroshot.sh
diff --git a/scripts/run_agentic.sh b/scripts/run_agentic.sh
new file mode 100755
index 0000000..ecbf5e7
--- /dev/null
+++ b/scripts/run_agentic.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+# Script to reproduce all BixBench paper results
+# This script runs the main agentic evaluations
+
+set -e # Exit on error
+
+echo "=========================================="
+echo "BixBench Reproduction Script"
+echo "=========================================="
+echo ""
+
+# Configuration
+RESULTS_DIR="bixbench-v1.5_results"
+ZERO_SHOT_DIR="${RESULTS_DIR}/zero_shot_baselines"
+TRAJECTORIES_DIR="${RESULTS_DIR}/trajectories"
+CONFIG_DIR="bixbench/run_configuration"
+POSTPROCESS_CONFIG="bixbench/run_configuration/v1.5_paper_results.yaml"
+NUM_REPLICAS=5
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Function to print colored output
+print_status() {
+ echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1"
+}
+
+print_warning() {
+ echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+print_error() {
+ echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Check if required directories exist
+if [ ! -d "$RESULTS_DIR" ]; then
+ print_status "Creating results directory: $RESULTS_DIR"
+ mkdir -p "$RESULTS_DIR"/{zero_shot_baselines,trajectories}
+fi
+
+# ==========================================
+# AGENTIC EVALUATIONS
+# ==========================================
+
+print_status "Starting AGENTIC evaluations..."
+echo ""
+print_warning "This will take a long time (24-48 hours total for all runs)"
+echo ""
+
+# List of all configurations
+CONFIGS=(
+ "4o_image"
+ "4o_no_image"
+ "claude_image"
+ "claude_no_image"
+)
+
+# Function to run agentic evaluation
+run_agentic() {
+ local config_name=$1
+ local replica_id=$2
+ local config_file="${CONFIG_DIR}/${config_name}.yaml"
+
+ if [ ! -f "$config_file" ]; then
+ print_error "Configuration file not found: $config_file"
+ return 1
+ fi
+
+ print_status "Running agentic evaluation: $config_name"
+ print_status "Config file: $config_file"
+
+ # Create a subdirectory for this run's trajectories
+ mkdir -p "${TRAJECTORIES_DIR}/${config_name}"
+
+ # Run the evaluation
+ echo "Running replica $replica_id"
+ python bixbench/generate_trajectories.py --config_file "$config_file" --replica_id "$replica_id"
+
+ print_status "Completed: $config_name"
+ echo ""
+}
+
+# Ask user if they want to run all agentic evaluations
+echo "Do you want to run all agentic evaluations? (y/n)"
+echo "WARNING: This will take 24-48 hours and requires API credits for both OpenAI and Anthropic"
+read -r response
+
+if [[ "$response" =~ ^[Yy]$ ]]; then
+ # Run all agentic evaluations
+ for replica_id in $(seq 0 $NUM_REPLICAS); do
+ for config in "${CONFIGS[@]}"; do
+ run_agentic "$config" "$replica_id"
+ done
+ done
+
+ print_status "All agentic evaluations complete!"
+else
+ print_warning "Skipping agentic evaluations. You can run them individually using:"
+ echo "python bixbench/generate_trajectories.py --config_file bixbench/run_configuration/.yaml"
+fi
+
+echo ""
+
+# ==========================================
+# PART 4: POSTPROCESSING
+# ==========================================
+
+print_status "Running postprocessing to generate figures..."
+echo ""
+
+# Check if postprocessing config exists
+if [ ! -f "$POSTPROCESS_CONFIG" ]; then
+ print_error "Postprocessing config not found. Please check the config file path."
+ exit 1
+fi
+
+# Run postprocessing
+python bixbench/postprocessing.py --config_file "$POSTPROCESS_CONFIG"
+
+print_status "Postprocessing complete! Figures saved to ${RESULTS_DIR}/figures"
+echo ""
+
+# ==========================================
+# SUMMARY
+# ==========================================
+
+echo "=========================================="
+echo "REPRODUCTION COMPLETE!"
+echo "=========================================="
+echo ""
+echo "Results saved to:"
+echo " - Zero-shot baselines: ${ZERO_SHOT_DIR}/"
+echo " - Trajectories: ${TRAJECTORIES_DIR}/"
+echo ""
+echo "To view the figures:"
+echo " - Performance comparison: ${RESULTS_DIR}/bixbench_results_comparison.png"
+echo " - Majority vote (refusal): ${RESULTS_DIR}/majority_vote_accuracy_refusal_option_comparison.png"
+echo " - Majority vote (images): ${RESULTS_DIR}/majority_vote_accuracy_image_comparison.png"
+echo ""
diff --git a/scripts/run_zeroshot.sh b/scripts/run_zeroshot.sh
new file mode 100755
index 0000000..b923933
--- /dev/null
+++ b/scripts/run_zeroshot.sh
@@ -0,0 +1,148 @@
+#!/bin/bash
+
+# Script to reproduce all BixBench paper results
+# This script runs the zero-shot evaluations
+
+set -e # Exit on error
+
+echo "=========================================="
+echo "BixBench Reproduction Script"
+echo "=========================================="
+echo ""
+
+# Configuration
+RESULTS_DIR="bixbench-v1.5_results"
+ZERO_SHOT_DIR="${RESULTS_DIR}/zero_shot_baselines"
+TRAJECTORIES_DIR="${RESULTS_DIR}/trajectories"
+CONFIG_DIR="bixbench/run_configuration"
+POSTPROCESS_CONFIG="bixbench/run_configuration/v1.5_paper_results.yaml"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Function to print colored output
+print_status() {
+ echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1"
+}
+
+print_warning() {
+ echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+print_error() {
+ echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Check if required directories exist
+if [ ! -d "$RESULTS_DIR" ]; then
+ print_status "Creating results directory: $RESULTS_DIR"
+ mkdir -p "$RESULTS_DIR"/{zero_shot_baselines,trajectories}
+fi
+
+# ==========================================
+# PART 1: ZERO-SHOT EVALUATIONS
+# ==========================================
+
+print_status "Starting ZERO-SHOT evaluations..."
+echo ""
+
+# Function to run zero-shot evaluation
+run_zero_shot() {
+ local answer_mode=$1
+ local model=$2
+ local refusal_flag=$3
+ local output_file=$4
+
+ print_status "Running zero-shot: $model - $answer_mode $refusal_flag"
+
+ if [ "$refusal_flag" = "--with-refusal" ]; then
+ python generate_zeroshot_evals.py \
+ --answer-mode "$answer_mode" \
+ --model "$model" \
+ --with-refusal \
+ --output-dir "$ZERO_SHOT_DIR" \
+ --output-file "$output_file" \
+ --dataset-split "train"
+ else
+ python generate_zeroshot_evals.py \
+ --answer-mode "$answer_mode" \
+ --model "$model" \
+ --output-dir "$ZERO_SHOT_DIR" \
+ --output-file "$output_file" \
+ --dataset-split "train"
+ fi
+}
+
+# Run all zero-shot evaluations
+
+
+if true; then
+ print_status "Running GPT-4o zero-shot evaluations..."
+ run_zero_shot "openanswer" "gpt-4o" "" "gpt-4o-grader-openended.csv"
+ run_zero_shot "mcq" "gpt-4o" "--with-refusal" "gpt-4o-grader-mcq-refusal-True.csv"
+ run_zero_shot "mcq" "gpt-4o" "" "gpt-4o-grader-mcq-refusal-False.csv"
+
+ print_status "Running Claude-3.5-Sonnet zero-shot evaluations..."
+ run_zero_shot "openanswer" "claude-3-5-sonnet-latest" "" "claude-3-5-sonnet-latest-grader-openended.csv"
+ run_zero_shot "mcq" "claude-3-5-sonnet-latest" "--with-refusal" "claude-3-5-sonnet-latest-grader-mcq-refusal-True.csv"
+ run_zero_shot "mcq" "claude-3-5-sonnet-latest" "" "claude-3-5-sonnet-latest-grader-mcq-refusal-False.csv"
+fi
+
+echo ""
+print_status "Zero-shot evaluations complete!"
+echo ""
+
+# ==========================================
+# PART 2: GRADE ZERO-SHOT RESULTS
+# ==========================================
+
+print_status "Grading zero-shot results..."
+echo ""
+
+# Function to grade zero-shot results
+grade_zero_shot() {
+ local input_file=$1
+ local answer_mode=$2
+
+ print_status "Grading: $input_file"
+
+ python grade_outputs.py \
+ --input-file "${ZERO_SHOT_DIR}/${input_file}" \
+ --answer-mode "$answer_mode" \
+ --output-dir "$ZERO_SHOT_DIR"
+}
+
+# Grade all zero-shot results
+grade_zero_shot "gpt-4o-grader-openended.csv" "openanswer"
+grade_zero_shot "gpt-4o-grader-mcq-refusal-True.csv" "mcq"
+grade_zero_shot "gpt-4o-grader-mcq-refusal-False.csv" "mcq"
+grade_zero_shot "claude-3-5-sonnet-latest-grader-openended.csv" "openanswer"
+grade_zero_shot "claude-3-5-sonnet-latest-grader-mcq-refusal-True.csv" "mcq"
+grade_zero_shot "claude-3-5-sonnet-latest-grader-mcq-refusal-False.csv" "mcq"
+
+# Aggregate zero-shot results into a single JSON file
+print_status "Aggregating zero-shot results..."
+python -c "
+import json
+import glob
+import os
+
+results = {}
+for file in glob.glob('${ZERO_SHOT_DIR}/*.json'):
+ if 'zero_shot_baselines.json' not in file:
+ with open(file, 'r') as f:
+ data = json.load(f)
+ basename = os.path.basename(file).replace('.json', '')
+ results[basename] = data
+
+with open('${ZERO_SHOT_DIR}/zero_shot_baselines.json', 'w') as f:
+ json.dump(results, f, indent=4)
+print(f'Aggregated {len(results)} zero-shot results')
+"
+
+echo ""
+print_status "Zero-shot grading complete!"
+echo ""
From df6eacae853a10b26659e2f59ade29100e9e3d2a Mon Sep 17 00:00:00 2001
From: Alex Andonian
Date: Fri, 26 Sep 2025 06:32:37 +0000
Subject: [PATCH 07/16] Update README.md for clarity and new features
- Corrected the dataset question count from 296 to 205 and updated related links for consistency.
- Added a new "Quick Start" section with automated evaluation scripts for easier reproduction of results.
- Enhanced instructions for trajectory generation and evaluation, including details on API costs and execution time.
- Improved formatting and clarity throughout the document, including updated links and section titles.
---
README.md | 155 ++++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 128 insertions(+), 27 deletions(-)
diff --git a/README.md b/README.md
index d4b2531..2366d18 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-
+
@@ -7,7 +7,7 @@
-
+
# BixBench: A Comprehensive Benchmark for LLM-based Agents in Computational Biology
@@ -19,9 +19,9 @@ This benchmark tests AI agents' ability to:
- Interpret nuanced results in the context of a research question
BixBench presents AI agents with open-ended or multiple-choice tasks, requiring them to navigate datasets, execute code (Python, R, Bash), generate scientific hypotheses, and validate them.
-The dataset contains 296 questions derived from 53 real-world, published Jupyter notebooks and related data (capsules).
+The dataset contains 205 questions derived from 60 real-world, published Jupyter notebooks and related data (capsules).
-You can find the BixBench dataset in [Hugging Face](https://huggingface.co/datasets/futurehouse/BixBench), read details in the paper [here](https://arxiv.org/abs/2503.00096), and read our the blog post announcement [here](https://www.futurehouse.org/research-announcements/bixbench).
+You can find the BixBench dataset on [Hugging Face](https://huggingface.co/datasets/futurehouse/BixBench), read details in the [paper](https://arxiv.org/abs/2503.00096), and read our [blog post announcement](https://www.futurehouse.org/research-announcements/bixbench).
This repository enables three separate functions:
@@ -32,9 +32,11 @@ This repository enables three separate functions:
## Links
- [Installation](#installation)
+- [Quick Start](#quick-start)
- [Agentic Evaluations](#agentic-evaluations)
- [Using Your Own Agent](#using-your-own-agent)
-- [Zero-shot Evaluations](#zero-shot-evaluations)
+- [Zero-shot Evaluations & Grading](#zero-shot-evaluations--grading)
+- [Automated Evaluation Scripts](#automated-evaluation-scripts)
- [Replicating the BixBench Paper Results](#replicating-the-bixbench-paper-results)
- [Acknowledgments](#acknowledgments)
@@ -58,7 +60,7 @@ Next, you will need to be able to access the BixBench dataset. To do this, you w
huggingface-cli login
```
-See [here](https://huggingface.co/docs/huggingface_hub/en/guides/cli) for how to get started with the Hugging Face CLI and [here](https://huggingface.co/docs/huggingface_hub/en/guides/security-tokens) for more information on how to create a token.
+See the [Hugging Face CLI](https://huggingface.co/docs/huggingface_hub/en/guides/cli) for how to get started with the Hugging Face CLI and [Hugging Face Security Tokens](https://huggingface.co/docs/huggingface_hub/en/guides/security-tokens) for more information on how to create a token.
Finally, the agent executes its data analysis code in a containerized environment. So to run it, you will need to pull the docker image:
@@ -67,7 +69,33 @@ Finally, the agent executes its data analysis code in a containerized environmen
docker pull futurehouse/bixbench:aviary-notebook-env
```
-See [here](https://www.docker.com/get-started/) for instructions on how to set up Docker.
+See [Docker's Getting Started Guide](https://docs.docker.com/get-started/) for instructions on how to set up Docker.
+
+## Quick Start
+
+For quick reproduction of BixBench results, we provide automated scripts that handle the entire evaluation pipeline:
+
+### Option 1: Automated Evaluation (Recommended)
+
+```bash
+# Run zero-shot evaluations and grading (fastest)
+bash scripts/run_zeroshot.sh
+
+# Run agentic evaluations with multiple replicas (takes 24-48 hours)
+bash scripts/run_agentic.sh
+```
+
+### Option 2: Manual Configuration
+
+```bash
+# Generate trajectories with specific configuration
+python bixbench/generate_trajectories.py --config_file bixbench/run_configuration/4o_image.yaml
+
+# Run postprocessing to generate results
+python bixbench/postprocessing.py --config_file bixbench/run_configuration/v1.5_paper_results.yaml
+```
+
+⚠️ **Note**: The automated agentic evaluation script will run multiple models (GPT-4o, Claude) across 5 replicas each, which requires significant API credits and 24-48 hours to complete.
## Prerequisites
@@ -75,7 +103,7 @@ See [here](https://www.docker.com/get-started/) for instructions on how to set u
We support all LLMs that are supported by [litellm](https://github.com/BerriAI/litellm). Create a `.env` file with the API keys for the LLMs you want to evaluate. For example:
-```
+```env
OPENAI_API_KEY = "your-openai-api-key"
ANTHROPIC_API_KEY = "your-anthropic-api-key"
```
@@ -101,7 +129,7 @@ This will:
2. Preprocess each capsule in the dataset
3. Generate and store trajectories including the final agent answer and Jupyter notebook in the directory specified in the YAML file
-Trajectories are saved in the `bixbench_results/` directory as json files.
+Trajectories are saved in the specified `trajectories_dir` in the YAML file as json files (default is `data/trajectories/`).
### Customization
@@ -118,10 +146,6 @@ Edit or create a new YAML file to modify:
To use your own agent, use the `generate_trajectories.py` script by editing the [`custom_rollout`](https://github.com/Future-House/BixBench/blob/6c28217959d5d7dd6f48c59894534fced7c6c040/bixbench/generate_trajectories.py#L239) function to generate trajectories in the same format as the BixBench trajectories, then use the `postprocessing.py` script to evaluate your agent's performance.
-### Hosted trajectory generation
-
-Coming soon!
-
### Evaluate trajectories
Similarly, to evaluate the trajectories, we use the `postprocessing.py` script alongside a YAML configuration file:
@@ -151,48 +175,125 @@ You can run zero-shot evaluations using the `generate_zeroshot_evals.py` script
The scripts can be configured to run with open-ended questions, multiple-choice questions (with or without a refusal option), different models, and different temperatures. To explore the different options, run the scripts with the `--help` flag.
-**Example: Generate zero-shot answers in MCQ setting with the "refusal option" (in addition to the original distractors)**
+### Example: Generate zero-shot answers in MCQ setting with the "refusal option" (in addition to the original distractors)
```bash
python generate_zeroshot_evals.py \
- --answer-mode "mcq" \
- --model "gpt-4o" \
- --with-refusal
+ --answer-mode "mcq" \
+ --model "gpt-4o" \
+ --with-refusal
```
-**Example: Grade the zero-shot answers from the previous step**
+### Example: Grade the zero-shot answers from the previous step
```bash
python grade_outputs.py \
- --input-file path/to/zeroshot.csv \
- --answer-mode "mcq"
+ --input-file path/to/zeroshot.csv \
+ --answer-mode "mcq"
+```
+
+## Automated Evaluation Scripts
+
+For v1.5 of BixBench, we provide comprehensive automation scripts that handle the complete evaluation pipeline, including support for running multiple replicas to replicate the BixBench paper results.
+
+### Zero-shot Evaluation Script
+
+The `scripts/run_zeroshot.sh` script automates the entire zero-shot evaluation process:
+
+```bash
+bash scripts/run_zeroshot.sh
```
+This script will:
+1. **Generate zero-shot answers** for both GPT-4o and Claude-3.5-Sonnet across three answer modes:
+ - Open-ended questions
+ - Multiple choice questions (MCQ) with refusal option
+ - Multiple choice questions (MCQ) without refusal option
+2. **Grade all responses** using appropriate graders (LLM-based for open-ended, exact match for MCQs)
+3. **Aggregate results** into a unified JSON file for analysis
+
+### Agentic Evaluation Script
+
+The `scripts/run_agentic.sh` script handles complete agentic evaluations with replica support:
+
+```bash
+bash scripts/run_agentic.sh
+```
+
+**⚠️ Warning**: This script runs resource-intensive evaluations that will:
+- Execute 5 replicas for each configuration (GPT-4o and Claude, with/without image support)
+- Take approximately 24-48 hours to complete
+- Require significant API costs for both OpenAI and Anthropic
+
+The script includes:
+- **Multiple model configurations**: 4o_image, 4o_no_image, claude_image, claude_no_image
+- **Replica management**: Automatically runs 5 replicas per configuration for statistical robustness and majority vote analysis
+- **Progress tracking**: Real-time status updates and error handling
+- **Directory management**: Automatic creation of `bixbench-v1.5_results/` structure
+- **Automatic postprocessing**: Generates all comparison plots and analysis files
+
## Replicating the BixBench Paper Results
-To replicate the BixBench paper results for agentic evaluations, you can download the raw data from 2,120 trajectories and its respective postprocessed evaluation dataframe:
+### v1.5 Results (Latest)
+
+For the latest BixBench v1.5 results using the enhanced 205-question dataset, use the automated scripts or the v1.5 configuration:
+
+#### Quick Reproduction
+```bash
+# Complete reproduction using automation scripts
+bash scripts/run_zeroshot.sh # Zero-shot baselines
+bash scripts/run_agentic.sh # Agentic evaluations (24-48 hours)
+```
+
+#### Manual v1.5 Reproduction
+```bash
+# Generate v1.5 results
+python bixbench/postprocessing.py --config_file bixbench/run_configuration/v1.5_paper_results.yaml
+```
+
+The v1.5 configuration includes:
+- **Enhanced majority vote analysis** with k=5 replicas
+- **Image comparison analysis** (with/without image support)
+- **Refusal option comparison** (with/without refusal options in MCQs)
+- **Zero-shot baseline integration** for comprehensive model comparison
+- **Updated result paths** using `bixbench-v1.5_results/` directory structure
+
+### Original Paper Results
+
+To replicate the original BixBench paper results, you can download the raw data from 2,120 trajectories and its respective postprocessed evaluation dataframe:
```bash
wget https://storage.googleapis.com/bixbench-results/raw_trajectory_data.csv -P bixbench_results/
wget https://storage.googleapis.com/bixbench-results/eval_df.csv -P bixbench_results/
```
-You can then run the postprocessing script to generate the evaluation dataframe and analysis plots using the `bixbench/run_configuration/bixbench_paper_results.yaml` configuration file:
+You can then run the postprocessing script to generate the evaluation dataframe and analysis plots using the original configuration file:
```bash
python bixbench/postprocessing.py --config_file bixbench/run_configuration/bixbench_paper_results.yaml
```
-You will see the following figures from the paper:
-
+### Generated Figures
+
+The evaluation process will generate the following comparative visualizations:
+
+**v1.5 Results:**
+- `bixbench-v1.5_results/bixbench_results_comparison.png` - Overall performance comparison
+- `bixbench-v1.5_results/majority_vote_accuracy_refusal_option_comparison.png` - MCQ with/without refusal analysis
+- `bixbench-v1.5_results/majority_vote_accuracy_image_comparison.png` - Image support comparison
-
+**Original Results:**
+- `bixbench_results/bixbench_results_comparison.png` - Original performance comparison
+- `bixbench_results/majority_vote_accuracy_refusal_option_comparison.png` - Original majority vote analysis
## Gotchas
- The BixBench dataset is large and may take several minutes to download.
-- When generating trajectories, the default batch size is set to 4 to optimize processing speed. You may need to adjust this value in the [configuration file](https://github.com/Future-House/BixBench/blob/8c57d3562044e4ce574a09438066033e21155f54/bixbench/run_configuration/generate_trajectories.yaml#L14) based on your API rate limits and available compute resources.
-- While the agent uses the local Jupyter kernel by default, we recommend using our custom Docker environment for improved performance. To enable this, pull the Docker image as described in the [Installation](#installation) section and set the environment variable `USE_DOCKER=true` when running the `generate_trajectories.py` script.
+- **API Costs**: The automated agentic evaluation script (`run_agentic.sh`) will incur significant API costs as it runs 5 replicas across multiple model configurations (GPT-4o, Claude-3.5-Sonnet). Estimate your costs before running.
+- **Execution Time**: Complete agentic evaluations take 24-48 hours. Use the zero-shot script (`run_zeroshot.sh`) for faster results if you only need baseline comparisons.
+- When generating trajectories manually, the default batch size is set to 4 to optimize processing speed. You may need to adjust this value in the [configuration file](https://github.com/Future-House/BixBench/blob/8c57d3562044e4ce574a09438066033e21155f54/bixbench/run_configuration/generate_trajectories.yaml#L14) based on your API rate limits and available compute resources.
+- While the agent can use a local Jupyter kernel, we recommend using our custom Docker environment for improved performance (default is `USE_DOCKER=true`). Be sure to pull the Docker image as described in the [Installation](#installation) section. If you would like to use a local Jupyter kernel, set use_docker to false in the notebook section of the configuration file.
+- **Directory Structure**: The v1.5 automation scripts create and use `bixbench-v1.5_results/` while manual configurations may use `bixbench_results/` or other directories as specified in the YAML files.
## Acknowledgments
From 28909d842bc492ecd99bab303279afb29e3cb353 Mon Sep 17 00:00:00 2001
From: Alex Andonian
Date: Fri, 26 Sep 2025 06:36:52 +0000
Subject: [PATCH 08/16] Add new results and comparison files for BixBench
evaluation
- Introduced new binary image files for results comparison: `bixbench_results_comparison.png`, `majority_vote_accuracy_image_comparison.png`, and `majority_vote_accuracy_refusal_option_comparison.png`.
- Added JSON file `zero_shot_baselines.json` containing baseline evaluation metrics for various models.
- Created CSV files for zero-shot grading results, including `claude-3-5-sonnet-latest-grader-mcq-refusal-False.csv`, `claude-3-5-sonnet-latest-grader-mcq-refusal-True.csv`, `claude-3-5-sonnet-latest-grader-openended.csv`, and `gpt-4o-grader-mcq-refusal-False.csv`, `gpt-4o-grader-mcq-refusal-True.csv`, `gpt-4o-grader-openended.csv`.
- These additions enhance the evaluation framework and facilitate better comparison of model performance across different scenarios.
---
.../bixbench_results_comparison.png | Bin 0 -> 85277 bytes
...ajority_vote_accuracy_image_comparison.png | Bin 0 -> 175610 bytes
...ote_accuracy_refusal_option_comparison.png | Bin 0 -> 198317 bytes
.../zero_shot_baselines.json | 50 +++
...sonnet-latest-grader-mcq-refusal-False.csv | 211 +++++++++++
...-sonnet-latest-grader-mcq-refusal-True.csv | 211 +++++++++++
...ude-3-5-sonnet-latest-grader-openended.csv | 347 ++++++++++++++++++
.../gpt-4o-grader-mcq-refusal-False.csv | 211 +++++++++++
.../gpt-4o-grader-mcq-refusal-True.csv | 211 +++++++++++
.../gpt-4o-grader-openended.csv | 238 ++++++++++++
10 files changed, 1479 insertions(+)
create mode 100644 bixbench-v1.5_results/bixbench_results_comparison.png
create mode 100644 bixbench-v1.5_results/majority_vote_accuracy_image_comparison.png
create mode 100644 bixbench-v1.5_results/majority_vote_accuracy_refusal_option_comparison.png
create mode 100644 bixbench-v1.5_results/zero_shot_baselines.json
create mode 100644 bixbench-v1.5_results/zero_shot_baselines/claude-3-5-sonnet-latest-grader-mcq-refusal-False.csv
create mode 100644 bixbench-v1.5_results/zero_shot_baselines/claude-3-5-sonnet-latest-grader-mcq-refusal-True.csv
create mode 100644 bixbench-v1.5_results/zero_shot_baselines/claude-3-5-sonnet-latest-grader-openended.csv
create mode 100644 bixbench-v1.5_results/zero_shot_baselines/gpt-4o-grader-mcq-refusal-False.csv
create mode 100644 bixbench-v1.5_results/zero_shot_baselines/gpt-4o-grader-mcq-refusal-True.csv
create mode 100644 bixbench-v1.5_results/zero_shot_baselines/gpt-4o-grader-openended.csv
diff --git a/bixbench-v1.5_results/bixbench_results_comparison.png b/bixbench-v1.5_results/bixbench_results_comparison.png
new file mode 100644
index 0000000000000000000000000000000000000000..816f854c4b0439d2fcd7678091866e09d43f5c1c
GIT binary patch
literal 85277
zcmeFZ^;?u{`!~ummXEsBXIZEy7$_no-Qcp2R_RtL5r*!x3j{4Isq?ybRzqpSx?Sto*w{8GpE-G+
zjcv^*Hnx?+|M?T2xov9s0)NT4p3-;KaV-U{Q1NFlbZtDA8c&O
zC;!p*jQ>ve@Z8$IRQt0jm2KaXRmV3}c$~ejZL{v_-luyHA5kg0e)w*MqtO0!5?j|_
zKRjA+_4?Iy`%nGzPqONrvnSSyx81zi#+bihHaNi@XCcYmxvD2;=)f2LHCe!^czK=t|
ztkT%FBU|U){cY`K9uC|1^*_<}?F%V(9SN&kUh+whaZ?$m73tMQ3RLXbj91Q$sor5;
zzGoF1+xsdR*Wt&COTSKpiCuWj=J)d*yJ_IS0*kS3tyH_3`Pl=WpCUg<0l)LRH_-$uowU^m0rq}*u3tCe?_?3DZzug0e%9J>=4hvfeT)*y`S;(Gd*r>fc{Ji*M!Jo6iA$bXb=5e}
zWk`cUD^&mc@4xTgyLYbn=_#FD3uW;uZD%&@5_`#9!Pf7g8p-d(`qh%6%Ivbu8PG3V
zyLRo=>}=Dkxrg$`6^dbDVfnr+=k_e4j`e?k_1@0KmGgi@@(4e_TAD_}`Qyj0tArnZ
zB4YV*I5p1x{N>Bfi%!%t1dWRyAvPGxd)9KdDo6l`pQ(u@?lop*>
z#pbz|6q@d{DJt9c$hf|b6*~9yrbdFy0JXUG=g*%<404m7K6^$95xuW6vwq#WmZ^a{
zQ31&-ZHFnHxd++zf83eKePrFXBgt4}8Rt_}u5{+6c+Y%KC$Dh9mDM}ankH`FEk!e5
zoMX0T{UvT^E&_Nka~-rnO_Q4ZJJT`
z$$Ohed#kEWhj3;c(RtBtdi9tf^V-n}J=2HW
z5duaM)qW4tCdwui8e+v&!^E8X&s5Cz_~_{*rA>wuQfQ^jvJ~^$V1o^7J6;AXv6Lpu
z8Nq`MacR-f(XMsvFLYF^?`-P$^5%xz;AVtLvN3_
z#>zkc`r$5R{~<(J)`jUm)t{sM=#bSIHLJ&emlrk8q0+{u;`RoP$PpQ3W#vQr_U&VT
zx-vADF~npvDbBh}b=<@PiFy2ZZ%C!l!oos*w6OVS_Wiv^mA)l)v%m1a+={FiQR~=T
z%Q~wu;gL5t)=->;%qr!KMwz!Asurm;#)78h(G<5{$LbBb67+sa<%j9mRJBcM
z+Ue&x<$h9jO7AVQ>nS%7FwFmSe;Z%<
z9+kxYh0?8jx@ry;Ox@J6U%!66ZwhHlVCxTfcKSg})e2#e{1u0N@jp$9%f5w0;<$R6O1K3if1Cd74KWn+tMj8aA
zr1Y6%9eVhxcAk~`w!>#%7cR7=YmIj14Y>u_e=F!P)he>5yZ-v|Rjls%jT<^Srt$^$
zbVKqp$Btd4u&@=~>I8}||NQnYrf>^8ds?O6{AjZhmyU%+ve7F6({lX>yTn`Y_y&6(
zIXO=cp9|@5>bSRsr|p^2CX-^veRPq!D4~vLAqu0qeq?<=(o{%KPUdD#_A1gZYO(DV
z`ohn}#%7T3`1QDYyi{|R^DzxC)baKr2eYWUAmRJ>@3&w}J#Tg!xJ%V+oPo5mc!YOp`
z)_I}Liymgs8?aoVktmlQyhnBv_0`$OZ!cev@MvpVh;EK4MSgH-DD7+Yov|l(I8|B~
z-CbStBL$37*KRpg!AyJ+-qmSj$h9yNR*{${LR5ZwfEA2-b;v}0?a%*d9UMPid8s=X<
zcRPo2MLOmfm!y4sV}dM?@{zNTubvLxP3rc@+gs~nB`C4XMAm3V4hrb|zsjzM;OU``
zW)v4oDeINCEPu~yYD-a#+8xNnmVIAuGGGnc-XzC~xAfZD+E@kG;j_2L#(OGgN5-)2
z#s)_XA99tQXd<=9pR0mNHL}m~n47j~EwI(2%?VZhy0r0htfI7s(ZXPygSYH%wiw5r
zwB`y5_13LhG1phGH_WlTwqkWXpujk_G(#^(;>nXIgQ$~@Uk_iiRn^%2$W+y&n8RER
z5GHGAYBFG1mhMbS^|+TJYR*XH;&@SSrZc`;ckOxJre3F`HPKB9^Yt^P&rw`D^KBj(
zjcLS)yxV$2`+cTC-r$Q%XDS>%Xwn^kAY{~wW~*;+aPiTtoU{{2Fr$Z&H
zMpr^g>hr>MtL8|gV!%>4S?ZKA?M(gVlc!FVEH0oOU(A}vHJ4PFT5aCESq&{es}#NZ
zicw;8^ku_BI~#$3@2_e_GAAqNC-UT*0BNE>fBtM2$j+9_VN$%3?aLFaBE!}gw`+st
z-CpvqIgTGcK3%;@kt}GcxtTvJh5f+CxcD-Gjlf%%xY}BXUe=9FPv=?bFB^s$6Iua*tQ*#>Su=(!
zZr>o;{e?P*_ubl4;gds4#N}@m!$UPHnnfGTO;w9|y8qPuVx8L0ugobPli0y@87t?=
z@Nf&+?F4A*X6irTl=n_H@<1gf<;xZwgleu7yxVC$`iO*tE>K{>AfNX0FI4Y|&y{{<
z=oW{SkC>F^|G9DZWupl!8Xt!xV80QnxZj*>L%eirtOGSO#(!>v2aWMC?c3_B-%k@p
zw8?FCJ4{lg}Pv4J8$&3YnBzxILJ(
z8)-=??4{!7YoT>?zgPxBId^B{o=j4SjXsP8NqO{6I7(?}8%q~;xhQ#h7K_tgUOYiN
ztH5ht{`0i$+@~ie1jjnj^06^(8agucG^>{8GHC2a*{&9a9$Ulq`h+Ci^$}JDw=xQ;
zo}FE0oin?j@h7xEZWGqIhY5-nS2LOck7)}_i}Mv**P=%Mnj6c>vN1C=tC;DsH4XZc
zZK*b4RjKf40Nz*fk$@|EWZjGTCwYlg>s)u^4
z7_TZbpF-0`jVT;7Iru4Hk*Q}Zvv=>Y5QVwtG^XtCjbxuwebv;0tQuk?M!Xx>PNSxl
zsX1FmxHQ?Ns6;p$U>%82UA48fV+W!HO-OmMO^^hLydTML@DzB)$|#;KRI4}z#D5d(
z*RSVBMfP*3NLx8LGJ5^z8{GB0?;li+JiKq;KViA!6O11}94TG-%6B$#>7oL!Sg}I8
z(9S^I?(^9V91`KxU*AW%fBW_=-+4e~kDTY(hYugFTet3fiSs~Pshe$tkePx}*BN>b
zFho9(dV;je=?@=15b#S7J?P7G&!68qdj^^HETwF2qbP?789>-2))3dNJ9qeGWDNiL
z=bv?J*Pgra=Z3cD=bqX52&G!j&y08FS_TGlDQ2T;ZsR?F65WV2_o;^YfnPuSx{95i
zZRC`D9>^~A;@GtnK)B(%#4bN3XrZ#QQo3si%cQ`4+;ESS<1uv0Bl<5NlZzoRXT(19
z*Dp^2qe3;nDQBtOSK6LGlyUzaJy`KFVA+2WJ5I)HDj6*`MbM=5^X+YpQNpL!Y~1|`
z{mb>+ziUm(J)6CSR;^i+VOrsR<&C-^f*Nn$ym@i<$@N%Chs43WcpV!u-zz!<*m?nPpySOql$&*!_1=@RAuR;YXSY$)zxGKWL?Ie_BE
zLF=$GR4#%zKoHGvecErXuhzk;pu4uJzc4I#QB-{+>foV6S5WG@DtwCeA2<*Z9v*Jq
zYu@>W72S__eE;s6Ehv$0Lx`^`8`hZ%qe
zuz+APSxQ`fZ38kO8!ta*8WlDAVndPz2>!9PCHAC;^g09ID3qlS)b)=0P^Sl@$ecudn{WSy1FR@6G@NvizJF5)y)C
z_<)6#owrbazAQGHv0|uWD66Pv%9AjG&e31Eo^HcQ&{Hc=t<-bRPVcN_KXLqcJz%O0
z_TYq*9g1Yps247gQimtqKycRa6DL|JW!QbG0n1B+-#hc{3m)zXNG!htt_(!(Yfk#~
z?u|;NZm3BuK|6%s?gmHh>-ZGCJf8P4QozMnLVq@d6<
zK674GMMWgfx=CWZt3Y&VX=$fIm5keHbcd-=!){r3e%0}Mw9|Zhx=i)#abS{$*Vk4y
zpgrU*#5>2jloQ+}ToaAuChGgkQLEhZr)!cCctO2&lAl%Bqet{BX#kzn(t(c=q7W!J
zYh&Z%?a*+n+R68ajJ>!2`t@sJhviiNhmw|wfF%Z3MuSotvF>Sg~@Y%d`)F8Kj!F?R{W9k*T;bE|~)rKC^|XpsYW|
zEc?EF3(CpK(Xm1)FC1VDHZVNPl6`0UZ^qm4h={bvG3rHA0l#AuYC}W+D+&!vG+=3g
z-*0)Rc|D<_O^NdAC^)T@v#cY(7ay|v&R;vj
zth|6nvIY#~n|}PeMN{7`oLB2m>(rJ*YA-*|4mTfArKDP;A~OAB6nug;-*i48mo3sVDvdEk2RT10$&e9Ef1
zt^!;2EW-jhnQK~4Pp%bp8B_~N>VpIzq9&HS#HW`XPijJLcu;Uih@__CV*Yr4glqr=
z#L^e*SDiPWTsz&iwjZ3qs%_n
zHqSB}Pb!;yg4Nji^wj#0(T)H4&(X<>*+iwBC&J4#Yw6t^U073==0U<}5n|XS9%bVO
zc~&V#0V*-zV{wV{K34jgsA3IZ=pyQ#nsE{W)k9*Slc-HG{FDO$0Rf^mE!;(>vO|w<
zY*P63SlWM~xMY5!O#Vs^stS=0CLCTa7XcEq0-A*+nSn`reZ9pl;@c~`e*Jn;0>G3g
zzgVmo6pPlrp>U4)zIZvW+?~RgZbChCnlFJ2@CJ`}U#q)SW4w)#5Fbx4z|NJ7>PLaH
z&i5JjG?lH;{7PD~UEcmesS>W)v4w-f7UIId%+EKJBKn}akD2e!TeZrqq)w{GNtEbR
z6>T2ooli&a_L}aVn;DO@4og08I_TxMQb4Y}tR5r#l6Nm=^O_X$1j-UO52IU)`OQuL
ze4dc$A2P6Y$owv3w`7B*7F8jGI^f02>P4ts>YYG-`gMMOeyzrbT`IUgMe!S4f{c(i
zt&pkz>Z#XIf=9PKQIZGWtl^q2PC!E}TU2mWsN@p03hFHL$QhSRk_{lJT3JQqs-J|GWqR<p%?CN7r4@%9cu=L7n^e
z&lgRKH*ei)fiC(e>(w8}>>sc9fK>fh|J9`{w8AV%b=e1$f-j>d5~Nh%i#x8Xr(4wP
z?-N;?XZrHOwYlk$XJ`*JP9dPI7uxAhp;aG7F9(>Uz6)f}8USNy_nW1s-B`0}^cK6R
zV@4KOL^{Z{P1g~_{O9CHP_2ntPVgthiDW*y=Il7FqffB^lDWAXZXcvkOPk1Bw{O2N
zE^(e>Fj}FYARBWjjWnmxTVWkj&IsYw&+y3Uz1cOheDW|wOHTT_FZb&lX7
zg{5ESAvE!t;r9;@VCzN^@dB6ep|Bh`_=Rp(>^gSAOthRUsCX(WD(6u)+-B(>)`rWd
zMwx_dw1d3(`}=pjGQZRAHwrmnE#nlh%SNeIc8OANqYCm$OeD
z)EKM#EWjC>dDZ|5R2z}3fFnxiR1^ePRFbZmU1}(Iry`E%s6Y;E!%f}F$!R|?SA%zt
z)kJ8WD8(ABv~+QXW>U#++xGkk*2XY!hK%?0^DNaG+;92$?|G!rC^hjFJ>Zn@noCsq
zd+XL@=!q{t8}0`MsWIrem)@e?nshByhvf=v+0O$-F0Q7QeD4yo=e
z_i_R7`}~aQa^1z_S(r!7N+>F!M1KOQzKpe7Xo-7CK(b0Ce>%9uIK8YxX2QwCt~tug
zzjL5?0$PU_{^)?J5h}B06KDR6Nm#yl?MA3y8gNHC1d6(yw>2FdvdB*&7#<&l7=X;s
zA_MT~;88Tg$*JgZbBdFoOn<$@rBa$3qN1UpA(<*{UNa1s@Q|Ahb$4W97N~&I*IH=P
zdMNH$qavs0muq74@8W-P)+ytcQPHlsOE}lar<9ok?-NlJot%o*NPzoJbGxc{@f|tf
zeCxIz4$XTr^v_sFj^6A9oiZ=0OyK?zz3j}_uaCW}sToD>k%5{Y+sg(ap&7c6nD_Ls
z8V$F%4LI=1*loE}*)wQ9V*SCit?Z&em^(&
zzcN;GE`vcc6Lecy&R3X5BHzSw9zvuFM9_3RmehtvG_bt!WcSKR(5)p!O(*1lGXbJh4~ZURr`|@YX~bM
z6q;y;5djb`4=NMZlBD3bb<37#ctcl~7FZ*dOKhQOUeeb#u^p=no!jVY9z<$yX055u
zxVRCEJYYi!j|pS*pr6AVr_m2mMy#??Rju%|Hra1A)zrkm#iQzELn=X*)sl=XEG%Nn
zWH`hwU)U~a96i{W>@w6C!{z_$&Gc|{^#I*(qWj)hch*yGU_B>M{{>H%{`U{XEQO4V
zFA#4G(!~av47&c*bEEdQ>>=4t&z?S2yR~jxCiBFJ6QcY;V%}T+``-(;462cXIH;^f
z-Ucj)23RT$q5DqQO>VkV{ng#$Rij79&D*zQu|qq%vh{KjS^bYR>(jN;S~APfTty)+
zW|OJV9gKfABhPieHw_WaTj-OS6T0gG+~S&8M)@Nx~dITcnu323YIiR%Gai8iIi
z**Y1KF$#XA(pXNhQ-BFp;GcsfLy38%CW}9+H;Gb)U_c$h`lMWfgODNx`i#XNSMR#J*X*RJi^b#FGZf@>|
zDO3roxy2ql_>AC;l9lq$Bz8&K+Z8mUrl^@ary1Eg_=XJRV`tH3*wa&d*{SJTwL8r1
zD*BQY3Zo&C7km$voj9Y_0Wu{@-sLQTUN7^zuxW8qEXs1)`axfJ447f`F}(KuUopKu>(IoK*&-8y#h1$6R3DDaXn
z5tQ>9@k<&A&*U@1CbE5=TmHIa%}c<@ri@qpMY5l?Hk}g%0u!@L^cvh5HOb+zWSpQ>
zJe25@Igoy*#aVB(FS4z=JDHkqqm}U;eLJ_!>NKIOx(?em$^E!mpyBMC>ouC5M#xBv
zq=RW;?JAg9t7O>N>>VSHZCbxRkJFZ3T9D=Y@m>eYNbK|Hhs}fL*G}KOc~i_>O%dJ6
z-mPN>DtK2-I3MU~YTwySdt}cF(gAQIyR{oo(i!07W$jczWy**2PoK^f_2`WxEF95(
z9vUO(<(%!B5l!bg0)FLglt2{eu@#}e#k&>FO-)VXPJQvS28{tAL3)ujshSNXjnk#3
zTw<3)byBg(sg<8Pv6b!IP=Ah8yF;fQqt3sNVMtvMtH=8k;ZK_Dzko7a5ArZc^-Mxl
zdqSa0_IMs>b;Cwz{VU`sNNG`-Zt#N#;#|Hz{)y#+Rq;wDln&1kMub0wL
z%Zz>w+-{q{|Ff^QtUX~7LKcN*)t-$?sU|PDq@DkDcdEk!9&wL{^q*Q_3Z<#8>84~i
zV4JQn?}xPRFY!m+mofm94PfgR<~u=jrg@xua{N}e%-19c<3z#@%m7H)$rRN{OU2}B}OCu}+U
z^OAi*oApXqQH
z4;Fqgt0t@W6CrgJI{XU3RyXHB#;@s4x_nXEl+gw
zM0nMof07CgtM?KT0~WXc+Ih*JHNakUXO(`5b4nK9Y+C^MimTUGWpDODwqySP5H~=w
zSmiN%Ac$RHt*>_M+-Vb3BI2*B(g_t*-;%EWczL#Jxgdf=&hw?xEB~nvJoZUjrO|N4
z*BCNK+vPbYU*ZATC;U=H6NUBnDq*(*zDgVax3-ox@t+LfBqN>`CMA)ZhoM(pF7>GS
zT^P|_B(%~r#G!(nak*m)1YIXvqUAd`l2$l{dzG!^)Cgy;uu}Gehu+$0=-RQUVcu;s
z?e816L?D8YmQ{)C^AvOc3WP)9aYuY&gzBBWdDy%r=)6#mtrs{<3iwH@Qz4se>muigqEx|1M?(UPQ58F{E;iwtZ
z+brvQC_92m-|E*54G}jwgqk4hkzLzc0uQ4|M*)^rRv>$j@bBH{ZLo)3l*bhg%YD9Q
zKmU19(h{ib&7+yH+|9Cgi#P^H#-Q+!3v}hn8Otb_Gnf5_0@rg}da_4IZW)fDCqx>K
zZdpj2b_~;s_lZ}YTqF=7#jB|tNLsK?Vfxbv-I&Zn*K#6Od20lO4E#A{{;{nuav27N
zO(7HDGpp9r^XqCO6Nvh+B2Cma_jY=fN5_p5%dJpGbj;*H25h6tW&cGiW@_is#2Eod
zn&C%uJ{nfRSJ-KKGGDtUx?OdlMqI{1mO@IXoCt@n-W
zt!G3an7q$#te@#bqsle83_zvs0=>8B8-X8LQQJI%O}_Q@QL46a5qvA3WI5fDth}X2
zkq?2M!FxQ3M-k;Y3I-tt2r7yz7KZ5JGIo6z+xw7m*-zWp*jU|rJi7Ode3S|d+%6Et
zjgZLE24_vF5SQJXH$R02=99k!ywq0d=VR*+fo3FlNZh({_q%`p)n`B~^1j_yReJvA
zg@1}ZF5(6i{*>(Xn>7ik$z{yWc91wn0~aDkFqz$a7H!)hBDcRXSBfIy%lQya`RU_>zXYzf5?XzYncr1ml5s<`;M$0^ZKCNU8a@a1-&>Zvt&gEa2e+=g(?46<@G&Sv~
z2y&s5W!N6Oj2@UP%N-)kCA5GZFIdNqj9g2Ed35+>NOfOiHz`>$dve?Sn1-V@+*s;kPkZs}_i
z<=uPIG?S>{@0l!on|J)D$Ycn%2CeX?=HM(CN6w=VWR8%eAM{Snq5;97juglp2&tsi
zy17nt8=%4#47SSJ+z%4BX*o%fheXT}aGB~?rgRrK#|8kE=7oQE)zDo4XS*K~GJAOu
z$b+VFeUJxPCF(8lmWcK1pxeW3{lJryp9(O}#M<`E_lKT7jYsF*JyDR1nn0cI%(HgM
z&jP|VYR%TTE)9_|Uc&D4g@$G5BOhB76h-gdyJuML`AU2ls2k<|IA=%Sf>#d$E0KbO
z?*oY$t!@Un1+cyDr7u<+MVWnQy=4o)oU3iAQ(sMCpiFrKOP-yBW2|5Y(o0IK*Q~KN
zN0pT08LQxTSJZ2So!h4{Y~WBWeL>f5IqI
zh>)>iB<%!7Dj`+d-D`*_|5X_^Rpa66A2G*N@%!$Bc$#e~o>nJy!e_D~jYxG24hg%n
zP)6KPibf#ZwnJ0N_xyRu51o&oY@PJ;2T2g9^lB^X(njT-oE)c4o-Fn)cOLk->*$56
zWaZ%j*+MKbgaQX!GxZLazdk&nS04e7&JigQOfqNCX60+~4^1=#oeVY5NXFxiK5IY1XoTM71}$z;M$Bb4*b1Gi-@y
zV!+O)n^v6)++O0G=3yTg986RYSNs-NpUyK*6W!9bU9Evqa@(L|s`GGjTj;w3$P_l6
zaZfCFDfE~$J$34o0Ry+^L(5PnOYrc>M}=a9g!DKPJEVIlye6#5kKf-Wpe~M~9ffjd
z85L>3Cekb&i$_Y*y~EhFbly0(_n{l^_g{5?u3DAs^Rw$fh}@+yo0=`?22;PHc|{x9
z(?|-2DjULMkN8`Lq7<#Zh4*jcEilraP>j}xUh#X8g@9}fi2z9{%y?E{W9$n@#x
zU8TrQ>J(hjp}kvYFi^g!5?7e3ZH=x)I%-3VD35wVr+`r9WC~wzGw=L6+pjCKyZg)4
zQ}}X{T!bVWYrk}Qt#QMC7fmH`C#)OkJ`h@kR4FBV12?0%!J1S6lFLVB5+?VSn36z-
zH}+;;h3U}vL1ie}jgj654jhnIc|2s@$@GfM=HBi)fZw`EPtHtsljOsM?Fb!NgcnBf>$
zlas+r)$(GFjt8JW6Mm+JR)1$TOE9^Oo{%~?HWp_;dGOk`Yn?l(IxanHI?}b2;Gn-h
zFP}b3M^`O7w0Rh6Nag-&ZXS%g;tg04n&!#dcg41bFB9>o=mKcZ^-P9vKb3#sNj*;HOQyrS_WF
zi+f$X>`fesJwjG9dp4Az@OKUh0iebS!5M2Wm_yqY$#TH%D~wwq(gHUL?d}}$octG=
z9cpzRP)14QG&Z|`<;s;PJSnanjGsT7A%Z>gnJ7_$XYM>NiehI#(vv`&FM}=F`SnBL
z%OSFrg{qT*Ur9Cp9dYlLxEby`P}DTr8<0b4?Ax1by;0@m>|yq8A=xs=h9VqrQNO%dUsi+o1GYHN>1x$
z$(fC97c$L+tvrlOQ**XS*_7XuaadO1B{Q>LF>_HXc7>)Gm!*xa6R-Blqu>=Be2ZW`
zO^nA%3z?@LEUk<=`=+&xJaQpiiism8!>n73d4;l>s`Rmjrt=
zl>Lkoyq|}Gz8bb7EtWU_jn5i_FBxN)DDK-B(i4EVG$HM}*~aPEA=>CJ+7Kc7fp5{;
z(!Cgi@32F8hDI&QzhhmHK&x9{FvzqU+!VIjAwFRt9Zh*rSO
zQyBHH#0jb$$}};PJ8Fj;lsDcvP&@Jb8zWwnWCCsHum$E2jEqaIhZ&y&xM;}m%9?lo
z(8IVUT;qP{P-tBIqn(i6i)wDDpE>gh-*j+9rGo*_g{X_WBMip2{>w1;h&e@C6uXeAHq<*roKjIQY`*<_uW(--6g)eQ(Fx2L
zFpA|hVP{px&g5IC0~TEY2!o$NQu>+eZ$R#TjDHyppe{dy%%P3eMTM&5Kof*&xz#*p
z<@?q%kIBA4sLFCo;(Y1U2_C+_%hJ6Ok(eBqpxaG2R^|y;l+oiSWgIIx(8A$c@un7)UQ`MW>3}tS_|DR}hGN~Z&cQ-3nhQi%Q$@Ggy{qec8^2%eF_R`$X`l{*5beSwv=M6Z`-zQ
zAh#!8=Fb@>Q(ZKhSZxjn%|f
zhuq#=A0w&*ey^;zd5o8N`{vEdxLruzg_gRGJSBAk=q?mt@?^ehkxb(f5ssw4*(`)M
z53{jZ+>o6pK2YuDfowfpVzN=+I2&PKUOBn$SdYrw&oM5NW*89qc;0Bjxw1tnqlu5l
zcs>2t(9Jfz??Wk+NHdF$A%n1S3j^<@AzK}i5{Jd!0P|fJeZPeiday7fo(ogY+LH3(
zWq3ahNt+Yv&O7v>+}Mg%mf$odv1l4AOhF=dY3Y89^GH%Xk01_(Uc3xZ#5Cy8ywv0k
z58P%m+>8*&NPx!#hGc2U*8t~JJ7z{Ln_AzP-N`Nf{XGM$kY^ntP#`u+yPgB
z2*t;boRbBeA~RS!sbBm0o{>d_2Qu3HOsTLodOxPVh~z3BfPQVDJQDv+F7sB|2?1mh
zj>8@baWfW&i-HZ`U^vmeda;VkXRr^qXBuchaDO$J6R}?M592tpbwfAA*UHW^nFWhi
zf1Ct}rO>Zq=0Qr8NkqItzPqhuV;!uYRm)4X(IpX{ZV-*aLKKrobBbjus3CQlDG{7l
z2GB+sn_+>e{J12>(gGx~^Z0JTWRzRhZ3ocrimghV_!gDj9rLB!VVAVCMWsfptS!bw
zC`nbWBQ3jahy-#3N{4vUP^TbstvimzXP_8eKeZ%B8BnoL_>l
zFnyleJ-g-~P37Po$s`C-fhR>xNL1Yb+xtKx|dQ6-W~$i&vl9%+w|N^E1X4_ydK0Ff)LXBX54rQNn`
z*9%Bnh@p`Z}b&A#VG|=|CX`Zdptcplk&EuXCCgyltx9Jb0g1oBpfmj
z4r}f!>>iNw*Ml+B1I)bJmQS>izK#n%@}##nI&+N6Q%SuX%o2^5voS~!wB@6i8%^G1_bHGsaLk{E(#JR72XBNNoSUQ
z#3YVG+3<6i+tjto;1zrz)4lX-6TVMOO#Pesw>8qUn~*3QmFITDs)^@XhGv&Jf|;m+
zxWJ!NEt8Q%1p+Up`kp?~BxQENAHSBLE={H}x&-$(d?*WGz|^^pL{*cTu6!
zkE*1&de5|^CHar$cH{f90ba+3o1fhe3~a?e3%nRsWJpd|b2~4m-}E``zK`y9e&Xnq
zWXg!P+ULaWzm!GQF1V|xskLpT0cx$^&i}q;t)Vr1**wHJ6|zu_Akd=VkqfNaLfBf
zBAnRT9fPP_op|VbLcm_jWRQ_UzEH(qUslq#H6<2zO{`Ctlp+~^7=~^-Op^?fic(in
zQnE$t+F&B(i}(1EUKV3Fp4`3@w>_r~7a-0QzM4U_!AbpKDE*=LLRV(IoQ6Er!F1|Z
zUx*R1p-DutbxGbczpH`JKwPMJ=BjJvR^5jO>q1uGHp|lBF
zLO_ue&*$7Dd>+@+j@AqVDo}k#&Bd(2W2yvLBjnMl
z`hh(`yWQyt@GQ^iKMuW<3>3trg?x2BPO7y0S2Cd7_448k_5-;zthiG5@fScTh9P7`
z06vXvs`xyS1vA1KDoAw@&liEKzeC6d*Fs9MDQ9`&{)Eff}7ab^?iXPZg+
z%44R-2}+k6B!fH$W5Lv3M45Vj$l@x4Fj7I0qY@DiAq34LU#LA49(CBR86@wR<#Zeo
zF_OUQtSSC@xBBZRk``znlZ1`3lWp+*yA91sTORb{n{yDNH_`E6H1b%(m!~+T@JLHb
z*Y%|h>Op$QrSE~feTj~0fH;h6SujyWFfK7-Iv$V(B%NRn|J8O0;gMz_v2WD*Su~ro
z2%cnaNkEi{q?}y%spT(_fhgR<2g+YbOIjwfFdUnP<{t5EDqh)I_Te(;y#GD0^JQ
z8Ocks86oXj-yXog*cFaGCp4zfu7ZrW)z$Q1CRQmK--VmeHzQL*0s8g^)uorlS>oGv
zNjW|ipLCQWmk@1Fxj!c7Am8k(sVxPI~JBEk=sni&GI@roYaT4?V
zaREts*|ZbWpD~Q8UF>-A_nwSZ4De4K+i5-xG&F{QNG_)>)T7iS(IL~DgH1_fT+XtK
z%EFi{a)Tp~$v@`Xd6QQrnJ~A%tHF|(-7ZZA49qSRnrH_LaJJJ_>6#eB|-N~p0
zxOJDHGpDS(T`Bu5v$6Z^LiQXS^W7ybrQ?OP?g{l_^h+(Ea&ZhER{&N^Z?j4q@U@GI
zkIy7y!+tx8Fcw3CG#K-^Hh#Pj!Yvu`D%ewI5rtqVnxOIvm|d}7<5M*fMn!fCnvBdX
zMH<<^^1g8x2n1L^YT^_fo1!Q+LFlhD50Un!g``wgbFocrUCG8KqEq5*dEmntG8)=+
zeh(=mI!ab%Zx9+2A<^ClhvuiZSXeb`7zE0)!-HgDEvsYY1)+OqG0-Bdcj;lc1Ci7q
zN(G8z44(2kXm6iC5}3Pyl1Gx$HJB?b4CY40`l=eu2fVN)vxB5mzz$*4-4YvsQCyD#
zUmqw`d6|gZS)!I7!_{ainy7oc@Hqr-1KD@SOF8XV?6#tZ}IzGaz_w5)Flts_9kO@40pS)ntQtT>X3uB;{E?*m2@j`UQN)h%kDeuXIlU}gG!
zUirqRGAIxBS$W9uZrQpud{BO>(>f^z^C+pjT#f2)S&&PJ;0{RUC%&0NEWO3{Q3p-RD
z`fskQTxdj-iY`gvY>%t{dem=z2A7qo%;+H}P{b$(RNA0nQxf1U#6q^P>Z0O!0867B
zRDLV878?a4XfswIL013X{o6)iexjT+eO8M%0Iw~A9#D$bYZU87WB`)A8FPuBlC)__
zt~dMFn3`jz7?otBH@^UFwWPblx^~DOnt-k!P^bi1&eKB7S1wjK{6HOX^zlV<$#L)B
zzZbKtuC8wA|L}+;*z0j!Em^G`z2+sBB!E}w2CgsaOGnZ;&ME;@rr%$fJ7rG-;nz)9
zd>wTe2__V-YaMdzTz=CJ5ppkxzyy&5=BXucqrxy!;pFvK0sHFU1n-Hv$ih7v_>u9ABo~rd0)KDY7GXP{B<1v$-yrv2LNk7A
zFpab`B4Bo;-fkz`+!Kh
zK5AlB7(!pUoEWg=Mf3TEansJJ_&@gY{GNW?P?B&=vL+f%LNt7k6xKs6c-t6(Hz6-f
z;`&;d`od8>=U^fXyW7G4aqA>>LLH8TIy7W;%tae74Zkp}s#M~s$FxH|CSgo!zW4Q+
zlX(^pT`TC?mY83oV3yE0GRvf_kk-@`(MCHku*L$u7qXWWH4w>GZu!a5vFwa1qGeLG
z;jL?t*d+0R*g*bx%|uY?*O(Hm=|)!e5T>9krIoF48)FLH1P*{RJ-c1Ugt)8@#Gr;1s*a0SN3AnX
z{PM^G0{EBav9L$+y_?9{6I*N{X{uqPtCMfULo_}_gXnn?&)ZeU>9??60eeDF==%E%
zGUNn&FytJ^E@0T!mZ8^_f2I9nTc&|%gYZAtJ8t|Bqc#uM>=3o8-x1Z6jz}~a&mt=X
zJx4}R(Fp3l6&kgD0Sc-$tyijQ4Q}lf;*U(cA{U$46qmNvRj<|>6NkTll)ODW9Koaj
zq#H!kRlKU9{)Zl-D)`MM8xiKKrsa=Nl-`~@F2@__RhisD5RSrAIT2U;y+AM%>(ci?
zGL^NF4BWJ4O&V%Ep6n{%Yp`|0r>%;EGyJ-sIJ&jJ@|^1#2Ed7=IWtZ9(GYFAi(k3@
z6+XA)8_=byRuiyN;>94+FlHDe_5mSI3?3aU4|vk9d^3RSahT|QW4Q;SmIO|jIEvJn
zB&rj}Um+t?)+Ycv%x%&g?OO+3XzNQFl~Vn5#&fV(CN?X&UagPh8_kM7-az}!WbY@xGe
zWg*WVOC|=StG+e<+|TB@<@eaomj{?1*30g3t5kiRP=zZOwWach^r77uq$*CtFQe(y
zlbQ_~FDixW7;cCpsu}awonn-G5^ubB_wJ8h(y22zQoy^-9Uk1hkdUfT_nb%0oZo-Yzs}a6(FC7F?|#_mpRPj
zgJeY#t0f43L_~kgAe)nj9d-RiRsmSQgXoju&CSjB1tjeg<>6V%(8FvK1A+I_awEFY
z`AY;nOCuSv69Qo!x0iPShE^OqIc}!)Ap0lg)(hRL^T%HadEb+KmUxHsa6ThQF_?N9$Ot}wIUwvRfZ>oXoRf0C)Xk)B
z?KHGMG5P7gc8MLXcIyH55X(u*kX?`;81d=5vUwP(ClLtk)6XlglZ!JU+bvBM3l3Ew
zV~-h2;#ZMuPY8|Zp)~d)lTEl|+1&>f5J+?ta>RQ;skK6n+e!5k_`PVM6KUivhgy)x
zf+%J-5XN4P2J(o7bL;{>l}Cm+p~^;MZ|bxEN8#6(8}*^
zUX0v)6H;`+Qoqj3_Tv+37`7^bX-QXDLdMm_>t`{;49cMB_`4PVE1Y8gLzmoPqMR!L
zE~z~_GEf&KN!&+iXo`{FL(8?gtS)PKV&wT+?m_L8Jjd
z`uk7%1zseS?K!>rAO2-lh6A9YA!0UC-E6=`=-%=zG)>Cqmv7{Lyk2Q@`w!)LZ!icaFLUbfg9rN|)!=kEMXG3&b
zO4B3+|MxE!K+-4!80^hGxKXp0L+{Zs$w3v5VQ9owsq;CY0I?`G;;n(J+$Vb|OTg37
zK({5|-|c6bL2la#bx_Bx2pJTWRtzltH5m#e^Kr>oc3C~R`uI6|hbo>ds&G9K8^12~
ze07W_3d
z=KkEU>p2dc!k`<&{jQg;v2i@878eEYyUQ$`-{3rUDmhhd8xC_C|8L$Y*dgRRA4G(P
z;U^&XrFU=hK{7*6X7!<%s}<&(2Q@>GCsRyhs0qU#ddOF6qg(?HsH3gon4@Eb`4Kd{
z)8Di9T?MMpV#2Dwk_?Gk^#;l3Wc;M3C&8m99c9=c?Dy=Ml;hssfPm!kO|#!S%_}5{
zL5#XSGwbvj0QfLJvw*5yaRLrGZLiBgD(6nQ4TNPwtPb>I_-}%|hYz1YAO_KcW}ONL
zN6HyEmDuq45I|3zK79n^Z+M0}pgova+M1PZGscYK~7Q;j6Jg|WEb2mcU}9owrNLuad3ueMZ`?YM!XCB_Qm+(yI6l0iE4
ze@>OdU98x1DbWQ-L*cblKmj6nY#st$jDW^>kPS0peG!A-P;0eN-N~KAJOGxH9`sVCCg~i2z$;ljS5CrBgkqbcDK^sCs&hjCVn#;iF`>kyWHf&Ian`Stf}
z<{c78#D^Z3wdxUY`roI<5bcVID2k8t5#vA+RTw>2gYbS-$xp-xMJ5-K?Q*#DC)Vl
zervF+W<1JF78!00Z2tY(s~NxZ>o}^AUxtXAFuH9ku~bRD?{u^k5=3KuhFI0_DCI1x_WOlH)pxA?eQ@YiSJWUA*aZm?DP?gc
zg&H}G1oRAML#MzkZ(}_rZfU@WrVMSX22ItN)C$1YOMr
zx#cf9{`>Z^N#7Lg{(T^+`MfLep2UjCi17{hAHVVM4`R7W^lchm-(-q@1cLyZAOB<1I+9yhcrwP%u-jQf``B{!E8e3Ix}hW?
zGVotNwwd_@OvB2hH~n?-Ni6)g*W#jG_^+M6{XhAe98wmmSl+JIy^R^^P3-J(&{je2
ziVfS)ba9!qFJyOx_4O)PcjWDYHXKVko1Jy(zVz<8w!UK-MLV$0FmCA>Jj1fHiDpHS
zLSS*Kj7UL_mliiaTl^JwIVkJ+_+RaZmH`$(M$C68xOtY9r9%gXKWQO{5%X*S^ucaz
zKL^JzSqm!`zwNO8OPfDDD5nB+Q5}Wf52s@IuW*q-C>zc%WZCHi(2{K9kOCEv%s39<
zEna;~{A$VHFLL3;a`q2iurUObRUW{HM-SHVUH-2uJEs}b(=q7MIMJ|RTqFh+w5%5b
zbQIKqj*Uwf+;Jc%_48~FPR`=K#^;Eg92LtnNAVI50S3urT77l4n7#M!pWQ^4N{x>o
z_#w;TmCIQ&_V2mID}zM-0d`@FU*TAbAG}bk%TS_3#qIZanS1kDKJv
zl&Ao8$>u=(^Xh;>*B)<{x9Z(jj>KA4C{6sl^lBaSrYzF&?;m})_)pTb*q6SI|Nk8%
z+uvW(U;oSGnB#x)n;!yT_fz=p$`jB)3rlqs7vKs8E2;Mit^f?)DAO*Tl^(Y0w>zw+D>
z`p)X+%ijmhfDNAja8Lzxdt5?sFQY0*2VkJgFQ6WxG9f)lRpaOv>8`U_2W5
z?!UvC-V1NuTH-gcZkx6@Vfqqx(&w9p%RcZ=^4<9-cCozO!n9jke6#o2eq-p
z#2a1XH_sk|r-(12TglCML%2Ez+e7?s@!wUK2`Lnr8gdoSNz>9;@*-(D|;Q8}z~65u&LkIR3B-WJ$nTGvtzTowp;MhJ^AoCMJ|?mPzXN;MM_1BkuL_6RY~=>L24!pDc%^O
z{?+T&olcL(#N+9N?2%iL4E9<%0IwxW2#lyx0q+RJX$;fQec+?KJQcd?l>sa(SBQ_^
zM&)14ErtPgyv}1Z?IcHrrX(B|3{lWFkkX+rWe9nT^bjcVO#}1~B)mjfHD1h>&;xv`
zdc_9udq%;GT9_tmM0CaE{^iCLqg!|GoL0gkk!Zx=0Tn(FM#{lX!7j#J+i>X-YwpI9
z(PsVXtA|)I8uotEh)*bxC|wcei?XU0-LqFbMKML8BE
zMM?1Uw(ztaY)C$VAd3)EZ14jCT%7s~D0kW0rv07d&51L^gan5*?lchzxX+=cVIlN&iwHa2H1W
z8%adjQ`+=hhZtsyM(n;Y;@G=a){A_wu?Wb@SP9G2jDd?gQDQ-Z(17Pn7h!nD3
z+O7eBYr%WKgJx7Kc9aI(AX2#nOC4&|?x!
zihClHSvBi){L8VKG0}zk4Q1~-j>>D!1Bi94>z6fzLH0ETII=n?rU9NJ^y0XfCm=y-
z#)u|8C{EBEhfy(zJuS@7LH1wttOczCG4$lBLGpj{I|ZXc>%C+It|)0kzvO}P9D`^!
zlIbg11GjRzjTmJjS$;K-69|txaOesMJ*0bsfC(GjIdCVn08;VKM_+wQO$+dLZ%kv|
zs1TOwYtBf{{q%kfy6ZnXM}Ya0R)K~{uv}rBD@tZA=*8rfeCyd3cZGYYzT_JomYpXM
zkT_rR{;6~rU0{X8WRPH}mHjg=|2-8-n0>{4s|m0d{M_jl>>^dG@wQemt}<+UEA|+n
zJA$0Ri^B-mZSP*&V2YQ-alfO?(UN!XRoT$AyHqfTc->-C60ln{yBHS-z!nN
z3l*TWB){pKgZ=B@qK`I|3uZF35c5~_V9*GHU*h)_6&2G`q_8|prpU6!DTgS~0Br!X
zX@7a?pf9TrkVJGT;mwQ5i*xGrj~;!YoT`C~
zbHN@WcoL*6JtlmRa`o-7TLHCA>YAtURwn;U+o9$LyKYBggoLz6>h1R!OZc?nvv`zh
z3;z@6ntxc%URZ#X(lPj-w=2u-x1Kn5EJAG#-K+_oQQg)0hF*Zd=dl_JTeGD#zF+K;dx^^_}O>kSE|sB)}d^Y3eB8S%&aYgD5JNz*6A
zhmSZ0di|j?iNK9CRMmkWDKLbAe#=!oW?g+N0yi#4p
zGy~ugd6;Z3xK{e{pZKtS2c^m@Z3sgtCiT3~-9E&%rt|0k?+~1YIKUu4yy8pPx1uE_
z_v2XVj2)XDvd1=JES&sS_sD=wcnxuc^`7IPr7&`!$
zedY!>!_@|?>~V172I}TW@|3NawugsL`y*nRIAWFi*ZIRd6P#bhrf$T!!_e-t={9rn
z$OBzks(usO;O3{;b}mHEC!y>W&i7X?ocQ0-me9G
z8=)c|Yrq6C0fm4HlIgj~Q*7wmi~$MKAZpwUMhCR&pv^jX+MoXz68Gv28&vQx&pkZ4
zvNm2z1B?$K?4Ry#6L|!Mi%Bp-)Fxg{RRDfaucCYELk{-gAS4Lm0>KS||70_jVetr=!5+eJ7
zk`CI<9-zdGGa16OoN4RSJ7CDj7M8cnubp`Wd}s@SImJlOF|wdXIfgm^Fq-w$Z&<7{
znWa}J>Q(hjo9Lm;e%`@1i9dMR0H}k#;o;M2;629&8kDHM1wrr_)E;NcwuC&B0DgSUNDv91zR$LO^Y2?
zi>23VshEz`PYIqDBVv@S?7S^qIJN|QwG2HQ?}Z~km9*Phshj8iopCyZ(J}r}f>84m
z&*L&4KV5?>=FsCbGZaxBWFo*3o{SF=jfg>}>oDZJF^)0mAAk~7q1C4n2#HAs9X-&T8pG&U1_X-Qg
zwt(IKq!)c;V6mFYF6e5MkvVWQ1C9bH8~EZYmXVbcZVOAMPyrxCu)(hdknRvGsD0~;
zLY*YB0^jQY|AZmjq5tumAc6L?DsCxc=-SpkLKeniHm
zcLsq(A7J_~r7H_FBiNBnhyH;RS?afZ-NX2iq~VZ{=fI%_0nIs}6D$ku9yBZ%7&S10
z9CC}8T`U+tOJEkw8;;O;bPr|`R0uta#wvD%D_qVtct~5i^iwxcwqw^
zU**ZgvI`3N!ze(V+xAHO^P<_m@ivb>L!{5aAYolLq_VMU$zw>HS*JqeR9>wj(>n5s
zZ)%5=2UV@shg%0G&@9l@eBBHrQ#Q!lj{ka0{9pGtkbcy(B@vNelp>2pKVpF@(os?u
z&X}P-TF>dWNGC3l#(4#GGHs0R?2u~p_-Y9UU}1RRKJp8S24ESe=J!@AEBEy{CE{L){)
z+GLs8>x1*9Fl6Q#Eaj$6+KjYst&36L6CR{Giu_#*8u)&sCKm0D_3TfPy}AspvW)z?
zfQod!kVTm?vaGD$-)tWCqYuHhBQGd~V+xD@7DeOw0Lc
z2Ezy}PTFT`_!rV((OeA#9Np{x%h6;FFEE?`M=FhDRug3f^hcwgqA|^AA%Swp&=et1
z9$VpQ?384wH>P7a;HVeV2z4{4t3tXmh7nguO=}J|W#P-+KKlp0+uEB3-zOf3hzQg4
z{482RKbw77{P)e*LRRG$6mW5Kmj>7iJLG~9BU?qP88|4h(zysEBk&=a$L*p(F>mmf
z%tZrBh)#NB_=1|wdBgRk@)ul*p~a==!PWBgNP$bI{LtO_M!1tD!ACa!m99k0e881E=}
zldLW&C8c7B0XDYv`IwR-=UNj<)b1~@g0SlBp3Hk02;s-tb`0#GzfIVbH9C2Z3wobt
zAtnZiKx62*m;jgqorJMm|0<_>FwDuHPH^@;CAg-#EpaM%{nY8TDhRyG@Dc{n+J2
zezA-nPkg^eWOcYkYXL`Ln`T#D5y*|7D=QrPUT3iag#22o2(=D^p<#hH;>3ploN6BmaY|Vy!
ze%rWov>Za8HwIWrUb|&2+I@bfi2B`mz9KuTyAn6kl@!08&avr;nAJX>prQTaa)}MH
zGCxHXayR6_V-WqI8dR^p+jOp&g>eX(aPZbscN`)YM!R$hv|TI%{3}%%8C!k;LMWg)
z1fCMn6sSkOD|5%f|MPcDd2{v_uUyk>8>kZ|y$r~GRKs`rDL;>aad`J)e)>J0c*&KU
zD)?U0dVV1xp_lC1=n`YE-adr7P%%?~4;9!`k06AoK@nWd(+{E+RGA4NEy2RFkwEfQu)Di@i~FC0YFyin
zQYB+tk_;G;qJqsEAWeD#mbq>QYGy#_i8duSadH@LP~8lO3CwT$m^0Arb3)Y^P{D0y
zpBJhK6ZE+1F0@LN-y=w`Q+E_>Ggxc&1Zy}k)sX0wj!&Tk%`
z@_H$7JLh#q!nwmAi>Emn6KZtC
zG%BDJ`xk#xubY*#Sg=@uD<8XqQJzYYUeQ|JGLxPc5*u#aP1_Lp)yDgCJGUTTQh$cR
z%S>Vd7L!KM-t@+YzP?$z6~_PYDZOmciU2^8-+v`z39AR$qB|#S2T-W0pbm^j<=<(i0u#OuhaVEpaW|0n@0l
z#D$0ae-JptnhvPbY@fuKDQposJ}iPeL1>Rb$VoX`*LN@UwNBza)U#TEZ%T?
zwB{`^K=leoDtSuAUayACa2=~6=B|k4EmIo6!481NQ}$EQ~lC}vty=mjL7sA
zz<(?QrM3}{pV9s;0)zw`ue~d~l6_|oq@bZTCEyps8yMm8>KRIT`@|0n8*z(hA<)*(
zY;$sTh>!0p;!AXT?5yn=j}(^n^IxUmbMhs(N0*r!U`|hCdSYY9tD)}h?h!Ewi4aK@
zg-aDZ(?y$>qGeTqo$QXLB0=VV_v(seyrTa;;9|P39wAm&EW&fJBcPy3=C=`BS#5(?
z6@N8&{qFx!I5B?JjQol3H(dI7*6RcU;7>eX2C=zWtMT`4@OThlysH89Y6VVTAkppMR
zl%l6r;90*98Aabd#@23%zv=)(DpTFBSJY>sz~XxMi7-81vZnyp9gAcd338?g>leaS
zI;jJi=A9@zcwT1G^b+)GD(Iq>@eLBWA<;+^Vno)=uwKde@L~CDwpbrP`$`Pe11o~_JGW-5IQZSEP`qAA!V(soL_!SAD(o>je
zt-k*oJb(~NQ<0tgeh@oJXRk%l%hA9`hB#_;vKK-;C&ZjatsG%l2tA}lhHccx{~o1N
zp3(|&M>Hrtb|^T%Ve({1zRW2s-PSyK&@IxTNAIQUY^BW}pe!`C4dg2E(z1KOLx9-vArJyR
zy{e}Qa1-zz_ZCjT!NiO*5F~Qt*O$P3M&Mvye-mje5@;UTjH6(K%k-}k0}YSC)3fOw
z#`yrq@FtLn`oCfWYz3%TwcS%r5!EEjCTr*_#KJy8l|Ti>G9Aa!c3AZ
zjBszs=%^nv_P3zIKMwqdq>KN|}zxybQw}xG&&Zu_IG`aZKk)An3J#I&Be0
z0X;#!Ziy+mT#bsq8<>vuXoJq6Z806iZ&xE#)Zor)SoK00T?^k^fjA3Lg>@-7LCGd|
zxcLLR8@0`!pHvBvgG4Up+p}}`tqQE{EW2|ofs~nA2n!&eWVA*Kmj((e5a$Ur`y47E
z2n0{jm^I8?jJVQKwF5`yYY_`4TUNr((Jj@1vW$Vy`xxQq#ncUB@7B1@PwU8l67xO`
zEKnJOz?Vlb07)7xdR&;OdW}v(8NoxtA3|4xUubXu_WOB;_oa8h>NC}i3-}#T$v3w<
z77CSPHuyFv!^k^=X1CVO-vjGE@y4s0^zf}$+wUCSd<(t~WF+$%BPXOw0S?`RqOk_h
zpFnILQpyy%VOtD#$$5dwCGlMUE^j)yi6-DMn!zn2ie49_zDdhR2n`m#Mp$2=*C2EW
zZMKOkl$=yFkW^ek2r_%x;pVlQp({9zHJjhB8+P;hbtCY#X$5$IhIdv3H|T%`Fze49
zDdyrWiBnD7wK;Utm2sk=0TRQLLJJNq^{{92+lwP?3AI2CQ5_eBsZO%wz|}p*g=V5*
zrUPP$`XY<8Dxdh4mYm(05VsMk2}7%3N~3f<65-9L0#ABjaYxZ4SgZ-E;`74Y=sas6
z8K6X<0plMc+ktZSVpY~tbpdzMwYrC25lNry)xAMt>>f2wemUG5@x&+x5lOtx`jA=U
zVL}zy(hi|>=fYWonZ$za
z)o=vqKQ93j#T1OEYbK;`1B8Jxxpn=DLT}vS$AAFoTp1N*&(UK?*es81!-=6!1oLnn
zQ+k~f$Tt4L#GMdHLX?x%m2TSbKB*bx?WNOFG&YHep~yj+rNH(&cS9d~X%%X2t9*Ft
zqF}dTls=3bow#n_NA$G%s;w1#W+!;dHy_YXY5vyxV$0H_-#JMj0T$-q$VW0xJbeBW
z@bXJ?RhX3Nd#bzQo&YH(sKkLwL`I`Il4)EohRsalvS@;S43l1(z6^lzd&trb2SZ*>
zOO+r527X?l=h0j=t0!Q>?!52>xR~=l{`g~B{*vU0Wkd+e?(q;d*I|1}$&td{0T_rt
z0n;>&vgoBy8wScve;YQg0jQ49Trn~Q3OMP`Y%7laq{@r8DWgt*^6CVhymibeyYxUd
zm^^7(RBy#HkELmU$E*l4qRG3BZ2l!G&^d_VN=}$zObG$D;kKPP-iqj+)$Qcek
zaP-xMhymMfl;n$eGQ?
z25rQtSU%kWp%i4ux)8+GMpo8H4CZp7sjhD?G!2C=!N_FFctRl_M?Rgb^7&YXU@`nU
zllF|#hNX}Ed;$;lC5^LbRM9}r6ly2D_=Da|+*Wz-D`W$lysZR`DV=zsu`3#P46>xx
zww6)C_d=cLXD4Y88j1>N*Z2`>%ub87#f7jdhPYwjNoc3Td%~#i@6}%~NstvXb}W2T
zPXhh+V%mSA0InfG2-hBw)r$2++_*+k7UZrtJH*2GJ;{e@RH2Kv6fv?7Mh$fcI}(`!
z+oj7Ja2gv9M51G>WmJY(aGXXij+qNJ_xuWnivWu^GQVMDDSu9EUv=^iD2~w2z_aj}
zXZ@X9w~}g?Gyp8|8<5=(f@i>co<_1#hTfaQ2+J5;@bBc+AOXK$iYaIupwtUT!0o|;
zs3mR+cU@zABgh-5yo;>1VQxLucF(XWpz_8{+ZY;j>zrDq7fK_$vLV7Ib*CuRr^ts8YLsaiW6F_H$00tDu{;*%L+eI7HRk9%O(o?%<&$ueaq6Y}k
z0Pt)=2k;XUcBtqDasC0y9yuuJG2a_;4x=`-h~Z$AzNcc%#{FTa&`5<}Y_;~)gBbPo
zJ11jx@>&Dvkd|Z@&5r}rVv|rGj&fGW+Wt5J8S3@AG3@pC6OKrX>PdRP3ATYhF!lIy
z>BDrG;nrQlcuWsU^i;1?*&ZBn(^^SpBLFse-e|+cP
zP;HqY!_$4nc-4c84bV;Xli498;xvj}Yc@SKMOr4(Tn~OkokWKejC_1&_F2a!AbxdY
zD7Xc9;BrnJXs6yuf1HXvr++#R`3wR{JRu;-hWA-5~)yb7G;%
z9EL8?)f4@S8Lg9`y)pNkLz{6B7C!iBHd{A@KgQ}JIWh;a`ZP-%0nR4!fIY-me&+)klJsIx95^9J1S7*wVQ_U=7S
z1+gvkDCB=ymy^W-?@r==nawz)12P1f*P`SZa7kLa-kl1&uZ1X(>nmAM)C&Fl(@$eC
zOYATXWDytmKS6_zs5c{=*V9eIAwyL4BLp%%Dp5^vLEeTArV3GD6R^uTCF~(M|8|;qrm}r$1+sKR%DA2Mp0i0XwX$j~*pj!KarH`I
zUr%7f>%kVj4SzV{<1QXP*x0Jl#H5I+t&Obfu`2df{KeuSld6)f@Y6^H|tls$R!
z9tUg_i)LAxT~2oP_a=#$pI7
zyX;i|dAD`UpUZ|n8mT@x2&5-?G?km2SRmg(eXH%+Cl2)>F^q_7F{q>o*Tmoeok5G@
zfI4PaK9<*YFGVbtdpZmUF$lpaL=s8l6;3$3x)LdnHKJN98xnX4d~2dOs1vt
z;DCr4v{ET(L0m9PM4D3cI%235_@+btv-F{*CCR;W#2rVNaN<*Xk*I_VXt_G;rcDt5
zNiwuR*xp8W7J>Fs)ss$e#k@8tY|&Gu!tt3homUD5Xko;E6!<3LE#;<;$CuuKg|Pu<
zuI_#SvEI(V$r?Qp)9>i^HbG!b)4`pqdEK*EBOm#j_}|FNP&D?iQ#xBMl#_DxaZGf>
z`_xj!ETp*ref@ym36vgw}L=LYWnqtic^{C>^<;?}OUFGm?y$9kq`
zLn)nEaf*EgrTEmY9iIznM2r)Xp
zR1eq*9X13IXPQH%ufN|Klo=i1uAS$#43d!9vp
z6Sm_1YX~17pbXq6vtDfAne$Bh4xNa(+{v6_pES3LlW3+~CE5+O<@{b;KLjjqalGv8QAJM}waC
z>5+^I7xRqrexO*nbK?nq6K<1PZE=c=(+?(3pIU8Zd6u?C!V!8ie{T-PE8*PV=^Qqn
zDUFRG`--lu+Ww%pxVTet61X+58I=$^5j67VKJ+fYB>sT?mw)WCK5pB0+;~$qlhN
zpY^clUjcI41rlD?%QXArD@I*}PPu*xK3x=zG-+*u*SNC#BTMOB`}#k-jI7)w5lPGP(`m+jtsZ
zy>C!$pMOq{Zu?|B{(iOPHtzBhpeT}4Bye_WmEur|F4Cl2>imdbUJ&!H$`bmKepC=`
zcJ#(Di_G3gE}MlyUfTXUe3rd-kjGU`gwBmc9TpUfNJ_HGC?_W*xu&eaG>x0}L@CEQ
zfEiaG`r7^2-r2b3O)Rm3xOu??XxYn9=w>w!>bQ*8n!>73#tUt-7+eXYV+MurOb3!m
zcpb1>NARPaP#aWWy)pDB+TOf*uw(vHLB#AHzh4RAtdoCy{0=8QAt}k|r8j6Pgt&m;
zs-uvjNF{f~#}t)NgHcp)qiI`nb5OOs7!O9}oxSuRv?^b=YFp)jREqh}rnGEcAIs=@
zV3y}USRPaf{)al^?n0I-}@fJaW
z7y|4_czk@E(IuScZ{arGgaQ4lb(mr=4Hgq7A0LnTD*)vc;UP`IQu8pk+y34|8G{CIi>$3A==ve2^r
z#oqJmxRZ68cd^#;;0wuRLUM_!n$gxcdZgO9Tdb|fK6f;Def()!uc{?Usf6!Ys@2R5!Q8PCIVg>$Bz5ssra>#gVu
zvMxVhFS77E`$pE*otgRf?*LxpMa-28gz58y6%&v0QJn1$uv7r^{B3(NoR6W4kL9LrCj{K)PdQP~BQ?B8ZxdME>stpIvhD?k4mH*Pe8suaONM5BqGF5D;tPMevTJ$?F=AMR#rw+eAn
zD6_^4d3a)?rT?tN!o9=oqXq68a|7ux)utQ-Ql4fvHq2I${@CP^hZ|h>!^=?{he!Ft
zNV21S$NW`ZK)lnM!IEO#1rztQehs-OpaGFWDPt>IDof0z^*(;0f!Vhc}rK0K+6I
znF}hyFza(ZSuF0UEjZJ_2ROJ0p!+w931v9{5V@or+T)g_!v3%{J#Tg}$nq5%8HINY1Y>;Xd({`7WbFy}^=FQ|eR{dWs@bwYA9EcMx?n`gpmg#BlqO1oPr@?-+(s(+xA@32tRLZ94nh
z)esI?bePMF_J)h&`xyZ3#Il;bpi4eN)GO75XY*@6O`DG=)Uf
zfmnXIa`Dkp)CR!L8AS?Rn9|bV6jxwv6`cEd$IMQIjS^r%xuvd1bmPv95*^;b`S+Xi
z3URbS2E8*VrXsibNw;p`Q>T-z3{@Oyzpbdn{Mgd>UDHWIk_;+I;rMn%*%ei&RkD3eq!+TVeN
zqG>P)4LTXJ`87-`xJ^}8DNgJQxq3jgb2ibN8Iu<{Ng@ja@
z!K?=n1ANo+KHjki$BEVYu?FJ}So3mCc$9-RiC|UcswTjg1Y#4Uznq#w8a~rFM^$5U
zw@W9N&H*#>Gh#o#0B=_wqL=&tu+J|IrT1I@^GS~5pzTt?JOs(JPh%;ZFml2~Dtp~9
z4tgE|lDb#z`PlS%4Kuu+>)7KRryrg<=~cCQfr^l>KdU^noTctK9dp5~=eL@rob``U
zJLHDCd^E6aW-pyqN={uc_U=Gt0wRTZFXpF+f9ck7XBH
zWYJ$)*u&NWck|((i0jX#=1L)oEOJpC`)1wmR8&z>Y1E#P|D+6|c=P&R!`A{e%p~R>
z{{!O0=d{OBnP{x8s{br1$V%4ZgENqPW_JU2S!no$>-*pKjll54=KOCwHFb5qX`44x
ztJI42$0{|pUVeNh32QU{oK&EBrqL?pg=@9hh?0lR^!d9R4)m~bN@zBS`v#c6m1rqu
z;h3`r6!T=fjy}8xvFKrnYfochnk0rn09Ww2F-khONA33e2`6EmF~ZqP9~&$S*u}!z
z^8K3I@EX=7Jn1$k^A?sML-{^M>enYEM}DuGz#(&qjRpNE9F3gxhh9N;ULefU
zxu5d+cm>3D5ctQ{BN`i%P=NZR(TZ5#Z;zv4N`#tBy*cQV(GjR-?Nr+qUfqEjv$t3R
zd}gp9yKZD^su|Z#3pxU4cfN}Iz<`~_0W-`k4iDc8U{%XbBMs@xsTR=pvQ2qYwF&(2
zVPD@rjWm7I>#(pe#i;>_!z+B!rbP*a_5O%-
zHTCt~=*&w%AM;=R@K&fv`0(Kl>|-s1JPL>W(Ad%$p8kh)kIRd6-lq%|{p9kreF4Ua
z15n^N_`9GF74_4!V)m`I$}0NkXUagj<(7_mb--I$rB2Jfh1D$*w@aFTtTm=Y>xFuP
z(W`-oyX)rN>Z7cJ8~M+@d>?!5i%N&)Ygla8kA8JLIixE@Q75)R%7BVu6P1!1XGW*5y%Bv%R>9VsfcYfICOyEL^y#FZO81
z5_W=F#XSz1ALK`0UJCdp8i;CWD2VU<)Fr0@M4nWXuB^ZeZKeEFCAf}S<&
zDyyh;BQKYMqz~fAK#`Y(%BUOh{Q>}FiYH`T>VL%Yl9wL%@&+_Ia@~_XuXm&Q(nfEV
zz#)T%KC}bvUI}DF$$x&nRS#&9zmSE(_tG$D>&9k|gI1ft-R#@9?j
zW+7J}%t7@63dT6_C3!8m2!?-A)L>Mg+1u!muh>Sfwo3BDH7Z8XQq#HMBe-aNoTm_B
z?S=^2u}bRGwXB5-JBli$H8tt{fwhS4EWPW5pdnvmHS
ze4lD6>|fVg`Ilh)Ti<0hfb9B0u^<-@(Gc_nRm;^iHTQAEO=`xE)8H^rLU-$hx=a*N
z%r9+(JSzmjkeA?->dMM{k&%%E1UVpns;6+Fg6+8XgB4b7cUIf1xhtjG7-IAq$AoKBH87ep?7~Nd^$0dDPx3GZ*qA*tHlg;E01iYvKZjJG
z-k5u4e+ST}Qn%T9w=Wm;En#cE#yx0|7aP0}NM0NtUjgTu)ro_Pq_qHR_YgSU8p4X5
z8gq&U%w`dhJ-JF3Hc<|2C@12dzrTeW;8ujk^Sg+37a>s^risT0ci4ugBl6aLg}TIi
zY;t+=*%RzOG4Yw4v>{Os`ILr8FqgoMt>zEFze*s}U%4wjQg}{9B?_G~fTUBpy6yZX
zVqFh(vGlS!X!8(LPv`m?UA|lhU(rDL0*GQTqL4}=Y7b9nMZ1`v9$FEfWqVnM>)|Dsn
zFCBt_hs#=!C0G}G_5iELQcGzgvD{|AH&Q?qFrDNQ)i0&H{jso(vfu%!AG=Uf`0kay
z$xkJYkrVL42r>R;nK73V-|Ui3h>PRV6=c5EGoVEGWstI}pabEM(bd(31?1`PL@UiU
zdE}EXFB14Eud%*bE;KXyt(6x(pBD>z@nu*zC{VdpQ(LQ#!6AQ$L<}FTXn6&Z1wS+o
zryd;q4p<+vWI-*KSL(ye;eS*;E!JX*Ei*PYW=32D=aY%5Sk=0y-Mru>7=C7dcv4bQ
z^Pc6gDF_D<=j--wT-|^O2|P$C@+tEJgqh*$laGBjB{rPxjsCdmBbpzqsQZch?soyI
zClCPiVu78=GM!$bI
z^G$3VmfQ3kMOyrwHVE~~8FI`^K=CAh3}EQOmwUYQ-w;pYFVkD1fGg)sw+ve$*mi-U
zD28bTg*J*Ci1$>OStChxdxy^G;U*Mo4Xws=MOZj7Qu)J%PUH#=s_~(c6$Tz>&dJG1
zD4l7_>*MsRqpv+(51(MSJqKb@WhiO!^OzrX&%JTk+`Jt6x1eFy$t=_-uM`vNYim1!
zXBSd;NadqQ`)vU$1mF%X9fs`c>qomcXD^4YwnfvrOppMFQM9+|4d&>69Y9iy!egY1
z5KlkkWasf1oiYggf+0Jwxs<C*OkUrq{2YiVh%7wrji9yHN05fry5L(6Lo6-zfR
zrwpX#d5ddY(Fk5t#SpyVjJ#tijGx0Or1hUGHxo8oggB5I@N{9_vedB-Ne{n=0e>h4
zl@2cyE7Xmn>k0uB@4|NE$_4U&MnS0$nK0ts{?x4YyJ7^Lq6}ikvm6*0X|e?*(y<0O
zr(!|O)2B(4aN)UGLa~*eD8Y%;g1bq>N@!AYj3~~iH!f{CR%AYIQ1Po3Y-p_899QCj0
zc(I1B6tbaUj)sdj*{vFlqB_wLn;MG|>>fnh2)JtYKv1%fu`*F#I+;j9Mio$9&=4
z!<>?SaoRkAeyQt()YMs{JY+86JjQ7P2h!2U7NG|43!UlMm(lk$Mt$4-pBr57ff-Jy
zb5uE}udmOGT7p`O3{3it&{M@tIhys(!E3C)o{u1i+FgkAs2k_Cf;$(QB$k#SmYg+W
z$ZP7{1#>&*Kkg7g;o{fd>@6>PfS~I^T3m?xgCDl2)Z+H}W2m{~dHY?Bch*2FsDemJ
zna6t>;PTkjwl7@AGuE%d?9XEJk;$p-G)b%owAs|x{^app<3u+=ECz^k?20-kFg{%`
z8?18n?C)5h>G_HAc776zFVgiNya=A82IPhsjRgERcO?zRF`zjP(Y-4YnV*Pz-+@+G
z>1OSlO|dYm;h8{Ds}m`^@B;3F!7Mo*n58uJm<83wJDCBnN|lx{LW5Iuy*YRQ<@h)*
zDwos;EU@$RmsX}~^_P`dPjK%1(-aB4!h3{0m)N$Fkv(NeKvJi?&YeFWi9_ll>O;|<
z+~rBRjpB$hbk!V{JT-s&ZI#Hmx2}Wpzo?cjqJtkeEbu&bpu#&
z5kla>Oz|=|>oj;(G%b&5gM6r(1E8mqwt!!;xC`niXRNtkM3_6Y6K5JxVz=d$T7y+`
z)29!KT0f8jkRTIm(DjrMLYJvChg%1Jt*?)IRq#7GGVlYkB!m~t#l1=69&lo^;!fhrjD46GODJu`p3WN$KzJBy#hB4p9)xFe?
zy1_LVLE1QZ_Lg@4`_z3#)~>9F6ygLFx{Qh%zZp$AGFo#eH83j8KdR1u-)M^7;BOA<
zK@NQn8q1WLN|pLcy&cq(CjJuVF<0)2K5;0Tm$lI}FYL^z_Dg(|#`2uicGC{mFLm)v
zSe>?RGV|6mfB4|oY0g_R!*<3pUQ@Rw=bE2-$avZ{3P(J+k#BC&uXgHbqmfkJnbPch
z?V)PT%iXfxH;x`UJztNPoZ@~wA#(i^t6((uUI`6Ga&NWfNoDKr@;R(ov(Ar>g_J*t
znlz3n{^?e$SsG8ch(tiY-1M5&64mE=UK>o`dOve+^H$mE(KR`ESuxS4oI>GApIV(%
zbdQ;3rK$*5$Q)zM9Ttk>ALW`#Y05Pmb@Gff<2+~h{X-S8RKAn8mP40PCuOZ4W|mF(
zi$BsGpTZWk}Am2}Z@2N8qVr(BQ>TuuhKZD#l8;?1f#v^g4|P0MW&zj`I@%_Wbj
zIUvgoLmNE7dg}vmkzvhkAVZwNx?I%4_gmdTM1}y_e^{1A&?{Ic^+rH+^#X865^BP+
zs|O+RG7Zf8=J|qJ^uE`(3$5moQh!vokABj2xMuS$hHcyA
z(Sr-C?pWkxGe6&$SQ2hXfPrDXy_2Sn@gjq
zhM1e8-2+Q4BRA*4YyOSas)2*6Djc{qYT8s9-K>Hu47K%}{v3Ixmacc9{YC9h-O6KH
zU8zI-gWVAq8pLx9Q!A>+dL>V!Jmhk@t6IJNPj2bqpAeAL2QVo#VIc^O2ru9V1;X`2
z?Hcs`I>Wu?jYnrD<`R9IkYl_ke%ntI;eh9U5wg_0EisE3tnTycm+T#O+6j*|wawua
z;+iPgX*8@Zcd=w58fn6ux_VfUcT^=3gTmu0q~Jw}b5yXMy3zbyv7ti3lDBZ`*m7+N
zn?$N4*aHmh2=o9JKCiSx_B3yvz4QXZ*+Ep`*4mkje7(i@lu4k2aj*lw!fy`79{4}G
zx{wD^`TkLNqg%vN%fwU5#r>o9m?)H1zb{oPs4)*|Gzd{ozZ`O-)oJ{NHP1Tjss_2|
zDyI`lxC&;RJcKl6CiX?EN7a2jq#qI`93sk?H%>nM$thLiMNU~`eWb%wpH1lQ#!4T_
z6Q=@&cp#3wx&Re?zSXqSe6X`?{TT1%BDZFb`IVXk0J9ys^rOdz859w(6_Nj0-QH>R
zyPc*p&SsDcN>PkIpmr31OS9~xAb2!89i5qqL}YZH)~k(*XU`^T4zi-WGW}?f!7W!l
zn8ZDIJdYb;JGD+*S-G~aZ7?miCpG@$U{|ym^N;O+^Dm>T*SxIhOg-46IXB_4D!Qaf
z^Q6&lqn(HHo*g`;rG4jmxaG>m3bkvE9$czUO5gd{%g$SK-ikKueC}Y})E5+0%zd)+
z(!0*2bDOqm{PKH=qE=AWuIidyC$)TU9gaIQUHpEA^}7#Ah_hb#N4xaZ#Q)ItJ$j4l
ze3pZ;(#?wMOHw+{cQ0#XNS%l+e4}*@g^#*b(KtHylsBOVFCIO9``h0!B=39I1xdA{
zK6fJZ{B%r$1*5B~)$2p&1d6SLDq6O+iUl6|m$q!#LI5VUn|$%jRZVqNis={SquktA1-1iwX)nm*RGF?ETdCR5xQ!(F++(E2LF+
zV^0{HXz)fPUS*-<9s!k35Vmku)DC)yE{(RS^MSKGY_voZe&!s!O1QE|cY1m^)&2lI
zLWo5#deONL#{mQrbRLuZBUM7~g2xkczbjKRj2&Z?4VP=a`^a+P^j^G~@7l^JG-G=2
z1Ts(H+KE3LYh%QfFh_*i$asNw*hLzJGyo3#D$Gf)
zGp^SIw4=X$E+2K5N$exb6zvU6dE*9hk>c|0Zx8K)s7$u^6AIcgDx*sVhSA1wn{8fw
zU0<@_=`_mlU_R$}YZxy!=bjOr=y~k9dI7NVs@ZPqzNdECg_Y-kX#1uC95p~zh?phl
zhN1?HayqLLiDm^P`KQ&mFgV4k34=hhx`@zgx%|K^>n_a$RO40JvrrlS5pc@A^_Uq7
zriel0w!~v!%n^Sg1s2FRCKMD`Rz7==$7GWh0dD!OiK^WH{!JI)b>`!r4nze(Ad#V$
zF`a>~ME$d3Cv+2>^>wu*C@@aU`_wYM@O~^*1Ri4t-z_U@N&2$_jGv!B~uDi7$p
z`T*vCU&m-_s8lsKXV(aVu)N!OVA1hF^|3MtS9M|MPag>5Ay%vR&2Rfco*#Y3K(!eN
z1oue_ISd#A``7FZK&?y)Wf4eqje*=J-`*y|4?VGV@0{oh^mQ$8`M4w0xPQmY+f^m1
zi{>29BEJr^zu5Nal`ub4`Bigx`3EQiLMgd-WkC9DWNGf@Jln6M)2j}*2gWuWKk5J=
zUbaAJu?%paGPG(oo`mzA1Z2i%_B~}Spbp{I>NQ-8cY#a3>dkd_s89%0lbxm_mi8Pv
zm~^LB2mpMc_-N;OG34pov71p7C7G&dOU%GGp`U*AnDf#D_`vvUpXBTJZ2d7!klR#{
zn1zUHztt5|U<0}#HXBY^s+5QD2(?t9uO^so0F?(5;Sfg|L7aj-L?bgzd9xaH1?QBN
zBch`Z5M@afj;3R81OfliU=gM|NZ2D={}Hd9t}5$lNeCGc62RBibG6JfyH{K^;Q0d%
zzz#-e--2tvCo>$uR^p+%H=a!pHi{5ya5-Y(i$akLgV^xz?J!rR2?U%mU?>#`XoEwB
z7zP}cAw_^ICl?pr9uA3I0LwFEQRo0h$CYvnBvK(TBt;E<_j|&kVYP
zV%XH=$hL3TAt_FrF65JWH74k7YGrSZf&(wnyqn8IF-Q`Rq8m+3IYf~vPZuX(-1BXl
zSX=;=w-}2IQ;|t`mxKqJwzbN{u{WA#p*ifjuOweYu?rBF^7bq=4|(WytqZ@qsfW?N
zh<4&wPYbu)$JB=M_Qw*bsj0NoLa~`8nUV@XCk6;lF}9}`*4Tx?YO@t%m~=6~NCb!*
zCBj5~vVzv1)zApUec&<-wmpwCA{&S<_CIW-h(KP2voKH{a^S#MU5aSycQ_Fff#Wpb
z_L1HJAy^5NbO9(q;uWQ-=RQc}03M|hdK+|-k-&X1nqL8Q*n6GOo1OT~GH?_Z`>@!D
zvZ&YNGXe4JA#H&}BBqn3dPCh;$)ngAgXm|>&3
zAiaR@>C$+7PRmL#)}2(f;QPKjX>WEB4-;6icJc9E7_7y+d4gpH+!QejzzlI+5E7Fj
zG=D*OXdefFoVOIJ9oMZRu%)H+WXpGrH%6Pqc^=bwq8|7FB!59ed
z25h9@;^HDwvWU2bm1df`hj8rp^v5cz-UvTBF@?dXa0|iHagENZ3dJUH?v>Li1|PuW
zr{P_Dij8YAu!U)6+PdeS#ImNnNRPt-dXtfc3RfW%^(RMKwR(xJg&H)3-R{kIJ7#-M
zFsWehR(ysHixI`qSJ6IVo`eEKYtw>2Nk>7eSoP52
zF9%%|9%Rvkrg3VF>z_dbhiORCg0w)sAYO4
z8kf3kY8QAr;>+mtGw4aX!B!i5zJ8mkV-EBq2rBl?mM2KiV49fE%1D@>O2CPBskfCE
zU!(GM-4u(9CG8?62ZS246uZ)I10*UktTzS#4C4>Qa?oNH`q(z*z`guX|2;4L`!&A$
ztS9aXdM~OmHOY5KF
zF~J@i1Nbu6;1YZrH+$rZV=$eBn@MmR>PC}zKxN5HJa+lAsd>7qR)}{7fIVr;jUZO~
z{WNmyAp_dld$gp8c(f`#R6tBXk&|5ZJh3J>CC46;{!2gm>%|9h{y4ym{pnC9@e#2~
zw-g|S;^o}^8vz!Gy#IB#74Swwu=l*d=@5$PW6XpEkydCJB0wa{@FVgO4@taIQtf#S
zpU!~AmsYhj>16N0w~J^&k=6^&8g9h-F!dis851N`grw2|sd*IZ>EW@fxrN8Kn%xJe
zdXN?_1!~2&+A1o37S8QkgB?#VzH)3pYTx=Xdi2B*f{~jiB9UiENP(5NmL$b|I*ei!
zEAZ`6k&y-&Q!SS~c&q^b7y^tcge1tDgVY)VK+-|8Tp8C~vK3}Gj(fghme4I)KBTIw
zAk{}{kVvvk`qiX^N5JCDS_8jjmVbV^V-cO}D>gF;GyJUIQ3gQunL!J;{tqS#=rPNM
z@t!pMFv9cPpvFWxa5Ndl+3P63V+JCo`A`FjWr4xGL$R1f`
zsG*@t@a(`^saPcO)n5ezGcrOnZjI4jx(%Y*>FX-JuiFO_Zd9<%`jouX?3i=U4?EQ3
z+HaGJA{yl1+vd0ORZZM9C
z6SZ~5ue;x+p6XQ4kwm2eT;zT~{F1#Lv_bSq3A(1Biv469bFo3Dxc%zeYp5r;ck)tXzu*&B=+#Gqkf<=)-oGLJ
z9a#0ayH)t6Xb_8w;HNhQlIMq(_LA!NC3&t(2O`+$E?+X{`jy2c<2Jf&N*Wm@_+9$C
zAlO&8Y>I;4?z=~JVT80tHW=O)10(}$0IdhD-h-K51-HJps~6W&4<|yuHa9DM=Qg{T
z5Gvmj#0Do^*@3pG2$QHm#QdppceMd;R{0@f;f+{qR^rj*feIc$#}ui`y;lZ=ciyI<
z$H@Ft@EEzFcw&r(&Np8aT(AhFxoWGL)2udzwv*jYY#17Jnr3E?y;psk?!g;joYpKIol0cX)pK1Vq=u
zXKLxL^y98fp+M6Zkirebhh|PE;MH!fWlNHsU3{#S<2EHME)ZLsCg!YY9ItEQGnGZE
zlBU|)+6F9<^U+dEZykcSCNTp^7yw)257pW}uB?uz66miRS6{~;!nR3sy-H&_}K7$G6!iD8L61#RMx8u{eXp4G-&+YX?rWdJYpMH-ZX6HFP@vfenH
z`~dVd5kioH6=U@TBQf#aWVch5oJjQaNw~1|hPCS7gz=Fm0!{s+6dgzlpl*qDo1a6&
zctBTw@yG60t8WhDi;j#uer2)a`J)kH&c*MlvMv){{Of;TCC?GfX}>I3AJ!LCQH+3_
z)5G7R6B3~3)O_a(sxv;LE1>qwcZ;vwSu?m|43G}hex#{Ff8G3@%Ob46!jc(k&K@Q;
zS}GudhsjR>VN2KbqYuZX-rI-qxjI|QwMMj_#}pXztIo7#f~^NziwH78G)Q^#L5rJJ
zZXN=bh{Ov0b02P61Bvd2+=zSl`r_(+rcMzI)pG@U3fW~>!9SBerh`~xWJ)aJ
z=J!J*n64EqD=XufvqyX3`_mZ0p(Fy41!=SRwU^vJV<<-?VBoAOxD(-|lqz>xPKcr|
zAW?i{JaM+e!O+sSyd6Z62w1j$bP}xaoR=JPaAcn%j>f6JJCH{cTsn5QIyl{UcpV{gJ~HC
z5$dgu(HXv=lSEQlzDBHHcjug!_JvM6f~z!{cfr=dn(Ip
z$gFu^f{)K>B*4zXpPT%&=&EU)z{S_P)iWbV3`
zL+|T%^`gQ#cn8_hDK4oY*}2#NF?;iy^FfcNdwk`d*VpGwYMccY5!2iTKRXblA&y>z
zI>t0JuLu{VC5P;~Ij=qY;lqnQ(}?S`Ad-6h`-TyUIVTtG6Qt{_cvg&|sP{?h@9Pr{
zugD)}>Z`^6YvyFE9mBE7&r))b0AL}v2Sl~{Q!RKNkbZ=tXofK;wMHMp8>Gn?AUDsL
z*php^p+ksT?k-B}r9#P0=Q{@*%V+0OqwnF4mRpIZI~ok%&8fT5r|&6}m-OgmLTcjE
zXQe{b(>4PSN~(2ql^?!LuyyTB8kPyFOFz-q%{T2ZR$^i>tY2`%Wlgd`%KJNXPdSqZ
zOnQ0_{V(jj2UJv9w=ImRZF2$@6zNt}B&jG-k`VzV=cphcS+Zob#Q-WwR6)UjNOooJiJWTnWceWC{-LmK_u)VD8)D=GR!u$ziS-LsH8X;DzZg7RW
z9*7T)>*FF{y~~4pi@ka>Am+&2krDCk;ewd%;jUqee%!H-h1*9fOD;@x83;Jn%R0w)
z4j;7ZA2@A`|ApOEm6z4_H*QFyIOA!*`1Sj?sjcg`sVfK;?CSCpF!kg}{Y0o|gDqt-
zHif(Ec~PYh+V$do0MgGYZ39gb#SN$Ki`BZkC#8YUi{aJ7>zuwVVN{$K6JY6_z|!%#
z0N|HxS)=FTAG@7@{K0f4@xazOq?|!S>k+|&9gv7sCC)p+B`FDP=$Mxag3tN(~gcqVF!RGxy_bwq}3G^Q$
zcPD;2Xd4FUJH!d2g%n
ziGSP>d94tLHoXQI2ZXlBg?7H2Og0OjbmOWLqJqWg5d`$7XrT2AP5dv^tVSy6YXqrU
zP}W#baUde#vqY5~ioC}(l?V_mfrz$$*d$#9nzcVcL`cd`XfIZvLB)PFTI9y932!+r
z0FG@&br3%iD3~wx%c*Ul&vYh&{hxOQrP{E1Vl$YZO|
zN=;3r3OTrx7BYZ0kDlRBM6qLtny}`fu(QL2$InCEk%4wY>BeK*+KAj)3_9=zwi$k=
z5+5*r&ntq>d^x$rruH(H;AfJxnhN5Pb2TIlA-o!-+5{;_deC6)L(Z|AEgbyVpbP4$
zAPd$lHWY6yPS6tyX)B<_APBt^IQoUCgSGn-@uZGtsHy{i0I)090WZR^!VnE(%w)|?Id)1A7(@y0&vye`k5=sW%M2@of}QdX(IqM>5dR=8TWktyY(|f
z{53lyGG`S!$w9jkw0kEp|AoaWx}_A~lC)4lOPwt6{CNPeOJbx>?PpyXJwd%;b9EBx
zch7$wn)vI$W+nj=wwYNh{K{V)sU{Op2+N=3xHk+h{uk42#pTFy3P8bWo-zWM-XJbD
zdts@ucp-EVZ6?BCRH!+%e27nd)$z9V*<6~FX<%T0aO!(69}hZ}B^JY4`5jpSIbf2B
zigf{V5(5^_F{-T0LPcTdij7Y1`K&Q9w6{cSfGo3uP*h)%*PGaMWzS_J;JXtGHXLgIDZ(^ChJHR)6=fdMk%NI>r@}b6;tRwW)0_9&9
z9*8=d=KPC;7?_?}-pp7OWIP)({a`}jD^gtahIQQHX+51xbwNUKT<~!3u!Kr|wMSbl
z_0=r9P!ZigBjzx4)%$2SUta?u-v#?s3lSpZ{fg3n*d-1AE#YeT*{T;XsIvJ(Zc9BdxPP{)CCx$UTDks#K?Dgf5{&l_IV-3?ZsP
zD|ArWZM6%Zu)~49xzQu{_z(^PcM&2FEGC^zE~SN_{-&$Kv|qFdOgtq($B=pv&M+1Z
zv6;Zyh}&*cA&W3mVidlB=DEc_LsTJcfON@)H#xUPXNBFue?#cD@L&HC#hrS4sT>5j
zGttoR5`ccjA)-$tx--zLhk@mwP#^{21Jsp#glS4HTc4$}3W(b~u(gtEg!E~+PynNo
z!3|L_$-apMhh$p7lr;wl3_^4U>5>awhKvNn9^f*}%@!tf+b
zpGaUJ+a`i(eK^EKrM*$KhA$Oq$rp{~%+2t;A4;VvMF=tkRfILSBH{x1iRQ@YW$eJ+
zNE&p6j|clIf>^6J-%EiE+kU+H20^Eo^K<5=awzCxt24Hx8Y0i+hg4+<=}#(OW14vr
zwdv6`)ffQfJ4uM>ak5GK1Q%5<6XpFbLTLio=RR^gsTgNfhHcwGM#yVL>aW?Ag61m0
z=xWe;q6(ad>JKdGA(W2o=o1%1%~p*}g?ZK7Z6b+!0qp@la@!*l3u(1mA6cG6#zi`-
zR7ek?aoRg)aAez&{Fabr^Mz))K0+!;!YFLJi!r_Y2oY-6D2ONgKwyT@smQD}2$BZ7
ziQvGbQ(Trn2rUGt#sVp)-@QMLG+SYPoSQ(D>>}ve4L&S!+CwP&ffOm|5Qs^P=Z~2v
z?HQxB(4|skC_izskg2Xz$yeI=l%c(7vk8h}5}?G$Atc_bcL=^3-aasoBCB(f+l**b
z{YF>0y!fj$NWaqaqYzP()l36&@4A7lIg7s`VGV0t!N86`&EcK|82Lf-2$k_Fkvt
zzvIt>AdDEm#9Esmc_Qs9CA<9ji!aJsi98HZ0v8?C0V;^raxl~+#uPz^CgfeL3;ZGK
zq~%Jr*(4U_TK0gWJtv#7%|(?g&D#vzk;&S)b+C5Q-WzKul1{w~5SRA3nVh)*EGwe-
zEp#?Ra4vBfrIt%~$zYGD=V#_W#5oR<3=*&jQ6bdeLO)JA;NhDac1hHM=*-TsCPx6!
zZyQ?Rti9_P4zo1Pj;1Mxdp*A^5Scqmm}E#KZB9Egc9K>;jCK;_D6#mc9R5&$Pf&=&
z^)BnYJe@|x6A306>}HmbLkcT7S7OED?dxyRf18GS^AU3X6d^3vX{2Ux6LvFMr^$Rg
zxId^JhURsL|gPx@KO%+r9Vfgy|%+$^*gtM3BVw|49h
z(fDt_^kz}0$RD_~5I-}AC%x?`cFRc=^PIsAm3GE&{yftdCsI>>s!n(wk^o>|wy}>b
z(RZ@o0XkohgDBT(H~yFgcvu9?R(s_*DZL7{XSqncqq4YFUrjrKj-*-msE?$DXt62Q
zGNFTwS+sH=Jy6qZ(}c)_v9<}3ED*}FtQnq7-HB}#{EPy9F-M}C`nt2)x!c?*M{G~V
z8Y~|x9zW__Ynn6^b?DG#ecc;(YGgDYW{SjXWt#p{QBoqN{iLzIJ9k;}LlGkdPuW%E`7jQO!W*!e3!vtJw97OTbl8c`{#}#+QAM|P6ufFEf@S`r<
zy6r#elZPKQRh-wA>T0rjXgbU7nkgvYnnth=;f8MBh=_0;h1=>>%G9JsZEDxUp2n8@
zwsq6ajb`??Z*B4M!-acusw;V`TY1+6_mv*Bi*aGDQx+*V3v}EzmLGLC(C+I4=ap82
z4;ypL(u(WOb{m+qoCi|qeY?X#J2m_f5J2^9YYTnbN1LMRDhf&>i6#~Mw)t2xF#g>z
z5uIDPj@L*bZ#HW=joW8}9_99n#sXW$*5|qVtFi+A4a%y-(;ScR1<~3tuL`c7_0*o;iREU3c2c&S8`GW*rrgAT%f6o~
zOr*=I$~BfQYoBJo^zx9}2{dhNQS
z4m>$yH}AhZX0w^LcI`%1k5zJ-PWx{@(C+YSKUSyxQF${ZG;+MLDaxLXdtYkOHz~!T
zdIkOFV4H*PMNu7fX1!p@%YAxhZ!V#@Sz*{L$@KT)&M(uA0i9IR{AKe-2DV2&$uoLW
z)5rP`Z>gOP*MCx*DlydR9QUFwt3RgQUaiU?pN`O1nN6#;$~8V^tT{W6Z+cE@)(0Is
zm^*hwLG{cePw4x+pabU38lyxIFu8v+mN!i|&PK|sJv25Y`+HVSQ)TXprmNo*^Puzg
zpRx+{PM)z#qFi;?9X8}lwg}CCGr5djrP`ueO_e?QRh5o6zo(|Lmrj{^qoqdo%1AS{
z@iE(c5lZfw$iQIXApuTN^B&pC@09b#BEk9>&NZ4ezDg-qR+DE9m>7z9o>$LezTYBd
zNM6%~rMG1N@aIhWisqI*y{>m3_+&E*w@-0p=cc5N^6N-g{IQlXEj9kf7Kz%a?Xpd;
zrEX|<_q@w6jK18Td810WKTES7(!k|$^)$WH(MJ6S1L!3vPf&tNHuMkJ$Jt1JmiN+;
z(=?ef_;mHy!8@K=^^KI9%ny}W^k;SUr;QEm6K~(_p<40c*@I!TF#ijkB^!il4V!#*
z7&cFp1V71r5&3v@XjN{JdQaP^uo$!W@uv@7&Kb+F>~Z#&o$?MIb4V%APAS>YZupcc
zdr(j5-{oIXOZl})j9uY#{~iAePih~Az1*|FT23CLRMDa9=lmt$tS5x7y9hi48#mGy
z6PK_vi|acQw8j0KCp%jYF$YTougm{Qu@(z0>DjddeR7JTARltRv$m92!H`}Fu_QZW
zT;*sd4%I9{2PP%PG6%f1T{xa*WC*}cP&)C_tr-%(D{aZ&05o~CPrJM8aRla|!17OR
z0`2Y0gVIs+K!EA`=yS33dNGM)pN!9Wf$p6F&_}dv
zK@tBf7;|zcxp7OKNsfJkGdmIISD6v4i!EZ&B|Zw9o|rCO>-!|t0d~o
zM`{mln(QmrudAK#Nqm31TVkrkcwJG&llQeIB9}kw);ycdhH#?WNqC60Zg$Aq=d@0Y_zGP9?K%4WNrg}ud{ZTIy`$35|B
z_2wyYjh&NysM#EMX|$3^`QDpUQt><;F{J`9|k
zn@TkES>{(`CpBe98|Sfv!9j24B~Lc3s!Yw~fL}$Xib|JSTy+3Z+JTtT0ea#N(H
zSW&0nqtt9>)4s2JV`8c`zI<2fUX?qY0mB+LrQUsLth>0{70AbWZyfVG^g3ws^4XPKlFoN8njBf`7?#MNHfq0r
zX9bOYR?H1Ibz02XYdjpgtzR=iM39Ud75V4%ytxnKL~Y1dV&>45@4&o&RAJ
z{(_UrDCy0vNrG)Nvqg)S1))DbRpF2`E_Ei4wY;B^&!ZT@E&*viUsOois@pb(=O*k97~U>uthIK$+IP3K
z=I*i8>b#qN88x#LFIn=$W$%)1fxgt{2xrIEzUKa!G)ZI0IJ1L#X2O%-Gh>3X250%4
z&NX&cCpCmBdpS8--oVG7bjTVSnxF)Xs!vS}qj5>WgkREu9PD@IlrYWhv2_)mxl~mx
zU|po?Ao#@DQAnlM+*G_ogtsp+uf4baX^Tr(KR*`cgUWhdk+`X2H}0lOmn=^-E^+n6
zA-50Q;J;x|e^pOp*37~OG0u}EhTF%X70=$
z?wHn%eKA`FtS^#&QeVk;Qv1Vyt#IYo3~UR-l6x2KwP*GWFbG&xl5+U+t=;u?sj0=~
z+oT^)yQZizOlQ?*ei_@)9&y2)(iv5bTkE*;Up=~DdM>A6b)YD!s
znP;33^C;L4{Un{V6BxPK1&pxdGhkWgzjI|FraNK*8NR#cEcVZCxpW`gJ_DE;al9KM
zBveUvpgz;?<}q`zgx&=3-3AU#(XXMa?oEwBBLY>%5Wp4EoGNDZ%4#i}lT`alTN7nr
zU&|qnxJUwPIHDrL@Zk;FH
zIJekPMF!oIz5R}+YAsD;N^c5^$`gepW5a<>?KR9Co%GOY>{Wc2W_-j}d?YtVK|@ad
z_lT%7H@x`6^4_Iuk1x~fZjtWReE5Ws&kSw%%xZz0dqt6~$&N9kGMM`w#c@|L$2^HP
zvUXM{EWxNb;ILIKy`#}t#YZdIek?gh|8gwCXiSYl=dCJKn^Wa?Fvqev__(94bf@}b
zt)U}k`IClOY~@J_y<^i|I>)Q>D^|x`e*XUB8SjF_977a&F|Ty9m&r%8Y%I*Gihb8I
z@fyxqod|AkN3Zqq8PhQ}$HUbIGR?ge`&%R%>Ar-;U%J(Vcwa$_&Sy&1<|gtCo|4bM
zxT8m;lCLdD|2j|Z25GRrm=dT>NDkL)x_)SRC{;#^Id~R{Z=Q8Yo{iXU$h^S
zkPO=IH8KBdcIG>5rP+9;S}YV)}~(uFC3J=^P|rh%(@GJCt>z~aWK
ziSJe?>I!D~bxwZlTe(17Y6FMtGb{ompY{-t6b@ythU@5i4M7A%UxaW*APB*NqLOF{
zdj^vT>h?fmGoQBaFfMofv(Pm*){JI0Xv)8Q$9NZuRW+=%UBtkI$oHF@XsA7*Qy$
zmQ!TZT3cPD!(9yOL@xE(JzK
zuWj17IqsT{PkG9vV`H~vF9eSwUV;*Ld%mNaJXcg&+1$4-`nth4p_*{;R!lw2h*Kr~EvvsQo@DA230JXQy0
z!WQ>BQQqbFv((hKEu5aqzY#_RdI-UA;d9JsbNW{*Dn8;tM=Y{l19_FqBE849d&k#O
zO?L_XiO}*WTNW^MYeyjY(xk=3LUaVc@L$WVp9Q9WD0uwCUZNu)7ndF~R??9ovJhZ#
zL?1>>49f&&JfHk=37?1T6Ft5e0YrYKI$oazZpIi2h7$u+Ti+T(WGASiFJ>>H2Uci}tnoPweMW$NXb9xM#iCEy+Bt8Z7^y3D}=1xUwJ7cD8Y
z#2Kg=N9=RZkV%j5$3g?O!@6O1?eIy0mJ^b5A0X_p^X{HBo@H4>3
z00=4AnWnzx6)5$CaT-i$Z{meVKKZmtYEQ}$izjwv5hQuGo