From 962b70f1abdb714cc826a27b5a8c13893bba91f0 Mon Sep 17 00:00:00 2001
From: Alex Andonian <alexandonian@gmail.com>
Date: Sun, 21 Sep 2025 00:14:25 -0700
Subject: [PATCH 1/4] Refine prompts in prompts.py for clarity and consistency,
 changing references from "questions" to "question" and updating submission
 instructions for answers.

---
 src/fhda/prompts.py | 45 ++++++++++++++++++---------------------------
 1 file changed, 18 insertions(+), 27 deletions(-)

diff --git a/src/fhda/prompts.py b/src/fhda/prompts.py
index 786ce04..4796b67 100644
--- a/src/fhda/prompts.py
+++ b/src/fhda/prompts.py
@@ -7,20 +7,20 @@
 
 CAPSULE_SYSTEM_PROMPT_MCQ = """
 You are an expert bioinformatician and seasoned biological data scientist.
-Your task is to create a comprehensive Jupyter notebook named 'notebook.ipynb' that analyzes data to answer a series of Multiple Choice Questions (MCQs).
-The notebook should contain all necessary artifacts (plots, tables, print outputs) to fully answer these questions, structured in a way that another model could use to derive the answers.
+Your task is to create a comprehensive Jupyter notebook named 'notebook.ipynb' that analyzes data to answer a Multiple Choice Question (MCQ).
+The notebook should contain all necessary artifacts (plots, tables, print outputs) to fully answer this question, structured in a way that another model could use to derive the answer.
 """
 
 CAPSULE_SYSTEM_PROMPT_OPEN = """
 You are an expert bioinformatician and seasoned biological data scientist.
-Your task is to create a comprehensive Jupyter notebook named 'notebook.ipynb' that analyzes data to answer a series of open-ended questions.
-The notebook should contain all necessary artifacts (plots, tables, print outputs) to fully answer these questions, structured in a way that another model could use to derive the answers.
+Your task is to create a comprehensive Jupyter notebook named 'notebook.ipynb' that analyzes data to answer an open-ended question.
+The notebook should contain all necessary artifacts (plots, tables, print outputs) to fully answer this question, structured in a way that another model could use to derive the answer.
 """
 
 CAPSULE_SYSTEM_PROMPT_QUERY = """
 You are an expert bioinformatician and seasoned biological data scientist.
 Your task is to create a comprehensive Jupyter notebook named 'notebook.ipynb' that analyzes data to answer a user query.
-The notebook should contain all necessary artifacts (plots, tables, print outputs) to fully answer these questions.
+The notebook should contain all necessary artifacts (plots, tables, print outputs) to fully answer this question.
 Take your time to think through the question and the data before writing any code, explore the data rigorously and defend your conclusions rigorously.
 """
 
@@ -175,28 +175,19 @@
 Remember, the final notebook should contain all necessary artifacts (plots, tables, print outputs) to solve the task provided.
 """
 SUBMIT_ANSWER_OPEN = """
-[Use the submit_answer tool to submit your final answer as a jsondictionary with keys as the question number and values as a short answer]
+[Use the submit_answer tool to submit your final answer as a single string with your short answer]
 Example output:
 ```
-submit_answer({{
-    "q1": "Short answer to question 1",
-    "q2": "Short answer to question 2",
-    "q3": "Short answer to question 3",
-    "q4": "Short answer to question 4"
-}})
+submit_answer("Your concise answer to the question")
 ```
 Remember, the final notebook should contain all necessary artifacts (plots, tables, print outputs) to solve the task provided.
 """
 SUBMIT_ANSWER_MCQ = """
-[Use the submit_answer tool to submit your final answer as a json dictionary with keys as the question number and values as the answer]
+[Use the submit_answer tool to submit your final answer as a single string with the letter choice]
 Example output:
 ```
-submit_answer({{
-    "q1": "A",
-    "q2": "B",
-    "q3": "C",
-    "q4": "D"
-}})
+submit_answer("A") or submit_answer("B") or submit_answer("C") or submit_answer("D")
+```
 Remember, the final notebook should contain all necessary artifacts (plots, tables, print outputs) to solve the task provided.
 """
 
@@ -215,10 +206,10 @@
 """
 # MCQ
 MCQ_PROMPT_TEMPLATE = f"""
-Here are the questions you need to address:
-<questions>
-{{questions}}
-</questions>
+Here is the question you need to address:
+<question>
+{{question}}
+</question>
 
 {CHAIN_OF_THOUGHT_AGNOSTIC}
 {SUBMIT_ANSWER_MCQ}
@@ -227,11 +218,11 @@
 """
 # Open answer
 OPEN_PROMPT_TEMPLATE = f"""
-Here are the questions you need to address:
+Here is the question you need to address:
 
-<questions>
-{{questions}}
-</questions>
+<question>
+{{question}}
+</question>
 
 {CHAIN_OF_THOUGHT_AGNOSTIC}
 {SUBMIT_ANSWER_OPEN}

From 78e11e281c772f8bc1db8eda1644d03b00107d22 Mon Sep 17 00:00:00 2001
From: Alex Andonian <alexandonian@gmail.com>
Date: Sun, 21 Sep 2025 00:17:42 -0700
Subject: [PATCH 2/4] Update submission instructions in prompts.py to require
 answers wrapped in XML tags for consistency across different answer types.

---
 src/fhda/prompts.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/fhda/prompts.py b/src/fhda/prompts.py
index 4796b67..02f330d 100644
--- a/src/fhda/prompts.py
+++ b/src/fhda/prompts.py
@@ -168,25 +168,28 @@
 """
 SUBMIT_ANSWER_SINGLE = """
 [Use the submit_answer tool to submit your final answer as a single string]
+IMPORTANT: Wrap your answer in XML tags <answer> </answer>
 Example output:
 ```
-submit_answer("CD94") or submit_answer("-1.23")
+submit_answer("<answer>CD94</answer>") or submit_answer("<answer>-1.23</answer>")
 ```
 Remember, the final notebook should contain all necessary artifacts (plots, tables, print outputs) to solve the task provided.
 """
 SUBMIT_ANSWER_OPEN = """
 [Use the submit_answer tool to submit your final answer as a single string with your short answer]
+IMPORTANT: Wrap your answer in XML tags <answer> </answer>
 Example output:
 ```
-submit_answer("Your concise answer to the question")
+submit_answer("<answer>Your concise answer to the question</answer>")
 ```
 Remember, the final notebook should contain all necessary artifacts (plots, tables, print outputs) to solve the task provided.
 """
 SUBMIT_ANSWER_MCQ = """
 [Use the submit_answer tool to submit your final answer as a single string with the letter choice]
+IMPORTANT: Wrap your answer in XML tags <answer> </answer>
 Example output:
 ```
-submit_answer("A") or submit_answer("B") or submit_answer("C") or submit_answer("D")
+submit_answer("<answer>A</answer>") or submit_answer("<answer>B</answer>") or submit_answer("<answer>C</answer>") or submit_answer("<answer>D</answer>")
 ```
 Remember, the final notebook should contain all necessary artifacts (plots, tables, print outputs) to solve the task provided.
 """

From f1b045c9fb524cff52b169f18caf14fa52bd0f51 Mon Sep 17 00:00:00 2001
From: Alex Andonian <alexandonian@gmail.com>
Date: Sun, 21 Sep 2025 00:26:28 -0700
Subject: [PATCH 3/4] Refactor DataAnalysisEnv class by removing unused mcqs
 parameter and simplifying answer submission logic, enhancing code clarity and
 maintainability.

---
 src/fhda/data_analysis_env.py | 75 +----------------------------------
 1 file changed, 2 insertions(+), 73 deletions(-)

diff --git a/src/fhda/data_analysis_env.py b/src/fhda/data_analysis_env.py
index 8e649d9..27daead 100644
--- a/src/fhda/data_analysis_env.py
+++ b/src/fhda/data_analysis_env.py
@@ -1,5 +1,4 @@
 import hashlib
-import json
 import logging
 import shutil
 from typing import Any, cast
@@ -10,11 +9,10 @@
     Message,
     Messages,
     Tool,
-    eval_answer,
 )
 
 from .notebook_env import NBEnvironment
-from .utils import NBLanguage, MultipleChoiceQuestion, nb_to_html
+from .utils import NBLanguage, nb_to_html
 from . import prompts
 from . import config as cfg
 
@@ -35,14 +33,12 @@ def __init__(
         correct_reward: float = 1.0,
         eval_mode: EvalAnswerMode,
         metadata: dict[str, Any] | None = None,  # used for NBEvalExpt
-        mcqs: list[MultipleChoiceQuestion] | None = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
 
         self.problem_id = problem_id
         self.problem = problem
-        self.mcqs = mcqs
         self.answer = answer
         self.eval_mode = eval_mode
         self.correct_reward = correct_reward
@@ -74,80 +70,13 @@ async def submit_answer(self, answer: str | float | dict[str, Any] | None) -> st
         Args:
             answer: The answer to the problem
         """
-        # TODO: support various eval modes
         self.state.answer = answer
         self.state.done = True
         logger.info("Submitting answer and closing environment")
         await self.close()
-        correct = False
         logger.info("Answer: %s", answer)
 
-        if self.eval_mode is None:
-            return CORRECT_MSG
-
-        if isinstance(self.answer, int):
-            try:
-                answer = int(answer)  # type: ignore[arg-type]
-            except ValueError:
-                pass
-            else:
-                correct = answer == self.answer
-
-        elif isinstance(self.answer, float):
-            try:
-                answer = float(answer)  # type: ignore[arg-type]
-            except ValueError:
-                pass
-            else:
-                correct = abs(answer - self.answer) < 1e-4 * self.answer
-
-        elif isinstance(self.answer, str):
-            correct = bool(
-                await eval_answer(
-                    proposed=str(answer),
-                    correct=str(self.answer),
-                    question=self.problem,
-                    eval_mode=self.eval_mode,
-                )
-            )
-        elif isinstance(self.answer, dict):  # This is for mcqs and open questions
-            # Check if answer is a json string
-            if isinstance(answer, str):  # type: ignore[unreachable]
-                # Process json into dictionary
-                try:
-                    processed_answer = json.loads(answer)
-                except json.JSONDecodeError:
-                    return INCORRECT_MSG
-            else:
-                processed_answer = answer if isinstance(answer, dict) else {}
-
-            # Loop through each question and answer
-            for question_id, agent_answer in processed_answer.items():
-                try:
-                    ideal_answer = self.answer[question_id]
-                    question = next(
-                        q
-                        for q in self.mcqs
-                        if q.question_id.lower() == question_id.lower()
-                    )
-                    correct = bool(
-                        await eval_answer(
-                            proposed=str(agent_answer),
-                            correct=str(ideal_answer),
-                            question=question,
-                            eval_mode=self.eval_mode,
-                        )
-                    )
-                    self.question_rewards[question_id] = correct
-                except KeyError:
-                    self.question_rewards[question_id] = 0
-                average_reward = sum(self.question_rewards.values()) / len(self.mcqs)
-            correct = round(average_reward) == 1.0
-
-        if correct:
-            self.state.total_reward += self.correct_reward
-            return CORRECT_MSG
-        return INCORRECT_MSG
+        return f"Submitted answer: {answer}"
 
     @classmethod
     def from_task(

From b59fddaff44df9592142f864068185a171392dea Mon Sep 17 00:00:00 2001
From: Alex Andonian <alexandonian@gmail.com>
Date: Wed, 24 Sep 2025 20:25:41 -0700
Subject: [PATCH 4/4] Increase EXEC_TIMEOUT in NBEnvironment class from 300.0
 to 1200.0 to allow for longer execution periods.

---
 src/fhda/notebook_env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fhda/notebook_env.py b/src/fhda/notebook_env.py
index 6ec9b8f..bb4365c 100644
--- a/src/fhda/notebook_env.py
+++ b/src/fhda/notebook_env.py
@@ -113,7 +113,7 @@ async def close(self):
 
 class NBEnvironment(Environment[NBEnvironmentState]):
     NOTEBOOK_NAME: ClassVar[str] = "notebook.ipynb"
-    EXEC_TIMEOUT: ClassVar[float | None] = 300.0
+    EXEC_TIMEOUT: ClassVar[float | None] = 1200.0
 
     state: NBEnvironmentState