From f67b432d2fb319815a3aef34d89add357f91b3b2 Mon Sep 17 00:00:00 2001
From: James Braza <jamesbraza@gmail.com>
Date: Fri, 6 Jun 2025 14:37:34 -0700
Subject: [PATCH 1/3] Reusing extract_answer_loose in accuracy_reward

---
 src/ether0/rewards.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/src/ether0/rewards.py b/src/ether0/rewards.py
index a86bce7..911c94e 100644
--- a/src/ether0/rewards.py
+++ b/src/ether0/rewards.py
@@ -19,7 +19,7 @@
 
 from ether0.clients import fetch_forward_rxn, fetch_purchasable, fetch_solubility
 from ether0.data import is_reasonable_fp, is_reasonable_ring_system, mol_from_smiles
-from ether0.model_prompts import extract_thought_answer_strict
+from ether0.model_prompts import extract_answer_loose, extract_thought_answer_strict
 from ether0.models import RewardFunctionInfo, RewardReason
 
 block = BlockLogs()
@@ -702,14 +702,11 @@ def accuracy_reward(
         reward_info = RewardFunctionInfo.model_validate(info)
         fxn_name, answer_info, problem_type = tuple(reward_info.model_dump().values())
         try:
-            if test:
-                answer: str | None = (
-                    content.split("<answer>")[1].split("</answer>")[0]
-                    if "<answer>" in content
-                    else content
-                )
-            else:
-                answer = extract_thought_answer_strict(content, reasoning=reasoning)[1]
+            answer: str | None = (
+                extract_answer_loose(content)
+                if test
+                else extract_thought_answer_strict(content, reasoning=reasoning)[1]
+            )
             if answer is not None:
                 # During test time, see if full SMILES string was given as input
                 if problem_type == "valid_mol_eval" and test:

From e86f8f558f7365ad022044f71af31ec21002ddca Mon Sep 17 00:00:00 2001
From: James Braza <jamesbraza@gmail.com>
Date: Fri, 6 Jun 2025 14:39:20 -0700
Subject: [PATCH 2/3] Moved README baselines example to use accuracy_reward
 with test flag

---
 README.md | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 68ebfff..9368d4d 100644
--- a/README.md
+++ b/README.md
@@ -163,9 +163,8 @@ from lmi import LiteLLMModel
 from tqdm.asyncio import tqdm_asyncio as asyncio
 
 from ether0.data import get_problem_category
-from ether0.model_prompts import LOOSE_XML_ANSWER_USER_PROMPT, extract_answer_loose
-from ether0.models import RewardFunctionInfo
-from ether0.rewards import EVAL_FUNCTIONS
+from ether0.model_prompts import LOOSE_XML_ANSWER_USER_PROMPT
+from ether0.rewards import accuracy_reward
 
 # Add LLM prompt of your making to the dataset
 test_ds = load_dataset("futurehouse/ether0-benchmark", split="test").map(
@@ -180,13 +179,17 @@ results = await asyncio.gather(
 )
 
 # Compute rewards
-per_category_rewards = defaultdict(list)
-for row, result in zip(test_ds, results, strict=True):
-    reward_info = RewardFunctionInfo.model_validate(row["solution"])
-    yhat = extract_answer_loose(result[0].text)
-    reward = EVAL_FUNCTIONS[reward_info.fxn_name](yhat=yhat, y=reward_info.answer_info)
-    per_category_rewards[get_problem_category(reward_info.problem_type)].append(reward)
+rewards = accuracy_reward(
+    completions=[result[0].text for result in results],
+    solution=[row["solution"] for row in test_ds],
+    reasoning=False,
+    test=True,  # Use test flag for third party LLMs using loose prompt
+)
 
+# Aggregate rewards by problem category for understandability
+per_category_rewards = defaultdict(list)
+for prob_type, reward in zip(test_ds["problem_type"], rewards, strict=True):
+    per_category_rewards[get_problem_category(prob_type)].append(reward)
 for category, rewards in sorted(per_category_rewards.items()):
     print(
         f"In category {category!r} of {len(rewards)} questions,"

From a5e140de4e4c152e7cb6f8e64ab3f46791958fdd Mon Sep 17 00:00:00 2001
From: James Braza <jamesbraza@gmail.com>
Date: Fri, 6 Jun 2025 15:12:55 -0700
Subject: [PATCH 3/3] Reverted back to original example, with comment
 mentioning the existence of accuracy_reward

---
 README.md | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 9368d4d..ef1b4aa 100644
--- a/README.md
+++ b/README.md
@@ -163,8 +163,9 @@ from lmi import LiteLLMModel
 from tqdm.asyncio import tqdm_asyncio as asyncio
 
 from ether0.data import get_problem_category
-from ether0.model_prompts import LOOSE_XML_ANSWER_USER_PROMPT
-from ether0.rewards import accuracy_reward
+from ether0.model_prompts import LOOSE_XML_ANSWER_USER_PROMPT, extract_answer_loose
+from ether0.models import RewardFunctionInfo
+from ether0.rewards import EVAL_FUNCTIONS
 
 # Add LLM prompt of your making to the dataset
 test_ds = load_dataset("futurehouse/ether0-benchmark", split="test").map(
@@ -179,17 +180,17 @@ results = await asyncio.gather(
 )
 
 # Compute rewards
-rewards = accuracy_reward(
-    completions=[result[0].text for result in results],
-    solution=[row["solution"] for row in test_ds],
-    reasoning=False,
-    test=True,  # Use test flag for third party LLMs using loose prompt
-)
-
-# Aggregate rewards by problem category for understandability
 per_category_rewards = defaultdict(list)
-for prob_type, reward in zip(test_ds["problem_type"], rewards, strict=True):
-    per_category_rewards[get_problem_category(prob_type)].append(reward)
+for row, result in zip(test_ds, results, strict=True):
+    # NOTE: you can also use `ether0.rewards.accuracy_reward`,
+    # but we decided to go a bit "lower level" for this demo
+    reward_info = RewardFunctionInfo.model_validate(row["solution"])
+    yhat = extract_answer_loose(result[0].text)
+    reward = EVAL_FUNCTIONS[reward_info.fxn_name](
+        yhat=yhat, y=reward_info.answer_info, test=True
+    )
+    per_category_rewards[get_problem_category(reward_info.problem_type)].append(reward)
+
 for category, rewards in sorted(per_category_rewards.items()):
     print(
         f"In category {category!r} of {len(rewards)} questions,"