From f67b432d2fb319815a3aef34d89add357f91b3b2 Mon Sep 17 00:00:00 2001 From: James Braza Date: Fri, 6 Jun 2025 14:37:34 -0700 Subject: [PATCH 1/3] Reusing extract_answer_loose in accuracy_reward --- src/ether0/rewards.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/ether0/rewards.py b/src/ether0/rewards.py index a86bce7..911c94e 100644 --- a/src/ether0/rewards.py +++ b/src/ether0/rewards.py @@ -19,7 +19,7 @@ from ether0.clients import fetch_forward_rxn, fetch_purchasable, fetch_solubility from ether0.data import is_reasonable_fp, is_reasonable_ring_system, mol_from_smiles -from ether0.model_prompts import extract_thought_answer_strict +from ether0.model_prompts import extract_answer_loose, extract_thought_answer_strict from ether0.models import RewardFunctionInfo, RewardReason block = BlockLogs() @@ -702,14 +702,11 @@ def accuracy_reward( reward_info = RewardFunctionInfo.model_validate(info) fxn_name, answer_info, problem_type = tuple(reward_info.model_dump().values()) try: - if test: - answer: str | None = ( - content.split("")[1].split("")[0] - if "" in content - else content - ) - else: - answer = extract_thought_answer_strict(content, reasoning=reasoning)[1] + answer: str | None = ( + extract_answer_loose(content) + if test + else extract_thought_answer_strict(content, reasoning=reasoning)[1] + ) if answer is not None: # During test time, see if full SMILES string was given as input if problem_type == "valid_mol_eval" and test: From e86f8f558f7365ad022044f71af31ec21002ddca Mon Sep 17 00:00:00 2001 From: James Braza Date: Fri, 6 Jun 2025 14:39:20 -0700 Subject: [PATCH 2/3] Moved README baselines example to use accuracy_reward with test flag --- README.md | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 68ebfff..9368d4d 100644 --- a/README.md +++ b/README.md @@ -163,9 +163,8 @@ from lmi import LiteLLMModel from tqdm.asyncio import tqdm_asyncio as asyncio from ether0.data import get_problem_category -from ether0.model_prompts import LOOSE_XML_ANSWER_USER_PROMPT, extract_answer_loose -from ether0.models import RewardFunctionInfo -from ether0.rewards import EVAL_FUNCTIONS +from ether0.model_prompts import LOOSE_XML_ANSWER_USER_PROMPT +from ether0.rewards import accuracy_reward # Add LLM prompt of your making to the dataset test_ds = load_dataset("futurehouse/ether0-benchmark", split="test").map( @@ -180,13 +179,17 @@ results = await asyncio.gather( ) # Compute rewards -per_category_rewards = defaultdict(list) -for row, result in zip(test_ds, results, strict=True): - reward_info = RewardFunctionInfo.model_validate(row["solution"]) - yhat = extract_answer_loose(result[0].text) - reward = EVAL_FUNCTIONS[reward_info.fxn_name](yhat=yhat, y=reward_info.answer_info) - per_category_rewards[get_problem_category(reward_info.problem_type)].append(reward) +rewards = accuracy_reward( + completions=[result[0].text for result in results], + solution=[row["solution"] for row in test_ds], + reasoning=False, + test=True, # Use test flag for third party LLMs using loose prompt +) +# Aggregate rewards by problem category for understandability +per_category_rewards = defaultdict(list) +for prob_type, reward in zip(test_ds["problem_type"], rewards, strict=True): + per_category_rewards[get_problem_category(prob_type)].append(reward) for category, rewards in sorted(per_category_rewards.items()): print( f"In category {category!r} of {len(rewards)} questions," From a5e140de4e4c152e7cb6f8e64ab3f46791958fdd Mon Sep 17 00:00:00 2001 From: James Braza Date: Fri, 6 Jun 2025 15:12:55 -0700 Subject: [PATCH 3/3] Reverted back to original example, with comment mentioning the existence of accuracy_reward --- README.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 9368d4d..ef1b4aa 100644 --- a/README.md +++ b/README.md @@ -163,8 +163,9 @@ from lmi import LiteLLMModel from tqdm.asyncio import tqdm_asyncio as asyncio from ether0.data import get_problem_category -from ether0.model_prompts import LOOSE_XML_ANSWER_USER_PROMPT -from ether0.rewards import accuracy_reward +from ether0.model_prompts import LOOSE_XML_ANSWER_USER_PROMPT, extract_answer_loose +from ether0.models import RewardFunctionInfo +from ether0.rewards import EVAL_FUNCTIONS # Add LLM prompt of your making to the dataset test_ds = load_dataset("futurehouse/ether0-benchmark", split="test").map( @@ -179,17 +180,17 @@ results = await asyncio.gather( ) # Compute rewards -rewards = accuracy_reward( - completions=[result[0].text for result in results], - solution=[row["solution"] for row in test_ds], - reasoning=False, - test=True, # Use test flag for third party LLMs using loose prompt -) - -# Aggregate rewards by problem category for understandability per_category_rewards = defaultdict(list) -for prob_type, reward in zip(test_ds["problem_type"], rewards, strict=True): - per_category_rewards[get_problem_category(prob_type)].append(reward) +for row, result in zip(test_ds, results, strict=True): + # NOTE: you can also use `ether0.rewards.accuracy_reward`, + # but we decided to go a bit "lower level" for this demo + reward_info = RewardFunctionInfo.model_validate(row["solution"]) + yhat = extract_answer_loose(result[0].text) + reward = EVAL_FUNCTIONS[reward_info.fxn_name]( + yhat=yhat, y=reward_info.answer_info, test=True + ) + per_category_rewards[get_problem_category(reward_info.problem_type)].append(reward) + for category, rewards in sorted(per_category_rewards.items()): print( f"In category {category!r} of {len(rewards)} questions,"