From d95ed3b636c1e2cc088b697fe02ed8de5f623899 Mon Sep 17 00:00:00 2001 From: Ryan Hatter <25823361+RNHTTR@users.noreply.github.com> Date: Fri, 23 Jan 2026 21:41:40 -0500 Subject: [PATCH 1/2] Fix bug in preprocess_fn codepath When passing a preprocess_fn to the `@evaluation_test` decorator, it fails with the following error: ``` if preprocess_fn: > if not data_loaders: ^^^^^^^^^^^^ E UnboundLocalError: cannot access local variable 'data_loaders' where it is not associated with a value .venv/lib/python3.12/site-packages/eval_protocol/pytest/evaluation_test.py:350: UnboundLocalError ``` Python 3.12 eval-protocol==0.3.0 --- eval_protocol/pytest/evaluation_test.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 2256b35f..57816bbd 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -351,12 +351,7 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo apply preprocess_fn. """ if preprocess_fn: - if not data_loaders: - data = preprocess_fn(data) - else: - raise ValueError( - "preprocess_fn should not be used with data_loaders. Pass preprocess_fn to data_loaders instead." - ) + data = preprocess_fn(data) for row in data: # generate a stable row_id for each row From 6bebd55ef8bb808572fa05ed729ec2ab87daf220 Mon Sep 17 00:00:00 2001 From: Benny Chen Date: Sun, 8 Feb 2026 21:29:48 -0800 Subject: [PATCH 2/2] Avoid double preprocess with data loaders and add tests --- eval_protocol/pytest/evaluation_test.py | 12 +-- .../pytest/test_preprocess_fn_data_loaders.py | 92 +++++++++++++++++++ 2 files changed, 98 insertions(+), 6 deletions(-) create mode 100644 tests/pytest/test_preprocess_fn_data_loaders.py diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 57816bbd..e70f8794 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -307,6 +307,7 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo try: # Handle dataset loading data: list[EvaluationRow] = [] + data_loader_rows_preprocessed = False # Track all rows processed in the current run for error logging processed_rows_in_run: list[EvaluationRow] = [] if "data_loaders" in kwargs and kwargs["data_loaders"] is not None: @@ -318,6 +319,7 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo results = data_loader.load() for result in results: data.extend(result.rows) + data_loader_rows_preprocessed = data_loader_rows_preprocessed or result.preprocessed # Apply max_dataset_rows limit to data from data loaders if max_dataset_rows is not None: data = data[:max_dataset_rows] @@ -345,13 +347,11 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo if filtered_row_ids is not None: data = [row for row in data if row.input_metadata.row_id in filtered_row_ids] - """ - data_loaders handles preprocess_fn internally so we want - to specially handle data_loaders here so we don't double - apply preprocess_fn. - """ if preprocess_fn: - data = preprocess_fn(data) + # If data loaders already applied preprocessing, skip the decorator-level + # preprocess_fn to avoid running the same transform twice. + if not data_loader_rows_preprocessed: + data = preprocess_fn(data) for row in data: # generate a stable row_id for each row diff --git a/tests/pytest/test_preprocess_fn_data_loaders.py b/tests/pytest/test_preprocess_fn_data_loaders.py new file mode 100644 index 00000000..60b91ebc --- /dev/null +++ b/tests/pytest/test_preprocess_fn_data_loaders.py @@ -0,0 +1,92 @@ +from eval_protocol.data_loader import DynamicDataLoader +from eval_protocol.dataset_logger.dataset_logger import DatasetLogger +from eval_protocol.models import EvaluateResult, EvaluationRow, Message +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor + + +class InMemoryLogger(DatasetLogger): + def log(self, row: EvaluationRow) -> None: + return None + + def read(self) -> list[EvaluationRow]: + return [] + + +class StopAfterPreprocess(Exception): + pass + + +class StopAfterPreprocessRolloutProcessor(NoOpRolloutProcessor): + def setup(self) -> None: + raise StopAfterPreprocess("Stop after preprocessing for focused test assertions") + + +def _build_rows() -> list[EvaluationRow]: + return [ + EvaluationRow( + messages=[ + Message(role="user", content="question"), + Message(role="assistant", content="answer"), + ] + ) + ] + + +async def test_preprocess_fn_runs_with_data_loader_without_loader_preprocess(): + call_count = {"decorator_preprocess": 0} + + def decorator_preprocess(rows: list[EvaluationRow]) -> list[EvaluationRow]: + call_count["decorator_preprocess"] += 1 + return rows + + data_loader = DynamicDataLoader(generators=[_build_rows]) + + @evaluation_test( + data_loaders=data_loader, + preprocess_fn=decorator_preprocess, + rollout_processor=StopAfterPreprocessRolloutProcessor(), + logger=InMemoryLogger(), + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + row.evaluation_result = EvaluateResult(score=1.0, reason="ok") + return row + + try: + await eval_fn(data_loaders=data_loader) + except StopAfterPreprocess: + pass + + assert call_count["decorator_preprocess"] == 1 + + +async def test_preprocess_fn_not_double_applied_when_data_loader_preprocess_exists(): + call_count = {"loader_preprocess": 0, "decorator_preprocess": 0} + + def loader_preprocess(rows: list[EvaluationRow]) -> list[EvaluationRow]: + call_count["loader_preprocess"] += 1 + return rows + + def decorator_preprocess(rows: list[EvaluationRow]) -> list[EvaluationRow]: + call_count["decorator_preprocess"] += 1 + return rows + + data_loader = DynamicDataLoader(generators=[_build_rows], preprocess_fn=loader_preprocess) + + @evaluation_test( + data_loaders=data_loader, + preprocess_fn=decorator_preprocess, + rollout_processor=StopAfterPreprocessRolloutProcessor(), + logger=InMemoryLogger(), + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + row.evaluation_result = EvaluateResult(score=1.0, reason="ok") + return row + + try: + await eval_fn(data_loaders=data_loader) + except StopAfterPreprocess: + pass + + assert call_count["loader_preprocess"] == 1 + assert call_count["decorator_preprocess"] == 0