Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,7 @@ async def execute_run(run_idx: int, config: RolloutProcessorConfig):
async def _execute_pointwise_eval_with_semaphore(
row: EvaluationRow,
) -> EvaluationRow:
start_time = time.perf_counter()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Eval duration includes semaphore wait time, not just execution

The start_time is captured before async with semaphore:, which means eval_duration_seconds includes time spent waiting for the semaphore, not just the actual evaluation execution time. When concurrent evaluations compete for semaphore slots, reported durations will be artificially inflated. The start_time assignment needs to be moved inside the async with semaphore: block to measure only the actual evaluation time.

Additional Locations (1)

Fix in Cursor Fix in Web

async with semaphore:
evaluation_test_kwargs = kwargs.get("evaluation_test_kwargs") or {}
async with rollout_logging_context(
Expand All @@ -505,11 +506,15 @@ async def _execute_pointwise_eval_with_semaphore(
raise ValueError(
f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
)
result.execution_metadata.eval_duration_seconds = (
time.perf_counter() - start_time
)
return result

async def _execute_groupwise_eval_with_semaphore(
rows: list[EvaluationRow],
) -> list[EvaluationRow]:
start_time = time.perf_counter()
async with semaphore:
evaluation_test_kwargs = kwargs.get("evaluation_test_kwargs") or {}
primary_rollout_id = rows[0].execution_metadata.rollout_id if rows else None
Expand All @@ -531,6 +536,9 @@ async def _execute_groupwise_eval_with_semaphore(
raise ValueError(
f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
)
eval_duration = time.perf_counter() - start_time
for r in results:
r.execution_metadata.eval_duration_seconds = eval_duration
return results

if mode == "pointwise":
Expand Down Expand Up @@ -617,11 +625,16 @@ async def _collect_result(config, lst):
run_id=run_id,
rollout_ids=group_rollout_ids or None,
):
start_time = time.perf_counter()
results = await execute_pytest_with_exception_handling(
test_func=test_func,
evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
processed_dataset=input_dataset,
)
if isinstance(results, list):
eval_duration = time.perf_counter() - start_time
for r in results:
r.execution_metadata.eval_duration_seconds = eval_duration
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duration set before element type validation causes AttributeError

The new loop at lines 636-637 accesses r.execution_metadata.eval_duration_seconds before the validation at lines 638-642 confirms all elements are EvaluationRow instances. If the test function returns a list containing non-EvaluationRow objects, this will raise an AttributeError instead of the helpful ValueError with guidance. This is inconsistent with the pointwise case where validation occurs before setting the duration.

Fix in Cursor Fix in Web

if (
results is None
or not isinstance(results, list)
Expand Down
Loading