Skip to content

Commit 56fe56a

Browse files
committed
enforce single evaluator upload per command
- Reject comma-separated --entry values in upload command - Enforce single test selection after discovery - Fix non-interactive mode to auto-select first test only - Simplify upload logic by removing loop (only 1 evaluator per run) This prevents duplicate evaluator creation when the same test is discovered multiple times or when using ep create rft.
1 parent bd1be95 commit 56fe56a

File tree

2 files changed

+67
-56
lines changed

2 files changed

+67
-56
lines changed

eval_protocol/cli_commands/upload.py

Lines changed: 62 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -184,23 +184,32 @@ def upload_command(args: argparse.Namespace) -> int:
184184
entries_arg = getattr(args, "entry", None)
185185
non_interactive: bool = bool(getattr(args, "yes", False))
186186
if entries_arg:
187-
entries = [e.strip() for e in re.split(r"[,\s]+", entries_arg) if e.strip()]
188-
selected_specs: list[tuple[str, str]] = []
189-
for e in entries:
190-
qualname, resolved_path = _resolve_entry_to_qual_and_source(e, root)
191-
selected_specs.append((qualname, resolved_path))
187+
# Only support single entry, not comma-separated values
188+
entry = entries_arg.strip()
189+
if "," in entry:
190+
print("Error: --entry only supports uploading one evaluator at a time.")
191+
print("Please specify a single entry in the format: module::function or path::function")
192+
return 1
193+
qualname, resolved_path = _resolve_entry_to_qual_and_source(entry, root)
194+
selected_specs: list[tuple[str, str]] = [(qualname, resolved_path)]
192195
else:
193196
selected_tests = _discover_and_select_tests(root, non_interactive=non_interactive)
194197
if not selected_tests:
195198
return 1
199+
200+
# Enforce single selection
201+
if len(selected_tests) > 1:
202+
print(f"Error: Multiple tests selected ({len(selected_tests)}), but only one can be uploaded at a time.")
203+
print("Please select exactly one test to upload.")
204+
return 1
205+
196206
# Warn about parameterized tests
197-
parameterized_tests = [t for t in selected_tests if t.has_parametrize]
198-
if parameterized_tests:
199-
print("\nNote: Parameterized tests will be uploaded as a single evaluator that")
207+
if selected_tests[0].has_parametrize:
208+
print("\nNote: This parameterized test will be uploaded as a single evaluator that")
200209
print(" handles all parameter combinations. The evaluator will work with")
201210
print(" the same logic regardless of which model/parameters are used.")
202211

203-
selected_specs = [(t.qualname, t.file_path) for t in selected_tests]
212+
selected_specs = [(selected_tests[0].qualname, selected_tests[0].file_path)]
204213

205214
base_id = getattr(args, "id", None)
206215
display_name = getattr(args, "display_name", None)
@@ -256,53 +265,51 @@ def upload_command(args: argparse.Namespace) -> int:
256265
except Exception as e:
257266
print(f"Warning: Skipped Fireworks secret registration due to error: {e}")
258267

259-
exit_code = 0
260-
for i, (qualname, source_file_path) in enumerate(selected_specs):
261-
# Generate a short default ID from just the test function name
262-
if base_id:
263-
evaluator_id = base_id
264-
if len(selected_specs) > 1:
265-
evaluator_id = f"{base_id}-{i + 1}"
268+
# selected_specs is guaranteed to have exactly 1 item at this point
269+
qualname, source_file_path = selected_specs[0]
270+
271+
# Generate evaluator ID
272+
if base_id:
273+
evaluator_id = base_id
274+
else:
275+
# Extract just the test function name from qualname
276+
test_func_name = qualname.split(".")[-1]
277+
# Extract source file name (e.g., "test_gpqa.py" -> "test_gpqa")
278+
if source_file_path:
279+
source_file_name = Path(source_file_path).stem
266280
else:
267-
# Extract just the test function name from qualname
268-
test_func_name = qualname.split(".")[-1]
269-
# Extract source file name (e.g., "test_gpqa.py" -> "test_gpqa")
270-
if source_file_path:
271-
source_file_name = Path(source_file_path).stem
272-
else:
273-
source_file_name = "eval"
274-
# Create a shorter ID: filename-testname
275-
evaluator_id = f"{source_file_name}-{test_func_name}"
281+
source_file_name = "eval"
282+
# Create a shorter ID: filename-testname
283+
evaluator_id = f"{source_file_name}-{test_func_name}"
276284

277-
# Normalize the evaluator ID to meet Fireworks requirements
278-
evaluator_id = _normalize_evaluator_id(evaluator_id)
285+
# Normalize the evaluator ID to meet Fireworks requirements
286+
evaluator_id = _normalize_evaluator_id(evaluator_id)
279287

280-
# Compute entry point metadata for backend as a pytest nodeid usable with `pytest <entrypoint>`
281-
# Always prefer a path-based nodeid to work in plain pytest environments (server may not use --pyargs)
282-
func_name = qualname.split(".")[-1]
283-
entry_point = _build_entry_point(root, source_file_path, func_name)
288+
# Compute entry point metadata for backend as a pytest nodeid usable with `pytest <entrypoint>`
289+
# Always prefer a path-based nodeid to work in plain pytest environments (server may not use --pyargs)
290+
func_name = qualname.split(".")[-1]
291+
entry_point = _build_entry_point(root, source_file_path, func_name)
284292

285-
print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
286-
try:
287-
test_dir = root
288-
metric_name = os.path.basename(test_dir) or "metric"
289-
result = create_evaluation(
290-
evaluator_id=evaluator_id,
291-
metric_folders=[f"{metric_name}={test_dir}"],
292-
display_name=display_name or evaluator_id,
293-
description=description or f"Evaluator for {qualname}",
294-
force=force,
295-
entry_point=entry_point,
296-
)
297-
name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id
298-
299-
# Print success message with Fireworks dashboard link
300-
print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
301-
print("📊 View in Fireworks Dashboard:")
302-
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
303-
print(f" {dashboard_url}\n")
304-
except Exception as e:
305-
print(f"Failed to upload {qualname}: {e}")
306-
exit_code = 2
307-
308-
return exit_code
293+
print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
294+
try:
295+
test_dir = root
296+
metric_name = os.path.basename(test_dir) or "metric"
297+
result = create_evaluation(
298+
evaluator_id=evaluator_id,
299+
metric_folders=[f"{metric_name}={test_dir}"],
300+
display_name=display_name or evaluator_id,
301+
description=description or f"Evaluator for {qualname}",
302+
force=force,
303+
entry_point=entry_point,
304+
)
305+
name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id
306+
307+
# Print success message with Fireworks dashboard link
308+
print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
309+
print("📊 View in Fireworks Dashboard:")
310+
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
311+
print(f" {dashboard_url}\n")
312+
return 0
313+
except Exception as e:
314+
print(f"Failed to upload {qualname}: {e}")
315+
return 2

eval_protocol/cli_commands/utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,11 @@ def _prompt_select_fallback(tests: list[DiscoveredTest]) -> list[DiscoveredTest]
334334
def _prompt_select(tests: list[DiscoveredTest], non_interactive: bool) -> list[DiscoveredTest]:
335335
"""Prompt user to select tests to upload."""
336336
if non_interactive:
337-
return tests
337+
# In non-interactive mode, auto-select only the first test
338+
if len(tests) > 1:
339+
print(f"Note: {len(tests)} tests discovered. Auto-selecting first test in non-interactive mode:")
340+
print(f" {_format_test_choice(tests[0], 1)}")
341+
return [tests[0]]
338342

339343
return _prompt_select_interactive(tests)
340344

0 commit comments

Comments
 (0)