Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,6 @@ dmypy.json


.copilotignore
tests/e2e/attacks/
tests/e2e/attacks/

.tmp
Binary file modified docs/static/videos/dashboard_demo.mov
Binary file not shown.
104 changes: 78 additions & 26 deletions hackagent/attacks/evaluator/evaluation_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -1036,7 +1036,7 @@ def prepare_and_sync(self, evaluated_items: list, run_id: str):
- Add _run_id if missing
- Ensure result_id exists
- Build judge_keys
- Call _sync_to_server
- Call _sync_to_server (only if not already synced by the attack)
"""
if not self._should_sync_evaluation(evaluated_items):
self.logger.warning(
Expand Down Expand Up @@ -1095,6 +1095,14 @@ def prepare_and_sync(self, evaluated_items: list, run_id: str):
# fallback to goal_id or generated UUID
item["result_id"] = item.get("goal_id") or str(uuid4())

# Skip server sync when results were already evaluated and synced
# by the attack technique's own pipeline.
if self._already_evaluated(evaluated_items):
self.logger.info(
"Results already synced by attack pipeline — skipping prepare_and_sync"
)
return

# Build judge_keys automatically
judge_keys = self._build_judge_keys_from_data(evaluated_items)
self._sync_to_server(evaluated_items, judge_keys=judge_keys)
Expand Down Expand Up @@ -1233,44 +1241,88 @@ def _update_tracker(
evaluator_name=f"{evaluator_prefix}_{'_'.join(judges_used)}",
)

def run_full_evaluation(self, input_data: List[Dict[str, Any]]):
# 1. Resolve judges
judges_config = self._resolve_judges_from_config()
evaluator_base_config = self._build_base_eval_config()
# Keys whose presence signals that results were already evaluated
# by the attack technique's own pipeline (e.g. AdvPrefix aggregation).
_EVAL_SCORE_KEYS: frozenset = frozenset(
{
"best_score",
"success",
"eval_nj",
"eval_jb",
"eval_hb",
"eval_hbv",
"eval_nj_mean",
"eval_jb_mean",
"eval_hb_mean",
"eval_hbv_mean",
"pasr",
}
)

# 2. Run evaluation
evaluated_items = self._run_evaluation(
input_data, judges_config, evaluator_base_config
)
def _already_evaluated(self, data: List[Dict[str, Any]]) -> bool:
"""Return True when *data* already carries evaluation scores.

# 3. Enrich
error_indices = self._detect_error_indices(evaluated_items)
if error_indices:
self._mark_error_rows(evaluated_items, error_indices)
self._enrich_items_with_scores(evaluated_items, error_indices=error_indices)
Attack techniques (AdvPrefix, FlipAttack, etc.) run their own
judge evaluation inside ``run()``. Re-evaluating those results
here is at best redundant and at worst destructive — e.g.
AdvPrefix aggregated rows lack the ``completion`` field, so
judges would evaluate an empty string and overwrite correct
statuses with FAILED_JAILBREAK.
"""
if not data:
return False
sample = data[0]
if not isinstance(sample, dict):
return False
return bool(self._EVAL_SCORE_KEYS & sample.keys())

if not self._should_sync_evaluation(evaluated_items):
self.logger.warning(
"Skipping evaluation sync: no judge outputs or success signals were produced."
def run_full_evaluation(self, input_data: List[Dict[str, Any]]):
already_evaluated = self._already_evaluated(input_data)

if already_evaluated:
self.logger.info(
"Results already contain evaluation scores — skipping redundant re-evaluation, generating metrics only"
)
evaluated_items = input_data
else:
# 1. Resolve judges
judges_config = self._resolve_judges_from_config()
evaluator_base_config = self._build_base_eval_config()

# 2. Run evaluation
evaluated_items = self._run_evaluation(
input_data, judges_config, evaluator_base_config
)
return evaluated_items

# 4. Logging
self._log_evaluation_asr(evaluated_items)
# 3. Detect and mark error rows before enrichment
error_indices = self._detect_error_indices(evaluated_items)
if error_indices:
self._mark_error_rows(evaluated_items, error_indices)

# 4. Enrich with scores (pass error indices for deterministic fields)
self._enrich_items_with_scores(evaluated_items, error_indices=error_indices)

# 5. Log ASR
self._log_evaluation_asr(evaluated_items)

# 5. Tracker
self._update_tracker(evaluated_items)
# 6. Tracker
self._update_tracker(evaluated_items)

# 6. Sync row-level results
self._sync_to_server(evaluated_items)
# 7. Sync row-level results (only if there is something to sync)
if self._should_sync_evaluation(evaluated_items):
self._sync_to_server(evaluated_items)
else:
self.logger.warning(
"Skipping evaluation sync: no judge outputs or success signals were produced."
)

# 7. Generate metrics summary
# 8. Generate metrics summary
summary = generate_summary_report(evaluated_items)

# Attach summary to statistics for TUI access
self._statistics["metrics_summary"] = summary

# 8. Sync summary to backend
# 9. Sync summary to backend
self._sync_metrics_to_backend_structured(summary)

return evaluated_items
Expand Down
Loading
Loading