From c44c4cac39baa64ef86cb4c6ace92099a4cd79a4 Mon Sep 17 00:00:00 2001 From: Nicola Franco Date: Fri, 22 May 2026 20:01:59 +0200 Subject: [PATCH 1/6] fix: resolve top-level 'judge' dict before falling back to gpt-4-0613 default When the orchestrator runs the post-attack evaluation pipeline, it calls _resolve_judges_from_config() with no arguments on the attack_config. The method correctly checked for a 'judges' list but skipped the common 'judge' dict format used by Ollama/local examples, falling through to the hardcoded 'gpt-4-0613' default and crashing with a missing-credentials error for users without an OpenAI API key. Resolution order is now: 1. 'judges' list in raw config 2. 'judge' dict in raw config (wrapped in a list) 3. technique_params fallback 4. gpt-4-0613 / jailbreakbench hardcoded defaults Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- hackagent/attacks/evaluator/evaluation_step.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/hackagent/attacks/evaluator/evaluation_step.py b/hackagent/attacks/evaluator/evaluation_step.py index ef68c080..a355f5b7 100644 --- a/hackagent/attacks/evaluator/evaluation_step.py +++ b/hackagent/attacks/evaluator/evaluation_step.py @@ -355,8 +355,11 @@ def _resolve_judges_from_config( """ Resolve the judges list from ``_raw_config``. - If no top-level ``judges`` key is present, builds a single-judge - fallback from *technique_params* for backward compatibility. + Resolution order: + 1. Top-level ``judges`` list in raw config. + 2. Top-level ``judge`` dict in raw config (wrapped in a list). + 3. ``technique_params["judge"]`` string (legacy fallback). + 4. ``default_judge`` / ``default_type`` hardcoded defaults. Args: technique_params: Technique-specific params dict with legacy @@ -371,6 +374,11 @@ def _resolve_judges_from_config( if isinstance(judges, list) and judges: return judges + # Use the top-level "judge" dict if present (e.g. from Ollama/local configs). + raw_judge = self._raw_config.get("judge") + if isinstance(raw_judge, dict) and raw_judge: + return [raw_judge] + tp = technique_params or {} judge_model = tp.get("judge", default_judge) judge_type = tp.get("judge_type") or self.infer_judge_type( From 35656c32986c32e84fd4f155d13842d17a1a8d90 Mon Sep 17 00:00:00 2001 From: Nicola Franco Date: Fri, 22 May 2026 20:23:27 +0200 Subject: [PATCH 2/6] fix: replace hardcoded OpenAI model defaults with local Ollama defaults Remove all gpt-4/gpt-4o-mini hardcoded defaults from attacks, techniques, and judge resolution so the tool works without any external API key. Changes: - evaluation_step._resolve_judges_from_config: default_judge now uses DEFAULT_JUDGE_IDENTIFIER (gemma3:4b via Ollama) with default_type 'harmbench' instead of 'gpt-4-0613'/'jailbreakbench'. Also injects the Ollama endpoint/agent_type when the built-in default is used. - flipattack/attack.py: goal metadata judge default changed from 'gpt-4-0613' to DEFAULT_JUDGE_IDENTIFIER. - cli/tui/attack_specs.py: PAIR attacker default changed from 'gpt-4' and PAP attacker default changed from 'gpt-4o-mini' to DEFAULT_ATTACKER_IDENTIFIER (gemma3:4b). --- hackagent/attacks/evaluator/evaluation_step.py | 14 ++++++++++++-- hackagent/attacks/techniques/flipattack/attack.py | 3 ++- hackagent/cli/tui/attack_specs.py | 9 +++++++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/hackagent/attacks/evaluator/evaluation_step.py b/hackagent/attacks/evaluator/evaluation_step.py index a355f5b7..a05daada 100644 --- a/hackagent/attacks/evaluator/evaluation_step.py +++ b/hackagent/attacks/evaluator/evaluation_step.py @@ -51,6 +51,11 @@ def execute(self, input_data): from hackagent.attacks.shared.router_factory import extract_passthrough_request_config from hackagent.attacks.evaluator.sync import sync_evaluation_to_server from hackagent.attacks.techniques.advprefix.config import EvaluatorConfig +from hackagent.attacks.techniques.config import ( + DEFAULT_JUDGE_IDENTIFIER, + DEFAULT_LOCAL_AGENT_TYPE, + DEFAULT_LOCAL_MODEL_ENDPOINT, +) from hackagent.server.client import AuthenticatedClient from hackagent.router.types import AgentTypeEnum @@ -349,8 +354,8 @@ def _build_base_eval_config( def _resolve_judges_from_config( self, technique_params: Optional[Dict[str, Any]] = None, - default_judge: str = "gpt-4-0613", - default_type: str = "jailbreakbench", + default_judge: str = DEFAULT_JUDGE_IDENTIFIER, + default_type: str = "harmbench", ) -> List[Dict[str, Any]]: """ Resolve the judges list from ``_raw_config``. @@ -388,6 +393,11 @@ def _resolve_judges_from_config( "identifier": judge_model, "type": judge_type, } + # For the built-in local default, inject Ollama connectivity so it + # works out-of-the-box without any API key. + if judge_model == DEFAULT_JUDGE_IDENTIFIER: + fallback.setdefault("endpoint", DEFAULT_LOCAL_MODEL_ENDPOINT) + fallback.setdefault("agent_type", DEFAULT_LOCAL_AGENT_TYPE) for key in ( "endpoint", "agent_type", diff --git a/hackagent/attacks/techniques/flipattack/attack.py b/hackagent/attacks/techniques/flipattack/attack.py index 8b41db5a..1dcba302 100644 --- a/hackagent/attacks/techniques/flipattack/attack.py +++ b/hackagent/attacks/techniques/flipattack/attack.py @@ -41,6 +41,7 @@ from hackagent.router.router import AgentRouter from hackagent.attacks.techniques.base import BaseAttack from hackagent.attacks.shared.tui import with_tui_logging +from hackagent.attacks.techniques.config import DEFAULT_JUDGE_IDENTIFIER from . import generation, evaluation from .config import DEFAULT_FLIPATTACK_CONFIG @@ -461,7 +462,7 @@ def run(self, goals: List[str]) -> List[Dict]: "cot": flipattack_params.get("cot", False), "lang_gpt": flipattack_params.get("lang_gpt", False), "few_shot": flipattack_params.get("few_shot", False), - "judge": flipattack_params.get("judge", "gpt-4-0613"), + "judge": flipattack_params.get("judge", DEFAULT_JUDGE_IDENTIFIER), } # Initialize goal contexts upfront so goal elapsed_s covers the full diff --git a/hackagent/cli/tui/attack_specs.py b/hackagent/cli/tui/attack_specs.py index 772df0bc..80fadd26 100644 --- a/hackagent/cli/tui/attack_specs.py +++ b/hackagent/cli/tui/attack_specs.py @@ -26,6 +26,11 @@ from enum import Enum from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +from hackagent.attacks.techniques.config import ( + DEFAULT_ATTACKER_IDENTIFIER, + DEFAULT_JUDGE_IDENTIFIER, +) + # ===================================================================== # Field / Spec primitives @@ -578,7 +583,7 @@ def get_all_attack_specs() -> Dict[str, AttackConfigSpec]: key="attacker.model", label="Attacker Model", field_type=FieldType.STRING, - default="gpt-4", + default=DEFAULT_ATTACKER_IDENTIFIER, description="Model ID for the attacker LLM that generates prompts.", section="Attacker LLM", ), @@ -1305,7 +1310,7 @@ def get_all_attack_specs() -> Dict[str, AttackConfigSpec]: key="attacker.identifier", label="Attacker Model", field_type=FieldType.STRING, - default="gpt-4o-mini", + default=DEFAULT_ATTACKER_IDENTIFIER, description="Model identifier for persuasive paraphrasing.", section="Attacker LLM", ), From 2da8f66e7d9416cfa8816e839eee066d454a166f Mon Sep 17 00:00:00 2001 From: Nicola Franco Date: Fri, 22 May 2026 20:29:04 +0200 Subject: [PATCH 3/6] =?UTF-8?q?bump:=20version=200.10.0=20=E2=86=92=200.10?= =?UTF-8?q?.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 8 ++++++++ pyproject.toml | 2 +- uv.lock | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b7a995b..1420e940 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## v0.10.1 (2026-05-22) + +### fix + +- replace hardcoded OpenAI model defaults with local Ollama defaults +- resolve top-level 'judge' dict before falling back to gpt-4-0613 default +- move examples/ inside hackagent package for correct wheel packaging + ## v0.10.0 (2026-05-22) ### ✨ Features diff --git a/pyproject.toml b/pyproject.toml index 2fef50df..47a1d729 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "hackagent" -version = "0.10.0" +version = "0.10.1" description = "HackAgent is an open-source security toolkit to detect vulnerabilities of your AI Agents." authors = [ {name = "AI Security Lab", email = "ais@ai4i.it"} diff --git a/uv.lock b/uv.lock index b4c831c5..a6ce3655 100644 --- a/uv.lock +++ b/uv.lock @@ -2368,7 +2368,7 @@ wheels = [ [[package]] name = "hackagent" -version = "0.9.1" +version = "0.10.0" source = { editable = "." } dependencies = [ { name = "click" }, From bc911ea7334bd8076dcdee76e1915fd6c66ecef6 Mon Sep 17 00:00:00 2001 From: Nicola Franco Date: Fri, 22 May 2026 20:40:43 +0200 Subject: [PATCH 4/6] chore(ci): add dependabot auto-merge workflow Enables auto-merge (squash) for all dependabot PRs targeting main. GitHub will merge automatically once all required CI checks pass. --- .github/workflows/dependabot-automerge.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/dependabot-automerge.yml diff --git a/.github/workflows/dependabot-automerge.yml b/.github/workflows/dependabot-automerge.yml new file mode 100644 index 00000000..d7529fa3 --- /dev/null +++ b/.github/workflows/dependabot-automerge.yml @@ -0,0 +1,21 @@ +name: Dependabot Auto-merge + +on: + pull_request: + branches: ["main"] + +permissions: + contents: write + pull-requests: write + +jobs: + auto-merge: + name: Auto-merge Dependabot PR + runs-on: ubuntu-latest + if: github.actor == 'dependabot[bot]' + steps: + - name: Enable auto-merge + run: gh pr merge --auto --squash "$PR_URL" + env: + PR_URL: ${{ github.event.pull_request.html_url }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 033e4e09982c8785da8567f3dd6e0837d878d2ac Mon Sep 17 00:00:00 2001 From: Nicola Franco Date: Fri, 22 May 2026 20:44:23 +0200 Subject: [PATCH 5/6] fix(ci): remove unused DEFAULT_JUDGE_IDENTIFIER import from attack_specs --- hackagent/cli/tui/attack_specs.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/hackagent/cli/tui/attack_specs.py b/hackagent/cli/tui/attack_specs.py index 80fadd26..9e07ea2e 100644 --- a/hackagent/cli/tui/attack_specs.py +++ b/hackagent/cli/tui/attack_specs.py @@ -26,10 +26,7 @@ from enum import Enum from typing import Any, Dict, List, Optional, Sequence, Tuple, Union -from hackagent.attacks.techniques.config import ( - DEFAULT_ATTACKER_IDENTIFIER, - DEFAULT_JUDGE_IDENTIFIER, -) +from hackagent.attacks.techniques.config import DEFAULT_ATTACKER_IDENTIFIER # ===================================================================== From be51dfb4d6a92a447345f4d6db1958984f17ae6b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 22 May 2026 19:00:26 +0000 Subject: [PATCH 6/6] Fix stale default judge expectations in evaluation-step integration test --- tests/integration/attacks/test_evaluation_step.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/integration/attacks/test_evaluation_step.py b/tests/integration/attacks/test_evaluation_step.py index aec5cfd6..52d279f4 100644 --- a/tests/integration/attacks/test_evaluation_step.py +++ b/tests/integration/attacks/test_evaluation_step.py @@ -41,6 +41,7 @@ JUDGE_TYPE_LABELS, MERGE_KEYS, ) +from hackagent.attacks.techniques.config import DEFAULT_JUDGE_IDENTIFIER from hackagent.router.types import AgentTypeEnum logger = logging.getLogger(__name__) @@ -297,9 +298,9 @@ def test_fallback_to_defaults_with_no_params(self): judges = step._resolve_judges_from_config() assert len(judges) == 1 - assert judges[0]["identifier"] == "gpt-4-0613" - # default_type in _resolve_judges_from_config is "jailbreakbench" - assert judges[0]["type"] == "jailbreakbench" + assert judges[0]["identifier"] == DEFAULT_JUDGE_IDENTIFIER + # default_type in _resolve_judges_from_config is "harmbench" + assert judges[0]["type"] == "harmbench" def test_multiple_judges(self): """Test with multiple judges configured."""