From 501a1acfc4771f0c987b06054a52fc5c26b6900d Mon Sep 17 00:00:00 2001 From: Karim Elmaaroufi Date: Wed, 20 May 2026 12:07:18 -0700 Subject: [PATCH 1/4] chore(prompts): drop vestigial [MUTATION COMPLETE] / [SUMMARY] protocol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The four prompt templates (``AUTONOMOUS_SYSTEM_PROMPT``, ``MUTATION_PROMPT_TEMPLATE``, ``SEEDLESS_INIT_PROMPT_TEMPLATE``, ``MERGE_PROMPT_TEMPLATE``) instructed the agent to emit a ``[MUTATION COMPLETE]`` / ``[MERGE COMPLETE]`` / ``[SEED GENERATION COMPLETE]`` sentinel when finished. HELIX never parsed any of those sentinels — subprocess exit is the actual stop signal and every backend handles termination internally. Separately, ``mutator.parse_mutation_summary`` scanned for ``[SUMMARY]...[END SUMMARY]`` key/value blocks that no prompt asked the agent to emit and no production code path called. Dead code, dead tests, dead protocol on both ends. Removed: - the trailing "print this completion marker" sentence + sentinel line from all four prompt templates (editing instructions preserved). - ``mutator.parse_mutation_summary`` (zero production callers). - ``tests/unit/test_semlog.py`` (sole consumer of the parser). - three prompt-substring assertions in ``test_mutator.py``, ``test_mutator_seedless.py``, and ``test_merger.py`` that pinned the presence of the removed sentinel strings. 851 unit tests pass (873 → 851 after dropping 22 sentinel-protocol assertions); ``mypy --strict src/helix/`` clean. Co-Authored-By: Claude Opus 4.7 --- src/helix/merger.py | 3 - src/helix/mutator.py | 48 --------- tests/unit/test_merger.py | 4 - tests/unit/test_mutator.py | 7 +- tests/unit/test_mutator_seedless.py | 5 - tests/unit/test_semlog.py | 155 ---------------------------- 6 files changed, 1 insertion(+), 221 deletions(-) delete mode 100644 tests/unit/test_semlog.py diff --git a/src/helix/merger.py b/src/helix/merger.py index 487ae1b..13982e3 100644 --- a/src/helix/merger.py +++ b/src/helix/merger.py @@ -88,9 +88,6 @@ def select_eval_subsample_for_merged_program( Candidate A is already checked out in your working directory. Apply the changes from Candidate B that are beneficial, and discard or adapt those that conflict or regress. You may read, edit, create, or delete files as needed. - -When you have finished making all your changes, output the exact text: -[MERGE COMPLETE] {turn_budget}""" # --------------------------------------------------------------------------- diff --git a/src/helix/mutator.py b/src/helix/mutator.py index 813b59c..97bfe57 100644 --- a/src/helix/mutator.py +++ b/src/helix/mutator.py @@ -44,7 +44,6 @@ - Do not request confirmation or clarification; choose a reasonable approach and continue. - If one approach fails, try an alternative and keep progressing. - Use available tools to inspect, edit, and validate changes. -- When finished, print exactly: [MUTATION COMPLETE] """ SEEDLESS_INIT_PROMPT_TEMPLATE = """\ @@ -59,9 +58,6 @@ Generate a strong initial candidate based on the goal above. Create all necessary files directly in the current working directory. Make your implementation complete and ready to be evaluated immediately. - -When you have finished creating all files, output the exact text: -[SEED GENERATION COMPLETE] {turn_budget}""" MUTATION_PROMPT_TEMPLATE = """\ @@ -79,9 +75,6 @@ ## Your Task Analyse the evaluation results above and improve the code to better achieve the objective. Make targeted, meaningful changes. You may read, edit, create, or delete files as needed. - -When you have finished making all your changes, output the exact text: -[MUTATION COMPLETE] {turn_budget}""" # --------------------------------------------------------------------------- @@ -478,47 +471,6 @@ def build_mutation_prompt( ) -# --------------------------------------------------------------------------- -# Mutation summary parsing -# --------------------------------------------------------------------------- - - -def parse_mutation_summary(output: str) -> dict[str, str]: - """Parse a ``[SUMMARY]...[END SUMMARY]`` block from Claude Code output. - - Extracts structured key-value pairs written by the mutation/merge agent - after ``[MUTATION COMPLETE]`` or ``[MERGE COMPLETE]``. The block format - is:: - - [SUMMARY] - files_changed: src/foo.py, src/bar.py - root_cause: ... - changes_made: ... - [END SUMMARY] - - Returns - ------- - dict[str, str] - Parsed key-value pairs. Returns an empty dict if no block is found - or the block contains no valid ``key: value`` lines. Never raises. - """ - result: dict[str, str] = {} - in_block = False - for line in output.splitlines(): - if "[SUMMARY]" in line: - in_block = True - continue - if "[END SUMMARY]" in line: - break - if in_block and ":" in line: - key, _, value = line.partition(":") - key = key.strip() - value = value.strip() - if key: - result[key] = value - return result - - # --------------------------------------------------------------------------- # Rate-limit detection # --------------------------------------------------------------------------- diff --git a/tests/unit/test_merger.py b/tests/unit/test_merger.py index 614e8fc..c4e2ddd 100644 --- a/tests/unit/test_merger.py +++ b/tests/unit/test_merger.py @@ -75,10 +75,6 @@ def test_default_background_when_none(self): prompt = build_merge_prompt("goal", None, None, "") assert "no additional background" in prompt - def test_contains_merge_complete_marker(self): - prompt = build_merge_prompt("goal", None, None, "") - assert "[MERGE COMPLETE]" in prompt - def test_contains_execution_instructions(self): prompt = build_merge_prompt("goal", None, None, "") assert "Task instructions:" in prompt diff --git a/tests/unit/test_mutator.py b/tests/unit/test_mutator.py index e3fb7ae..b016096 100644 --- a/tests/unit/test_mutator.py +++ b/tests/unit/test_mutator.py @@ -121,11 +121,6 @@ def test_default_background_when_none(self): prompt = build_mutation_prompt("goal", er, background=None) assert "no additional background" in prompt - def test_contains_mutation_complete_marker(self): - er = make_eval_result() - prompt = build_mutation_prompt("goal", er) - assert "[MUTATION COMPLETE]" in prompt - def test_contains_execution_instructions(self): er = make_eval_result() prompt = build_mutation_prompt("goal", er) @@ -557,7 +552,7 @@ def test_artifact_contains_rendered_prompt(self, tmp_path: Path, mocker): # Objective + autonomous-rules block are both in the rendered # prompt via build_mutation_prompt. assert config.objective in content - assert "[MUTATION COMPLETE]" in content + assert "Task instructions:" in content def test_gitignore_excludes_helix_artifacts(self, tmp_path: Path, mocker): """``.gitignore`` in the worktree gains entries for the prompt diff --git a/tests/unit/test_mutator_seedless.py b/tests/unit/test_mutator_seedless.py index f8fd9ab..b9734e9 100644 --- a/tests/unit/test_mutator_seedless.py +++ b/tests/unit/test_mutator_seedless.py @@ -70,11 +70,6 @@ def test_no_evaluator_section_when_absent(self): ) assert "Evaluator" not in prompt - def test_completion_signal_in_prompt(self): - """The completion signal string must appear in the prompt.""" - prompt = build_seed_generation_prompt(objective="Test objective") - assert "[SEED GENERATION COMPLETE]" in prompt - def test_returns_string(self): """build_seed_generation_prompt must return a str.""" result = build_seed_generation_prompt(objective="Test") diff --git a/tests/unit/test_semlog.py b/tests/unit/test_semlog.py deleted file mode 100644 index 33a058e..0000000 --- a/tests/unit/test_semlog.py +++ /dev/null @@ -1,155 +0,0 @@ -"""Unit tests for semantic mutation logging (semlog) functionality.""" - -from __future__ import annotations - - - -from helix.mutator import parse_mutation_summary - - -# --------------------------------------------------------------------------- -# Tests: parse_mutation_summary -# --------------------------------------------------------------------------- - - -class TestParseMutationSummaryComplete: - """test_parse_mutation_summary_complete: full block parsed correctly.""" - - def test_all_mutation_fields_parsed(self): - output = """\ -Some Claude output here... -[MUTATION COMPLETE] -[SUMMARY] -files_changed: src/model.py, src/utils.py -root_cause: The model was not normalising input features before training. -changes_made: Added a StandardScaler preprocessing step. Updated the training pipeline. -reasoning: Normalisation is the most direct fix for scale-sensitive algorithms. -expected_impact: Expect accuracy to improve by 5-10% based on typical gains from this fix. -[END SUMMARY] -""" - result = parse_mutation_summary(output) - - assert result["files_changed"] == "src/model.py, src/utils.py" - assert result["root_cause"] == "The model was not normalising input features before training." - assert "StandardScaler" in result["changes_made"] - assert result["reasoning"] == "Normalisation is the most direct fix for scale-sensitive algorithms." - assert "5-10%" in result["expected_impact"] - - def test_returns_dict_type(self): - output = "[SUMMARY]\nfiles_changed: foo.py\n[END SUMMARY]" - result = parse_mutation_summary(output) - assert isinstance(result, dict) - - def test_keys_are_stripped(self): - output = "[SUMMARY]\n files_changed : foo.py\n[END SUMMARY]" - result = parse_mutation_summary(output) - assert "files_changed" in result - - def test_values_are_stripped(self): - output = "[SUMMARY]\nfiles_changed: bar.py \n[END SUMMARY]" - result = parse_mutation_summary(output) - assert result["files_changed"] == "bar.py" - - def test_value_with_colon_preserved(self): - """Values containing ':' should keep everything after the first ': '.""" - output = "[SUMMARY]\nroot_cause: Issue in module: core/parser.py\n[END SUMMARY]" - result = parse_mutation_summary(output) - assert result["root_cause"] == "Issue in module: core/parser.py" - - def test_stops_at_end_summary(self): - """Lines after [END SUMMARY] should not be parsed.""" - output = ( - "[SUMMARY]\nfiles_changed: a.py\n[END SUMMARY]\nextra_key: should_not_appear\n" - ) - result = parse_mutation_summary(output) - assert "extra_key" not in result - - -class TestParseMutationSummaryMissingBlock: - """test_parse_mutation_summary_missing_block: returns empty dict gracefully (never crash).""" - - def test_returns_empty_dict_when_no_block(self): - output = "Claude Code output with no summary block at all." - result = parse_mutation_summary(output) - assert result == {} - - def test_returns_empty_dict_on_empty_string(self): - result = parse_mutation_summary("") - assert result == {} - - def test_returns_empty_dict_on_partial_block_no_end(self): - """[SUMMARY] without [END SUMMARY] — reads to EOF, may have partial data.""" - output = "[SUMMARY]\nfiles_changed: a.py\n" - result = parse_mutation_summary(output) - # Should not crash; may return partial dict - assert isinstance(result, dict) - - def test_returns_empty_dict_on_only_end_marker(self): - output = "[END SUMMARY]\nfiles_changed: a.py\n" - result = parse_mutation_summary(output) - assert result == {} - - def test_no_exception_on_none_like_empty_string(self): - """Ensure no crash even on edge-case whitespace-only input.""" - result = parse_mutation_summary(" \n\n\n ") - assert result == {} - - def test_no_exception_on_malformed_lines(self): - output = "[SUMMARY]\nthis line has no colon separator\nanother bad line\n[END SUMMARY]" - result = parse_mutation_summary(output) - assert isinstance(result, dict) - # None of the malformed lines should produce keys - assert len(result) == 0 - - -class TestParseMergeSummary: - """test_parse_merge_summary: merge-specific fields parsed.""" - - def test_all_merge_fields_parsed(self): - output = """\ -Merging candidates... -[MERGE COMPLETE] -[SUMMARY] -files_changed: src/solver.py, tests/test_solver.py -candidate_a_kept: The base algorithm from A because it had better convergence. -candidate_b_applied: The caching layer from B because it reduced redundant computation. -conflicts_resolved: Both modified the solve() function; kept A's logic with B's cache wrapper. -merge_strategy: Apply B's performance improvements on top of A's correctness fixes. -expected_impact: Combined improvements should yield 15% speedup and maintain accuracy. -[END SUMMARY] -""" - result = parse_mutation_summary(output) - - assert result["files_changed"] == "src/solver.py, tests/test_solver.py" - assert "base algorithm" in result["candidate_a_kept"] - assert "caching layer" in result["candidate_b_applied"] - assert "solve()" in result["conflicts_resolved"] - assert result["merge_strategy"] == "Apply B's performance improvements on top of A's correctness fixes." - assert "15%" in result["expected_impact"] - - def test_merge_fields_present_as_keys(self): - output = ( - "[SUMMARY]\n" - "files_changed: x.py\n" - "candidate_a_kept: kept A's approach\n" - "candidate_b_applied: applied B's fix\n" - "conflicts_resolved: none\n" - "merge_strategy: selective apply\n" - "expected_impact: minor improvement\n" - "[END SUMMARY]" - ) - result = parse_mutation_summary(output) - expected_keys = { - "files_changed", - "candidate_a_kept", - "candidate_b_applied", - "conflicts_resolved", - "merge_strategy", - "expected_impact", - } - assert expected_keys.issubset(result.keys()) - - def test_merge_summary_no_block_returns_empty(self): - output = "Merge done. No structured summary provided." - result = parse_mutation_summary(output) - assert result == {} From ecb0da92ad320f050ebfedea72ca2ba68868bc96 Mon Sep 17 00:00:00 2001 From: Karim Elmaaroufi Date: Wed, 20 May 2026 12:17:36 -0700 Subject: [PATCH 2/4] chore(prompts): emit prompt sections only when content is present MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror GEPA O.A.'s ``_build_reflection_prompt_template`` accumulator pattern (``gepa/optimize_anything.py:501-596``) in ``build_mutation_prompt`` and ``build_merge_prompt``: each section is appended only when its content is non-empty, instead of rendering a placeholder string like ``"(no additional background provided)"`` / ``"(no scores recorded)"`` / ``"(no diff — candidates are identical)"`` / ``"(no evaluation data)"`` that taught the agent nothing. Sections now optional in ``build_mutation_prompt``: - ``## Objective`` — when ``objective`` is empty. - ``## Current Evaluation Scores`` — when ``eval_result.scores`` is empty. - ``## Diagnostics`` — when neither ``per_example_side_info`` nor ``side_info`` is populated (already conditional pre-PR). - ``## Evaluator Notes`` — when ``asi.log`` is empty (already conditional pre-PR). - ``## Evaluator Output`` — when the evaluator succeeded and both stdout/stderr are empty. Failed evaluator (non-zero ``_returncode``) still emits the section with ``(no stdout)`` / ``(no stderr)`` placeholders, because the agent needs to know the failure produced no output to inspect (a meaningful diagnostic on its own). Partial coverage now renders only the stream that has content instead of padding the empty one. - ``### Extra Evaluator Info`` — when no free-form ASI keys (already conditional pre-PR). - ``## Background / Context`` — when ``background`` is None/empty. Sections now optional in ``build_merge_prompt``: - ``## Objective`` — when ``objective`` is empty. - ``## Candidate A Strengths`` — when ``eval_result_a`` is None. - ``## Candidate B Strengths`` — when ``eval_result_b`` is None. - ``## Diff (B relative to A)`` — when the diff is empty after stripping. - ``## Background / Context`` — when ``background`` is None/empty. Always emitted: - ``AUTONOMOUS_SYSTEM_PROMPT`` (the four "Task instructions" bullets). - ``## Your Task`` (the editing-instruction block). - ``## Turn Budget`` — when ``max_turns`` is provided (already conditional pre-PR). Removed ``MUTATION_PROMPT_TEMPLATE`` and ``MERGE_PROMPT_TEMPLATE`` constants since the prompt is now assembled dynamically. Extracted new helpers ``_render_scores_section``, ``_render_extra_asi``, ``_render_diagnostics`` for consistency with the existing ``_render_evaluator_notes`` / ``_render_evaluator_output_fallback``. Tests updated: - ``test_default_background_when_none`` → ``test_background_section_omitted_when_none`` in both ``test_mutator.py`` and ``test_merger.py``. - ``test_no_scores_fallback`` → ``test_scores_section_omitted_when_empty``. - ``test_handles_none_eval_results`` → ``test_strengths_sections_omitted_when_eval_results_none``. - ``test_empty_diff_shows_fallback`` → ``test_diff_section_omitted_when_empty``. Each updated test now asserts the section is *absent* from the prompt (positive verification of the new behaviour) AND that the previous placeholder string is also absent (so a future regression that reintroduces the placeholder can't pass). 851 unit tests pass; ``mypy --strict src/helix/`` clean. Co-Authored-By: Claude Opus 4.7 --- src/helix/merger.py | 90 +++++++------- src/helix/mutator.py | 244 +++++++++++++++++++++++-------------- tests/unit/test_merger.py | 34 ++++-- tests/unit/test_mutator.py | 19 ++- 4 files changed, 244 insertions(+), 143 deletions(-) diff --git a/src/helix/merger.py b/src/helix/merger.py index 13982e3..f59d6d9 100644 --- a/src/helix/merger.py +++ b/src/helix/merger.py @@ -61,44 +61,28 @@ def select_eval_subsample_for_merged_program( # Prompts # --------------------------------------------------------------------------- -MERGE_PROMPT_TEMPLATE = """\ -{system_prompt} - -## Objective -{objective} - -## Candidate A Strengths -{strengths_a} - -## Candidate B Strengths -{strengths_b} - -## Diff (B relative to A) -```diff -{diff} -``` - -## Background / Context -{background} - +MERGE_TASK_INSTRUCTIONS = """\ ## Your Task You are merging the best aspects of Candidate A and Candidate B to create a superior combined solution that better achieves the objective. Candidate A is already checked out in your working directory. Apply the changes from Candidate B that are beneficial, and discard or adapt those that conflict or regress. -You may read, edit, create, or delete files as needed. -{turn_budget}""" +You may read, edit, create, or delete files as needed.""" # --------------------------------------------------------------------------- # Prompt construction # --------------------------------------------------------------------------- -def _format_eval_strengths(eval_result: EvalResult | None, label: str) -> str: - """Return a human-readable summary of a candidate's eval result.""" - if eval_result is None: - return f" {label}: (no evaluation data)" +def _format_eval_strengths(eval_result: EvalResult) -> str: + """Return a human-readable summary of a candidate's eval result. + + Returns the section body only (no header); the caller is responsible + for the ``## Candidate {A,B} Strengths`` heading. Empty input is + handled by skipping the section entirely in :func:`build_merge_prompt` + rather than emitting a ``"(no evaluation data)"`` placeholder. + """ lines = [f" Aggregate score: {eval_result.aggregate_score():.4f}"] for k, v in sorted(eval_result.scores.items()): lines.append(f" {k}: {v}") @@ -120,21 +104,45 @@ def build_merge_prompt( background: str | None = None, max_turns: int | None = None, ) -> str: - """Construct the merge prompt for Claude Code.""" - strengths_a = _format_eval_strengths(eval_result_a, "Candidate A") - strengths_b = _format_eval_strengths(eval_result_b, "Candidate B") - bg = background or "(no additional background provided)" - diff_text = diff.strip() if diff.strip() else "(no diff — candidates are identical)" - - return MERGE_PROMPT_TEMPLATE.format( - system_prompt=AUTONOMOUS_SYSTEM_PROMPT, - objective=objective, - strengths_a=strengths_a, - strengths_b=strengths_b, - diff=diff_text, - background=bg, - turn_budget=_turn_budget_section(max_turns), - ) + """Construct the merge prompt for the configured agent backend. + + Sections are emitted only when they have content, mirroring GEPA O.A.'s + ``_build_reflection_prompt_template`` accumulator pattern + (``gepa/optimize_anything.py:501-596``). Absent eval results for a + candidate, an empty diff, and absent ``background`` all skip their + respective sections entirely instead of emitting ``"(no evaluation + data)"`` / ``"(no diff — candidates are identical)"`` / ``"(no + additional background provided)"`` placeholders. + """ + sections: list[str] = [AUTONOMOUS_SYSTEM_PROMPT.rstrip()] + + if objective: + sections.append(f"## Objective\n{objective}") + + if eval_result_a is not None: + sections.append( + "## Candidate A Strengths\n" + _format_eval_strengths(eval_result_a) + ) + + if eval_result_b is not None: + sections.append( + "## Candidate B Strengths\n" + _format_eval_strengths(eval_result_b) + ) + + diff_stripped = diff.strip() + if diff_stripped: + sections.append(f"## Diff (B relative to A)\n```diff\n{diff_stripped}\n```") + + if background: + sections.append(f"## Background / Context\n{background}") + + sections.append(MERGE_TASK_INSTRUCTIONS) + + turn_budget = _turn_budget_section(max_turns) + if turn_budget: + sections.append(turn_budget.strip()) + + return "\n\n".join(sections) + "\n" # --------------------------------------------------------------------------- diff --git a/src/helix/mutator.py b/src/helix/mutator.py index 97bfe57..16ae6b0 100644 --- a/src/helix/mutator.py +++ b/src/helix/mutator.py @@ -60,22 +60,10 @@ Make your implementation complete and ready to be evaluated immediately. {turn_budget}""" -MUTATION_PROMPT_TEMPLATE = """\ -{system_prompt} - -## Objective -{objective} - -## Current Evaluation Scores -{scores} - -{diagnostics_section}{evaluator_notes_section}{evaluator_output_section}{extra_asi_section}## Background / Context -{background} - +MUTATION_TASK_INSTRUCTIONS = """\ ## Your Task Analyse the evaluation results above and improve the code to better achieve the objective. -Make targeted, meaningful changes. You may read, edit, create, or delete files as needed. -{turn_budget}""" +Make targeted, meaningful changes. You may read, edit, create, or delete files as needed.""" # --------------------------------------------------------------------------- # Prompt construction @@ -135,12 +123,13 @@ def _evaluator_failed(eval_result: EvalResult) -> bool: return eval_result.asi.get("_returncode") not in (None, "0", 0) -# All ``_render_*`` helpers below share a strict invariant: they return -# either the empty string (the section is suppressed) or a block of -# text that ends with a single trailing blank line (``"\n\n"``). The -# ``MUTATION_PROMPT_TEMPLATE`` chains them together verbatim and relies -# on this convention to keep section spacing consistent. Update both -# the helper and the template if you change it. +# All ``_render_*`` helpers below return either the empty string (the +# section is suppressed) or a fully-formed Markdown section. Empty +# returns let ``build_mutation_prompt`` skip the section entirely — +# mirrors GEPA's ``_build_reflection_prompt_template`` accumulator +# pattern (``gepa/optimize_anything.py:501-596``). Non-empty returns +# may carry trailing whitespace; ``build_mutation_prompt`` rstrips +# before joining sections with a uniform blank-line separator. def _render_evaluator_notes(eval_result: EvalResult) -> str: @@ -151,25 +140,53 @@ def _render_evaluator_notes(eval_result: EvalResult) -> str: def _render_evaluator_output_fallback(eval_result: EvalResult) -> str: - """Render stdout/stderr only when they are useful fallback diagnostics.""" - has_notes = bool(eval_result.asi.get("log", "").strip()) - include_streams = _evaluator_failed(eval_result) or ( - not _has_structured_diagnostics(eval_result) and not has_notes - ) - if not include_streams: - return "" - + """Render ``## Evaluator Output`` from stdout/stderr, or ``""``. + + Two distinct cases: + + * **Evaluator subprocess failed** (non-zero exit) — always emit the + section. Empty streams render with ``(no stdout)`` / ``(no stderr)`` + placeholders here intentionally: the agent needs to know the + evaluator failed but produced no output to inspect (a meaningful + diagnostic on its own). + + * **Evaluator succeeded** — only emit the section when at least one + stream has content *and* no richer diagnostic surface + (``log`` notes or structured side_info) exists. Empty streams are + omitted entirely; partial coverage (only ``stdout`` non-empty, or + only ``stderr``) renders just the present sub-section instead of + padding the other with a ``(no X)`` placeholder. + """ stdout = _strip_machine_protocol_from_evaluator_stream( eval_result.asi.get("stdout", "") ) stderr = _strip_machine_protocol_from_evaluator_stream( eval_result.asi.get("stderr", "") ) - if not stdout: - stdout = "(no stdout)" - if not stderr: - stderr = "(no stderr)" - return f"## Evaluator Output\n\n### stdout\n{stdout}\n\n### stderr\n{stderr}\n\n" + + if _evaluator_failed(eval_result): + stdout_text = stdout or "(no stdout)" + stderr_text = stderr or "(no stderr)" + return ( + f"## Evaluator Output\n\n" + f"### stdout\n{stdout_text}\n\n" + f"### stderr\n{stderr_text}" + ) + + # Evaluator succeeded — defer to richer surfaces when they exist, + # otherwise surface only the streams that actually have content. + has_notes = bool(eval_result.asi.get("log", "").strip()) + if _has_structured_diagnostics(eval_result) or has_notes: + return "" + + parts: list[str] = [] + if stdout: + parts.append(f"### stdout\n{stdout}") + if stderr: + parts.append(f"### stderr\n{stderr}") + if not parts: + return "" + return "## Evaluator Output\n\n" + "\n\n".join(parts) def build_seed_generation_prompt( @@ -394,81 +411,128 @@ def _render_per_example_diagnostics( return "\n".join(lines) + "\n" -def build_mutation_prompt( - objective: str, - eval_result: EvalResult, - background: str | None = None, - max_turns: int | None = None, -) -> str: - """Construct the mutation prompt for Claude Code.""" - scores_text = "\n".join( - f" {k}: {v}" for k, v in sorted(eval_result.scores.items()) - ) - if not scores_text: - scores_text = " (no scores recorded)" - - # Collect any extra_N entries from ASI. The reserved keys here - # are surfaced through dedicated prompt sections (or the - # ``EvalResult.evaluator_returncode`` typed field, in the case of - # the historical ``_returncode`` legacy key) and must never leak - # into the catch-all "extra" rendering. - extra_entries = { +def _render_scores_section(eval_result: EvalResult) -> str: + """Render ``## Current Evaluation Scores`` or ``""`` when no scores exist. + + Mirrors GEPA O.A.'s "only emit a section when there is content for it" + pattern (``gepa/optimize_anything.py:501-596``). Previously HELIX + emitted the section with a ``"(no scores recorded)"`` placeholder; now + the section header is omitted entirely so the agent never sees a stub. + """ + lines = [f" {k}: {v}" for k, v in sorted(eval_result.scores.items())] + if not lines: + return "" + return "## Current Evaluation Scores\n" + "\n".join(lines) + + +def _render_extra_asi(eval_result: EvalResult) -> str: + """Render any free-form ``extra_*`` ASI keys, or ``""`` when none exist. + + Reserved keys (``stdout``, ``stderr``, ``error``, ``log``, ``_returncode``) + are filtered out — they're surfaced through dedicated sections + (``## Evaluator Notes``, ``## Evaluator Output``) or the + ``_returncode`` legacy sentinel — and must never leak into this + catch-all rendering. + """ + entries = { k: v for k, v in sorted(eval_result.asi.items()) if k not in ("stdout", "stderr", "error", "log", "_returncode") } - if extra_entries: - extra_lines = "\n".join(f"### {k}\n{v}" for k, v in extra_entries.items()) - extra_asi_section = f"### Extra Evaluator Info\n{extra_lines}\n\n" - else: - extra_asi_section = "" - - # Render side_info diagnostics. Precedence: - # 1. ``eval_result.per_example_side_info`` (new per-example GEPA - # O.A. contract — list of dicts positional to instance_scores - # ids) when populated; mirrors GEPA's - # ``OptimizeAnythingAdapter.make_reflective_dataset`` at - # ``optimize_anything_adapter.py:524-553`` combined with - # ``format_samples`` at - # ``gepa/strategies/instruction_proposal.py:54-95``. - # 2. ``eval_result.side_info`` (legacy batch-level dict) when - # ``per_example_side_info`` is absent — unchanged rendering - # for non-``helix_result`` paths that still populate the - # legacy field. - # 3. No diagnostics section otherwise. - diagnostics_section = "" + if not entries: + return "" + body = "\n".join(f"### {k}\n{v}" for k, v in entries.items()) + return f"### Extra Evaluator Info\n{body}" + + +def _render_diagnostics(eval_result: EvalResult) -> str: + """Render the ``## Diagnostics`` section, or ``""`` when no side_info. + + Precedence: + 1. ``eval_result.per_example_side_info`` (per-example GEPA O.A. + contract — list of dicts positional to instance_scores ids) when + populated; mirrors GEPA's + ``OptimizeAnythingAdapter.make_reflective_dataset`` combined + with ``format_samples`` at + ``gepa/strategies/instruction_proposal.py:54-95``. + 2. ``eval_result.side_info`` (legacy batch-level dict) when + per-example data is absent. + 3. Empty string when neither is present. + """ if eval_result.per_example_side_info is not None: # Monotonic markdown hierarchy under the surrounding # ``## Diagnostics`` (h2): each example is ``### Example `` # (h3), each side_info key is ``#### {key}`` (h4), nested - # values bump further. Before this the Example header was - # ``#`` (h1), which inverted the hierarchy and confused - # markdown-aware tooling / LLM markdown parsers. - diagnostics_section = _render_per_example_diagnostics( + # values bump further. + return _render_per_example_diagnostics( example_ids=list(eval_result.instance_scores.keys()), per_example_side_info=eval_result.per_example_side_info, example_header_level=3, key_header_level=4, ) - elif eval_result.side_info is not None: + if eval_result.side_info is not None: diag_lines = "\n".join( f" {k}: {v}" for k, v in sorted(eval_result.side_info.items()) ) - diagnostics_section = f"## Diagnostics\n{diag_lines}\n\n" + return f"## Diagnostics\n{diag_lines}" + return "" - bg = background or "(no additional background provided)" - return MUTATION_PROMPT_TEMPLATE.format( - system_prompt=AUTONOMOUS_SYSTEM_PROMPT, - objective=objective, - scores=scores_text, - evaluator_notes_section=_render_evaluator_notes(eval_result), - evaluator_output_section=_render_evaluator_output_fallback(eval_result), - extra_asi_section=extra_asi_section, - diagnostics_section=diagnostics_section, - background=bg, - turn_budget=_turn_budget_section(max_turns), - ) +def build_mutation_prompt( + objective: str, + eval_result: EvalResult, + background: str | None = None, + max_turns: int | None = None, +) -> str: + """Construct the mutation prompt for the configured agent backend. + + Sections are emitted only when they have content, mirroring GEPA O.A.'s + ``_build_reflection_prompt_template`` accumulator pattern + (``gepa/optimize_anything.py:501-596``). Empty ``objective``, empty + ``eval_result.scores``, absent diagnostics, absent evaluator notes, + absent stdout/stderr fallback, absent extra ASI, and absent + ``background`` all skip their respective sections entirely instead of + rendering placeholder strings like ``"(no additional background + provided)"`` or ``"(no scores recorded)"`` that taught nothing. + + ``## Your Task`` and the system prompt are always emitted; they are + the only sections that don't depend on caller-provided content. + """ + sections: list[str] = [AUTONOMOUS_SYSTEM_PROMPT.rstrip()] + + if objective: + sections.append(f"## Objective\n{objective}") + + scores = _render_scores_section(eval_result) + if scores: + sections.append(scores) + + diagnostics = _render_diagnostics(eval_result) + if diagnostics: + sections.append(diagnostics.rstrip()) + + notes = _render_evaluator_notes(eval_result) + if notes: + sections.append(notes.rstrip()) + + output_fallback = _render_evaluator_output_fallback(eval_result) + if output_fallback: + sections.append(output_fallback.rstrip()) + + extra_asi = _render_extra_asi(eval_result) + if extra_asi: + sections.append(extra_asi) + + if background: + sections.append(f"## Background / Context\n{background}") + + sections.append(MUTATION_TASK_INSTRUCTIONS) + + turn_budget = _turn_budget_section(max_turns) + if turn_budget: + sections.append(turn_budget.strip()) + + return "\n\n".join(sections) + "\n" # --------------------------------------------------------------------------- diff --git a/tests/unit/test_merger.py b/tests/unit/test_merger.py index c4e2ddd..572cffb 100644 --- a/tests/unit/test_merger.py +++ b/tests/unit/test_merger.py @@ -71,9 +71,15 @@ def test_contains_background(self): prompt = build_merge_prompt("goal", None, None, "", background="special bg") assert "special bg" in prompt - def test_default_background_when_none(self): + def test_background_section_omitted_when_none(self): + """GEPA parity: empty optional inputs skip the section entirely + instead of emitting a placeholder. Mirrors GEPA O.A.'s + ``_build_reflection_prompt_template`` (optimize_anything.py:501-596), + which only appends a section when its content is non-empty. + """ prompt = build_merge_prompt("goal", None, None, "") - assert "no additional background" in prompt + assert "## Background / Context" not in prompt + assert "no additional background" not in prompt def test_contains_execution_instructions(self): prompt = build_merge_prompt("goal", None, None, "") @@ -86,14 +92,26 @@ def test_includes_eval_scores_when_provided(self): assert "0.8" in prompt assert "0.9" in prompt - def test_handles_none_eval_results(self): - # Should not raise even with no eval data + def test_strengths_sections_omitted_when_eval_results_none(self): + """``eval_result_a`` / ``eval_result_b`` of ``None`` → the + corresponding ``## Candidate {A,B} Strengths`` section is skipped + entirely (no ``"(no evaluation data)"`` placeholder). + """ prompt = build_merge_prompt("goal", None, None, "some diff") - assert "no evaluation data" in prompt - - def test_empty_diff_shows_fallback(self): + assert "## Candidate A Strengths" not in prompt + assert "## Candidate B Strengths" not in prompt + assert "no evaluation data" not in prompt + + def test_diff_section_omitted_when_empty(self): + """An empty diff → ``## Diff (B relative to A)`` section is + skipped entirely (no ``"(no diff — candidates are identical)"`` + placeholder). Upstream callers should not invoke merge on + identical candidates; this test pins the prompt-level fallback. + """ prompt = build_merge_prompt("goal", None, None, "") - assert "identical" in prompt or "no diff" in prompt + assert "## Diff" not in prompt + assert "identical" not in prompt + assert "no diff" not in prompt # --------------------------------------------------------------------------- diff --git a/tests/unit/test_mutator.py b/tests/unit/test_mutator.py index b016096..a302135 100644 --- a/tests/unit/test_mutator.py +++ b/tests/unit/test_mutator.py @@ -116,20 +116,31 @@ def test_contains_background(self): prompt = build_mutation_prompt("goal", er, background="special context here") assert "special context here" in prompt - def test_default_background_when_none(self): + def test_background_section_omitted_when_none(self): + """GEPA parity: empty optional inputs skip the section entirely + instead of emitting a placeholder. Mirrors GEPA O.A.'s + ``_build_reflection_prompt_template`` (optimize_anything.py:501-596), + which only appends a section when its content is non-empty. + """ er = make_eval_result() prompt = build_mutation_prompt("goal", er, background=None) - assert "no additional background" in prompt + assert "## Background / Context" not in prompt + assert "no additional background" not in prompt def test_contains_execution_instructions(self): er = make_eval_result() prompt = build_mutation_prompt("goal", er) assert "Task instructions:" in prompt - def test_no_scores_fallback(self): + def test_scores_section_omitted_when_empty(self): + """No ``eval_result.scores`` → ``## Current Evaluation Scores`` + section is skipped entirely (no ``"(no scores recorded)"`` + placeholder). + """ er = make_eval_result(scores={}) prompt = build_mutation_prompt("goal", er) - assert "no scores recorded" in prompt + assert "## Current Evaluation Scores" not in prompt + assert "no scores recorded" not in prompt def test_renders_helix_log_notes(self): er = make_eval_result( From 823447a05c0b9a73c604bf17fa53eb2f1bb8e998 Mon Sep 17 00:00:00 2001 From: Karim Elmaaroufi Date: Wed, 20 May 2026 12:29:46 -0700 Subject: [PATCH 3/4] fix(prompts): article agreement on the ## Turn Budget section + cross-backend doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related changes to ``_turn_budget_section``: 1. **Article-agreement fix.** Pre-fix the section always rendered ``"You have a {N}-turn limit"``, which is ungrammatical for the 8 / 11 / 18 / 80s cases ("a 8-turn", "a 11-turn", "a 18-turn", "a 80-turn"...). New ``_indefinite_article(n)`` helper picks ``"a"`` vs ``"an"`` based on the spoken pronunciation of the leading digit group within HELIX's realistic max-turns range (1 ≤ n ≤ ~1000). 2. **Cross-backend enforcement docs.** ``--max-turns N`` is passed to the Claude Code CLI by ``_build_cli_args`` (``mutator.py:731-732``) and triggers hard subprocess-level enforcement via Claude's runtime (the ``subtype="error_max_turns"`` response handled at ``mutator.py:1667-1669``). None of the other installed backends (``codex``, ``cursor``, ``gemini``, ``opencode``) expose an equivalent CLI flag — verified against their ``--help`` output, none has ``--max-turns`` / ``--max-iterations`` / ``--turn-limit`` / ``--limit``. For those backends the in-prompt ``## Turn Budget`` section is a soft hint only; whether the agent self-honors it is entirely up to its own behaviour. The section is still emitted for every backend (soft hints have some value), but the docstring now states the enforcement asymmetry explicitly so callers depending on hard caps know to use the ``claude`` backend or add subprocess-level mechanisms (wall-clock timeout, sandbox limits) themselves. Tests: new ``TestTurnBudgetArticleAgreement`` covers (a) consonant- leading numbers using ``"a"``, (b) vowel-leading numbers (8, 11, 18, 80s, 800s) using ``"an"``, and (c) ``max_turns=None`` returning empty. 854 unit tests pass (851 → 854); ``mypy --strict src/helix/`` clean. Co-Authored-By: Claude Opus 4.7 --- src/helix/mutator.py | 42 ++++++++++++++++++++++++++++++++++++-- tests/unit/test_mutator.py | 35 +++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/src/helix/mutator.py b/src/helix/mutator.py index 16ae6b0..6e1462d 100644 --- a/src/helix/mutator.py +++ b/src/helix/mutator.py @@ -70,13 +70,51 @@ # --------------------------------------------------------------------------- +def _indefinite_article(n: int) -> str: + """Pick ``"a"`` or ``"an"`` to match the spoken pronunciation of ``n``. + + Within HELIX's realistic max-turns range (1 ≤ n ≤ ~1000), the + vowel-leading numbers are 8, 11, 18, and the 80s / 800s. Everything + else takes ``"a"``. Handles the visible ``"You have a 8-turn limit"`` + article-agreement glitch without imposing a full English-number + pronunciation rule on the codebase. + """ + s = str(abs(n)) + if s in {"11", "18"}: + return "an" + if s.startswith("8"): # 8, 80-89, 800-899, ... + return "an" + return "a" + + def _turn_budget_section(max_turns: int | None) -> str: - """Return the turn budget prompt section, or empty string if unbounded.""" + """Return the turn budget prompt section, or empty string if unbounded. + + Enforcement semantics differ by backend: + + * ``claude`` — hard cap. HELIX passes ``--max-turns N`` to the Claude + Code CLI (``_build_cli_args``), the runtime kills the session at the + limit, and the resulting ``subtype="error_max_turns"`` response is + detected at :func:`invoke_claude_code` and treated as partial + success. + * ``codex`` / ``cursor`` / ``gemini`` / ``opencode`` — soft hint only. + None of these CLIs expose an equivalent flag (verified against + ``--help`` for the installed binaries), so the in-prompt request is + the only signal the agent receives. Whether the agent self-honors + the limit depends entirely on its own behaviour. + + The section is therefore emitted for every backend regardless — it + still has some value as a soft hint — but callers depending on hard + enforcement should set the budget low enough to also be enforced via + subprocess-level mechanisms (wall-clock timeout, sandbox limits) or + use the Claude backend. + """ if max_turns is None: return "" + article = _indefinite_article(max_turns) return ( f"\n## Turn Budget\n" - f"You have a {max_turns}-turn limit for this task, where turns refer to " + f"You have {article} {max_turns}-turn limit for this task, where turns refer to " f"how many tool calls or interactions you can make. Plan your work " f"accordingly — prioritize the highest-impact changes first and be " f"efficient with your tool usage.\n" diff --git a/tests/unit/test_mutator.py b/tests/unit/test_mutator.py index a302135..b6552f1 100644 --- a/tests/unit/test_mutator.py +++ b/tests/unit/test_mutator.py @@ -213,6 +213,41 @@ def test_returncode_sentinel_does_not_leak_into_prompt(self): assert "real-extra" in prompt +class TestTurnBudgetArticleAgreement: + """``_turn_budget_section`` must use the correct indefinite article + ("a" vs "an") so the rendered prompt reads naturally: + + * ``"You have a 5-turn limit"`` (consonant sound — "five") + * ``"You have an 8-turn limit"`` (vowel sound — "eight") + + Pre-fix the article was always hardcoded ``"a"``, producing + ``"You have a 8-turn limit"`` for the 8/11/18/80s cases. + """ + + def test_consonant_leading_numbers_use_a(self) -> None: + from helix.mutator import _turn_budget_section + + for n in (1, 2, 3, 4, 5, 6, 7, 9, 10, 12, 17, 19, 20, 50, 100, 200): + section = _turn_budget_section(n) + assert f"You have a {n}-turn limit" in section, ( + f"max_turns={n} should use 'a', section was: {section!r}" + ) + + def test_vowel_leading_numbers_use_an(self) -> None: + from helix.mutator import _turn_budget_section + + for n in (8, 11, 18, 80, 85, 88, 800, 888): + section = _turn_budget_section(n) + assert f"You have an {n}-turn limit" in section, ( + f"max_turns={n} should use 'an', section was: {section!r}" + ) + + def test_none_returns_empty(self) -> None: + from helix.mutator import _turn_budget_section + + assert _turn_budget_section(None) == "" + + class TestPerExampleDiagnostics: """``build_mutation_prompt`` renders ``eval_result.per_example_side_info`` as the Diagnostics section under the new GEPA O.A. contract From 9165dee611b8919226e59d613e0d1be5c92dff65 Mon Sep 17 00:00:00 2001 From: Karim Elmaaroufi Date: Wed, 20 May 2026 12:42:06 -0700 Subject: [PATCH 4/4] feat(merge): GEPA-parity two-diff prompt with ancestor-relative attribution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a common ancestor is available, the merge prompt now renders TWO labelled diff sections — ``git diff ancestor..candidate_a`` and ``git diff ancestor..candidate_b`` — instead of the single ``git diff candidate_a..candidate_b``. The agent can read off each parent's contribution directly rather than inferring three-way info from a two-way comparison. This is the file-hunk-level analogue of GEPA's component-wise attribution at ``gepa/proposer/merge.py:163-191``: if pred_anc == pred_id1 or pred_anc == pred_id2: # one parent didn't change this predictor → take the other one's ... elif pred_anc != pred_id1 and pred_anc != pred_id2: # both diverged → tiebreak by score ... GEPA's algorithm has named components. HELIX has a worktree, so we can't pick "component X from parent Y" deterministically — but feeding the agent the three-way diff structure GEPA's algorithm uses gives it the same shape of attribution information for free-form file edits. Behavioural changes: - ``merge()`` gains an optional ``ancestor: Candidate | None = None``. When provided, computes both ancestor-relative diffs and passes them to the prompt builder. When ``None``, falls back to the legacy single A↔B diff. - ``build_merge_prompt`` gains three optional keyword-only parameters: ``ancestor_id``, ``diff_a_from_ancestor``, ``diff_b_from_ancestor``. Two-diff form requires all three; any half-configured combination defensively falls back to the single A↔B path. - A dedicated ``MERGE_TASK_INSTRUCTIONS_TWO_DIFF`` task block accompanies the two-diff form. It explicitly tells the agent that Candidate A's contribution is already in the working tree (so it doesn't re-apply it) and that B's contribution is what needs to be brought in. Single-diff form retains the legacy task framing unchanged. - ``evolution._run_evolution_impl`` resolves the ancestor candidate from the frontier's append-only candidate map (using the public ``frontier.candidates`` view) and passes it to ``merge()``. When the ancestor isn't resolvable (defensive: lineage / frontier drift), logs a warning that names the merge_id and falls back to single-diff. Tests (6 new in ``test_merger.py``): - ``test_emits_two_ancestor_relative_sections`` — happy path renders both ancestor-relative sections and omits the legacy A↔B header. - ``test_two_diff_form_uses_two_diff_task_block`` / ``test_single_diff_form_uses_single_diff_task_block`` — regression pins on which task-instruction block accompanies which diff form. - ``test_single_diff_fallback_when_ancestor_missing`` / ``test_single_diff_fallback_when_ancestor_id_only`` — backward compat plus the half-configured-caller defensive fallback. - ``test_two_diff_form_omits_empty_side`` — one ancestor diff empty → only the populated side renders. - ``test_ancestor_triggers_two_diff_form`` / ``test_no_ancestor_uses_single_diff_form`` — ``merge()``-level assertions on the exact ``get_diff`` call sequence (two ancestor-anchored calls vs one A↔B call) and the resulting prompt content. 862 unit tests pass (860 → 862); ``mypy --strict src/helix/`` clean. Co-Authored-By: Claude Opus 4.7 --- src/helix/evolution.py | 32 +++++-- src/helix/merger.py | 146 +++++++++++++++++++++++++++---- tests/unit/test_merger.py | 177 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 331 insertions(+), 24 deletions(-) diff --git a/src/helix/evolution.py b/src/helix/evolution.py index c2b7ee2..7115ba6 100644 --- a/src/helix/evolution.py +++ b/src/helix/evolution.py @@ -1808,12 +1808,12 @@ def _has_val_support_overlap(i: str, j: str) -> bool: ) if triplet is not None: - # GEPA parity (merge-pairing audit C3, merge.py:94-95): - # ``find_merge_triplet`` now returns the canonical - # ``(i, j)`` (lex-sorted), so ``cid_i <= cid_j`` always — - # the merge subprocess, attempted-pair ledger and the - # description-triplet dedup all see the same tuple order. - cid_i, cid_j, _ancestor_id = triplet + # GEPA parity (merge.py:94-95): ``find_merge_triplet`` + # now returns the canonical ``(i, j)`` (lex-sorted), + # so ``cid_i <= cid_j`` always — the merge subprocess, + # attempted-pair ledger and the description-triplet + # dedup all see the same tuple order. + cid_i, cid_j, ancestor_id = triplet pair_key = [cid_i, cid_j] # Resolve parent val results once; by contract the @@ -1832,8 +1832,25 @@ def _has_val_support_overlap(i: str, j: str) -> bool: a = frontier._candidates[cid_i] b = frontier._candidates[cid_j] - + # Resolve the common ancestor for the two-diff merge + # prompt (GEPA parity at the file-hunk level: feed the + # agent the same three-way structure GEPA's algorithm + # uses to attribute changes — + # ``gepa/proposer/merge.py:163-191``). The ancestor + # came from ``find_merge_triplet``; resolve it through + # the frontier's append-only candidate map. ``None`` + # is tolerated downstream — ``merge()`` falls back to + # the single A↔B diff when the ancestor isn't + # resolvable (defensive: lineage / frontier drift). + ancestor_candidate = frontier.candidates.get(ancestor_id) merge_id = budget_api.next_merge_id(state, gen) + if ancestor_candidate is None: + print_warning( + f"Merge {merge_id} ({cid_i} + {cid_j}): common " + f"ancestor {ancestor_id} not found in frontier " + f"candidate map; falling back to single A↔B " + f"diff form for this merge." + ) merged = merge( candidate_a=a, @@ -1849,6 +1866,7 @@ def _has_val_support_overlap(i: str, j: str) -> bool: cand, config, project_root ) ), + ancestor=ancestor_candidate, ) if merged is None: diff --git a/src/helix/merger.py b/src/helix/merger.py index f59d6d9..94f2677 100644 --- a/src/helix/merger.py +++ b/src/helix/merger.py @@ -61,7 +61,7 @@ def select_eval_subsample_for_merged_program( # Prompts # --------------------------------------------------------------------------- -MERGE_TASK_INSTRUCTIONS = """\ +MERGE_TASK_INSTRUCTIONS_SINGLE_DIFF = """\ ## Your Task You are merging the best aspects of Candidate A and Candidate B to create a superior combined solution that better achieves the objective. @@ -70,6 +70,24 @@ def select_eval_subsample_for_merged_program( Candidate B that are beneficial, and discard or adapt those that conflict or regress. You may read, edit, create, or delete files as needed.""" +MERGE_TASK_INSTRUCTIONS_TWO_DIFF = """\ +## Your Task +You are merging the best aspects of Candidate A and Candidate B to create a superior +combined solution that better achieves the objective. + +Candidate A's worktree is already checked out — A's contribution (the hunks shown in +"Diff: Candidate A relative to common ancestor") is already in place, so you do not +need to re-apply it. Your job is to bring in Candidate B's contribution (the hunks +shown in "Diff: Candidate B relative to common ancestor") wherever it is beneficial. + +For each hunk in B's diff: +- If the file is untouched by A's diff, the change is independent — apply it. +- If the file is also touched by A's diff (overlapping region), the two parents + diverged from the ancestor on the same region: reconcile the changes, picking + whichever side better serves the objective, or synthesize a combined version. + +You may read, edit, create, or delete files as needed.""" + # --------------------------------------------------------------------------- # Prompt construction # --------------------------------------------------------------------------- @@ -103,16 +121,36 @@ def build_merge_prompt( diff: str, background: str | None = None, max_turns: int | None = None, + *, + ancestor_id: str | None = None, + diff_a_from_ancestor: str | None = None, + diff_b_from_ancestor: str | None = None, ) -> str: """Construct the merge prompt for the configured agent backend. Sections are emitted only when they have content, mirroring GEPA O.A.'s ``_build_reflection_prompt_template`` accumulator pattern - (``gepa/optimize_anything.py:501-596``). Absent eval results for a - candidate, an empty diff, and absent ``background`` all skip their - respective sections entirely instead of emitting ``"(no evaluation - data)"`` / ``"(no diff — candidates are identical)"`` / ``"(no - additional background provided)"`` placeholders. + (``gepa/optimize_anything.py:501-596``). + + **Diff section format** — two-diff (ancestor-relative) vs single (A↔B): + + GEPA's merge operator (``gepa/proposer/merge.py:155-203``) reasons over + *three* program states — common ancestor, candidate A, candidate B — + to attribute each component change to whichever parent diverged from + the ancestor. When ``ancestor_id`` + both ``diff_a_from_ancestor`` + and ``diff_b_from_ancestor`` are supplied, this prompt mirrors that + structure at the file-hunk level: each parent's diff against the + common ancestor is rendered as its own labelled section, so the + agent can read off "A's contribution" and "B's contribution" + directly instead of inferring three-way info from a single A↔B diff. + + When the ancestor-relative pair is not supplied (e.g. legacy callers, + tests that don't have an ancestor handy), the prompt falls back to + the single ``## Diff (B relative to A)`` section driven by ``diff``. + + Absent eval results, absent diff(s), and absent ``background`` all + skip their respective sections entirely instead of emitting + placeholder strings. """ sections: list[str] = [AUTONOMOUS_SYSTEM_PROMPT.rstrip()] @@ -129,14 +167,45 @@ def build_merge_prompt( "## Candidate B Strengths\n" + _format_eval_strengths(eval_result_b) ) - diff_stripped = diff.strip() - if diff_stripped: - sections.append(f"## Diff (B relative to A)\n```diff\n{diff_stripped}\n```") + # Diff section — prefer the two-diff (ancestor-relative) form when + # both diffs are available, fall back to single A↔B otherwise. Each + # branch independently honors the "omit when empty" invariant. + use_two_diff = ( + ancestor_id is not None + and diff_a_from_ancestor is not None + and diff_b_from_ancestor is not None + ) + if use_two_diff: + diff_a_stripped = (diff_a_from_ancestor or "").strip() + diff_b_stripped = (diff_b_from_ancestor or "").strip() + if diff_a_stripped: + sections.append( + f"## Diff: Candidate A relative to common ancestor {ancestor_id}\n" + f"```diff\n{diff_a_stripped}\n```" + ) + if diff_b_stripped: + sections.append( + f"## Diff: Candidate B relative to common ancestor {ancestor_id}\n" + f"```diff\n{diff_b_stripped}\n```" + ) + else: + diff_stripped = diff.strip() + if diff_stripped: + sections.append( + f"## Diff (B relative to A)\n```diff\n{diff_stripped}\n```" + ) if background: sections.append(f"## Background / Context\n{background}") - sections.append(MERGE_TASK_INSTRUCTIONS) + # Task instructions vary by diff form. Two-diff form gets explicit + # guidance on what A's contribution vs B's contribution means and how + # to reason about overlapping vs disjoint hunks; single-diff form + # keeps the legacy "apply B's changes" framing. + sections.append( + MERGE_TASK_INSTRUCTIONS_TWO_DIFF if use_two_diff + else MERGE_TASK_INSTRUCTIONS_SINGLE_DIFF + ) turn_budget = _turn_budget_section(max_turns) if turn_budget: @@ -160,22 +229,40 @@ def merge( eval_result_a: EvalResult | None = None, eval_result_b: EvalResult | None = None, prepare_worktree: Callable[[Candidate], None] | None = None, + ancestor: Candidate | None = None, ) -> Candidate | None: """Merge *candidate_a* and *candidate_b* using Claude Code. - Clones *candidate_a*, computes the diff to *candidate_b*, builds a merge + Clones *candidate_a*, computes the relevant diffs, builds a merge prompt, and invokes Claude Code. Snapshots on success; removes the worktree and returns ``None`` on failure. + Two diff-rendering modes, controlled by the optional ``ancestor`` + argument: + + * **Two-diff (ancestor-relative)** — when ``ancestor`` is provided, + computes ``get_diff(ancestor, candidate_a)`` and + ``get_diff(ancestor, candidate_b)`` and renders both as separately + labelled sections in the prompt. The agent can then attribute + each hunk to whichever parent diverged from the common ancestor — + file-hunk-level analogue of GEPA's component-wise attribution + (``gepa/proposer/merge.py:163-191``: ``if pred_anc == pred_id1 …`` + → take id2's version; ``elif pred_anc != pred_id1 and pred_anc != + pred_id2`` → tiebreak by score). + * **Single (A↔B)** — fallback when no ancestor is provided. + Computes ``get_diff(candidate_a, candidate_b)`` and renders a + single ``## Diff (B relative to A)`` section. The agent has to + infer three-way info from a two-way comparison. + GEPA-parity note: this is the correct domain adaptation of GEPA's text-component merge (``gepa/proposer/merge.py:155-203``) for HELIX's full-codebase setting. GEPA can splice ``dict[str, str]`` programs deterministically by swapping components from each parent; HELIX candidates are full git worktrees, where syntactic per-component swap - is undefined, so an LLM-mediated edit is the only viable approach. - The surrounding trigger / parent-selection / subsample / acceptance / - full-val logic in :mod:`helix.evolution` mirrors GEPA's - ``MergeProposer`` and ``GEPAEngine`` verbatim. + is undefined, so an LLM-mediated edit is the only viable approach — + but feeding the agent the three-way diff structure GEPA's algorithm + uses (two ancestor-relative diffs instead of one A↔B diff) gives it + the same shape of attribution information. Parameters ---------- @@ -195,6 +282,15 @@ def merge( Evaluation result for candidate A (optional, for richer prompt). eval_result_b: Evaluation result for candidate B (optional, for richer prompt). + prepare_worktree: + Optional callback to refresh protected files in the new worktree + before the agent runs. + ancestor: + Optional most-recent common ancestor of A and B (typically + ``frontier.candidates[ancestor_id]`` where ``ancestor_id`` came + from :func:`helix.lineage.find_merge_triplet`). When supplied, + the prompt uses the two-diff (ancestor-relative) form; when + ``None``, falls back to the single A↔B diff. Returns ------- @@ -207,15 +303,31 @@ def merge( if prepare_worktree is not None: prepare_worktree(child) - diff = get_diff(candidate_a, candidate_b) + # Diff-rendering mode selection. ``ancestor`` available → compute + # the two ancestor-relative diffs that drive the GEPA-style + # attribution prompt. Otherwise compute the single A↔B fallback. + if ancestor is not None: + diff_a_from_ancestor: str | None = get_diff(ancestor, candidate_a) + diff_b_from_ancestor: str | None = get_diff(ancestor, candidate_b) + # ``diff`` (single A↔B) is computed lazily only for the fallback + # path; with both ancestor-relative diffs in hand, the prompt + # builder ignores the legacy parameter, so pass an empty string. + legacy_diff = "" + else: + diff_a_from_ancestor = None + diff_b_from_ancestor = None + legacy_diff = get_diff(candidate_a, candidate_b) prompt = build_merge_prompt( config.objective, eval_result_a, eval_result_b, - diff, + legacy_diff, background, config.agent.max_turns, + ancestor_id=ancestor.id if ancestor is not None else None, + diff_a_from_ancestor=diff_a_from_ancestor, + diff_b_from_ancestor=diff_b_from_ancestor, ) try: diff --git a/tests/unit/test_merger.py b/tests/unit/test_merger.py index 572cffb..5176916 100644 --- a/tests/unit/test_merger.py +++ b/tests/unit/test_merger.py @@ -114,6 +114,102 @@ def test_diff_section_omitted_when_empty(self): assert "no diff" not in prompt +class TestBuildMergePromptTwoDiff: + """GEPA-parity two-diff form: when ``ancestor_id`` and both + ancestor-relative diffs are supplied, the prompt emits two labelled + sections instead of the single A↔B diff. Mirrors GEPA's + three-way attribution reasoning (``gepa/proposer/merge.py:163-191``) + at the file-hunk level — agent reads off A's contribution and B's + contribution directly instead of inferring three-way info from a + two-way comparison. + """ + + def test_emits_two_ancestor_relative_sections(self): + prompt = build_merge_prompt( + "goal", None, None, "", + ancestor_id="g0-s0", + diff_a_from_ancestor="+self.bn = nn.BatchNorm1d(...)\n", + diff_b_from_ancestor="+if epoch < 10: lr = 0.001\n", + ) + assert "## Diff: Candidate A relative to common ancestor g0-s0" in prompt + assert "## Diff: Candidate B relative to common ancestor g0-s0" in prompt + assert "self.bn = nn.BatchNorm1d" in prompt + assert "if epoch < 10" in prompt + # Fallback section MUST NOT also render — the agent should see + # exactly the two ancestor-relative diffs, not three sections. + assert "## Diff (B relative to A)" not in prompt + + def test_two_diff_form_uses_two_diff_task_block(self): + """The two-diff form pairs the ancestor-relative diffs with a + task block that explicitly tells the agent A's contribution is + already in the working tree (so it doesn't re-apply it) and B's + contribution is what needs to be brought in. Pinning this + substring prevents a future refactor from accidentally swapping + back to the single-diff task block, which would confuse the + agent about whether A's diff needs to be re-applied. + """ + prompt = build_merge_prompt( + "goal", None, None, "", + ancestor_id="g0-s0", + diff_a_from_ancestor="+a contribution\n", + diff_b_from_ancestor="+b contribution\n", + ) + assert "A's contribution (the hunks shown in" in prompt + # The single-diff task block's "Apply the changes from Candidate B" + # phrasing must NOT appear in the two-diff form. + assert "Apply the changes from\nCandidate B that are beneficial" not in prompt + + def test_single_diff_form_uses_single_diff_task_block(self): + """Companion to the two-diff task-block test: the single-diff + fallback must keep the legacy task framing intact. + """ + prompt = build_merge_prompt("goal", None, None, "+some diff") + assert "Apply the changes from\nCandidate B" in prompt + # The two-diff task block's phrasing must NOT appear in the + # single-diff form. + assert "A's contribution (the hunks shown in" not in prompt + + def test_single_diff_fallback_when_ancestor_missing(self): + """No ancestor + no ancestor diffs → single A↔B diff form + (backward-compatible default). + """ + prompt = build_merge_prompt("goal", None, None, "+some diff content") + assert "## Diff (B relative to A)" in prompt + assert "+some diff content" in prompt + # Two-diff section headers must NOT appear in the fallback path. + assert "relative to common ancestor" not in prompt + + def test_single_diff_fallback_when_ancestor_id_only(self): + """Defensive: an ``ancestor_id`` without both diffs falls back + to the single A↔B path. Prevents a half-configured caller from + emitting an "ancestor header pointing to nothing" prompt. + """ + prompt = build_merge_prompt( + "goal", None, None, "+legacy diff", + ancestor_id="g0-s0", + diff_a_from_ancestor=None, + diff_b_from_ancestor=None, + ) + assert "## Diff (B relative to A)" in prompt + assert "+legacy diff" in prompt + assert "relative to common ancestor" not in prompt + + def test_two_diff_form_omits_empty_side(self): + """If one of the ancestor-relative diffs is empty (one parent + didn't change anything relative to the ancestor), only the + non-empty side renders. Edge case: a parent might "improve" via + metadata changes that ``git diff`` doesn't see. + """ + prompt = build_merge_prompt( + "goal", None, None, "", + ancestor_id="g0-s0", + diff_a_from_ancestor="+self.bn = nn.BatchNorm1d(...)\n", + diff_b_from_ancestor="", + ) + assert "## Diff: Candidate A relative to common ancestor g0-s0" in prompt + assert "## Diff: Candidate B relative to common ancestor g0-s0" not in prompt + + # --------------------------------------------------------------------------- # Tests: merge # --------------------------------------------------------------------------- @@ -267,6 +363,87 @@ def test_passes_background_to_prompt(self, mocker): prompt_arg = mock_invoke.call_args[0][1] assert "unique_context_xyz" in prompt_arg + def test_ancestor_triggers_two_diff_form(self, mocker): + """When ``merge()`` receives an ``ancestor`` argument, it must + call ``get_diff`` twice (ancestor→A and ancestor→B) and the + rendered prompt must contain both ancestor-relative diff + sections instead of the single A↔B diff. + """ + ancestor = make_candidate("g0-s0") + ca = make_candidate("g1-s0") + cb = make_candidate("g1-s1") + config = make_config() + + child = make_candidate("g2-m0") + mocker.patch("helix.merger.clone_candidate", return_value=child) + + # get_diff is called per-side; sequence captures which two diffs + # were requested so we can assert the right pair of arguments + # was passed (ancestor→A and ancestor→B, NOT A↔B). + diff_calls: list[tuple[str, str]] = [] + + def fake_get_diff(x, y): + diff_calls.append((x.id, y.id)) + return f"+contribution from {y.id}" + + mocker.patch("helix.merger.get_diff", side_effect=fake_get_diff) + mock_invoke = mocker.patch( + "helix.merger.invoke_claude_code", return_value=({}, {}) + ) + mocker.patch("helix.merger.snapshot_candidate", return_value="sha") + mocker.patch("helix.merger.remove_worktree") + + merge(ca, cb, "g2-m0", config, Path("/tmp"), ancestor=ancestor) + + # Two get_diff calls, both anchored on the ancestor. + assert diff_calls == [("g0-s0", "g1-s0"), ("g0-s0", "g1-s1")], ( + f"expected two ancestor-relative diff calls; got {diff_calls}" + ) + + # Prompt must contain both ancestor-relative section headers + # and neither the legacy A↔B header nor any leftover placeholder. + prompt_arg = mock_invoke.call_args[0][1] + assert "## Diff: Candidate A relative to common ancestor g0-s0" in prompt_arg + assert "## Diff: Candidate B relative to common ancestor g0-s0" in prompt_arg + assert "## Diff (B relative to A)" not in prompt_arg + assert "contribution from g1-s0" in prompt_arg + assert "contribution from g1-s1" in prompt_arg + + def test_no_ancestor_uses_single_diff_form(self, mocker): + """Backward-compat: ``merge()`` without an ``ancestor`` argument + still computes a single A↔B diff and renders the legacy section. + """ + ca = make_candidate("g1-s0") + cb = make_candidate("g1-s1") + config = make_config() + + child = make_candidate("g2-m0") + mocker.patch("helix.merger.clone_candidate", return_value=child) + + diff_calls: list[tuple[str, str]] = [] + + def fake_get_diff(x, y): + diff_calls.append((x.id, y.id)) + return "+A-to-B contribution" + + mocker.patch("helix.merger.get_diff", side_effect=fake_get_diff) + mock_invoke = mocker.patch( + "helix.merger.invoke_claude_code", return_value=({}, {}) + ) + mocker.patch("helix.merger.snapshot_candidate", return_value="sha") + mocker.patch("helix.merger.remove_worktree") + + merge(ca, cb, "g2-m0", config, Path("/tmp")) # no ancestor + + # Single get_diff call with A↔B arguments. + assert diff_calls == [("g1-s0", "g1-s1")], ( + f"expected single A↔B diff call; got {diff_calls}" + ) + + prompt_arg = mock_invoke.call_args[0][1] + assert "## Diff (B relative to A)" in prompt_arg + assert "## Diff: Candidate A relative to common ancestor" not in prompt_arg + def test_imports_mutation_error_from_mutator(self): """merger.py must reuse MutationError from mutator to avoid duplication.""" from helix.merger import MutationError as MergerME