diff --git a/src/helix/evolution.py b/src/helix/evolution.py index c2b7ee2..7115ba6 100644 --- a/src/helix/evolution.py +++ b/src/helix/evolution.py @@ -1808,12 +1808,12 @@ def _has_val_support_overlap(i: str, j: str) -> bool: ) if triplet is not None: - # GEPA parity (merge-pairing audit C3, merge.py:94-95): - # ``find_merge_triplet`` now returns the canonical - # ``(i, j)`` (lex-sorted), so ``cid_i <= cid_j`` always — - # the merge subprocess, attempted-pair ledger and the - # description-triplet dedup all see the same tuple order. - cid_i, cid_j, _ancestor_id = triplet + # GEPA parity (merge.py:94-95): ``find_merge_triplet`` + # now returns the canonical ``(i, j)`` (lex-sorted), + # so ``cid_i <= cid_j`` always — the merge subprocess, + # attempted-pair ledger and the description-triplet + # dedup all see the same tuple order. + cid_i, cid_j, ancestor_id = triplet pair_key = [cid_i, cid_j] # Resolve parent val results once; by contract the @@ -1832,8 +1832,25 @@ def _has_val_support_overlap(i: str, j: str) -> bool: a = frontier._candidates[cid_i] b = frontier._candidates[cid_j] - + # Resolve the common ancestor for the two-diff merge + # prompt (GEPA parity at the file-hunk level: feed the + # agent the same three-way structure GEPA's algorithm + # uses to attribute changes — + # ``gepa/proposer/merge.py:163-191``). The ancestor + # came from ``find_merge_triplet``; resolve it through + # the frontier's append-only candidate map. ``None`` + # is tolerated downstream — ``merge()`` falls back to + # the single A↔B diff when the ancestor isn't + # resolvable (defensive: lineage / frontier drift). + ancestor_candidate = frontier.candidates.get(ancestor_id) merge_id = budget_api.next_merge_id(state, gen) + if ancestor_candidate is None: + print_warning( + f"Merge {merge_id} ({cid_i} + {cid_j}): common " + f"ancestor {ancestor_id} not found in frontier " + f"candidate map; falling back to single A↔B " + f"diff form for this merge." + ) merged = merge( candidate_a=a, @@ -1849,6 +1866,7 @@ def _has_val_support_overlap(i: str, j: str) -> bool: cand, config, project_root ) ), + ancestor=ancestor_candidate, ) if merged is None: diff --git a/src/helix/merger.py b/src/helix/merger.py index 487ae1b..94f2677 100644 --- a/src/helix/merger.py +++ b/src/helix/merger.py @@ -61,47 +61,46 @@ def select_eval_subsample_for_merged_program( # Prompts # --------------------------------------------------------------------------- -MERGE_PROMPT_TEMPLATE = """\ -{system_prompt} - -## Objective -{objective} - -## Candidate A Strengths -{strengths_a} - -## Candidate B Strengths -{strengths_b} - -## Diff (B relative to A) -```diff -{diff} -``` - -## Background / Context -{background} - +MERGE_TASK_INSTRUCTIONS_SINGLE_DIFF = """\ ## Your Task You are merging the best aspects of Candidate A and Candidate B to create a superior combined solution that better achieves the objective. Candidate A is already checked out in your working directory. Apply the changes from Candidate B that are beneficial, and discard or adapt those that conflict or regress. -You may read, edit, create, or delete files as needed. +You may read, edit, create, or delete files as needed.""" -When you have finished making all your changes, output the exact text: -[MERGE COMPLETE] -{turn_budget}""" +MERGE_TASK_INSTRUCTIONS_TWO_DIFF = """\ +## Your Task +You are merging the best aspects of Candidate A and Candidate B to create a superior +combined solution that better achieves the objective. + +Candidate A's worktree is already checked out — A's contribution (the hunks shown in +"Diff: Candidate A relative to common ancestor") is already in place, so you do not +need to re-apply it. Your job is to bring in Candidate B's contribution (the hunks +shown in "Diff: Candidate B relative to common ancestor") wherever it is beneficial. + +For each hunk in B's diff: +- If the file is untouched by A's diff, the change is independent — apply it. +- If the file is also touched by A's diff (overlapping region), the two parents + diverged from the ancestor on the same region: reconcile the changes, picking + whichever side better serves the objective, or synthesize a combined version. + +You may read, edit, create, or delete files as needed.""" # --------------------------------------------------------------------------- # Prompt construction # --------------------------------------------------------------------------- -def _format_eval_strengths(eval_result: EvalResult | None, label: str) -> str: - """Return a human-readable summary of a candidate's eval result.""" - if eval_result is None: - return f" {label}: (no evaluation data)" +def _format_eval_strengths(eval_result: EvalResult) -> str: + """Return a human-readable summary of a candidate's eval result. + + Returns the section body only (no header); the caller is responsible + for the ``## Candidate {A,B} Strengths`` heading. Empty input is + handled by skipping the section entirely in :func:`build_merge_prompt` + rather than emitting a ``"(no evaluation data)"`` placeholder. + """ lines = [f" Aggregate score: {eval_result.aggregate_score():.4f}"] for k, v in sorted(eval_result.scores.items()): lines.append(f" {k}: {v}") @@ -122,23 +121,98 @@ def build_merge_prompt( diff: str, background: str | None = None, max_turns: int | None = None, + *, + ancestor_id: str | None = None, + diff_a_from_ancestor: str | None = None, + diff_b_from_ancestor: str | None = None, ) -> str: - """Construct the merge prompt for Claude Code.""" - strengths_a = _format_eval_strengths(eval_result_a, "Candidate A") - strengths_b = _format_eval_strengths(eval_result_b, "Candidate B") - bg = background or "(no additional background provided)" - diff_text = diff.strip() if diff.strip() else "(no diff — candidates are identical)" - - return MERGE_PROMPT_TEMPLATE.format( - system_prompt=AUTONOMOUS_SYSTEM_PROMPT, - objective=objective, - strengths_a=strengths_a, - strengths_b=strengths_b, - diff=diff_text, - background=bg, - turn_budget=_turn_budget_section(max_turns), + """Construct the merge prompt for the configured agent backend. + + Sections are emitted only when they have content, mirroring GEPA O.A.'s + ``_build_reflection_prompt_template`` accumulator pattern + (``gepa/optimize_anything.py:501-596``). + + **Diff section format** — two-diff (ancestor-relative) vs single (A↔B): + + GEPA's merge operator (``gepa/proposer/merge.py:155-203``) reasons over + *three* program states — common ancestor, candidate A, candidate B — + to attribute each component change to whichever parent diverged from + the ancestor. When ``ancestor_id`` + both ``diff_a_from_ancestor`` + and ``diff_b_from_ancestor`` are supplied, this prompt mirrors that + structure at the file-hunk level: each parent's diff against the + common ancestor is rendered as its own labelled section, so the + agent can read off "A's contribution" and "B's contribution" + directly instead of inferring three-way info from a single A↔B diff. + + When the ancestor-relative pair is not supplied (e.g. legacy callers, + tests that don't have an ancestor handy), the prompt falls back to + the single ``## Diff (B relative to A)`` section driven by ``diff``. + + Absent eval results, absent diff(s), and absent ``background`` all + skip their respective sections entirely instead of emitting + placeholder strings. + """ + sections: list[str] = [AUTONOMOUS_SYSTEM_PROMPT.rstrip()] + + if objective: + sections.append(f"## Objective\n{objective}") + + if eval_result_a is not None: + sections.append( + "## Candidate A Strengths\n" + _format_eval_strengths(eval_result_a) + ) + + if eval_result_b is not None: + sections.append( + "## Candidate B Strengths\n" + _format_eval_strengths(eval_result_b) + ) + + # Diff section — prefer the two-diff (ancestor-relative) form when + # both diffs are available, fall back to single A↔B otherwise. Each + # branch independently honors the "omit when empty" invariant. + use_two_diff = ( + ancestor_id is not None + and diff_a_from_ancestor is not None + and diff_b_from_ancestor is not None + ) + if use_two_diff: + diff_a_stripped = (diff_a_from_ancestor or "").strip() + diff_b_stripped = (diff_b_from_ancestor or "").strip() + if diff_a_stripped: + sections.append( + f"## Diff: Candidate A relative to common ancestor {ancestor_id}\n" + f"```diff\n{diff_a_stripped}\n```" + ) + if diff_b_stripped: + sections.append( + f"## Diff: Candidate B relative to common ancestor {ancestor_id}\n" + f"```diff\n{diff_b_stripped}\n```" + ) + else: + diff_stripped = diff.strip() + if diff_stripped: + sections.append( + f"## Diff (B relative to A)\n```diff\n{diff_stripped}\n```" + ) + + if background: + sections.append(f"## Background / Context\n{background}") + + # Task instructions vary by diff form. Two-diff form gets explicit + # guidance on what A's contribution vs B's contribution means and how + # to reason about overlapping vs disjoint hunks; single-diff form + # keeps the legacy "apply B's changes" framing. + sections.append( + MERGE_TASK_INSTRUCTIONS_TWO_DIFF if use_two_diff + else MERGE_TASK_INSTRUCTIONS_SINGLE_DIFF ) + turn_budget = _turn_budget_section(max_turns) + if turn_budget: + sections.append(turn_budget.strip()) + + return "\n\n".join(sections) + "\n" + # --------------------------------------------------------------------------- # High-level merge entry point @@ -155,22 +229,40 @@ def merge( eval_result_a: EvalResult | None = None, eval_result_b: EvalResult | None = None, prepare_worktree: Callable[[Candidate], None] | None = None, + ancestor: Candidate | None = None, ) -> Candidate | None: """Merge *candidate_a* and *candidate_b* using Claude Code. - Clones *candidate_a*, computes the diff to *candidate_b*, builds a merge + Clones *candidate_a*, computes the relevant diffs, builds a merge prompt, and invokes Claude Code. Snapshots on success; removes the worktree and returns ``None`` on failure. + Two diff-rendering modes, controlled by the optional ``ancestor`` + argument: + + * **Two-diff (ancestor-relative)** — when ``ancestor`` is provided, + computes ``get_diff(ancestor, candidate_a)`` and + ``get_diff(ancestor, candidate_b)`` and renders both as separately + labelled sections in the prompt. The agent can then attribute + each hunk to whichever parent diverged from the common ancestor — + file-hunk-level analogue of GEPA's component-wise attribution + (``gepa/proposer/merge.py:163-191``: ``if pred_anc == pred_id1 …`` + → take id2's version; ``elif pred_anc != pred_id1 and pred_anc != + pred_id2`` → tiebreak by score). + * **Single (A↔B)** — fallback when no ancestor is provided. + Computes ``get_diff(candidate_a, candidate_b)`` and renders a + single ``## Diff (B relative to A)`` section. The agent has to + infer three-way info from a two-way comparison. + GEPA-parity note: this is the correct domain adaptation of GEPA's text-component merge (``gepa/proposer/merge.py:155-203``) for HELIX's full-codebase setting. GEPA can splice ``dict[str, str]`` programs deterministically by swapping components from each parent; HELIX candidates are full git worktrees, where syntactic per-component swap - is undefined, so an LLM-mediated edit is the only viable approach. - The surrounding trigger / parent-selection / subsample / acceptance / - full-val logic in :mod:`helix.evolution` mirrors GEPA's - ``MergeProposer`` and ``GEPAEngine`` verbatim. + is undefined, so an LLM-mediated edit is the only viable approach — + but feeding the agent the three-way diff structure GEPA's algorithm + uses (two ancestor-relative diffs instead of one A↔B diff) gives it + the same shape of attribution information. Parameters ---------- @@ -190,6 +282,15 @@ def merge( Evaluation result for candidate A (optional, for richer prompt). eval_result_b: Evaluation result for candidate B (optional, for richer prompt). + prepare_worktree: + Optional callback to refresh protected files in the new worktree + before the agent runs. + ancestor: + Optional most-recent common ancestor of A and B (typically + ``frontier.candidates[ancestor_id]`` where ``ancestor_id`` came + from :func:`helix.lineage.find_merge_triplet`). When supplied, + the prompt uses the two-diff (ancestor-relative) form; when + ``None``, falls back to the single A↔B diff. Returns ------- @@ -202,15 +303,31 @@ def merge( if prepare_worktree is not None: prepare_worktree(child) - diff = get_diff(candidate_a, candidate_b) + # Diff-rendering mode selection. ``ancestor`` available → compute + # the two ancestor-relative diffs that drive the GEPA-style + # attribution prompt. Otherwise compute the single A↔B fallback. + if ancestor is not None: + diff_a_from_ancestor: str | None = get_diff(ancestor, candidate_a) + diff_b_from_ancestor: str | None = get_diff(ancestor, candidate_b) + # ``diff`` (single A↔B) is computed lazily only for the fallback + # path; with both ancestor-relative diffs in hand, the prompt + # builder ignores the legacy parameter, so pass an empty string. + legacy_diff = "" + else: + diff_a_from_ancestor = None + diff_b_from_ancestor = None + legacy_diff = get_diff(candidate_a, candidate_b) prompt = build_merge_prompt( config.objective, eval_result_a, eval_result_b, - diff, + legacy_diff, background, config.agent.max_turns, + ancestor_id=ancestor.id if ancestor is not None else None, + diff_a_from_ancestor=diff_a_from_ancestor, + diff_b_from_ancestor=diff_b_from_ancestor, ) try: diff --git a/src/helix/mutator.py b/src/helix/mutator.py index 813b59c..6e1462d 100644 --- a/src/helix/mutator.py +++ b/src/helix/mutator.py @@ -44,7 +44,6 @@ - Do not request confirmation or clarification; choose a reasonable approach and continue. - If one approach fails, try an alternative and keep progressing. - Use available tools to inspect, edit, and validate changes. -- When finished, print exactly: [MUTATION COMPLETE] """ SEEDLESS_INIT_PROMPT_TEMPLATE = """\ @@ -59,43 +58,63 @@ Generate a strong initial candidate based on the goal above. Create all necessary files directly in the current working directory. Make your implementation complete and ready to be evaluated immediately. - -When you have finished creating all files, output the exact text: -[SEED GENERATION COMPLETE] {turn_budget}""" -MUTATION_PROMPT_TEMPLATE = """\ -{system_prompt} - -## Objective -{objective} - -## Current Evaluation Scores -{scores} - -{diagnostics_section}{evaluator_notes_section}{evaluator_output_section}{extra_asi_section}## Background / Context -{background} - +MUTATION_TASK_INSTRUCTIONS = """\ ## Your Task Analyse the evaluation results above and improve the code to better achieve the objective. -Make targeted, meaningful changes. You may read, edit, create, or delete files as needed. - -When you have finished making all your changes, output the exact text: -[MUTATION COMPLETE] -{turn_budget}""" +Make targeted, meaningful changes. You may read, edit, create, or delete files as needed.""" # --------------------------------------------------------------------------- # Prompt construction # --------------------------------------------------------------------------- +def _indefinite_article(n: int) -> str: + """Pick ``"a"`` or ``"an"`` to match the spoken pronunciation of ``n``. + + Within HELIX's realistic max-turns range (1 ≤ n ≤ ~1000), the + vowel-leading numbers are 8, 11, 18, and the 80s / 800s. Everything + else takes ``"a"``. Handles the visible ``"You have a 8-turn limit"`` + article-agreement glitch without imposing a full English-number + pronunciation rule on the codebase. + """ + s = str(abs(n)) + if s in {"11", "18"}: + return "an" + if s.startswith("8"): # 8, 80-89, 800-899, ... + return "an" + return "a" + + def _turn_budget_section(max_turns: int | None) -> str: - """Return the turn budget prompt section, or empty string if unbounded.""" + """Return the turn budget prompt section, or empty string if unbounded. + + Enforcement semantics differ by backend: + + * ``claude`` — hard cap. HELIX passes ``--max-turns N`` to the Claude + Code CLI (``_build_cli_args``), the runtime kills the session at the + limit, and the resulting ``subtype="error_max_turns"`` response is + detected at :func:`invoke_claude_code` and treated as partial + success. + * ``codex`` / ``cursor`` / ``gemini`` / ``opencode`` — soft hint only. + None of these CLIs expose an equivalent flag (verified against + ``--help`` for the installed binaries), so the in-prompt request is + the only signal the agent receives. Whether the agent self-honors + the limit depends entirely on its own behaviour. + + The section is therefore emitted for every backend regardless — it + still has some value as a soft hint — but callers depending on hard + enforcement should set the budget low enough to also be enforced via + subprocess-level mechanisms (wall-clock timeout, sandbox limits) or + use the Claude backend. + """ if max_turns is None: return "" + article = _indefinite_article(max_turns) return ( f"\n## Turn Budget\n" - f"You have a {max_turns}-turn limit for this task, where turns refer to " + f"You have {article} {max_turns}-turn limit for this task, where turns refer to " f"how many tool calls or interactions you can make. Plan your work " f"accordingly — prioritize the highest-impact changes first and be " f"efficient with your tool usage.\n" @@ -142,12 +161,13 @@ def _evaluator_failed(eval_result: EvalResult) -> bool: return eval_result.asi.get("_returncode") not in (None, "0", 0) -# All ``_render_*`` helpers below share a strict invariant: they return -# either the empty string (the section is suppressed) or a block of -# text that ends with a single trailing blank line (``"\n\n"``). The -# ``MUTATION_PROMPT_TEMPLATE`` chains them together verbatim and relies -# on this convention to keep section spacing consistent. Update both -# the helper and the template if you change it. +# All ``_render_*`` helpers below return either the empty string (the +# section is suppressed) or a fully-formed Markdown section. Empty +# returns let ``build_mutation_prompt`` skip the section entirely — +# mirrors GEPA's ``_build_reflection_prompt_template`` accumulator +# pattern (``gepa/optimize_anything.py:501-596``). Non-empty returns +# may carry trailing whitespace; ``build_mutation_prompt`` rstrips +# before joining sections with a uniform blank-line separator. def _render_evaluator_notes(eval_result: EvalResult) -> str: @@ -158,25 +178,53 @@ def _render_evaluator_notes(eval_result: EvalResult) -> str: def _render_evaluator_output_fallback(eval_result: EvalResult) -> str: - """Render stdout/stderr only when they are useful fallback diagnostics.""" - has_notes = bool(eval_result.asi.get("log", "").strip()) - include_streams = _evaluator_failed(eval_result) or ( - not _has_structured_diagnostics(eval_result) and not has_notes - ) - if not include_streams: - return "" - + """Render ``## Evaluator Output`` from stdout/stderr, or ``""``. + + Two distinct cases: + + * **Evaluator subprocess failed** (non-zero exit) — always emit the + section. Empty streams render with ``(no stdout)`` / ``(no stderr)`` + placeholders here intentionally: the agent needs to know the + evaluator failed but produced no output to inspect (a meaningful + diagnostic on its own). + + * **Evaluator succeeded** — only emit the section when at least one + stream has content *and* no richer diagnostic surface + (``log`` notes or structured side_info) exists. Empty streams are + omitted entirely; partial coverage (only ``stdout`` non-empty, or + only ``stderr``) renders just the present sub-section instead of + padding the other with a ``(no X)`` placeholder. + """ stdout = _strip_machine_protocol_from_evaluator_stream( eval_result.asi.get("stdout", "") ) stderr = _strip_machine_protocol_from_evaluator_stream( eval_result.asi.get("stderr", "") ) - if not stdout: - stdout = "(no stdout)" - if not stderr: - stderr = "(no stderr)" - return f"## Evaluator Output\n\n### stdout\n{stdout}\n\n### stderr\n{stderr}\n\n" + + if _evaluator_failed(eval_result): + stdout_text = stdout or "(no stdout)" + stderr_text = stderr or "(no stderr)" + return ( + f"## Evaluator Output\n\n" + f"### stdout\n{stdout_text}\n\n" + f"### stderr\n{stderr_text}" + ) + + # Evaluator succeeded — defer to richer surfaces when they exist, + # otherwise surface only the streams that actually have content. + has_notes = bool(eval_result.asi.get("log", "").strip()) + if _has_structured_diagnostics(eval_result) or has_notes: + return "" + + parts: list[str] = [] + if stdout: + parts.append(f"### stdout\n{stdout}") + if stderr: + parts.append(f"### stderr\n{stderr}") + if not parts: + return "" + return "## Evaluator Output\n\n" + "\n\n".join(parts) def build_seed_generation_prompt( @@ -401,122 +449,128 @@ def _render_per_example_diagnostics( return "\n".join(lines) + "\n" -def build_mutation_prompt( - objective: str, - eval_result: EvalResult, - background: str | None = None, - max_turns: int | None = None, -) -> str: - """Construct the mutation prompt for Claude Code.""" - scores_text = "\n".join( - f" {k}: {v}" for k, v in sorted(eval_result.scores.items()) - ) - if not scores_text: - scores_text = " (no scores recorded)" - - # Collect any extra_N entries from ASI. The reserved keys here - # are surfaced through dedicated prompt sections (or the - # ``EvalResult.evaluator_returncode`` typed field, in the case of - # the historical ``_returncode`` legacy key) and must never leak - # into the catch-all "extra" rendering. - extra_entries = { +def _render_scores_section(eval_result: EvalResult) -> str: + """Render ``## Current Evaluation Scores`` or ``""`` when no scores exist. + + Mirrors GEPA O.A.'s "only emit a section when there is content for it" + pattern (``gepa/optimize_anything.py:501-596``). Previously HELIX + emitted the section with a ``"(no scores recorded)"`` placeholder; now + the section header is omitted entirely so the agent never sees a stub. + """ + lines = [f" {k}: {v}" for k, v in sorted(eval_result.scores.items())] + if not lines: + return "" + return "## Current Evaluation Scores\n" + "\n".join(lines) + + +def _render_extra_asi(eval_result: EvalResult) -> str: + """Render any free-form ``extra_*`` ASI keys, or ``""`` when none exist. + + Reserved keys (``stdout``, ``stderr``, ``error``, ``log``, ``_returncode``) + are filtered out — they're surfaced through dedicated sections + (``## Evaluator Notes``, ``## Evaluator Output``) or the + ``_returncode`` legacy sentinel — and must never leak into this + catch-all rendering. + """ + entries = { k: v for k, v in sorted(eval_result.asi.items()) if k not in ("stdout", "stderr", "error", "log", "_returncode") } - if extra_entries: - extra_lines = "\n".join(f"### {k}\n{v}" for k, v in extra_entries.items()) - extra_asi_section = f"### Extra Evaluator Info\n{extra_lines}\n\n" - else: - extra_asi_section = "" - - # Render side_info diagnostics. Precedence: - # 1. ``eval_result.per_example_side_info`` (new per-example GEPA - # O.A. contract — list of dicts positional to instance_scores - # ids) when populated; mirrors GEPA's - # ``OptimizeAnythingAdapter.make_reflective_dataset`` at - # ``optimize_anything_adapter.py:524-553`` combined with - # ``format_samples`` at - # ``gepa/strategies/instruction_proposal.py:54-95``. - # 2. ``eval_result.side_info`` (legacy batch-level dict) when - # ``per_example_side_info`` is absent — unchanged rendering - # for non-``helix_result`` paths that still populate the - # legacy field. - # 3. No diagnostics section otherwise. - diagnostics_section = "" + if not entries: + return "" + body = "\n".join(f"### {k}\n{v}" for k, v in entries.items()) + return f"### Extra Evaluator Info\n{body}" + + +def _render_diagnostics(eval_result: EvalResult) -> str: + """Render the ``## Diagnostics`` section, or ``""`` when no side_info. + + Precedence: + 1. ``eval_result.per_example_side_info`` (per-example GEPA O.A. + contract — list of dicts positional to instance_scores ids) when + populated; mirrors GEPA's + ``OptimizeAnythingAdapter.make_reflective_dataset`` combined + with ``format_samples`` at + ``gepa/strategies/instruction_proposal.py:54-95``. + 2. ``eval_result.side_info`` (legacy batch-level dict) when + per-example data is absent. + 3. Empty string when neither is present. + """ if eval_result.per_example_side_info is not None: # Monotonic markdown hierarchy under the surrounding # ``## Diagnostics`` (h2): each example is ``### Example `` # (h3), each side_info key is ``#### {key}`` (h4), nested - # values bump further. Before this the Example header was - # ``#`` (h1), which inverted the hierarchy and confused - # markdown-aware tooling / LLM markdown parsers. - diagnostics_section = _render_per_example_diagnostics( + # values bump further. + return _render_per_example_diagnostics( example_ids=list(eval_result.instance_scores.keys()), per_example_side_info=eval_result.per_example_side_info, example_header_level=3, key_header_level=4, ) - elif eval_result.side_info is not None: + if eval_result.side_info is not None: diag_lines = "\n".join( f" {k}: {v}" for k, v in sorted(eval_result.side_info.items()) ) - diagnostics_section = f"## Diagnostics\n{diag_lines}\n\n" + return f"## Diagnostics\n{diag_lines}" + return "" - bg = background or "(no additional background provided)" - return MUTATION_PROMPT_TEMPLATE.format( - system_prompt=AUTONOMOUS_SYSTEM_PROMPT, - objective=objective, - scores=scores_text, - evaluator_notes_section=_render_evaluator_notes(eval_result), - evaluator_output_section=_render_evaluator_output_fallback(eval_result), - extra_asi_section=extra_asi_section, - diagnostics_section=diagnostics_section, - background=bg, - turn_budget=_turn_budget_section(max_turns), - ) +def build_mutation_prompt( + objective: str, + eval_result: EvalResult, + background: str | None = None, + max_turns: int | None = None, +) -> str: + """Construct the mutation prompt for the configured agent backend. + + Sections are emitted only when they have content, mirroring GEPA O.A.'s + ``_build_reflection_prompt_template`` accumulator pattern + (``gepa/optimize_anything.py:501-596``). Empty ``objective``, empty + ``eval_result.scores``, absent diagnostics, absent evaluator notes, + absent stdout/stderr fallback, absent extra ASI, and absent + ``background`` all skip their respective sections entirely instead of + rendering placeholder strings like ``"(no additional background + provided)"`` or ``"(no scores recorded)"`` that taught nothing. + + ``## Your Task`` and the system prompt are always emitted; they are + the only sections that don't depend on caller-provided content. + """ + sections: list[str] = [AUTONOMOUS_SYSTEM_PROMPT.rstrip()] + if objective: + sections.append(f"## Objective\n{objective}") -# --------------------------------------------------------------------------- -# Mutation summary parsing -# --------------------------------------------------------------------------- + scores = _render_scores_section(eval_result) + if scores: + sections.append(scores) + diagnostics = _render_diagnostics(eval_result) + if diagnostics: + sections.append(diagnostics.rstrip()) -def parse_mutation_summary(output: str) -> dict[str, str]: - """Parse a ``[SUMMARY]...[END SUMMARY]`` block from Claude Code output. + notes = _render_evaluator_notes(eval_result) + if notes: + sections.append(notes.rstrip()) - Extracts structured key-value pairs written by the mutation/merge agent - after ``[MUTATION COMPLETE]`` or ``[MERGE COMPLETE]``. The block format - is:: + output_fallback = _render_evaluator_output_fallback(eval_result) + if output_fallback: + sections.append(output_fallback.rstrip()) - [SUMMARY] - files_changed: src/foo.py, src/bar.py - root_cause: ... - changes_made: ... - [END SUMMARY] + extra_asi = _render_extra_asi(eval_result) + if extra_asi: + sections.append(extra_asi) - Returns - ------- - dict[str, str] - Parsed key-value pairs. Returns an empty dict if no block is found - or the block contains no valid ``key: value`` lines. Never raises. - """ - result: dict[str, str] = {} - in_block = False - for line in output.splitlines(): - if "[SUMMARY]" in line: - in_block = True - continue - if "[END SUMMARY]" in line: - break - if in_block and ":" in line: - key, _, value = line.partition(":") - key = key.strip() - value = value.strip() - if key: - result[key] = value - return result + if background: + sections.append(f"## Background / Context\n{background}") + + sections.append(MUTATION_TASK_INSTRUCTIONS) + + turn_budget = _turn_budget_section(max_turns) + if turn_budget: + sections.append(turn_budget.strip()) + + return "\n\n".join(sections) + "\n" # --------------------------------------------------------------------------- diff --git a/tests/unit/test_merger.py b/tests/unit/test_merger.py index 614e8fc..5176916 100644 --- a/tests/unit/test_merger.py +++ b/tests/unit/test_merger.py @@ -71,13 +71,15 @@ def test_contains_background(self): prompt = build_merge_prompt("goal", None, None, "", background="special bg") assert "special bg" in prompt - def test_default_background_when_none(self): - prompt = build_merge_prompt("goal", None, None, "") - assert "no additional background" in prompt - - def test_contains_merge_complete_marker(self): + def test_background_section_omitted_when_none(self): + """GEPA parity: empty optional inputs skip the section entirely + instead of emitting a placeholder. Mirrors GEPA O.A.'s + ``_build_reflection_prompt_template`` (optimize_anything.py:501-596), + which only appends a section when its content is non-empty. + """ prompt = build_merge_prompt("goal", None, None, "") - assert "[MERGE COMPLETE]" in prompt + assert "## Background / Context" not in prompt + assert "no additional background" not in prompt def test_contains_execution_instructions(self): prompt = build_merge_prompt("goal", None, None, "") @@ -90,14 +92,122 @@ def test_includes_eval_scores_when_provided(self): assert "0.8" in prompt assert "0.9" in prompt - def test_handles_none_eval_results(self): - # Should not raise even with no eval data + def test_strengths_sections_omitted_when_eval_results_none(self): + """``eval_result_a`` / ``eval_result_b`` of ``None`` → the + corresponding ``## Candidate {A,B} Strengths`` section is skipped + entirely (no ``"(no evaluation data)"`` placeholder). + """ prompt = build_merge_prompt("goal", None, None, "some diff") - assert "no evaluation data" in prompt - - def test_empty_diff_shows_fallback(self): + assert "## Candidate A Strengths" not in prompt + assert "## Candidate B Strengths" not in prompt + assert "no evaluation data" not in prompt + + def test_diff_section_omitted_when_empty(self): + """An empty diff → ``## Diff (B relative to A)`` section is + skipped entirely (no ``"(no diff — candidates are identical)"`` + placeholder). Upstream callers should not invoke merge on + identical candidates; this test pins the prompt-level fallback. + """ prompt = build_merge_prompt("goal", None, None, "") - assert "identical" in prompt or "no diff" in prompt + assert "## Diff" not in prompt + assert "identical" not in prompt + assert "no diff" not in prompt + + +class TestBuildMergePromptTwoDiff: + """GEPA-parity two-diff form: when ``ancestor_id`` and both + ancestor-relative diffs are supplied, the prompt emits two labelled + sections instead of the single A↔B diff. Mirrors GEPA's + three-way attribution reasoning (``gepa/proposer/merge.py:163-191``) + at the file-hunk level — agent reads off A's contribution and B's + contribution directly instead of inferring three-way info from a + two-way comparison. + """ + + def test_emits_two_ancestor_relative_sections(self): + prompt = build_merge_prompt( + "goal", None, None, "", + ancestor_id="g0-s0", + diff_a_from_ancestor="+self.bn = nn.BatchNorm1d(...)\n", + diff_b_from_ancestor="+if epoch < 10: lr = 0.001\n", + ) + assert "## Diff: Candidate A relative to common ancestor g0-s0" in prompt + assert "## Diff: Candidate B relative to common ancestor g0-s0" in prompt + assert "self.bn = nn.BatchNorm1d" in prompt + assert "if epoch < 10" in prompt + # Fallback section MUST NOT also render — the agent should see + # exactly the two ancestor-relative diffs, not three sections. + assert "## Diff (B relative to A)" not in prompt + + def test_two_diff_form_uses_two_diff_task_block(self): + """The two-diff form pairs the ancestor-relative diffs with a + task block that explicitly tells the agent A's contribution is + already in the working tree (so it doesn't re-apply it) and B's + contribution is what needs to be brought in. Pinning this + substring prevents a future refactor from accidentally swapping + back to the single-diff task block, which would confuse the + agent about whether A's diff needs to be re-applied. + """ + prompt = build_merge_prompt( + "goal", None, None, "", + ancestor_id="g0-s0", + diff_a_from_ancestor="+a contribution\n", + diff_b_from_ancestor="+b contribution\n", + ) + assert "A's contribution (the hunks shown in" in prompt + # The single-diff task block's "Apply the changes from Candidate B" + # phrasing must NOT appear in the two-diff form. + assert "Apply the changes from\nCandidate B that are beneficial" not in prompt + + def test_single_diff_form_uses_single_diff_task_block(self): + """Companion to the two-diff task-block test: the single-diff + fallback must keep the legacy task framing intact. + """ + prompt = build_merge_prompt("goal", None, None, "+some diff") + assert "Apply the changes from\nCandidate B" in prompt + # The two-diff task block's phrasing must NOT appear in the + # single-diff form. + assert "A's contribution (the hunks shown in" not in prompt + + def test_single_diff_fallback_when_ancestor_missing(self): + """No ancestor + no ancestor diffs → single A↔B diff form + (backward-compatible default). + """ + prompt = build_merge_prompt("goal", None, None, "+some diff content") + assert "## Diff (B relative to A)" in prompt + assert "+some diff content" in prompt + # Two-diff section headers must NOT appear in the fallback path. + assert "relative to common ancestor" not in prompt + + def test_single_diff_fallback_when_ancestor_id_only(self): + """Defensive: an ``ancestor_id`` without both diffs falls back + to the single A↔B path. Prevents a half-configured caller from + emitting an "ancestor header pointing to nothing" prompt. + """ + prompt = build_merge_prompt( + "goal", None, None, "+legacy diff", + ancestor_id="g0-s0", + diff_a_from_ancestor=None, + diff_b_from_ancestor=None, + ) + assert "## Diff (B relative to A)" in prompt + assert "+legacy diff" in prompt + assert "relative to common ancestor" not in prompt + + def test_two_diff_form_omits_empty_side(self): + """If one of the ancestor-relative diffs is empty (one parent + didn't change anything relative to the ancestor), only the + non-empty side renders. Edge case: a parent might "improve" via + metadata changes that ``git diff`` doesn't see. + """ + prompt = build_merge_prompt( + "goal", None, None, "", + ancestor_id="g0-s0", + diff_a_from_ancestor="+self.bn = nn.BatchNorm1d(...)\n", + diff_b_from_ancestor="", + ) + assert "## Diff: Candidate A relative to common ancestor g0-s0" in prompt + assert "## Diff: Candidate B relative to common ancestor g0-s0" not in prompt # --------------------------------------------------------------------------- @@ -253,6 +363,87 @@ def test_passes_background_to_prompt(self, mocker): prompt_arg = mock_invoke.call_args[0][1] assert "unique_context_xyz" in prompt_arg + def test_ancestor_triggers_two_diff_form(self, mocker): + """When ``merge()`` receives an ``ancestor`` argument, it must + call ``get_diff`` twice (ancestor→A and ancestor→B) and the + rendered prompt must contain both ancestor-relative diff + sections instead of the single A↔B diff. + """ + ancestor = make_candidate("g0-s0") + ca = make_candidate("g1-s0") + cb = make_candidate("g1-s1") + config = make_config() + + child = make_candidate("g2-m0") + mocker.patch("helix.merger.clone_candidate", return_value=child) + + # get_diff is called per-side; sequence captures which two diffs + # were requested so we can assert the right pair of arguments + # was passed (ancestor→A and ancestor→B, NOT A↔B). + diff_calls: list[tuple[str, str]] = [] + + def fake_get_diff(x, y): + diff_calls.append((x.id, y.id)) + return f"+contribution from {y.id}" + + mocker.patch("helix.merger.get_diff", side_effect=fake_get_diff) + mock_invoke = mocker.patch( + "helix.merger.invoke_claude_code", return_value=({}, {}) + ) + mocker.patch("helix.merger.snapshot_candidate", return_value="sha") + mocker.patch("helix.merger.remove_worktree") + + merge(ca, cb, "g2-m0", config, Path("/tmp"), ancestor=ancestor) + + # Two get_diff calls, both anchored on the ancestor. + assert diff_calls == [("g0-s0", "g1-s0"), ("g0-s0", "g1-s1")], ( + f"expected two ancestor-relative diff calls; got {diff_calls}" + ) + + # Prompt must contain both ancestor-relative section headers + # and neither the legacy A↔B header nor any leftover placeholder. + prompt_arg = mock_invoke.call_args[0][1] + assert "## Diff: Candidate A relative to common ancestor g0-s0" in prompt_arg + assert "## Diff: Candidate B relative to common ancestor g0-s0" in prompt_arg + assert "## Diff (B relative to A)" not in prompt_arg + assert "contribution from g1-s0" in prompt_arg + assert "contribution from g1-s1" in prompt_arg + + def test_no_ancestor_uses_single_diff_form(self, mocker): + """Backward-compat: ``merge()`` without an ``ancestor`` argument + still computes a single A↔B diff and renders the legacy section. + """ + ca = make_candidate("g1-s0") + cb = make_candidate("g1-s1") + config = make_config() + + child = make_candidate("g2-m0") + mocker.patch("helix.merger.clone_candidate", return_value=child) + + diff_calls: list[tuple[str, str]] = [] + + def fake_get_diff(x, y): + diff_calls.append((x.id, y.id)) + return "+A-to-B contribution" + + mocker.patch("helix.merger.get_diff", side_effect=fake_get_diff) + mock_invoke = mocker.patch( + "helix.merger.invoke_claude_code", return_value=({}, {}) + ) + mocker.patch("helix.merger.snapshot_candidate", return_value="sha") + mocker.patch("helix.merger.remove_worktree") + + merge(ca, cb, "g2-m0", config, Path("/tmp")) # no ancestor + + # Single get_diff call with A↔B arguments. + assert diff_calls == [("g1-s0", "g1-s1")], ( + f"expected single A↔B diff call; got {diff_calls}" + ) + + prompt_arg = mock_invoke.call_args[0][1] + assert "## Diff (B relative to A)" in prompt_arg + assert "## Diff: Candidate A relative to common ancestor" not in prompt_arg + def test_imports_mutation_error_from_mutator(self): """merger.py must reuse MutationError from mutator to avoid duplication.""" from helix.merger import MutationError as MergerME diff --git a/tests/unit/test_mutator.py b/tests/unit/test_mutator.py index e3fb7ae..b6552f1 100644 --- a/tests/unit/test_mutator.py +++ b/tests/unit/test_mutator.py @@ -116,25 +116,31 @@ def test_contains_background(self): prompt = build_mutation_prompt("goal", er, background="special context here") assert "special context here" in prompt - def test_default_background_when_none(self): + def test_background_section_omitted_when_none(self): + """GEPA parity: empty optional inputs skip the section entirely + instead of emitting a placeholder. Mirrors GEPA O.A.'s + ``_build_reflection_prompt_template`` (optimize_anything.py:501-596), + which only appends a section when its content is non-empty. + """ er = make_eval_result() prompt = build_mutation_prompt("goal", er, background=None) - assert "no additional background" in prompt - - def test_contains_mutation_complete_marker(self): - er = make_eval_result() - prompt = build_mutation_prompt("goal", er) - assert "[MUTATION COMPLETE]" in prompt + assert "## Background / Context" not in prompt + assert "no additional background" not in prompt def test_contains_execution_instructions(self): er = make_eval_result() prompt = build_mutation_prompt("goal", er) assert "Task instructions:" in prompt - def test_no_scores_fallback(self): + def test_scores_section_omitted_when_empty(self): + """No ``eval_result.scores`` → ``## Current Evaluation Scores`` + section is skipped entirely (no ``"(no scores recorded)"`` + placeholder). + """ er = make_eval_result(scores={}) prompt = build_mutation_prompt("goal", er) - assert "no scores recorded" in prompt + assert "## Current Evaluation Scores" not in prompt + assert "no scores recorded" not in prompt def test_renders_helix_log_notes(self): er = make_eval_result( @@ -207,6 +213,41 @@ def test_returncode_sentinel_does_not_leak_into_prompt(self): assert "real-extra" in prompt +class TestTurnBudgetArticleAgreement: + """``_turn_budget_section`` must use the correct indefinite article + ("a" vs "an") so the rendered prompt reads naturally: + + * ``"You have a 5-turn limit"`` (consonant sound — "five") + * ``"You have an 8-turn limit"`` (vowel sound — "eight") + + Pre-fix the article was always hardcoded ``"a"``, producing + ``"You have a 8-turn limit"`` for the 8/11/18/80s cases. + """ + + def test_consonant_leading_numbers_use_a(self) -> None: + from helix.mutator import _turn_budget_section + + for n in (1, 2, 3, 4, 5, 6, 7, 9, 10, 12, 17, 19, 20, 50, 100, 200): + section = _turn_budget_section(n) + assert f"You have a {n}-turn limit" in section, ( + f"max_turns={n} should use 'a', section was: {section!r}" + ) + + def test_vowel_leading_numbers_use_an(self) -> None: + from helix.mutator import _turn_budget_section + + for n in (8, 11, 18, 80, 85, 88, 800, 888): + section = _turn_budget_section(n) + assert f"You have an {n}-turn limit" in section, ( + f"max_turns={n} should use 'an', section was: {section!r}" + ) + + def test_none_returns_empty(self) -> None: + from helix.mutator import _turn_budget_section + + assert _turn_budget_section(None) == "" + + class TestPerExampleDiagnostics: """``build_mutation_prompt`` renders ``eval_result.per_example_side_info`` as the Diagnostics section under the new GEPA O.A. contract @@ -557,7 +598,7 @@ def test_artifact_contains_rendered_prompt(self, tmp_path: Path, mocker): # Objective + autonomous-rules block are both in the rendered # prompt via build_mutation_prompt. assert config.objective in content - assert "[MUTATION COMPLETE]" in content + assert "Task instructions:" in content def test_gitignore_excludes_helix_artifacts(self, tmp_path: Path, mocker): """``.gitignore`` in the worktree gains entries for the prompt diff --git a/tests/unit/test_mutator_seedless.py b/tests/unit/test_mutator_seedless.py index f8fd9ab..b9734e9 100644 --- a/tests/unit/test_mutator_seedless.py +++ b/tests/unit/test_mutator_seedless.py @@ -70,11 +70,6 @@ def test_no_evaluator_section_when_absent(self): ) assert "Evaluator" not in prompt - def test_completion_signal_in_prompt(self): - """The completion signal string must appear in the prompt.""" - prompt = build_seed_generation_prompt(objective="Test objective") - assert "[SEED GENERATION COMPLETE]" in prompt - def test_returns_string(self): """build_seed_generation_prompt must return a str.""" result = build_seed_generation_prompt(objective="Test") diff --git a/tests/unit/test_semlog.py b/tests/unit/test_semlog.py deleted file mode 100644 index 33a058e..0000000 --- a/tests/unit/test_semlog.py +++ /dev/null @@ -1,155 +0,0 @@ -"""Unit tests for semantic mutation logging (semlog) functionality.""" - -from __future__ import annotations - - - -from helix.mutator import parse_mutation_summary - - -# --------------------------------------------------------------------------- -# Tests: parse_mutation_summary -# --------------------------------------------------------------------------- - - -class TestParseMutationSummaryComplete: - """test_parse_mutation_summary_complete: full block parsed correctly.""" - - def test_all_mutation_fields_parsed(self): - output = """\ -Some Claude output here... -[MUTATION COMPLETE] -[SUMMARY] -files_changed: src/model.py, src/utils.py -root_cause: The model was not normalising input features before training. -changes_made: Added a StandardScaler preprocessing step. Updated the training pipeline. -reasoning: Normalisation is the most direct fix for scale-sensitive algorithms. -expected_impact: Expect accuracy to improve by 5-10% based on typical gains from this fix. -[END SUMMARY] -""" - result = parse_mutation_summary(output) - - assert result["files_changed"] == "src/model.py, src/utils.py" - assert result["root_cause"] == "The model was not normalising input features before training." - assert "StandardScaler" in result["changes_made"] - assert result["reasoning"] == "Normalisation is the most direct fix for scale-sensitive algorithms." - assert "5-10%" in result["expected_impact"] - - def test_returns_dict_type(self): - output = "[SUMMARY]\nfiles_changed: foo.py\n[END SUMMARY]" - result = parse_mutation_summary(output) - assert isinstance(result, dict) - - def test_keys_are_stripped(self): - output = "[SUMMARY]\n files_changed : foo.py\n[END SUMMARY]" - result = parse_mutation_summary(output) - assert "files_changed" in result - - def test_values_are_stripped(self): - output = "[SUMMARY]\nfiles_changed: bar.py \n[END SUMMARY]" - result = parse_mutation_summary(output) - assert result["files_changed"] == "bar.py" - - def test_value_with_colon_preserved(self): - """Values containing ':' should keep everything after the first ': '.""" - output = "[SUMMARY]\nroot_cause: Issue in module: core/parser.py\n[END SUMMARY]" - result = parse_mutation_summary(output) - assert result["root_cause"] == "Issue in module: core/parser.py" - - def test_stops_at_end_summary(self): - """Lines after [END SUMMARY] should not be parsed.""" - output = ( - "[SUMMARY]\nfiles_changed: a.py\n[END SUMMARY]\nextra_key: should_not_appear\n" - ) - result = parse_mutation_summary(output) - assert "extra_key" not in result - - -class TestParseMutationSummaryMissingBlock: - """test_parse_mutation_summary_missing_block: returns empty dict gracefully (never crash).""" - - def test_returns_empty_dict_when_no_block(self): - output = "Claude Code output with no summary block at all." - result = parse_mutation_summary(output) - assert result == {} - - def test_returns_empty_dict_on_empty_string(self): - result = parse_mutation_summary("") - assert result == {} - - def test_returns_empty_dict_on_partial_block_no_end(self): - """[SUMMARY] without [END SUMMARY] — reads to EOF, may have partial data.""" - output = "[SUMMARY]\nfiles_changed: a.py\n" - result = parse_mutation_summary(output) - # Should not crash; may return partial dict - assert isinstance(result, dict) - - def test_returns_empty_dict_on_only_end_marker(self): - output = "[END SUMMARY]\nfiles_changed: a.py\n" - result = parse_mutation_summary(output) - assert result == {} - - def test_no_exception_on_none_like_empty_string(self): - """Ensure no crash even on edge-case whitespace-only input.""" - result = parse_mutation_summary(" \n\n\n ") - assert result == {} - - def test_no_exception_on_malformed_lines(self): - output = "[SUMMARY]\nthis line has no colon separator\nanother bad line\n[END SUMMARY]" - result = parse_mutation_summary(output) - assert isinstance(result, dict) - # None of the malformed lines should produce keys - assert len(result) == 0 - - -class TestParseMergeSummary: - """test_parse_merge_summary: merge-specific fields parsed.""" - - def test_all_merge_fields_parsed(self): - output = """\ -Merging candidates... -[MERGE COMPLETE] -[SUMMARY] -files_changed: src/solver.py, tests/test_solver.py -candidate_a_kept: The base algorithm from A because it had better convergence. -candidate_b_applied: The caching layer from B because it reduced redundant computation. -conflicts_resolved: Both modified the solve() function; kept A's logic with B's cache wrapper. -merge_strategy: Apply B's performance improvements on top of A's correctness fixes. -expected_impact: Combined improvements should yield 15% speedup and maintain accuracy. -[END SUMMARY] -""" - result = parse_mutation_summary(output) - - assert result["files_changed"] == "src/solver.py, tests/test_solver.py" - assert "base algorithm" in result["candidate_a_kept"] - assert "caching layer" in result["candidate_b_applied"] - assert "solve()" in result["conflicts_resolved"] - assert result["merge_strategy"] == "Apply B's performance improvements on top of A's correctness fixes." - assert "15%" in result["expected_impact"] - - def test_merge_fields_present_as_keys(self): - output = ( - "[SUMMARY]\n" - "files_changed: x.py\n" - "candidate_a_kept: kept A's approach\n" - "candidate_b_applied: applied B's fix\n" - "conflicts_resolved: none\n" - "merge_strategy: selective apply\n" - "expected_impact: minor improvement\n" - "[END SUMMARY]" - ) - result = parse_mutation_summary(output) - expected_keys = { - "files_changed", - "candidate_a_kept", - "candidate_b_applied", - "conflicts_resolved", - "merge_strategy", - "expected_impact", - } - assert expected_keys.issubset(result.keys()) - - def test_merge_summary_no_block_returns_empty(self): - output = "Merge done. No structured summary provided." - result = parse_mutation_summary(output) - assert result == {}