Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions src/helix/evolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -1808,12 +1808,12 @@ def _has_val_support_overlap(i: str, j: str) -> bool:
)

if triplet is not None:
# GEPA parity (merge-pairing audit C3, merge.py:94-95):
# ``find_merge_triplet`` now returns the canonical
# ``(i, j)`` (lex-sorted), so ``cid_i <= cid_j`` always —
# the merge subprocess, attempted-pair ledger and the
# description-triplet dedup all see the same tuple order.
cid_i, cid_j, _ancestor_id = triplet
# GEPA parity (merge.py:94-95): ``find_merge_triplet``
# now returns the canonical ``(i, j)`` (lex-sorted),
# so ``cid_i <= cid_j`` always — the merge subprocess,
# attempted-pair ledger and the description-triplet
# dedup all see the same tuple order.
cid_i, cid_j, ancestor_id = triplet
pair_key = [cid_i, cid_j]

# Resolve parent val results once; by contract the
Expand All @@ -1832,8 +1832,25 @@ def _has_val_support_overlap(i: str, j: str) -> bool:

a = frontier._candidates[cid_i]
b = frontier._candidates[cid_j]

# Resolve the common ancestor for the two-diff merge
# prompt (GEPA parity at the file-hunk level: feed the
# agent the same three-way structure GEPA's algorithm
# uses to attribute changes —
# ``gepa/proposer/merge.py:163-191``). The ancestor
# came from ``find_merge_triplet``; resolve it through
# the frontier's append-only candidate map. ``None``
# is tolerated downstream — ``merge()`` falls back to
# the single A↔B diff when the ancestor isn't
# resolvable (defensive: lineage / frontier drift).
ancestor_candidate = frontier.candidates.get(ancestor_id)
merge_id = budget_api.next_merge_id(state, gen)
if ancestor_candidate is None:
print_warning(
f"Merge {merge_id} ({cid_i} + {cid_j}): common "
f"ancestor {ancestor_id} not found in frontier "
f"candidate map; falling back to single A↔B "
f"diff form for this merge."
)

merged = merge(
candidate_a=a,
Expand All @@ -1849,6 +1866,7 @@ def _has_val_support_overlap(i: str, j: str) -> bool:
cand, config, project_root
)
),
ancestor=ancestor_candidate,
)

if merged is None:
Expand Down
215 changes: 166 additions & 49 deletions src/helix/merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,47 +61,46 @@ def select_eval_subsample_for_merged_program(
# Prompts
# ---------------------------------------------------------------------------

MERGE_PROMPT_TEMPLATE = """\
{system_prompt}

## Objective
{objective}

## Candidate A Strengths
{strengths_a}

## Candidate B Strengths
{strengths_b}

## Diff (B relative to A)
```diff
{diff}
```

## Background / Context
{background}

MERGE_TASK_INSTRUCTIONS_SINGLE_DIFF = """\
## Your Task
You are merging the best aspects of Candidate A and Candidate B to create a superior
combined solution that better achieves the objective.

Candidate A is already checked out in your working directory. Apply the changes from
Candidate B that are beneficial, and discard or adapt those that conflict or regress.
You may read, edit, create, or delete files as needed.
You may read, edit, create, or delete files as needed."""

When you have finished making all your changes, output the exact text:
[MERGE COMPLETE]
{turn_budget}"""
MERGE_TASK_INSTRUCTIONS_TWO_DIFF = """\
## Your Task
You are merging the best aspects of Candidate A and Candidate B to create a superior
combined solution that better achieves the objective.

Candidate A's worktree is already checked out — A's contribution (the hunks shown in
"Diff: Candidate A relative to common ancestor") is already in place, so you do not
need to re-apply it. Your job is to bring in Candidate B's contribution (the hunks
shown in "Diff: Candidate B relative to common ancestor") wherever it is beneficial.

For each hunk in B's diff:
- If the file is untouched by A's diff, the change is independent — apply it.
- If the file is also touched by A's diff (overlapping region), the two parents
diverged from the ancestor on the same region: reconcile the changes, picking
whichever side better serves the objective, or synthesize a combined version.

You may read, edit, create, or delete files as needed."""

# ---------------------------------------------------------------------------
# Prompt construction
# ---------------------------------------------------------------------------


def _format_eval_strengths(eval_result: EvalResult | None, label: str) -> str:
"""Return a human-readable summary of a candidate's eval result."""
if eval_result is None:
return f" {label}: (no evaluation data)"
def _format_eval_strengths(eval_result: EvalResult) -> str:
"""Return a human-readable summary of a candidate's eval result.

Returns the section body only (no header); the caller is responsible
for the ``## Candidate {A,B} Strengths`` heading. Empty input is
handled by skipping the section entirely in :func:`build_merge_prompt`
rather than emitting a ``"(no evaluation data)"`` placeholder.
"""
lines = [f" Aggregate score: {eval_result.aggregate_score():.4f}"]
for k, v in sorted(eval_result.scores.items()):
lines.append(f" {k}: {v}")
Expand All @@ -122,23 +121,98 @@ def build_merge_prompt(
diff: str,
background: str | None = None,
max_turns: int | None = None,
*,
ancestor_id: str | None = None,
diff_a_from_ancestor: str | None = None,
diff_b_from_ancestor: str | None = None,
) -> str:
"""Construct the merge prompt for Claude Code."""
strengths_a = _format_eval_strengths(eval_result_a, "Candidate A")
strengths_b = _format_eval_strengths(eval_result_b, "Candidate B")
bg = background or "(no additional background provided)"
diff_text = diff.strip() if diff.strip() else "(no diff — candidates are identical)"

return MERGE_PROMPT_TEMPLATE.format(
system_prompt=AUTONOMOUS_SYSTEM_PROMPT,
objective=objective,
strengths_a=strengths_a,
strengths_b=strengths_b,
diff=diff_text,
background=bg,
turn_budget=_turn_budget_section(max_turns),
"""Construct the merge prompt for the configured agent backend.

Sections are emitted only when they have content, mirroring GEPA O.A.'s
``_build_reflection_prompt_template`` accumulator pattern
(``gepa/optimize_anything.py:501-596``).

**Diff section format** — two-diff (ancestor-relative) vs single (A↔B):

GEPA's merge operator (``gepa/proposer/merge.py:155-203``) reasons over
*three* program states — common ancestor, candidate A, candidate B —
to attribute each component change to whichever parent diverged from
the ancestor. When ``ancestor_id`` + both ``diff_a_from_ancestor``
and ``diff_b_from_ancestor`` are supplied, this prompt mirrors that
structure at the file-hunk level: each parent's diff against the
common ancestor is rendered as its own labelled section, so the
agent can read off "A's contribution" and "B's contribution"
directly instead of inferring three-way info from a single A↔B diff.

When the ancestor-relative pair is not supplied (e.g. legacy callers,
tests that don't have an ancestor handy), the prompt falls back to
the single ``## Diff (B relative to A)`` section driven by ``diff``.

Absent eval results, absent diff(s), and absent ``background`` all
skip their respective sections entirely instead of emitting
placeholder strings.
"""
sections: list[str] = [AUTONOMOUS_SYSTEM_PROMPT.rstrip()]

if objective:
sections.append(f"## Objective\n{objective}")

if eval_result_a is not None:
sections.append(
"## Candidate A Strengths\n" + _format_eval_strengths(eval_result_a)
)

if eval_result_b is not None:
sections.append(
"## Candidate B Strengths\n" + _format_eval_strengths(eval_result_b)
)

# Diff section — prefer the two-diff (ancestor-relative) form when
# both diffs are available, fall back to single A↔B otherwise. Each
# branch independently honors the "omit when empty" invariant.
use_two_diff = (
ancestor_id is not None
and diff_a_from_ancestor is not None
and diff_b_from_ancestor is not None
)
if use_two_diff:
diff_a_stripped = (diff_a_from_ancestor or "").strip()
diff_b_stripped = (diff_b_from_ancestor or "").strip()
if diff_a_stripped:
sections.append(
f"## Diff: Candidate A relative to common ancestor {ancestor_id}\n"
f"```diff\n{diff_a_stripped}\n```"
)
if diff_b_stripped:
sections.append(
f"## Diff: Candidate B relative to common ancestor {ancestor_id}\n"
f"```diff\n{diff_b_stripped}\n```"
)
else:
diff_stripped = diff.strip()
if diff_stripped:
sections.append(
f"## Diff (B relative to A)\n```diff\n{diff_stripped}\n```"
)

if background:
sections.append(f"## Background / Context\n{background}")

# Task instructions vary by diff form. Two-diff form gets explicit
# guidance on what A's contribution vs B's contribution means and how
# to reason about overlapping vs disjoint hunks; single-diff form
# keeps the legacy "apply B's changes" framing.
sections.append(
MERGE_TASK_INSTRUCTIONS_TWO_DIFF if use_two_diff
else MERGE_TASK_INSTRUCTIONS_SINGLE_DIFF
)

turn_budget = _turn_budget_section(max_turns)
if turn_budget:
sections.append(turn_budget.strip())

return "\n\n".join(sections) + "\n"


# ---------------------------------------------------------------------------
# High-level merge entry point
Expand All @@ -155,22 +229,40 @@ def merge(
eval_result_a: EvalResult | None = None,
eval_result_b: EvalResult | None = None,
prepare_worktree: Callable[[Candidate], None] | None = None,
ancestor: Candidate | None = None,
) -> Candidate | None:
"""Merge *candidate_a* and *candidate_b* using Claude Code.

Clones *candidate_a*, computes the diff to *candidate_b*, builds a merge
Clones *candidate_a*, computes the relevant diffs, builds a merge
prompt, and invokes Claude Code. Snapshots on success; removes the
worktree and returns ``None`` on failure.

Two diff-rendering modes, controlled by the optional ``ancestor``
argument:

* **Two-diff (ancestor-relative)** — when ``ancestor`` is provided,
computes ``get_diff(ancestor, candidate_a)`` and
``get_diff(ancestor, candidate_b)`` and renders both as separately
labelled sections in the prompt. The agent can then attribute
each hunk to whichever parent diverged from the common ancestor —
file-hunk-level analogue of GEPA's component-wise attribution
(``gepa/proposer/merge.py:163-191``: ``if pred_anc == pred_id1 …``
→ take id2's version; ``elif pred_anc != pred_id1 and pred_anc !=
pred_id2`` → tiebreak by score).
* **Single (A↔B)** — fallback when no ancestor is provided.
Computes ``get_diff(candidate_a, candidate_b)`` and renders a
single ``## Diff (B relative to A)`` section. The agent has to
infer three-way info from a two-way comparison.

GEPA-parity note: this is the correct domain adaptation of GEPA's
text-component merge (``gepa/proposer/merge.py:155-203``) for HELIX's
full-codebase setting. GEPA can splice ``dict[str, str]`` programs
deterministically by swapping components from each parent; HELIX
candidates are full git worktrees, where syntactic per-component swap
is undefined, so an LLM-mediated edit is the only viable approach.
The surrounding trigger / parent-selection / subsample / acceptance /
full-val logic in :mod:`helix.evolution` mirrors GEPA's
``MergeProposer`` and ``GEPAEngine`` verbatim.
is undefined, so an LLM-mediated edit is the only viable approach
but feeding the agent the three-way diff structure GEPA's algorithm
uses (two ancestor-relative diffs instead of one A↔B diff) gives it
the same shape of attribution information.

Parameters
----------
Expand All @@ -190,6 +282,15 @@ def merge(
Evaluation result for candidate A (optional, for richer prompt).
eval_result_b:
Evaluation result for candidate B (optional, for richer prompt).
prepare_worktree:
Optional callback to refresh protected files in the new worktree
before the agent runs.
ancestor:
Optional most-recent common ancestor of A and B (typically
``frontier.candidates[ancestor_id]`` where ``ancestor_id`` came
from :func:`helix.lineage.find_merge_triplet`). When supplied,
the prompt uses the two-diff (ancestor-relative) form; when
``None``, falls back to the single A↔B diff.

Returns
-------
Expand All @@ -202,15 +303,31 @@ def merge(
if prepare_worktree is not None:
prepare_worktree(child)

diff = get_diff(candidate_a, candidate_b)
# Diff-rendering mode selection. ``ancestor`` available → compute
# the two ancestor-relative diffs that drive the GEPA-style
# attribution prompt. Otherwise compute the single A↔B fallback.
if ancestor is not None:
diff_a_from_ancestor: str | None = get_diff(ancestor, candidate_a)
diff_b_from_ancestor: str | None = get_diff(ancestor, candidate_b)
# ``diff`` (single A↔B) is computed lazily only for the fallback
# path; with both ancestor-relative diffs in hand, the prompt
# builder ignores the legacy parameter, so pass an empty string.
legacy_diff = ""
else:
diff_a_from_ancestor = None
diff_b_from_ancestor = None
legacy_diff = get_diff(candidate_a, candidate_b)

prompt = build_merge_prompt(
config.objective,
eval_result_a,
eval_result_b,
diff,
legacy_diff,
background,
config.agent.max_turns,
ancestor_id=ancestor.id if ancestor is not None else None,
diff_a_from_ancestor=diff_a_from_ancestor,
diff_b_from_ancestor=diff_b_from_ancestor,
)

try:
Expand Down
Loading
Loading