diff --git a/src/helix/evolution.py b/src/helix/evolution.py
index c2b7ee2..7115ba6 100644
--- a/src/helix/evolution.py
+++ b/src/helix/evolution.py
@@ -1808,12 +1808,12 @@ def _has_val_support_overlap(i: str, j: str) -> bool:
                     )
 
                 if triplet is not None:
-                    # GEPA parity (merge-pairing audit C3, merge.py:94-95):
-                    # ``find_merge_triplet`` now returns the canonical
-                    # ``(i, j)`` (lex-sorted), so ``cid_i <= cid_j`` always —
-                    # the merge subprocess, attempted-pair ledger and the
-                    # description-triplet dedup all see the same tuple order.
-                    cid_i, cid_j, _ancestor_id = triplet
+                    # GEPA parity (merge.py:94-95): ``find_merge_triplet``
+                    # now returns the canonical ``(i, j)`` (lex-sorted),
+                    # so ``cid_i <= cid_j`` always — the merge subprocess,
+                    # attempted-pair ledger and the description-triplet
+                    # dedup all see the same tuple order.
+                    cid_i, cid_j, ancestor_id = triplet
                     pair_key = [cid_i, cid_j]
 
                     # Resolve parent val results once; by contract the
@@ -1832,8 +1832,25 @@ def _has_val_support_overlap(i: str, j: str) -> bool:
 
                     a = frontier._candidates[cid_i]
                     b = frontier._candidates[cid_j]
-
+                    # Resolve the common ancestor for the two-diff merge
+                    # prompt (GEPA parity at the file-hunk level: feed the
+                    # agent the same three-way structure GEPA's algorithm
+                    # uses to attribute changes —
+                    # ``gepa/proposer/merge.py:163-191``).  The ancestor
+                    # came from ``find_merge_triplet``; resolve it through
+                    # the frontier's append-only candidate map.  ``None``
+                    # is tolerated downstream — ``merge()`` falls back to
+                    # the single A↔B diff when the ancestor isn't
+                    # resolvable (defensive: lineage / frontier drift).
+                    ancestor_candidate = frontier.candidates.get(ancestor_id)
                     merge_id = budget_api.next_merge_id(state, gen)
+                    if ancestor_candidate is None:
+                        print_warning(
+                            f"Merge {merge_id} ({cid_i} + {cid_j}): common "
+                            f"ancestor {ancestor_id} not found in frontier "
+                            f"candidate map; falling back to single A↔B "
+                            f"diff form for this merge."
+                        )
 
                     merged = merge(
                         candidate_a=a,
@@ -1849,6 +1866,7 @@ def _has_val_support_overlap(i: str, j: str) -> bool:
                                 cand, config, project_root
                             )
                         ),
+                        ancestor=ancestor_candidate,
                     )
 
                     if merged is None:
diff --git a/src/helix/merger.py b/src/helix/merger.py
index 487ae1b..94f2677 100644
--- a/src/helix/merger.py
+++ b/src/helix/merger.py
@@ -61,47 +61,46 @@ def select_eval_subsample_for_merged_program(
 # Prompts
 # ---------------------------------------------------------------------------
 
-MERGE_PROMPT_TEMPLATE = """\
-{system_prompt}
-
-## Objective
-{objective}
-
-## Candidate A Strengths
-{strengths_a}
-
-## Candidate B Strengths
-{strengths_b}
-
-## Diff (B relative to A)
-```diff
-{diff}
-```
-
-## Background / Context
-{background}
-
+MERGE_TASK_INSTRUCTIONS_SINGLE_DIFF = """\
 ## Your Task
 You are merging the best aspects of Candidate A and Candidate B to create a superior
 combined solution that better achieves the objective.
 
 Candidate A is already checked out in your working directory.  Apply the changes from
 Candidate B that are beneficial, and discard or adapt those that conflict or regress.
-You may read, edit, create, or delete files as needed.
+You may read, edit, create, or delete files as needed."""
 
-When you have finished making all your changes, output the exact text:
-[MERGE COMPLETE]
-{turn_budget}"""
+MERGE_TASK_INSTRUCTIONS_TWO_DIFF = """\
+## Your Task
+You are merging the best aspects of Candidate A and Candidate B to create a superior
+combined solution that better achieves the objective.
+
+Candidate A's worktree is already checked out — A's contribution (the hunks shown in
+"Diff: Candidate A relative to common ancestor") is already in place, so you do not
+need to re-apply it.  Your job is to bring in Candidate B's contribution (the hunks
+shown in "Diff: Candidate B relative to common ancestor") wherever it is beneficial.
+
+For each hunk in B's diff:
+- If the file is untouched by A's diff, the change is independent — apply it.
+- If the file is also touched by A's diff (overlapping region), the two parents
+  diverged from the ancestor on the same region: reconcile the changes, picking
+  whichever side better serves the objective, or synthesize a combined version.
+
+You may read, edit, create, or delete files as needed."""
 
 # ---------------------------------------------------------------------------
 # Prompt construction
 # ---------------------------------------------------------------------------
 
 
-def _format_eval_strengths(eval_result: EvalResult | None, label: str) -> str:
-    """Return a human-readable summary of a candidate's eval result."""
-    if eval_result is None:
-        return f"  {label}: (no evaluation data)"
+def _format_eval_strengths(eval_result: EvalResult) -> str:
+    """Return a human-readable summary of a candidate's eval result.
+
+    Returns the section body only (no header); the caller is responsible
+    for the ``## Candidate {A,B} Strengths`` heading.  Empty input is
+    handled by skipping the section entirely in :func:`build_merge_prompt`
+    rather than emitting a ``"(no evaluation data)"`` placeholder.
+    """
     lines = [f"  Aggregate score: {eval_result.aggregate_score():.4f}"]
     for k, v in sorted(eval_result.scores.items()):
         lines.append(f"  {k}: {v}")
@@ -122,23 +121,98 @@ def build_merge_prompt(
     diff: str,
     background: str | None = None,
     max_turns: int | None = None,
+    *,
+    ancestor_id: str | None = None,
+    diff_a_from_ancestor: str | None = None,
+    diff_b_from_ancestor: str | None = None,
 ) -> str:
-    """Construct the merge prompt for Claude Code."""
-    strengths_a = _format_eval_strengths(eval_result_a, "Candidate A")
-    strengths_b = _format_eval_strengths(eval_result_b, "Candidate B")
-    bg = background or "(no additional background provided)"
-    diff_text = diff.strip() if diff.strip() else "(no diff — candidates are identical)"
-
-    return MERGE_PROMPT_TEMPLATE.format(
-        system_prompt=AUTONOMOUS_SYSTEM_PROMPT,
-        objective=objective,
-        strengths_a=strengths_a,
-        strengths_b=strengths_b,
-        diff=diff_text,
-        background=bg,
-        turn_budget=_turn_budget_section(max_turns),
+    """Construct the merge prompt for the configured agent backend.
+
+    Sections are emitted only when they have content, mirroring GEPA O.A.'s
+    ``_build_reflection_prompt_template`` accumulator pattern
+    (``gepa/optimize_anything.py:501-596``).
+
+    **Diff section format** — two-diff (ancestor-relative) vs single (A↔B):
+
+    GEPA's merge operator (``gepa/proposer/merge.py:155-203``) reasons over
+    *three* program states — common ancestor, candidate A, candidate B —
+    to attribute each component change to whichever parent diverged from
+    the ancestor.  When ``ancestor_id`` + both ``diff_a_from_ancestor``
+    and ``diff_b_from_ancestor`` are supplied, this prompt mirrors that
+    structure at the file-hunk level: each parent's diff against the
+    common ancestor is rendered as its own labelled section, so the
+    agent can read off "A's contribution" and "B's contribution"
+    directly instead of inferring three-way info from a single A↔B diff.
+
+    When the ancestor-relative pair is not supplied (e.g. legacy callers,
+    tests that don't have an ancestor handy), the prompt falls back to
+    the single ``## Diff (B relative to A)`` section driven by ``diff``.
+
+    Absent eval results, absent diff(s), and absent ``background`` all
+    skip their respective sections entirely instead of emitting
+    placeholder strings.
+    """
+    sections: list[str] = [AUTONOMOUS_SYSTEM_PROMPT.rstrip()]
+
+    if objective:
+        sections.append(f"## Objective\n{objective}")
+
+    if eval_result_a is not None:
+        sections.append(
+            "## Candidate A Strengths\n" + _format_eval_strengths(eval_result_a)
+        )
+
+    if eval_result_b is not None:
+        sections.append(
+            "## Candidate B Strengths\n" + _format_eval_strengths(eval_result_b)
+        )
+
+    # Diff section — prefer the two-diff (ancestor-relative) form when
+    # both diffs are available, fall back to single A↔B otherwise.  Each
+    # branch independently honors the "omit when empty" invariant.
+    use_two_diff = (
+        ancestor_id is not None
+        and diff_a_from_ancestor is not None
+        and diff_b_from_ancestor is not None
+    )
+    if use_two_diff:
+        diff_a_stripped = (diff_a_from_ancestor or "").strip()
+        diff_b_stripped = (diff_b_from_ancestor or "").strip()
+        if diff_a_stripped:
+            sections.append(
+                f"## Diff: Candidate A relative to common ancestor {ancestor_id}\n"
+                f"```diff\n{diff_a_stripped}\n```"
+            )
+        if diff_b_stripped:
+            sections.append(
+                f"## Diff: Candidate B relative to common ancestor {ancestor_id}\n"
+                f"```diff\n{diff_b_stripped}\n```"
+            )
+    else:
+        diff_stripped = diff.strip()
+        if diff_stripped:
+            sections.append(
+                f"## Diff (B relative to A)\n```diff\n{diff_stripped}\n```"
+            )
+
+    if background:
+        sections.append(f"## Background / Context\n{background}")
+
+    # Task instructions vary by diff form.  Two-diff form gets explicit
+    # guidance on what A's contribution vs B's contribution means and how
+    # to reason about overlapping vs disjoint hunks; single-diff form
+    # keeps the legacy "apply B's changes" framing.
+    sections.append(
+        MERGE_TASK_INSTRUCTIONS_TWO_DIFF if use_two_diff
+        else MERGE_TASK_INSTRUCTIONS_SINGLE_DIFF
     )
 
+    turn_budget = _turn_budget_section(max_turns)
+    if turn_budget:
+        sections.append(turn_budget.strip())
+
+    return "\n\n".join(sections) + "\n"
+
 
 # ---------------------------------------------------------------------------
 # High-level merge entry point
@@ -155,22 +229,40 @@ def merge(
     eval_result_a: EvalResult | None = None,
     eval_result_b: EvalResult | None = None,
     prepare_worktree: Callable[[Candidate], None] | None = None,
+    ancestor: Candidate | None = None,
 ) -> Candidate | None:
     """Merge *candidate_a* and *candidate_b* using Claude Code.
 
-    Clones *candidate_a*, computes the diff to *candidate_b*, builds a merge
+    Clones *candidate_a*, computes the relevant diffs, builds a merge
     prompt, and invokes Claude Code.  Snapshots on success; removes the
     worktree and returns ``None`` on failure.
 
+    Two diff-rendering modes, controlled by the optional ``ancestor``
+    argument:
+
+    * **Two-diff (ancestor-relative)** — when ``ancestor`` is provided,
+      computes ``get_diff(ancestor, candidate_a)`` and
+      ``get_diff(ancestor, candidate_b)`` and renders both as separately
+      labelled sections in the prompt.  The agent can then attribute
+      each hunk to whichever parent diverged from the common ancestor —
+      file-hunk-level analogue of GEPA's component-wise attribution
+      (``gepa/proposer/merge.py:163-191``: ``if pred_anc == pred_id1 …``
+      → take id2's version; ``elif pred_anc != pred_id1 and pred_anc !=
+      pred_id2`` → tiebreak by score).
+    * **Single (A↔B)** — fallback when no ancestor is provided.
+      Computes ``get_diff(candidate_a, candidate_b)`` and renders a
+      single ``## Diff (B relative to A)`` section.  The agent has to
+      infer three-way info from a two-way comparison.
+
     GEPA-parity note: this is the correct domain adaptation of GEPA's
     text-component merge (``gepa/proposer/merge.py:155-203``) for HELIX's
     full-codebase setting.  GEPA can splice ``dict[str, str]`` programs
     deterministically by swapping components from each parent; HELIX
     candidates are full git worktrees, where syntactic per-component swap
-    is undefined, so an LLM-mediated edit is the only viable approach.
-    The surrounding trigger / parent-selection / subsample / acceptance /
-    full-val logic in :mod:`helix.evolution` mirrors GEPA's
-    ``MergeProposer`` and ``GEPAEngine`` verbatim.
+    is undefined, so an LLM-mediated edit is the only viable approach —
+    but feeding the agent the three-way diff structure GEPA's algorithm
+    uses (two ancestor-relative diffs instead of one A↔B diff) gives it
+    the same shape of attribution information.
 
     Parameters
     ----------
@@ -190,6 +282,15 @@ def merge(
         Evaluation result for candidate A (optional, for richer prompt).
     eval_result_b:
         Evaluation result for candidate B (optional, for richer prompt).
+    prepare_worktree:
+        Optional callback to refresh protected files in the new worktree
+        before the agent runs.
+    ancestor:
+        Optional most-recent common ancestor of A and B (typically
+        ``frontier.candidates[ancestor_id]`` where ``ancestor_id`` came
+        from :func:`helix.lineage.find_merge_triplet`).  When supplied,
+        the prompt uses the two-diff (ancestor-relative) form; when
+        ``None``, falls back to the single A↔B diff.
 
     Returns
     -------
@@ -202,15 +303,31 @@ def merge(
     if prepare_worktree is not None:
         prepare_worktree(child)
 
-    diff = get_diff(candidate_a, candidate_b)
+    # Diff-rendering mode selection.  ``ancestor`` available → compute
+    # the two ancestor-relative diffs that drive the GEPA-style
+    # attribution prompt.  Otherwise compute the single A↔B fallback.
+    if ancestor is not None:
+        diff_a_from_ancestor: str | None = get_diff(ancestor, candidate_a)
+        diff_b_from_ancestor: str | None = get_diff(ancestor, candidate_b)
+        # ``diff`` (single A↔B) is computed lazily only for the fallback
+        # path; with both ancestor-relative diffs in hand, the prompt
+        # builder ignores the legacy parameter, so pass an empty string.
+        legacy_diff = ""
+    else:
+        diff_a_from_ancestor = None
+        diff_b_from_ancestor = None
+        legacy_diff = get_diff(candidate_a, candidate_b)
 
     prompt = build_merge_prompt(
         config.objective,
         eval_result_a,
         eval_result_b,
-        diff,
+        legacy_diff,
         background,
         config.agent.max_turns,
+        ancestor_id=ancestor.id if ancestor is not None else None,
+        diff_a_from_ancestor=diff_a_from_ancestor,
+        diff_b_from_ancestor=diff_b_from_ancestor,
     )
 
     try:
diff --git a/src/helix/mutator.py b/src/helix/mutator.py
index 813b59c..6e1462d 100644
--- a/src/helix/mutator.py
+++ b/src/helix/mutator.py
@@ -44,7 +44,6 @@
 - Do not request confirmation or clarification; choose a reasonable approach and continue.
 - If one approach fails, try an alternative and keep progressing.
 - Use available tools to inspect, edit, and validate changes.
-- When finished, print exactly: [MUTATION COMPLETE]
 """
 
 SEEDLESS_INIT_PROMPT_TEMPLATE = """\
@@ -59,43 +58,63 @@
 Generate a strong initial candidate based on the goal above.
 Create all necessary files directly in the current working directory.
 Make your implementation complete and ready to be evaluated immediately.
-
-When you have finished creating all files, output the exact text:
-[SEED GENERATION COMPLETE]
 {turn_budget}"""
 
-MUTATION_PROMPT_TEMPLATE = """\
-{system_prompt}
-
-## Objective
-{objective}
-
-## Current Evaluation Scores
-{scores}
-
-{diagnostics_section}{evaluator_notes_section}{evaluator_output_section}{extra_asi_section}## Background / Context
-{background}
-
+MUTATION_TASK_INSTRUCTIONS = """\
 ## Your Task
 Analyse the evaluation results above and improve the code to better achieve the objective.
-Make targeted, meaningful changes. You may read, edit, create, or delete files as needed.
-
-When you have finished making all your changes, output the exact text:
-[MUTATION COMPLETE]
-{turn_budget}"""
+Make targeted, meaningful changes. You may read, edit, create, or delete files as needed."""
 
 # ---------------------------------------------------------------------------
 # Prompt construction
 # ---------------------------------------------------------------------------
 
 
+def _indefinite_article(n: int) -> str:
+    """Pick ``"a"`` or ``"an"`` to match the spoken pronunciation of ``n``.
+
+    Within HELIX's realistic max-turns range (1 ≤ n ≤ ~1000), the
+    vowel-leading numbers are 8, 11, 18, and the 80s / 800s.  Everything
+    else takes ``"a"``.  Handles the visible ``"You have a 8-turn limit"``
+    article-agreement glitch without imposing a full English-number
+    pronunciation rule on the codebase.
+    """
+    s = str(abs(n))
+    if s in {"11", "18"}:
+        return "an"
+    if s.startswith("8"):  # 8, 80-89, 800-899, ...
+        return "an"
+    return "a"
+
+
 def _turn_budget_section(max_turns: int | None) -> str:
-    """Return the turn budget prompt section, or empty string if unbounded."""
+    """Return the turn budget prompt section, or empty string if unbounded.
+
+    Enforcement semantics differ by backend:
+
+    * ``claude`` — hard cap.  HELIX passes ``--max-turns N`` to the Claude
+      Code CLI (``_build_cli_args``), the runtime kills the session at the
+      limit, and the resulting ``subtype="error_max_turns"`` response is
+      detected at :func:`invoke_claude_code` and treated as partial
+      success.
+    * ``codex`` / ``cursor`` / ``gemini`` / ``opencode`` — soft hint only.
+      None of these CLIs expose an equivalent flag (verified against
+      ``--help`` for the installed binaries), so the in-prompt request is
+      the only signal the agent receives.  Whether the agent self-honors
+      the limit depends entirely on its own behaviour.
+
+    The section is therefore emitted for every backend regardless — it
+    still has some value as a soft hint — but callers depending on hard
+    enforcement should set the budget low enough to also be enforced via
+    subprocess-level mechanisms (wall-clock timeout, sandbox limits) or
+    use the Claude backend.
+    """
     if max_turns is None:
         return ""
+    article = _indefinite_article(max_turns)
     return (
         f"\n## Turn Budget\n"
-        f"You have a {max_turns}-turn limit for this task, where turns refer to "
+        f"You have {article} {max_turns}-turn limit for this task, where turns refer to "
         f"how many tool calls or interactions you can make. Plan your work "
         f"accordingly — prioritize the highest-impact changes first and be "
         f"efficient with your tool usage.\n"
@@ -142,12 +161,13 @@ def _evaluator_failed(eval_result: EvalResult) -> bool:
     return eval_result.asi.get("_returncode") not in (None, "0", 0)
 
 
-# All ``_render_*`` helpers below share a strict invariant: they return
-# either the empty string (the section is suppressed) or a block of
-# text that ends with a single trailing blank line (``"\n\n"``).  The
-# ``MUTATION_PROMPT_TEMPLATE`` chains them together verbatim and relies
-# on this convention to keep section spacing consistent.  Update both
-# the helper and the template if you change it.
+# All ``_render_*`` helpers below return either the empty string (the
+# section is suppressed) or a fully-formed Markdown section.  Empty
+# returns let ``build_mutation_prompt`` skip the section entirely —
+# mirrors GEPA's ``_build_reflection_prompt_template`` accumulator
+# pattern (``gepa/optimize_anything.py:501-596``).  Non-empty returns
+# may carry trailing whitespace; ``build_mutation_prompt`` rstrips
+# before joining sections with a uniform blank-line separator.
 
 
 def _render_evaluator_notes(eval_result: EvalResult) -> str:
@@ -158,25 +178,53 @@ def _render_evaluator_notes(eval_result: EvalResult) -> str:
 
 
 def _render_evaluator_output_fallback(eval_result: EvalResult) -> str:
-    """Render stdout/stderr only when they are useful fallback diagnostics."""
-    has_notes = bool(eval_result.asi.get("log", "").strip())
-    include_streams = _evaluator_failed(eval_result) or (
-        not _has_structured_diagnostics(eval_result) and not has_notes
-    )
-    if not include_streams:
-        return ""
-
+    """Render ``## Evaluator Output`` from stdout/stderr, or ``""``.
+
+    Two distinct cases:
+
+    * **Evaluator subprocess failed** (non-zero exit) — always emit the
+      section.  Empty streams render with ``(no stdout)`` / ``(no stderr)``
+      placeholders here intentionally: the agent needs to know the
+      evaluator failed but produced no output to inspect (a meaningful
+      diagnostic on its own).
+
+    * **Evaluator succeeded** — only emit the section when at least one
+      stream has content *and* no richer diagnostic surface
+      (``log`` notes or structured side_info) exists.  Empty streams are
+      omitted entirely; partial coverage (only ``stdout`` non-empty, or
+      only ``stderr``) renders just the present sub-section instead of
+      padding the other with a ``(no X)`` placeholder.
+    """
     stdout = _strip_machine_protocol_from_evaluator_stream(
         eval_result.asi.get("stdout", "")
     )
     stderr = _strip_machine_protocol_from_evaluator_stream(
         eval_result.asi.get("stderr", "")
     )
-    if not stdout:
-        stdout = "(no stdout)"
-    if not stderr:
-        stderr = "(no stderr)"
-    return f"## Evaluator Output\n\n### stdout\n{stdout}\n\n### stderr\n{stderr}\n\n"
+
+    if _evaluator_failed(eval_result):
+        stdout_text = stdout or "(no stdout)"
+        stderr_text = stderr or "(no stderr)"
+        return (
+            f"## Evaluator Output\n\n"
+            f"### stdout\n{stdout_text}\n\n"
+            f"### stderr\n{stderr_text}"
+        )
+
+    # Evaluator succeeded — defer to richer surfaces when they exist,
+    # otherwise surface only the streams that actually have content.
+    has_notes = bool(eval_result.asi.get("log", "").strip())
+    if _has_structured_diagnostics(eval_result) or has_notes:
+        return ""
+
+    parts: list[str] = []
+    if stdout:
+        parts.append(f"### stdout\n{stdout}")
+    if stderr:
+        parts.append(f"### stderr\n{stderr}")
+    if not parts:
+        return ""
+    return "## Evaluator Output\n\n" + "\n\n".join(parts)
 
 
 def build_seed_generation_prompt(
@@ -401,122 +449,128 @@ def _render_per_example_diagnostics(
     return "\n".join(lines) + "\n"
 
 
-def build_mutation_prompt(
-    objective: str,
-    eval_result: EvalResult,
-    background: str | None = None,
-    max_turns: int | None = None,
-) -> str:
-    """Construct the mutation prompt for Claude Code."""
-    scores_text = "\n".join(
-        f"  {k}: {v}" for k, v in sorted(eval_result.scores.items())
-    )
-    if not scores_text:
-        scores_text = "  (no scores recorded)"
-
-    # Collect any extra_N entries from ASI.  The reserved keys here
-    # are surfaced through dedicated prompt sections (or the
-    # ``EvalResult.evaluator_returncode`` typed field, in the case of
-    # the historical ``_returncode`` legacy key) and must never leak
-    # into the catch-all "extra" rendering.
-    extra_entries = {
+def _render_scores_section(eval_result: EvalResult) -> str:
+    """Render ``## Current Evaluation Scores`` or ``""`` when no scores exist.
+
+    Mirrors GEPA O.A.'s "only emit a section when there is content for it"
+    pattern (``gepa/optimize_anything.py:501-596``).  Previously HELIX
+    emitted the section with a ``"(no scores recorded)"`` placeholder; now
+    the section header is omitted entirely so the agent never sees a stub.
+    """
+    lines = [f"  {k}: {v}" for k, v in sorted(eval_result.scores.items())]
+    if not lines:
+        return ""
+    return "## Current Evaluation Scores\n" + "\n".join(lines)
+
+
+def _render_extra_asi(eval_result: EvalResult) -> str:
+    """Render any free-form ``extra_*`` ASI keys, or ``""`` when none exist.
+
+    Reserved keys (``stdout``, ``stderr``, ``error``, ``log``, ``_returncode``)
+    are filtered out — they're surfaced through dedicated sections
+    (``## Evaluator Notes``, ``## Evaluator Output``) or the
+    ``_returncode`` legacy sentinel — and must never leak into this
+    catch-all rendering.
+    """
+    entries = {
         k: v
         for k, v in sorted(eval_result.asi.items())
         if k not in ("stdout", "stderr", "error", "log", "_returncode")
     }
-    if extra_entries:
-        extra_lines = "\n".join(f"### {k}\n{v}" for k, v in extra_entries.items())
-        extra_asi_section = f"### Extra Evaluator Info\n{extra_lines}\n\n"
-    else:
-        extra_asi_section = ""
-
-    # Render side_info diagnostics.  Precedence:
-    #   1. ``eval_result.per_example_side_info`` (new per-example GEPA
-    #      O.A. contract — list of dicts positional to instance_scores
-    #      ids) when populated; mirrors GEPA's
-    #      ``OptimizeAnythingAdapter.make_reflective_dataset`` at
-    #      ``optimize_anything_adapter.py:524-553`` combined with
-    #      ``format_samples`` at
-    #      ``gepa/strategies/instruction_proposal.py:54-95``.
-    #   2. ``eval_result.side_info`` (legacy batch-level dict) when
-    #      ``per_example_side_info`` is absent — unchanged rendering
-    #      for non-``helix_result`` paths that still populate the
-    #      legacy field.
-    #   3. No diagnostics section otherwise.
-    diagnostics_section = ""
+    if not entries:
+        return ""
+    body = "\n".join(f"### {k}\n{v}" for k, v in entries.items())
+    return f"### Extra Evaluator Info\n{body}"
+
+
+def _render_diagnostics(eval_result: EvalResult) -> str:
+    """Render the ``## Diagnostics`` section, or ``""`` when no side_info.
+
+    Precedence:
+      1. ``eval_result.per_example_side_info`` (per-example GEPA O.A.
+         contract — list of dicts positional to instance_scores ids) when
+         populated; mirrors GEPA's
+         ``OptimizeAnythingAdapter.make_reflective_dataset`` combined
+         with ``format_samples`` at
+         ``gepa/strategies/instruction_proposal.py:54-95``.
+      2. ``eval_result.side_info`` (legacy batch-level dict) when
+         per-example data is absent.
+      3. Empty string when neither is present.
+    """
     if eval_result.per_example_side_info is not None:
         # Monotonic markdown hierarchy under the surrounding
         # ``## Diagnostics`` (h2): each example is ``### Example <id>``
         # (h3), each side_info key is ``#### {key}`` (h4), nested
-        # values bump further.  Before this the Example header was
-        # ``#`` (h1), which inverted the hierarchy and confused
-        # markdown-aware tooling / LLM markdown parsers.
-        diagnostics_section = _render_per_example_diagnostics(
+        # values bump further.
+        return _render_per_example_diagnostics(
             example_ids=list(eval_result.instance_scores.keys()),
             per_example_side_info=eval_result.per_example_side_info,
             example_header_level=3,
             key_header_level=4,
         )
-    elif eval_result.side_info is not None:
+    if eval_result.side_info is not None:
         diag_lines = "\n".join(
             f"  {k}: {v}" for k, v in sorted(eval_result.side_info.items())
         )
-        diagnostics_section = f"## Diagnostics\n{diag_lines}\n\n"
+        return f"## Diagnostics\n{diag_lines}"
+    return ""
 
-    bg = background or "(no additional background provided)"
 
-    return MUTATION_PROMPT_TEMPLATE.format(
-        system_prompt=AUTONOMOUS_SYSTEM_PROMPT,
-        objective=objective,
-        scores=scores_text,
-        evaluator_notes_section=_render_evaluator_notes(eval_result),
-        evaluator_output_section=_render_evaluator_output_fallback(eval_result),
-        extra_asi_section=extra_asi_section,
-        diagnostics_section=diagnostics_section,
-        background=bg,
-        turn_budget=_turn_budget_section(max_turns),
-    )
+def build_mutation_prompt(
+    objective: str,
+    eval_result: EvalResult,
+    background: str | None = None,
+    max_turns: int | None = None,
+) -> str:
+    """Construct the mutation prompt for the configured agent backend.
+
+    Sections are emitted only when they have content, mirroring GEPA O.A.'s
+    ``_build_reflection_prompt_template`` accumulator pattern
+    (``gepa/optimize_anything.py:501-596``).  Empty ``objective``, empty
+    ``eval_result.scores``, absent diagnostics, absent evaluator notes,
+    absent stdout/stderr fallback, absent extra ASI, and absent
+    ``background`` all skip their respective sections entirely instead of
+    rendering placeholder strings like ``"(no additional background
+    provided)"`` or ``"(no scores recorded)"`` that taught nothing.
+
+    ``## Your Task`` and the system prompt are always emitted; they are
+    the only sections that don't depend on caller-provided content.
+    """
+    sections: list[str] = [AUTONOMOUS_SYSTEM_PROMPT.rstrip()]
 
+    if objective:
+        sections.append(f"## Objective\n{objective}")
 
-# ---------------------------------------------------------------------------
-# Mutation summary parsing
-# ---------------------------------------------------------------------------
+    scores = _render_scores_section(eval_result)
+    if scores:
+        sections.append(scores)
 
+    diagnostics = _render_diagnostics(eval_result)
+    if diagnostics:
+        sections.append(diagnostics.rstrip())
 
-def parse_mutation_summary(output: str) -> dict[str, str]:
-    """Parse a ``[SUMMARY]...[END SUMMARY]`` block from Claude Code output.
+    notes = _render_evaluator_notes(eval_result)
+    if notes:
+        sections.append(notes.rstrip())
 
-    Extracts structured key-value pairs written by the mutation/merge agent
-    after ``[MUTATION COMPLETE]`` or ``[MERGE COMPLETE]``.  The block format
-    is::
+    output_fallback = _render_evaluator_output_fallback(eval_result)
+    if output_fallback:
+        sections.append(output_fallback.rstrip())
 
-        [SUMMARY]
-        files_changed: src/foo.py, src/bar.py
-        root_cause: ...
-        changes_made: ...
-        [END SUMMARY]
+    extra_asi = _render_extra_asi(eval_result)
+    if extra_asi:
+        sections.append(extra_asi)
 
-    Returns
-    -------
-    dict[str, str]
-        Parsed key-value pairs.  Returns an empty dict if no block is found
-        or the block contains no valid ``key: value`` lines.  Never raises.
-    """
-    result: dict[str, str] = {}
-    in_block = False
-    for line in output.splitlines():
-        if "[SUMMARY]" in line:
-            in_block = True
-            continue
-        if "[END SUMMARY]" in line:
-            break
-        if in_block and ":" in line:
-            key, _, value = line.partition(":")
-            key = key.strip()
-            value = value.strip()
-            if key:
-                result[key] = value
-    return result
+    if background:
+        sections.append(f"## Background / Context\n{background}")
+
+    sections.append(MUTATION_TASK_INSTRUCTIONS)
+
+    turn_budget = _turn_budget_section(max_turns)
+    if turn_budget:
+        sections.append(turn_budget.strip())
+
+    return "\n\n".join(sections) + "\n"
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/unit/test_merger.py b/tests/unit/test_merger.py
index 614e8fc..5176916 100644
--- a/tests/unit/test_merger.py
+++ b/tests/unit/test_merger.py
@@ -71,13 +71,15 @@ def test_contains_background(self):
         prompt = build_merge_prompt("goal", None, None, "", background="special bg")
         assert "special bg" in prompt
 
-    def test_default_background_when_none(self):
-        prompt = build_merge_prompt("goal", None, None, "")
-        assert "no additional background" in prompt
-
-    def test_contains_merge_complete_marker(self):
+    def test_background_section_omitted_when_none(self):
+        """GEPA parity: empty optional inputs skip the section entirely
+        instead of emitting a placeholder.  Mirrors GEPA O.A.'s
+        ``_build_reflection_prompt_template`` (optimize_anything.py:501-596),
+        which only appends a section when its content is non-empty.
+        """
         prompt = build_merge_prompt("goal", None, None, "")
-        assert "[MERGE COMPLETE]" in prompt
+        assert "## Background / Context" not in prompt
+        assert "no additional background" not in prompt
 
     def test_contains_execution_instructions(self):
         prompt = build_merge_prompt("goal", None, None, "")
@@ -90,14 +92,122 @@ def test_includes_eval_scores_when_provided(self):
         assert "0.8" in prompt
         assert "0.9" in prompt
 
-    def test_handles_none_eval_results(self):
-        # Should not raise even with no eval data
+    def test_strengths_sections_omitted_when_eval_results_none(self):
+        """``eval_result_a`` / ``eval_result_b`` of ``None`` → the
+        corresponding ``## Candidate {A,B} Strengths`` section is skipped
+        entirely (no ``"(no evaluation data)"`` placeholder).
+        """
         prompt = build_merge_prompt("goal", None, None, "some diff")
-        assert "no evaluation data" in prompt
-
-    def test_empty_diff_shows_fallback(self):
+        assert "## Candidate A Strengths" not in prompt
+        assert "## Candidate B Strengths" not in prompt
+        assert "no evaluation data" not in prompt
+
+    def test_diff_section_omitted_when_empty(self):
+        """An empty diff → ``## Diff (B relative to A)`` section is
+        skipped entirely (no ``"(no diff — candidates are identical)"``
+        placeholder).  Upstream callers should not invoke merge on
+        identical candidates; this test pins the prompt-level fallback.
+        """
         prompt = build_merge_prompt("goal", None, None, "")
-        assert "identical" in prompt or "no diff" in prompt
+        assert "## Diff" not in prompt
+        assert "identical" not in prompt
+        assert "no diff" not in prompt
+
+
+class TestBuildMergePromptTwoDiff:
+    """GEPA-parity two-diff form: when ``ancestor_id`` and both
+    ancestor-relative diffs are supplied, the prompt emits two labelled
+    sections instead of the single A↔B diff.  Mirrors GEPA's
+    three-way attribution reasoning (``gepa/proposer/merge.py:163-191``)
+    at the file-hunk level — agent reads off A's contribution and B's
+    contribution directly instead of inferring three-way info from a
+    two-way comparison.
+    """
+
+    def test_emits_two_ancestor_relative_sections(self):
+        prompt = build_merge_prompt(
+            "goal", None, None, "",
+            ancestor_id="g0-s0",
+            diff_a_from_ancestor="+self.bn = nn.BatchNorm1d(...)\n",
+            diff_b_from_ancestor="+if epoch < 10: lr = 0.001\n",
+        )
+        assert "## Diff: Candidate A relative to common ancestor g0-s0" in prompt
+        assert "## Diff: Candidate B relative to common ancestor g0-s0" in prompt
+        assert "self.bn = nn.BatchNorm1d" in prompt
+        assert "if epoch < 10" in prompt
+        # Fallback section MUST NOT also render — the agent should see
+        # exactly the two ancestor-relative diffs, not three sections.
+        assert "## Diff (B relative to A)" not in prompt
+
+    def test_two_diff_form_uses_two_diff_task_block(self):
+        """The two-diff form pairs the ancestor-relative diffs with a
+        task block that explicitly tells the agent A's contribution is
+        already in the working tree (so it doesn't re-apply it) and B's
+        contribution is what needs to be brought in.  Pinning this
+        substring prevents a future refactor from accidentally swapping
+        back to the single-diff task block, which would confuse the
+        agent about whether A's diff needs to be re-applied.
+        """
+        prompt = build_merge_prompt(
+            "goal", None, None, "",
+            ancestor_id="g0-s0",
+            diff_a_from_ancestor="+a contribution\n",
+            diff_b_from_ancestor="+b contribution\n",
+        )
+        assert "A's contribution (the hunks shown in" in prompt
+        # The single-diff task block's "Apply the changes from Candidate B"
+        # phrasing must NOT appear in the two-diff form.
+        assert "Apply the changes from\nCandidate B that are beneficial" not in prompt
+
+    def test_single_diff_form_uses_single_diff_task_block(self):
+        """Companion to the two-diff task-block test: the single-diff
+        fallback must keep the legacy task framing intact.
+        """
+        prompt = build_merge_prompt("goal", None, None, "+some diff")
+        assert "Apply the changes from\nCandidate B" in prompt
+        # The two-diff task block's phrasing must NOT appear in the
+        # single-diff form.
+        assert "A's contribution (the hunks shown in" not in prompt
+
+    def test_single_diff_fallback_when_ancestor_missing(self):
+        """No ancestor + no ancestor diffs → single A↔B diff form
+        (backward-compatible default).
+        """
+        prompt = build_merge_prompt("goal", None, None, "+some diff content")
+        assert "## Diff (B relative to A)" in prompt
+        assert "+some diff content" in prompt
+        # Two-diff section headers must NOT appear in the fallback path.
+        assert "relative to common ancestor" not in prompt
+
+    def test_single_diff_fallback_when_ancestor_id_only(self):
+        """Defensive: an ``ancestor_id`` without both diffs falls back
+        to the single A↔B path.  Prevents a half-configured caller from
+        emitting an "ancestor header pointing to nothing" prompt.
+        """
+        prompt = build_merge_prompt(
+            "goal", None, None, "+legacy diff",
+            ancestor_id="g0-s0",
+            diff_a_from_ancestor=None,
+            diff_b_from_ancestor=None,
+        )
+        assert "## Diff (B relative to A)" in prompt
+        assert "+legacy diff" in prompt
+        assert "relative to common ancestor" not in prompt
+
+    def test_two_diff_form_omits_empty_side(self):
+        """If one of the ancestor-relative diffs is empty (one parent
+        didn't change anything relative to the ancestor), only the
+        non-empty side renders.  Edge case: a parent might "improve" via
+        metadata changes that ``git diff`` doesn't see.
+        """
+        prompt = build_merge_prompt(
+            "goal", None, None, "",
+            ancestor_id="g0-s0",
+            diff_a_from_ancestor="+self.bn = nn.BatchNorm1d(...)\n",
+            diff_b_from_ancestor="",
+        )
+        assert "## Diff: Candidate A relative to common ancestor g0-s0" in prompt
+        assert "## Diff: Candidate B relative to common ancestor g0-s0" not in prompt
 
 
 # ---------------------------------------------------------------------------
@@ -253,6 +363,87 @@ def test_passes_background_to_prompt(self, mocker):
         prompt_arg = mock_invoke.call_args[0][1]
         assert "unique_context_xyz" in prompt_arg
 
+    def test_ancestor_triggers_two_diff_form(self, mocker):
+        """When ``merge()`` receives an ``ancestor`` argument, it must
+        call ``get_diff`` twice (ancestor→A and ancestor→B) and the
+        rendered prompt must contain both ancestor-relative diff
+        sections instead of the single A↔B diff.
+        """
+        ancestor = make_candidate("g0-s0")
+        ca = make_candidate("g1-s0")
+        cb = make_candidate("g1-s1")
+        config = make_config()
+
+        child = make_candidate("g2-m0")
+        mocker.patch("helix.merger.clone_candidate", return_value=child)
+
+        # get_diff is called per-side; sequence captures which two diffs
+        # were requested so we can assert the right pair of arguments
+        # was passed (ancestor→A and ancestor→B, NOT A↔B).
+        diff_calls: list[tuple[str, str]] = []
+
+        def fake_get_diff(x, y):
+            diff_calls.append((x.id, y.id))
+            return f"+contribution from {y.id}"
+
+        mocker.patch("helix.merger.get_diff", side_effect=fake_get_diff)
+        mock_invoke = mocker.patch(
+            "helix.merger.invoke_claude_code", return_value=({}, {})
+        )
+        mocker.patch("helix.merger.snapshot_candidate", return_value="sha")
+        mocker.patch("helix.merger.remove_worktree")
+
+        merge(ca, cb, "g2-m0", config, Path("/tmp"), ancestor=ancestor)
+
+        # Two get_diff calls, both anchored on the ancestor.
+        assert diff_calls == [("g0-s0", "g1-s0"), ("g0-s0", "g1-s1")], (
+            f"expected two ancestor-relative diff calls; got {diff_calls}"
+        )
+
+        # Prompt must contain both ancestor-relative section headers
+        # and neither the legacy A↔B header nor any leftover placeholder.
+        prompt_arg = mock_invoke.call_args[0][1]
+        assert "## Diff: Candidate A relative to common ancestor g0-s0" in prompt_arg
+        assert "## Diff: Candidate B relative to common ancestor g0-s0" in prompt_arg
+        assert "## Diff (B relative to A)" not in prompt_arg
+        assert "contribution from g1-s0" in prompt_arg
+        assert "contribution from g1-s1" in prompt_arg
+
+    def test_no_ancestor_uses_single_diff_form(self, mocker):
+        """Backward-compat: ``merge()`` without an ``ancestor`` argument
+        still computes a single A↔B diff and renders the legacy section.
+        """
+        ca = make_candidate("g1-s0")
+        cb = make_candidate("g1-s1")
+        config = make_config()
+
+        child = make_candidate("g2-m0")
+        mocker.patch("helix.merger.clone_candidate", return_value=child)
+
+        diff_calls: list[tuple[str, str]] = []
+
+        def fake_get_diff(x, y):
+            diff_calls.append((x.id, y.id))
+            return "+A-to-B contribution"
+
+        mocker.patch("helix.merger.get_diff", side_effect=fake_get_diff)
+        mock_invoke = mocker.patch(
+            "helix.merger.invoke_claude_code", return_value=({}, {})
+        )
+        mocker.patch("helix.merger.snapshot_candidate", return_value="sha")
+        mocker.patch("helix.merger.remove_worktree")
+
+        merge(ca, cb, "g2-m0", config, Path("/tmp"))  # no ancestor
+
+        # Single get_diff call with A↔B arguments.
+        assert diff_calls == [("g1-s0", "g1-s1")], (
+            f"expected single A↔B diff call; got {diff_calls}"
+        )
+
+        prompt_arg = mock_invoke.call_args[0][1]
+        assert "## Diff (B relative to A)" in prompt_arg
+        assert "## Diff: Candidate A relative to common ancestor" not in prompt_arg
+
     def test_imports_mutation_error_from_mutator(self):
         """merger.py must reuse MutationError from mutator to avoid duplication."""
         from helix.merger import MutationError as MergerME
diff --git a/tests/unit/test_mutator.py b/tests/unit/test_mutator.py
index e3fb7ae..b6552f1 100644
--- a/tests/unit/test_mutator.py
+++ b/tests/unit/test_mutator.py
@@ -116,25 +116,31 @@ def test_contains_background(self):
         prompt = build_mutation_prompt("goal", er, background="special context here")
         assert "special context here" in prompt
 
-    def test_default_background_when_none(self):
+    def test_background_section_omitted_when_none(self):
+        """GEPA parity: empty optional inputs skip the section entirely
+        instead of emitting a placeholder.  Mirrors GEPA O.A.'s
+        ``_build_reflection_prompt_template`` (optimize_anything.py:501-596),
+        which only appends a section when its content is non-empty.
+        """
         er = make_eval_result()
         prompt = build_mutation_prompt("goal", er, background=None)
-        assert "no additional background" in prompt
-
-    def test_contains_mutation_complete_marker(self):
-        er = make_eval_result()
-        prompt = build_mutation_prompt("goal", er)
-        assert "[MUTATION COMPLETE]" in prompt
+        assert "## Background / Context" not in prompt
+        assert "no additional background" not in prompt
 
     def test_contains_execution_instructions(self):
         er = make_eval_result()
         prompt = build_mutation_prompt("goal", er)
         assert "Task instructions:" in prompt
 
-    def test_no_scores_fallback(self):
+    def test_scores_section_omitted_when_empty(self):
+        """No ``eval_result.scores`` → ``## Current Evaluation Scores``
+        section is skipped entirely (no ``"(no scores recorded)"``
+        placeholder).
+        """
         er = make_eval_result(scores={})
         prompt = build_mutation_prompt("goal", er)
-        assert "no scores recorded" in prompt
+        assert "## Current Evaluation Scores" not in prompt
+        assert "no scores recorded" not in prompt
 
     def test_renders_helix_log_notes(self):
         er = make_eval_result(
@@ -207,6 +213,41 @@ def test_returncode_sentinel_does_not_leak_into_prompt(self):
         assert "real-extra" in prompt
 
 
+class TestTurnBudgetArticleAgreement:
+    """``_turn_budget_section`` must use the correct indefinite article
+    ("a" vs "an") so the rendered prompt reads naturally:
+
+      * ``"You have a 5-turn limit"`` (consonant sound — "five")
+      * ``"You have an 8-turn limit"`` (vowel sound — "eight")
+
+    Pre-fix the article was always hardcoded ``"a"``, producing
+    ``"You have a 8-turn limit"`` for the 8/11/18/80s cases.
+    """
+
+    def test_consonant_leading_numbers_use_a(self) -> None:
+        from helix.mutator import _turn_budget_section
+
+        for n in (1, 2, 3, 4, 5, 6, 7, 9, 10, 12, 17, 19, 20, 50, 100, 200):
+            section = _turn_budget_section(n)
+            assert f"You have a {n}-turn limit" in section, (
+                f"max_turns={n} should use 'a', section was: {section!r}"
+            )
+
+    def test_vowel_leading_numbers_use_an(self) -> None:
+        from helix.mutator import _turn_budget_section
+
+        for n in (8, 11, 18, 80, 85, 88, 800, 888):
+            section = _turn_budget_section(n)
+            assert f"You have an {n}-turn limit" in section, (
+                f"max_turns={n} should use 'an', section was: {section!r}"
+            )
+
+    def test_none_returns_empty(self) -> None:
+        from helix.mutator import _turn_budget_section
+
+        assert _turn_budget_section(None) == ""
+
+
 class TestPerExampleDiagnostics:
     """``build_mutation_prompt`` renders ``eval_result.per_example_side_info``
     as the Diagnostics section under the new GEPA O.A. contract
@@ -557,7 +598,7 @@ def test_artifact_contains_rendered_prompt(self, tmp_path: Path, mocker):
         # Objective + autonomous-rules block are both in the rendered
         # prompt via build_mutation_prompt.
         assert config.objective in content
-        assert "[MUTATION COMPLETE]" in content
+        assert "Task instructions:" in content
 
     def test_gitignore_excludes_helix_artifacts(self, tmp_path: Path, mocker):
         """``.gitignore`` in the worktree gains entries for the prompt
diff --git a/tests/unit/test_mutator_seedless.py b/tests/unit/test_mutator_seedless.py
index f8fd9ab..b9734e9 100644
--- a/tests/unit/test_mutator_seedless.py
+++ b/tests/unit/test_mutator_seedless.py
@@ -70,11 +70,6 @@ def test_no_evaluator_section_when_absent(self):
         )
         assert "Evaluator" not in prompt
 
-    def test_completion_signal_in_prompt(self):
-        """The completion signal string must appear in the prompt."""
-        prompt = build_seed_generation_prompt(objective="Test objective")
-        assert "[SEED GENERATION COMPLETE]" in prompt
-
     def test_returns_string(self):
         """build_seed_generation_prompt must return a str."""
         result = build_seed_generation_prompt(objective="Test")
diff --git a/tests/unit/test_semlog.py b/tests/unit/test_semlog.py
deleted file mode 100644
index 33a058e..0000000
--- a/tests/unit/test_semlog.py
+++ /dev/null
@@ -1,155 +0,0 @@
-"""Unit tests for semantic mutation logging (semlog) functionality."""
-
-from __future__ import annotations
-
-
-
-from helix.mutator import parse_mutation_summary
-
-
-# ---------------------------------------------------------------------------
-# Tests: parse_mutation_summary
-# ---------------------------------------------------------------------------
-
-
-class TestParseMutationSummaryComplete:
-    """test_parse_mutation_summary_complete: full block parsed correctly."""
-
-    def test_all_mutation_fields_parsed(self):
-        output = """\
-Some Claude output here...
-[MUTATION COMPLETE]
-[SUMMARY]
-files_changed: src/model.py, src/utils.py
-root_cause: The model was not normalising input features before training.
-changes_made: Added a StandardScaler preprocessing step. Updated the training pipeline.
-reasoning: Normalisation is the most direct fix for scale-sensitive algorithms.
-expected_impact: Expect accuracy to improve by 5-10% based on typical gains from this fix.
-[END SUMMARY]
-"""
-        result = parse_mutation_summary(output)
-
-        assert result["files_changed"] == "src/model.py, src/utils.py"
-        assert result["root_cause"] == "The model was not normalising input features before training."
-        assert "StandardScaler" in result["changes_made"]
-        assert result["reasoning"] == "Normalisation is the most direct fix for scale-sensitive algorithms."
-        assert "5-10%" in result["expected_impact"]
-
-    def test_returns_dict_type(self):
-        output = "[SUMMARY]\nfiles_changed: foo.py\n[END SUMMARY]"
-        result = parse_mutation_summary(output)
-        assert isinstance(result, dict)
-
-    def test_keys_are_stripped(self):
-        output = "[SUMMARY]\n  files_changed : foo.py\n[END SUMMARY]"
-        result = parse_mutation_summary(output)
-        assert "files_changed" in result
-
-    def test_values_are_stripped(self):
-        output = "[SUMMARY]\nfiles_changed:   bar.py  \n[END SUMMARY]"
-        result = parse_mutation_summary(output)
-        assert result["files_changed"] == "bar.py"
-
-    def test_value_with_colon_preserved(self):
-        """Values containing ':' should keep everything after the first ': '."""
-        output = "[SUMMARY]\nroot_cause: Issue in module: core/parser.py\n[END SUMMARY]"
-        result = parse_mutation_summary(output)
-        assert result["root_cause"] == "Issue in module: core/parser.py"
-
-    def test_stops_at_end_summary(self):
-        """Lines after [END SUMMARY] should not be parsed."""
-        output = (
-            "[SUMMARY]\nfiles_changed: a.py\n[END SUMMARY]\nextra_key: should_not_appear\n"
-        )
-        result = parse_mutation_summary(output)
-        assert "extra_key" not in result
-
-
-class TestParseMutationSummaryMissingBlock:
-    """test_parse_mutation_summary_missing_block: returns empty dict gracefully (never crash)."""
-
-    def test_returns_empty_dict_when_no_block(self):
-        output = "Claude Code output with no summary block at all."
-        result = parse_mutation_summary(output)
-        assert result == {}
-
-    def test_returns_empty_dict_on_empty_string(self):
-        result = parse_mutation_summary("")
-        assert result == {}
-
-    def test_returns_empty_dict_on_partial_block_no_end(self):
-        """[SUMMARY] without [END SUMMARY] — reads to EOF, may have partial data."""
-        output = "[SUMMARY]\nfiles_changed: a.py\n"
-        result = parse_mutation_summary(output)
-        # Should not crash; may return partial dict
-        assert isinstance(result, dict)
-
-    def test_returns_empty_dict_on_only_end_marker(self):
-        output = "[END SUMMARY]\nfiles_changed: a.py\n"
-        result = parse_mutation_summary(output)
-        assert result == {}
-
-    def test_no_exception_on_none_like_empty_string(self):
-        """Ensure no crash even on edge-case whitespace-only input."""
-        result = parse_mutation_summary("   \n\n\n   ")
-        assert result == {}
-
-    def test_no_exception_on_malformed_lines(self):
-        output = "[SUMMARY]\nthis line has no colon separator\nanother bad line\n[END SUMMARY]"
-        result = parse_mutation_summary(output)
-        assert isinstance(result, dict)
-        # None of the malformed lines should produce keys
-        assert len(result) == 0
-
-
-class TestParseMergeSummary:
-    """test_parse_merge_summary: merge-specific fields parsed."""
-
-    def test_all_merge_fields_parsed(self):
-        output = """\
-Merging candidates...
-[MERGE COMPLETE]
-[SUMMARY]
-files_changed: src/solver.py, tests/test_solver.py
-candidate_a_kept: The base algorithm from A because it had better convergence.
-candidate_b_applied: The caching layer from B because it reduced redundant computation.
-conflicts_resolved: Both modified the solve() function; kept A's logic with B's cache wrapper.
-merge_strategy: Apply B's performance improvements on top of A's correctness fixes.
-expected_impact: Combined improvements should yield 15% speedup and maintain accuracy.
-[END SUMMARY]
-"""
-        result = parse_mutation_summary(output)
-
-        assert result["files_changed"] == "src/solver.py, tests/test_solver.py"
-        assert "base algorithm" in result["candidate_a_kept"]
-        assert "caching layer" in result["candidate_b_applied"]
-        assert "solve()" in result["conflicts_resolved"]
-        assert result["merge_strategy"] == "Apply B's performance improvements on top of A's correctness fixes."
-        assert "15%" in result["expected_impact"]
-
-    def test_merge_fields_present_as_keys(self):
-        output = (
-            "[SUMMARY]\n"
-            "files_changed: x.py\n"
-            "candidate_a_kept: kept A's approach\n"
-            "candidate_b_applied: applied B's fix\n"
-            "conflicts_resolved: none\n"
-            "merge_strategy: selective apply\n"
-            "expected_impact: minor improvement\n"
-            "[END SUMMARY]"
-        )
-        result = parse_mutation_summary(output)
-        expected_keys = {
-            "files_changed",
-            "candidate_a_kept",
-            "candidate_b_applied",
-            "conflicts_resolved",
-            "merge_strategy",
-            "expected_impact",
-        }
-        assert expected_keys.issubset(result.keys())
-
-    def test_merge_summary_no_block_returns_empty(self):
-        output = "Merge done. No structured summary provided."
-        result = parse_mutation_summary(output)
-        assert result == {}