From 043c7d81ba41c956ad37cd5a483613a53ea5dcee Mon Sep 17 00:00:00 2001
From: Clayton Thompson <claytonthompson@MacBook-Pro.local>
Date: Sat, 21 Mar 2026 21:41:45 +0300
Subject: [PATCH] feat: add test-both override for empirical deadlock
 resolution
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the critique loop stagnates (ESCALATE), the only options today are
add-note, force-proceed, or abort — all of which punt the decision to the
human without evidence. This adds a test-both override that invokes a judge
agent to evaluate the current plan against an alternative approach, then
renders a verdict (approach_a, approach_b, or synthesis) based on empirical
assessment.

Changes:
- New test-both.json schema for structured judge output
- Judge prompt in prompts.py that evaluates both approaches against
  unresolved flags
- _override_test_both handler in cli.py with full state machine integration
- Mock worker for test-both in workers.py
- Default agent routing (claude) in _core.py
- Updated infer_next_steps to surface test-both for ESCALATE/ABORT
- Documentation in instructions.md
- 15 new tests covering all verdict paths, state transitions, and schema

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 megaplan/_core.py             |   1 +
 megaplan/cli.py               |  85 ++++++++-
 megaplan/data/instructions.md |  17 ++
 megaplan/prompts.py           |  58 ++++++
 megaplan/schemas.py           |  34 ++++
 megaplan/workers.py           |  27 +++
 tests/test_megaplan.py        |   4 +-
 tests/test_schemas.py         |   2 +-
 tests/test_test_both.py       | 326 ++++++++++++++++++++++++++++++++++
 9 files changed, 549 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_test_both.py

diff --git a/megaplan/_core.py b/megaplan/_core.py
index 553f079..9e140ec 100644
--- a/megaplan/_core.py
+++ b/megaplan/_core.py
@@ -210,6 +210,7 @@ class StepResponse(TypedDict, total=False):
     "integrate": "claude",
     "execute": "codex",
     "review": "codex",
+    "test-both": "claude",
 }
 KNOWN_AGENTS = ["claude", "codex"]
 ROBUSTNESS_LEVELS = ("light", "standard", "thorough")
diff --git a/megaplan/cli.py b/megaplan/cli.py
index 4e8de1f..f2736b4 100755
--- a/megaplan/cli.py
+++ b/megaplan/cli.py
@@ -226,7 +226,7 @@ def infer_next_steps(state: PlanState) -> list[str]:
         if recommendation in {"SKIP", "CONTINUE"}:
             valid.append("gate")
         if recommendation in {"ESCALATE", "ABORT"}:
-            valid.extend(["override add-note", "override force-proceed", "override abort"])
+            valid.extend(["override test-both", "override add-note", "override force-proceed", "override abort"])
         return valid or ["override add-note", "override abort"]
     if current == STATE_GATED:
         return ["execute"]
@@ -1008,11 +1008,92 @@ def _override_skip(plan_dir: Path, state: PlanState, args: argparse.Namespace) -
     }
 
 
+def _override_test_both(plan_dir: Path, state: PlanState, args: argparse.Namespace) -> StepResponse:
+    if state["current_state"] != STATE_EVALUATED:
+        raise CliError(
+            "invalid_transition",
+            "test-both is only supported from evaluated state",
+            valid_next=infer_next_steps(state),
+        )
+    recommendation = state["last_evaluation"].get("recommendation")
+    if recommendation not in {"ESCALATE", "ABORT"}:
+        raise CliError(
+            "invalid_transition",
+            f"test-both requires an ESCALATE or ABORT evaluation, got {recommendation!r}",
+            valid_next=infer_next_steps(state),
+        )
+    root = args._test_both_root if hasattr(args, "_test_both_root") else Path.cwd()
+    try:
+        worker, agent, mode, refreshed = run_step_with_worker("test-both", state, plan_dir, args, root=root)
+    except CliError as error:
+        record_step_failure(plan_dir, state, step="test-both", iteration=state["iteration"], error=error)
+        raise
+    test_both_filename = "test-both.json"
+    atomic_write_json(plan_dir / test_both_filename, worker.payload)
+    verdict = worker.payload["verdict"]
+    rationale = worker.payload["verdict_rationale"]
+    apply_session_update(state, "test-both", agent, worker.session_id, mode=mode, refreshed=refreshed)
+    append_history(
+        state,
+        make_history_entry(
+            "test-both",
+            duration_ms=worker.duration_ms,
+            cost_usd=worker.cost_usd,
+            result="success",
+            worker=worker,
+            agent=agent,
+            mode=mode,
+            output_file=test_both_filename,
+            artifact_hash=sha256_file(plan_dir / test_both_filename),
+            recommendation=verdict,
+        ),
+    )
+    if verdict == "approach_a":
+        # Current plan wins — proceed to gate
+        gate = run_gate_checks(plan_dir, state)
+        atomic_write_json(plan_dir / "gate.json", gate)
+        if gate["passed"]:
+            final_plan = latest_plan_path(plan_dir, state).read_text(encoding="utf-8")
+            atomic_write_text(plan_dir / "final.md", final_plan)
+            state["current_state"] = STATE_GATED
+        next_step = "execute" if gate["passed"] else "integrate"
+    elif verdict == "approach_b":
+        # Alternative wins — need to integrate the alternative into the plan
+        next_step = "integrate"
+    else:
+        # Synthesis — need to integrate the synthesis
+        next_step = "integrate"
+    evaluation = copy.deepcopy(state["last_evaluation"])
+    evaluation["recommendation"] = "SKIP" if verdict == "approach_a" else "CONTINUE"
+    state["last_evaluation"] = evaluation
+    _append_to_meta(state, "overrides", {
+        "action": "test-both",
+        "timestamp": now_utc(),
+        "verdict": verdict,
+        "rationale": rationale,
+        "reason": args.reason,
+    })
+    save_state(plan_dir, state)
+    response: StepResponse = {
+        "success": True,
+        "step": "override",
+        "summary": f"Test-both complete. Verdict: {verdict}. {rationale}",
+        "artifacts": [test_both_filename],
+        "next_step": next_step,
+        "state": state["current_state"],
+    }
+    if verdict == "synthesis" and worker.payload.get("synthesis_description"):
+        response["message"] = worker.payload["synthesis_description"]
+    attach_agent_fallback(response, args)
+    return response
+
+
 _OVERRIDE_ACTIONS: dict[str, Callable[[Path, PlanState, argparse.Namespace], StepResponse]] = {
     "add-note": _override_add_note,
     "abort": _override_abort,
     "force-proceed": _override_force_proceed,
     "skip": _override_skip,
+    "test-both": _override_test_both,
 }
 
 
@@ -1292,7 +1373,7 @@ def build_parser() -> argparse.ArgumentParser:
     config_sub.add_parser("reset")
 
     override_parser = subparsers.add_parser("override")
-    override_parser.add_argument("override_action", choices=["skip", "abort", "force-proceed", "add-note"])
+    override_parser.add_argument("override_action", choices=["skip", "abort", "force-proceed", "add-note", "test-both"])
     override_parser.add_argument("--plan")
     override_parser.add_argument("--reason", default="")
     override_parser.add_argument("--note")
diff --git a/megaplan/data/instructions.md b/megaplan/data/instructions.md
index cd1d329..631f0f7 100644
--- a/megaplan/data/instructions.md
+++ b/megaplan/data/instructions.md
@@ -83,6 +83,22 @@ Auto-force-proceed (and tell the user why) when:
 - `suggested_override` is `"force-proceed"`, OR
 - Robustness is `light` and `weighted_score` < 4.0
 
+When the critique loop has stagnated (recurring critiques or score not improving),
+consider using `test-both` to break the deadlock empirically:
+
+```bash
+megaplan override test-both --plan <name> --reason "critique loop stagnated"
+```
+
+This invokes a judge agent that evaluates both the current plan and an alternative
+approach against the unresolved flags, then renders a verdict (approach_a, approach_b,
+or synthesis). The verdict determines the next step:
+- `approach_a` (current plan wins) → proceeds to gate
+- `approach_b` or `synthesis` → proceeds to integrate with the judge's recommendations
+
+Use `test-both` when the same concerns keep recurring across iterations and neither
+force-proceed nor add-note is resolving the impasse.
+
 Otherwise, present the evaluation details and ask the user what to do.
 
 ## Minor Megaplan Edits
@@ -114,5 +130,6 @@ megaplan status --plan <name>
 megaplan audit --plan <name>
 megaplan list
 megaplan override add-note --plan <name> --note "user context"
+megaplan override test-both --plan <name> --reason "critique loop stagnated"
 megaplan override abort --plan <name>
 ```
diff --git a/megaplan/prompts.py b/megaplan/prompts.py
index 9cc9968..4251b80 100644
--- a/megaplan/prompts.py
+++ b/megaplan/prompts.py
@@ -249,6 +249,62 @@ def _execute_prompt(state: PlanState, plan_dir: Path) -> str:
     ).strip()
 
 
+def _test_both_prompt(state: PlanState, plan_dir: Path) -> str:
+    project_dir = Path(state["config"]["project_dir"])
+    latest_plan = latest_plan_path(plan_dir, state).read_text(encoding="utf-8")
+    latest_meta = read_json(latest_plan_meta_path(plan_dir, state))
+    flag_registry = load_flag_registry(plan_dir)
+    unresolved = unresolved_significant_flags(flag_registry)
+    open_flags = [
+        {
+            "id": flag["id"],
+            "severity": flag.get("severity"),
+            "concern": flag.get("concern"),
+            "evidence": flag.get("evidence"),
+        }
+        for flag in unresolved
+    ]
+    return textwrap.dedent(
+        f"""
+        You are a neutral judge resolving a deadlock between a planner and a critic.
+        The critique loop has stagnated — the same concerns keep recurring despite
+        revisions. Your job is to test both the current plan AND an alternative
+        approach, then rule based on evidence.
+
+        Project directory:
+        {project_dir}
+
+        {intent_and_notes_block(state)}
+
+        Current plan (Approach A):
+        {latest_plan}
+
+        Plan metadata:
+        {json_dump(latest_meta).strip()}
+
+        Unresolved flags from the critic (the concerns driving the deadlock):
+        {json_dump(open_flags).strip()}
+
+        Requirements:
+        - Inspect the actual repository before judging.
+        - Evaluate Approach A (the current plan) against the unresolved flags.
+          For each flag, determine: does the plan actually have this problem,
+          or is the critic being overly cautious?
+        - Propose Approach B: an alternative that addresses the unresolved flags
+          differently. This could be a modified version of the plan, a simpler
+          approach, or a fundamentally different strategy.
+        - For BOTH approaches, assess:
+          1. Would it build and pass existing tests? (build_pass, test_pass)
+          2. What concrete issues would it cause? (issues)
+          3. What evidence supports your assessment? (evidence)
+        - Render a verdict: approach_a, approach_b, or synthesis.
+        - If synthesis, describe what to take from each approach.
+        - Judge based on correctness and practicality, not elegance.
+        - An approach that would fail to build loses automatically.
+        """
+    ).strip()
+
+
 def _review_claude_prompt(state: PlanState, plan_dir: Path) -> str:
     project_dir = Path(state["config"]["project_dir"])
     latest_plan = latest_plan_path(plan_dir, state).read_text(encoding="utf-8")
@@ -332,6 +388,7 @@ def _review_codex_prompt(state: PlanState, plan_dir: Path) -> str:
     "critique": _critique_prompt,
     "execute": _execute_prompt,
     "review": _review_claude_prompt,
+    "test-both": _test_both_prompt,
 }
 
 _CODEX_PROMPT_BUILDERS: dict[str, Any] = {
@@ -341,6 +398,7 @@ def _review_codex_prompt(state: PlanState, plan_dir: Path) -> str:
     "critique": _critique_prompt,
     "execute": _execute_prompt,
     "review": _review_codex_prompt,
+    "test-both": _test_both_prompt,
 }
 
 
diff --git a/megaplan/schemas.py b/megaplan/schemas.py
index 1bd23b1..b636a34 100644
--- a/megaplan/schemas.py
+++ b/megaplan/schemas.py
@@ -112,6 +112,40 @@
         },
         "required": ["criteria", "issues"],
     },
+    "test-both.json": {
+        "type": "object",
+        "properties": {
+            "approach_a": {
+                "type": "object",
+                "properties": {
+                    "label": {"type": "string"},
+                    "build_pass": {"type": "boolean"},
+                    "test_pass": {"type": "boolean"},
+                    "issues": {"type": "array", "items": {"type": "string"}},
+                    "evidence": {"type": "string"},
+                },
+                "required": ["label", "build_pass", "test_pass", "issues", "evidence"],
+            },
+            "approach_b": {
+                "type": "object",
+                "properties": {
+                    "label": {"type": "string"},
+                    "build_pass": {"type": "boolean"},
+                    "test_pass": {"type": "boolean"},
+                    "issues": {"type": "array", "items": {"type": "string"}},
+                    "evidence": {"type": "string"},
+                },
+                "required": ["label", "build_pass", "test_pass", "issues", "evidence"],
+            },
+            "verdict": {
+                "type": "string",
+                "enum": ["approach_a", "approach_b", "synthesis"],
+            },
+            "verdict_rationale": {"type": "string"},
+            "synthesis_description": {"type": "string"},
+        },
+        "required": ["approach_a", "approach_b", "verdict", "verdict_rationale"],
+    },
 }
 
 
diff --git a/megaplan/workers.py b/megaplan/workers.py
index 9990f0d..f065f1a 100644
--- a/megaplan/workers.py
+++ b/megaplan/workers.py
@@ -45,6 +45,7 @@
     "critique": "critique.json",
     "execute": "execution.json",
     "review": "review.json",
+    "test-both": "test-both.json",
 }
 
 # Derive required keys per step from SCHEMAS so they aren't duplicated.
@@ -304,6 +305,29 @@ def _mock_review(state: PlanState, plan_dir: Path) -> WorkerResult:
     return WorkerResult(payload=payload, raw_output=json_dump(payload), duration_ms=10, cost_usd=0.0, session_id=str(uuid.uuid4()))
 
 
+def _mock_test_both(state: PlanState, plan_dir: Path) -> WorkerResult:
+    payload = {
+        "approach_a": {
+            "label": "Current plan",
+            "build_pass": True,
+            "test_pass": True,
+            "issues": [],
+            "evidence": "The current plan builds and passes existing tests.",
+        },
+        "approach_b": {
+            "label": "Simplified alternative addressing unresolved flags",
+            "build_pass": True,
+            "test_pass": True,
+            "issues": ["Requires minor refactor of existing module structure."],
+            "evidence": "Alternative approach resolves the flagged concerns with a simpler design.",
+        },
+        "verdict": "synthesis",
+        "verdict_rationale": "Both approaches build and pass tests. Approach A is more complete but carries the flagged risks. Approach B addresses the flags but introduces a minor refactor. A synthesis takes the core structure from A with the risk mitigations from B.",
+        "synthesis_description": "Keep the current plan structure but incorporate the critic's suggested safeguards for the flagged concerns.",
+    }
+    return WorkerResult(payload=payload, raw_output=json_dump(payload), duration_ms=10, cost_usd=0.0, session_id=str(uuid.uuid4()))
+
+
 _MOCK_DISPATCH: dict[str, Any] = {
     "clarify": _mock_clarify,
     "plan": _mock_plan,
@@ -311,6 +335,7 @@ def _mock_review(state: PlanState, plan_dir: Path) -> WorkerResult:
     "integrate": _mock_integrate,
     "execute": _mock_execute,
     "review": _mock_review,
+    "test-both": _mock_test_both,
 }
 
 
@@ -330,6 +355,8 @@ def session_key_for(step: str, agent: str) -> str:
         return f"{agent}_executor"
     if step == "review":
         return f"{agent}_reviewer"
+    if step == "test-both":
+        return f"{agent}_judge"
     return f"{agent}_{step}"
 
 
diff --git a/tests/test_megaplan.py b/tests/test_megaplan.py
index 3db39df..1cea455 100644
--- a/tests/test_megaplan.py
+++ b/tests/test_megaplan.py
@@ -320,8 +320,8 @@ def test_infer_next_steps_non_evaluated_states(current_state: str, last_evaluati
     [
         ("CONTINUE", ["integrate", "gate"]),
         ("SKIP", ["gate"]),
-        ("ESCALATE", ["override add-note", "override force-proceed", "override abort"]),
-        ("ABORT", ["override add-note", "override force-proceed", "override abort"]),
+        ("ESCALATE", ["override test-both", "override add-note", "override force-proceed", "override abort"]),
+        ("ABORT", ["override test-both", "override add-note", "override force-proceed", "override abort"]),
         (None, ["override add-note", "override abort"]),
     ],
 )
diff --git a/tests/test_schemas.py b/tests/test_schemas.py
index 4f7fcca..3703e62 100644
--- a/tests/test_schemas.py
+++ b/tests/test_schemas.py
@@ -94,7 +94,7 @@ def test_list_passthrough(self) -> None:
 
 class TestSCHEMAS:
     def test_schemas_contains_expected_keys(self) -> None:
-        expected = {"clarify.json", "plan.json", "integrate.json", "critique.json", "execution.json", "review.json"}
+        expected = {"clarify.json", "plan.json", "integrate.json", "critique.json", "execution.json", "review.json", "test-both.json"}
         assert expected == set(SCHEMAS.keys())
 
     def test_all_schemas_are_objects(self) -> None:
diff --git a/tests/test_test_both.py b/tests/test_test_both.py
new file mode 100644
index 0000000..c52e078
--- /dev/null
+++ b/tests/test_test_both.py
@@ -0,0 +1,326 @@
+"""Tests for the test-both override action."""
+from __future__ import annotations
+
+import json
+from argparse import Namespace
+from pathlib import Path
+from typing import Any
+
+import pytest
+
+import megaplan.cli as megaplan
+import megaplan.cli
+import megaplan.workers
+from megaplan._core import (
+    MOCK_ENV_VAR,
+    read_json,
+    atomic_write_json,
+)
+
+
+# ---------------------------------------------------------------------------
+# Fixtures (matching test_megaplan.py patterns)
+# ---------------------------------------------------------------------------
+
+def make_args_factory(project_dir: Path):
+    def make_args(**overrides) -> Namespace:
+        data = {
+            "plan": None, "idea": "test idea", "name": "test-plan",
+            "project_dir": str(project_dir), "max_iterations": 3,
+            "budget_usd": 25.0, "auto_approve": False, "robustness": "standard",
+            "agent": None, "ephemeral": False, "fresh": False, "persist": False,
+            "confirm_destructive": True, "user_approved": False,
+            "confirm_self_review": False,
+            "override_action": None, "note": None, "reason": "",
+        }
+        data.update(overrides)
+        return Namespace(**data)
+    return make_args
+
+
+def load_state(plan_dir: Path) -> dict:
+    return read_json(plan_dir / "state.json")
+
+
+@pytest.fixture()
+def plan_fixture(tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
+    root = tmp_path / "root"
+    project_dir = tmp_path / "project"
+    project_dir.mkdir()
+    (project_dir / ".git").mkdir()
+    monkeypatch.setenv(MOCK_ENV_VAR, "1")
+    monkeypatch.setattr(
+        megaplan.cli.shutil, "which",
+        lambda name: "/usr/bin/mock" if name in {"claude", "codex"} else None,
+    )
+
+    make_args = make_args_factory(project_dir)
+    init_args = make_args(idea="test idea", name="test-plan")
+    megaplan.handle_init(root, init_args)
+
+    plan_dir = root / ".megaplan" / "plans" / "test-plan"
+    return {
+        "root": root,
+        "project_dir": project_dir,
+        "plan_dir": plan_dir,
+        "make_args": make_args,
+    }
+
+
+def advance_to_evaluated(fx: dict) -> None:
+    """Advance plan through plan → critique → evaluate."""
+    args = fx["make_args"](plan="test-plan")
+    megaplan.handle_plan(fx["root"], args)
+    megaplan.handle_critique(fx["root"], args)
+    megaplan.handle_evaluate(fx["root"], args)
+
+
+def force_escalate(fx: dict) -> None:
+    """Advance to evaluated, then mutate evaluation to ESCALATE."""
+    advance_to_evaluated(fx)
+    state = load_state(fx["plan_dir"])
+    state["last_evaluation"]["recommendation"] = "ESCALATE"
+    state["last_evaluation"]["valid_next_steps"] = [
+        "override test-both", "override add-note",
+        "override force-proceed", "override abort",
+    ]
+    atomic_write_json(fx["plan_dir"] / "state.json", state)
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+class TestTestBothOverride:
+    def test_test_both_requires_evaluated_state(self, plan_fixture: dict) -> None:
+        """test-both should fail if not in evaluated state."""
+        args = plan_fixture["make_args"](
+            plan="test-plan", override_action="test-both", reason="test",
+        )
+        with pytest.raises(megaplan.CliError) as exc_info:
+            megaplan.handle_override(plan_fixture["root"], args)
+        assert exc_info.value.code == "invalid_transition"
+
+    def test_test_both_requires_escalate_recommendation(self, plan_fixture: dict) -> None:
+        """test-both should fail if evaluation is SKIP or CONTINUE."""
+        advance_to_evaluated(plan_fixture)
+        args = plan_fixture["make_args"](
+            plan="test-plan", override_action="test-both", reason="test",
+        )
+        args._test_both_root = plan_fixture["root"]
+        with pytest.raises(megaplan.CliError) as exc_info:
+            megaplan.handle_override(plan_fixture["root"], args)
+        assert exc_info.value.code == "invalid_transition"
+
+    def test_test_both_success(self, plan_fixture: dict) -> None:
+        """test-both should succeed from ESCALATE state and write artifacts."""
+        force_escalate(plan_fixture)
+        args = plan_fixture["make_args"](
+            plan="test-plan", override_action="test-both", reason="critique stagnated",
+        )
+        args._test_both_root = plan_fixture["root"]
+        result = megaplan.handle_override(plan_fixture["root"], args)
+
+        assert result["success"] is True
+        assert result["step"] == "override"
+        assert "test-both.json" in result["artifacts"]
+        assert (plan_fixture["plan_dir"] / "test-both.json").exists()
+
+    def test_test_both_writes_verdict_to_state(self, plan_fixture: dict) -> None:
+        """test-both should record the verdict in meta.overrides."""
+        force_escalate(plan_fixture)
+        args = plan_fixture["make_args"](
+            plan="test-plan", override_action="test-both", reason="deadlock",
+        )
+        args._test_both_root = plan_fixture["root"]
+        megaplan.handle_override(plan_fixture["root"], args)
+
+        state = load_state(plan_fixture["plan_dir"])
+        last_override = state["meta"]["overrides"][-1]
+        assert last_override["action"] == "test-both"
+        assert last_override["verdict"] in {"approach_a", "approach_b", "synthesis"}
+        assert last_override["reason"] == "deadlock"
+
+    def test_test_both_records_history(self, plan_fixture: dict) -> None:
+        """test-both should append a history entry."""
+        force_escalate(plan_fixture)
+        args = plan_fixture["make_args"](
+            plan="test-plan", override_action="test-both", reason="test",
+        )
+        args._test_both_root = plan_fixture["root"]
+        megaplan.handle_override(plan_fixture["root"], args)
+
+        state = load_state(plan_fixture["plan_dir"])
+        test_both_entries = [h for h in state["history"] if h["step"] == "test-both"]
+        assert len(test_both_entries) == 1
+        assert test_both_entries[0]["result"] == "success"
+        assert test_both_entries[0]["output_file"] == "test-both.json"
+
+    def test_test_both_synthesis_sets_continue(self, plan_fixture: dict) -> None:
+        """Mock returns synthesis verdict — evaluation should be set to CONTINUE."""
+        force_escalate(plan_fixture)
+        args = plan_fixture["make_args"](
+            plan="test-plan", override_action="test-both", reason="test",
+        )
+        args._test_both_root = plan_fixture["root"]
+        result = megaplan.handle_override(plan_fixture["root"], args)
+
+        # Default mock returns "synthesis" verdict
+        state = load_state(plan_fixture["plan_dir"])
+        assert state["last_evaluation"]["recommendation"] == "CONTINUE"
+        assert result["next_step"] == "integrate"
+
+    def test_test_both_approach_a_wins(
+        self, plan_fixture: dict, monkeypatch: pytest.MonkeyPatch,
+    ) -> None:
+        """When approach_a wins, should proceed toward gate."""
+        force_escalate(plan_fixture)
+        original_mock = megaplan.workers.mock_worker_output
+
+        def mock_approach_a_wins(step: str, state: dict, plan_dir: Path):
+            if step == "test-both":
+                payload = {
+                    "approach_a": {
+                        "label": "Current plan",
+                        "build_pass": True, "test_pass": True,
+                        "issues": [], "evidence": "Plan is solid.",
+                    },
+                    "approach_b": {
+                        "label": "Alternative",
+                        "build_pass": False, "test_pass": False,
+                        "issues": ["Fails to build."],
+                        "evidence": "Alternative has compilation errors.",
+                    },
+                    "verdict": "approach_a",
+                    "verdict_rationale": "Current plan builds; alternative does not.",
+                }
+                return megaplan.workers.WorkerResult(
+                    payload=payload, raw_output=json.dumps(payload),
+                    duration_ms=10, cost_usd=0.0, session_id="test",
+                )
+            return original_mock(step, state, plan_dir)
+
+        monkeypatch.setattr(megaplan.workers, "mock_worker_output", mock_approach_a_wins)
+
+        args = plan_fixture["make_args"](
+            plan="test-plan", override_action="test-both", reason="test",
+        )
+        args._test_both_root = plan_fixture["root"]
+        result = megaplan.handle_override(plan_fixture["root"], args)
+
+        state = load_state(plan_fixture["plan_dir"])
+        assert state["last_evaluation"]["recommendation"] == "SKIP"
+        # Gate may not pass if unresolved flags exist from the escalation setup.
+        # The important thing is the verdict was recorded and evaluation set to SKIP.
+        assert (plan_fixture["plan_dir"] / "gate.json").exists()
+        assert result["next_step"] in {"execute", "integrate"}
+
+    def test_test_both_approach_b_wins(
+        self, plan_fixture: dict, monkeypatch: pytest.MonkeyPatch,
+    ) -> None:
+        """When approach_b wins, should proceed to integrate."""
+        force_escalate(plan_fixture)
+        original_mock = megaplan.workers.mock_worker_output
+
+        def mock_approach_b_wins(step: str, state: dict, plan_dir: Path):
+            if step == "test-both":
+                payload = {
+                    "approach_a": {
+                        "label": "Current plan",
+                        "build_pass": True, "test_pass": False,
+                        "issues": ["Fails existing tests."],
+                        "evidence": "Test suite regression.",
+                    },
+                    "approach_b": {
+                        "label": "Alternative",
+                        "build_pass": True, "test_pass": True,
+                        "issues": [], "evidence": "Clean build and tests.",
+                    },
+                    "verdict": "approach_b",
+                    "verdict_rationale": "Alternative passes all tests; current plan regresses.",
+                }
+                return megaplan.workers.WorkerResult(
+                    payload=payload, raw_output=json.dumps(payload),
+                    duration_ms=10, cost_usd=0.0, session_id="test",
+                )
+            return original_mock(step, state, plan_dir)
+
+        monkeypatch.setattr(megaplan.workers, "mock_worker_output", mock_approach_b_wins)
+
+        args = plan_fixture["make_args"](
+            plan="test-plan", override_action="test-both", reason="test",
+        )
+        args._test_both_root = plan_fixture["root"]
+        result = megaplan.handle_override(plan_fixture["root"], args)
+
+        state = load_state(plan_fixture["plan_dir"])
+        assert state["last_evaluation"]["recommendation"] == "CONTINUE"
+        assert result["next_step"] == "integrate"
+
+
+class TestTestBothSchema:
+    def test_schema_exists(self) -> None:
+        """test-both.json schema should be registered."""
+        from megaplan.schemas import SCHEMAS
+        assert "test-both.json" in SCHEMAS
+
+    def test_schema_required_fields(self) -> None:
+        from megaplan.schemas import SCHEMAS
+        schema = SCHEMAS["test-both.json"]
+        assert "approach_a" in schema["properties"]
+        assert "approach_b" in schema["properties"]
+        assert "verdict" in schema["properties"]
+        assert "verdict_rationale" in schema["properties"]
+        assert "verdict" in schema["required"]
+
+    def test_verdict_enum(self) -> None:
+        from megaplan.schemas import SCHEMAS
+        schema = SCHEMAS["test-both.json"]
+        assert schema["properties"]["verdict"]["enum"] == [
+            "approach_a", "approach_b", "synthesis",
+        ]
+
+
+class TestTestBothInferNextSteps:
+    def test_escalate_includes_test_both(self, plan_fixture: dict) -> None:
+        """infer_next_steps should include test-both for ESCALATE."""
+        state = load_state(plan_fixture["plan_dir"])
+        state["current_state"] = megaplan.STATE_EVALUATED
+        state["last_evaluation"] = {"recommendation": "ESCALATE"}
+        next_steps = megaplan.infer_next_steps(state)
+        assert "override test-both" in next_steps
+
+    def test_abort_includes_test_both(self, plan_fixture: dict) -> None:
+        """infer_next_steps should include test-both for ABORT."""
+        state = load_state(plan_fixture["plan_dir"])
+        state["current_state"] = megaplan.STATE_EVALUATED
+        state["last_evaluation"] = {"recommendation": "ABORT"}
+        next_steps = megaplan.infer_next_steps(state)
+        assert "override test-both" in next_steps
+
+    def test_skip_does_not_include_test_both(self, plan_fixture: dict) -> None:
+        """infer_next_steps should NOT include test-both for SKIP."""
+        state = load_state(plan_fixture["plan_dir"])
+        state["current_state"] = megaplan.STATE_EVALUATED
+        state["last_evaluation"] = {"recommendation": "SKIP"}
+        next_steps = megaplan.infer_next_steps(state)
+        assert "override test-both" not in next_steps
+
+
+class TestTestBothMock:
+    def test_mock_returns_valid_payload(self) -> None:
+        """Mock worker should return a valid test-both payload."""
+        from megaplan.workers import mock_worker_output, WorkerResult
+        state = {
+            "idea": "test idea",
+            "config": {"project_dir": "/tmp/test"},
+            "iteration": 1,
+            "meta": {},
+            "plan_versions": [{"version": 1, "file": "plan_v1.md"}],
+        }
+        result = mock_worker_output("test-both", state, Path("/tmp"))
+        assert isinstance(result, WorkerResult)
+        assert "approach_a" in result.payload
+        assert "approach_b" in result.payload
+        assert result.payload["verdict"] in {"approach_a", "approach_b", "synthesis"}
+        assert "verdict_rationale" in result.payload