From 043c7d81ba41c956ad37cd5a483613a53ea5dcee Mon Sep 17 00:00:00 2001 From: Clayton Thompson Date: Sat, 21 Mar 2026 21:41:45 +0300 Subject: [PATCH] feat: add test-both override for empirical deadlock resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the critique loop stagnates (ESCALATE), the only options today are add-note, force-proceed, or abort — all of which punt the decision to the human without evidence. This adds a test-both override that invokes a judge agent to evaluate the current plan against an alternative approach, then renders a verdict (approach_a, approach_b, or synthesis) based on empirical assessment. Changes: - New test-both.json schema for structured judge output - Judge prompt in prompts.py that evaluates both approaches against unresolved flags - _override_test_both handler in cli.py with full state machine integration - Mock worker for test-both in workers.py - Default agent routing (claude) in _core.py - Updated infer_next_steps to surface test-both for ESCALATE/ABORT - Documentation in instructions.md - 15 new tests covering all verdict paths, state transitions, and schema Co-Authored-By: Claude Opus 4.6 (1M context) --- megaplan/_core.py | 1 + megaplan/cli.py | 85 ++++++++- megaplan/data/instructions.md | 17 ++ megaplan/prompts.py | 58 ++++++ megaplan/schemas.py | 34 ++++ megaplan/workers.py | 27 +++ tests/test_megaplan.py | 4 +- tests/test_schemas.py | 2 +- tests/test_test_both.py | 326 ++++++++++++++++++++++++++++++++++ 9 files changed, 549 insertions(+), 5 deletions(-) create mode 100644 tests/test_test_both.py diff --git a/megaplan/_core.py b/megaplan/_core.py index 553f079..9e140ec 100644 --- a/megaplan/_core.py +++ b/megaplan/_core.py @@ -210,6 +210,7 @@ class StepResponse(TypedDict, total=False): "integrate": "claude", "execute": "codex", "review": "codex", + "test-both": "claude", } KNOWN_AGENTS = ["claude", "codex"] ROBUSTNESS_LEVELS = ("light", "standard", "thorough") diff --git a/megaplan/cli.py b/megaplan/cli.py index 4e8de1f..f2736b4 100755 --- a/megaplan/cli.py +++ b/megaplan/cli.py @@ -226,7 +226,7 @@ def infer_next_steps(state: PlanState) -> list[str]: if recommendation in {"SKIP", "CONTINUE"}: valid.append("gate") if recommendation in {"ESCALATE", "ABORT"}: - valid.extend(["override add-note", "override force-proceed", "override abort"]) + valid.extend(["override test-both", "override add-note", "override force-proceed", "override abort"]) return valid or ["override add-note", "override abort"] if current == STATE_GATED: return ["execute"] @@ -1008,11 +1008,92 @@ def _override_skip(plan_dir: Path, state: PlanState, args: argparse.Namespace) - } +def _override_test_both(plan_dir: Path, state: PlanState, args: argparse.Namespace) -> StepResponse: + if state["current_state"] != STATE_EVALUATED: + raise CliError( + "invalid_transition", + "test-both is only supported from evaluated state", + valid_next=infer_next_steps(state), + ) + recommendation = state["last_evaluation"].get("recommendation") + if recommendation not in {"ESCALATE", "ABORT"}: + raise CliError( + "invalid_transition", + f"test-both requires an ESCALATE or ABORT evaluation, got {recommendation!r}", + valid_next=infer_next_steps(state), + ) + root = args._test_both_root if hasattr(args, "_test_both_root") else Path.cwd() + try: + worker, agent, mode, refreshed = run_step_with_worker("test-both", state, plan_dir, args, root=root) + except CliError as error: + record_step_failure(plan_dir, state, step="test-both", iteration=state["iteration"], error=error) + raise + test_both_filename = "test-both.json" + atomic_write_json(plan_dir / test_both_filename, worker.payload) + verdict = worker.payload["verdict"] + rationale = worker.payload["verdict_rationale"] + apply_session_update(state, "test-both", agent, worker.session_id, mode=mode, refreshed=refreshed) + append_history( + state, + make_history_entry( + "test-both", + duration_ms=worker.duration_ms, + cost_usd=worker.cost_usd, + result="success", + worker=worker, + agent=agent, + mode=mode, + output_file=test_both_filename, + artifact_hash=sha256_file(plan_dir / test_both_filename), + recommendation=verdict, + ), + ) + if verdict == "approach_a": + # Current plan wins — proceed to gate + gate = run_gate_checks(plan_dir, state) + atomic_write_json(plan_dir / "gate.json", gate) + if gate["passed"]: + final_plan = latest_plan_path(plan_dir, state).read_text(encoding="utf-8") + atomic_write_text(plan_dir / "final.md", final_plan) + state["current_state"] = STATE_GATED + next_step = "execute" if gate["passed"] else "integrate" + elif verdict == "approach_b": + # Alternative wins — need to integrate the alternative into the plan + next_step = "integrate" + else: + # Synthesis — need to integrate the synthesis + next_step = "integrate" + evaluation = copy.deepcopy(state["last_evaluation"]) + evaluation["recommendation"] = "SKIP" if verdict == "approach_a" else "CONTINUE" + state["last_evaluation"] = evaluation + _append_to_meta(state, "overrides", { + "action": "test-both", + "timestamp": now_utc(), + "verdict": verdict, + "rationale": rationale, + "reason": args.reason, + }) + save_state(plan_dir, state) + response: StepResponse = { + "success": True, + "step": "override", + "summary": f"Test-both complete. Verdict: {verdict}. {rationale}", + "artifacts": [test_both_filename], + "next_step": next_step, + "state": state["current_state"], + } + if verdict == "synthesis" and worker.payload.get("synthesis_description"): + response["message"] = worker.payload["synthesis_description"] + attach_agent_fallback(response, args) + return response + + _OVERRIDE_ACTIONS: dict[str, Callable[[Path, PlanState, argparse.Namespace], StepResponse]] = { "add-note": _override_add_note, "abort": _override_abort, "force-proceed": _override_force_proceed, "skip": _override_skip, + "test-both": _override_test_both, } @@ -1292,7 +1373,7 @@ def build_parser() -> argparse.ArgumentParser: config_sub.add_parser("reset") override_parser = subparsers.add_parser("override") - override_parser.add_argument("override_action", choices=["skip", "abort", "force-proceed", "add-note"]) + override_parser.add_argument("override_action", choices=["skip", "abort", "force-proceed", "add-note", "test-both"]) override_parser.add_argument("--plan") override_parser.add_argument("--reason", default="") override_parser.add_argument("--note") diff --git a/megaplan/data/instructions.md b/megaplan/data/instructions.md index cd1d329..631f0f7 100644 --- a/megaplan/data/instructions.md +++ b/megaplan/data/instructions.md @@ -83,6 +83,22 @@ Auto-force-proceed (and tell the user why) when: - `suggested_override` is `"force-proceed"`, OR - Robustness is `light` and `weighted_score` < 4.0 +When the critique loop has stagnated (recurring critiques or score not improving), +consider using `test-both` to break the deadlock empirically: + +```bash +megaplan override test-both --plan --reason "critique loop stagnated" +``` + +This invokes a judge agent that evaluates both the current plan and an alternative +approach against the unresolved flags, then renders a verdict (approach_a, approach_b, +or synthesis). The verdict determines the next step: +- `approach_a` (current plan wins) → proceeds to gate +- `approach_b` or `synthesis` → proceeds to integrate with the judge's recommendations + +Use `test-both` when the same concerns keep recurring across iterations and neither +force-proceed nor add-note is resolving the impasse. + Otherwise, present the evaluation details and ask the user what to do. ## Minor Megaplan Edits @@ -114,5 +130,6 @@ megaplan status --plan megaplan audit --plan megaplan list megaplan override add-note --plan --note "user context" +megaplan override test-both --plan --reason "critique loop stagnated" megaplan override abort --plan ``` diff --git a/megaplan/prompts.py b/megaplan/prompts.py index 9cc9968..4251b80 100644 --- a/megaplan/prompts.py +++ b/megaplan/prompts.py @@ -249,6 +249,62 @@ def _execute_prompt(state: PlanState, plan_dir: Path) -> str: ).strip() +def _test_both_prompt(state: PlanState, plan_dir: Path) -> str: + project_dir = Path(state["config"]["project_dir"]) + latest_plan = latest_plan_path(plan_dir, state).read_text(encoding="utf-8") + latest_meta = read_json(latest_plan_meta_path(plan_dir, state)) + flag_registry = load_flag_registry(plan_dir) + unresolved = unresolved_significant_flags(flag_registry) + open_flags = [ + { + "id": flag["id"], + "severity": flag.get("severity"), + "concern": flag.get("concern"), + "evidence": flag.get("evidence"), + } + for flag in unresolved + ] + return textwrap.dedent( + f""" + You are a neutral judge resolving a deadlock between a planner and a critic. + The critique loop has stagnated — the same concerns keep recurring despite + revisions. Your job is to test both the current plan AND an alternative + approach, then rule based on evidence. + + Project directory: + {project_dir} + + {intent_and_notes_block(state)} + + Current plan (Approach A): + {latest_plan} + + Plan metadata: + {json_dump(latest_meta).strip()} + + Unresolved flags from the critic (the concerns driving the deadlock): + {json_dump(open_flags).strip()} + + Requirements: + - Inspect the actual repository before judging. + - Evaluate Approach A (the current plan) against the unresolved flags. + For each flag, determine: does the plan actually have this problem, + or is the critic being overly cautious? + - Propose Approach B: an alternative that addresses the unresolved flags + differently. This could be a modified version of the plan, a simpler + approach, or a fundamentally different strategy. + - For BOTH approaches, assess: + 1. Would it build and pass existing tests? (build_pass, test_pass) + 2. What concrete issues would it cause? (issues) + 3. What evidence supports your assessment? (evidence) + - Render a verdict: approach_a, approach_b, or synthesis. + - If synthesis, describe what to take from each approach. + - Judge based on correctness and practicality, not elegance. + - An approach that would fail to build loses automatically. + """ + ).strip() + + def _review_claude_prompt(state: PlanState, plan_dir: Path) -> str: project_dir = Path(state["config"]["project_dir"]) latest_plan = latest_plan_path(plan_dir, state).read_text(encoding="utf-8") @@ -332,6 +388,7 @@ def _review_codex_prompt(state: PlanState, plan_dir: Path) -> str: "critique": _critique_prompt, "execute": _execute_prompt, "review": _review_claude_prompt, + "test-both": _test_both_prompt, } _CODEX_PROMPT_BUILDERS: dict[str, Any] = { @@ -341,6 +398,7 @@ def _review_codex_prompt(state: PlanState, plan_dir: Path) -> str: "critique": _critique_prompt, "execute": _execute_prompt, "review": _review_codex_prompt, + "test-both": _test_both_prompt, } diff --git a/megaplan/schemas.py b/megaplan/schemas.py index 1bd23b1..b636a34 100644 --- a/megaplan/schemas.py +++ b/megaplan/schemas.py @@ -112,6 +112,40 @@ }, "required": ["criteria", "issues"], }, + "test-both.json": { + "type": "object", + "properties": { + "approach_a": { + "type": "object", + "properties": { + "label": {"type": "string"}, + "build_pass": {"type": "boolean"}, + "test_pass": {"type": "boolean"}, + "issues": {"type": "array", "items": {"type": "string"}}, + "evidence": {"type": "string"}, + }, + "required": ["label", "build_pass", "test_pass", "issues", "evidence"], + }, + "approach_b": { + "type": "object", + "properties": { + "label": {"type": "string"}, + "build_pass": {"type": "boolean"}, + "test_pass": {"type": "boolean"}, + "issues": {"type": "array", "items": {"type": "string"}}, + "evidence": {"type": "string"}, + }, + "required": ["label", "build_pass", "test_pass", "issues", "evidence"], + }, + "verdict": { + "type": "string", + "enum": ["approach_a", "approach_b", "synthesis"], + }, + "verdict_rationale": {"type": "string"}, + "synthesis_description": {"type": "string"}, + }, + "required": ["approach_a", "approach_b", "verdict", "verdict_rationale"], + }, } diff --git a/megaplan/workers.py b/megaplan/workers.py index 9990f0d..f065f1a 100644 --- a/megaplan/workers.py +++ b/megaplan/workers.py @@ -45,6 +45,7 @@ "critique": "critique.json", "execute": "execution.json", "review": "review.json", + "test-both": "test-both.json", } # Derive required keys per step from SCHEMAS so they aren't duplicated. @@ -304,6 +305,29 @@ def _mock_review(state: PlanState, plan_dir: Path) -> WorkerResult: return WorkerResult(payload=payload, raw_output=json_dump(payload), duration_ms=10, cost_usd=0.0, session_id=str(uuid.uuid4())) +def _mock_test_both(state: PlanState, plan_dir: Path) -> WorkerResult: + payload = { + "approach_a": { + "label": "Current plan", + "build_pass": True, + "test_pass": True, + "issues": [], + "evidence": "The current plan builds and passes existing tests.", + }, + "approach_b": { + "label": "Simplified alternative addressing unresolved flags", + "build_pass": True, + "test_pass": True, + "issues": ["Requires minor refactor of existing module structure."], + "evidence": "Alternative approach resolves the flagged concerns with a simpler design.", + }, + "verdict": "synthesis", + "verdict_rationale": "Both approaches build and pass tests. Approach A is more complete but carries the flagged risks. Approach B addresses the flags but introduces a minor refactor. A synthesis takes the core structure from A with the risk mitigations from B.", + "synthesis_description": "Keep the current plan structure but incorporate the critic's suggested safeguards for the flagged concerns.", + } + return WorkerResult(payload=payload, raw_output=json_dump(payload), duration_ms=10, cost_usd=0.0, session_id=str(uuid.uuid4())) + + _MOCK_DISPATCH: dict[str, Any] = { "clarify": _mock_clarify, "plan": _mock_plan, @@ -311,6 +335,7 @@ def _mock_review(state: PlanState, plan_dir: Path) -> WorkerResult: "integrate": _mock_integrate, "execute": _mock_execute, "review": _mock_review, + "test-both": _mock_test_both, } @@ -330,6 +355,8 @@ def session_key_for(step: str, agent: str) -> str: return f"{agent}_executor" if step == "review": return f"{agent}_reviewer" + if step == "test-both": + return f"{agent}_judge" return f"{agent}_{step}" diff --git a/tests/test_megaplan.py b/tests/test_megaplan.py index 3db39df..1cea455 100644 --- a/tests/test_megaplan.py +++ b/tests/test_megaplan.py @@ -320,8 +320,8 @@ def test_infer_next_steps_non_evaluated_states(current_state: str, last_evaluati [ ("CONTINUE", ["integrate", "gate"]), ("SKIP", ["gate"]), - ("ESCALATE", ["override add-note", "override force-proceed", "override abort"]), - ("ABORT", ["override add-note", "override force-proceed", "override abort"]), + ("ESCALATE", ["override test-both", "override add-note", "override force-proceed", "override abort"]), + ("ABORT", ["override test-both", "override add-note", "override force-proceed", "override abort"]), (None, ["override add-note", "override abort"]), ], ) diff --git a/tests/test_schemas.py b/tests/test_schemas.py index 4f7fcca..3703e62 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -94,7 +94,7 @@ def test_list_passthrough(self) -> None: class TestSCHEMAS: def test_schemas_contains_expected_keys(self) -> None: - expected = {"clarify.json", "plan.json", "integrate.json", "critique.json", "execution.json", "review.json"} + expected = {"clarify.json", "plan.json", "integrate.json", "critique.json", "execution.json", "review.json", "test-both.json"} assert expected == set(SCHEMAS.keys()) def test_all_schemas_are_objects(self) -> None: diff --git a/tests/test_test_both.py b/tests/test_test_both.py new file mode 100644 index 0000000..c52e078 --- /dev/null +++ b/tests/test_test_both.py @@ -0,0 +1,326 @@ +"""Tests for the test-both override action.""" +from __future__ import annotations + +import json +from argparse import Namespace +from pathlib import Path +from typing import Any + +import pytest + +import megaplan.cli as megaplan +import megaplan.cli +import megaplan.workers +from megaplan._core import ( + MOCK_ENV_VAR, + read_json, + atomic_write_json, +) + + +# --------------------------------------------------------------------------- +# Fixtures (matching test_megaplan.py patterns) +# --------------------------------------------------------------------------- + +def make_args_factory(project_dir: Path): + def make_args(**overrides) -> Namespace: + data = { + "plan": None, "idea": "test idea", "name": "test-plan", + "project_dir": str(project_dir), "max_iterations": 3, + "budget_usd": 25.0, "auto_approve": False, "robustness": "standard", + "agent": None, "ephemeral": False, "fresh": False, "persist": False, + "confirm_destructive": True, "user_approved": False, + "confirm_self_review": False, + "override_action": None, "note": None, "reason": "", + } + data.update(overrides) + return Namespace(**data) + return make_args + + +def load_state(plan_dir: Path) -> dict: + return read_json(plan_dir / "state.json") + + +@pytest.fixture() +def plan_fixture(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): + root = tmp_path / "root" + project_dir = tmp_path / "project" + project_dir.mkdir() + (project_dir / ".git").mkdir() + monkeypatch.setenv(MOCK_ENV_VAR, "1") + monkeypatch.setattr( + megaplan.cli.shutil, "which", + lambda name: "/usr/bin/mock" if name in {"claude", "codex"} else None, + ) + + make_args = make_args_factory(project_dir) + init_args = make_args(idea="test idea", name="test-plan") + megaplan.handle_init(root, init_args) + + plan_dir = root / ".megaplan" / "plans" / "test-plan" + return { + "root": root, + "project_dir": project_dir, + "plan_dir": plan_dir, + "make_args": make_args, + } + + +def advance_to_evaluated(fx: dict) -> None: + """Advance plan through plan → critique → evaluate.""" + args = fx["make_args"](plan="test-plan") + megaplan.handle_plan(fx["root"], args) + megaplan.handle_critique(fx["root"], args) + megaplan.handle_evaluate(fx["root"], args) + + +def force_escalate(fx: dict) -> None: + """Advance to evaluated, then mutate evaluation to ESCALATE.""" + advance_to_evaluated(fx) + state = load_state(fx["plan_dir"]) + state["last_evaluation"]["recommendation"] = "ESCALATE" + state["last_evaluation"]["valid_next_steps"] = [ + "override test-both", "override add-note", + "override force-proceed", "override abort", + ] + atomic_write_json(fx["plan_dir"] / "state.json", state) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +class TestTestBothOverride: + def test_test_both_requires_evaluated_state(self, plan_fixture: dict) -> None: + """test-both should fail if not in evaluated state.""" + args = plan_fixture["make_args"]( + plan="test-plan", override_action="test-both", reason="test", + ) + with pytest.raises(megaplan.CliError) as exc_info: + megaplan.handle_override(plan_fixture["root"], args) + assert exc_info.value.code == "invalid_transition" + + def test_test_both_requires_escalate_recommendation(self, plan_fixture: dict) -> None: + """test-both should fail if evaluation is SKIP or CONTINUE.""" + advance_to_evaluated(plan_fixture) + args = plan_fixture["make_args"]( + plan="test-plan", override_action="test-both", reason="test", + ) + args._test_both_root = plan_fixture["root"] + with pytest.raises(megaplan.CliError) as exc_info: + megaplan.handle_override(plan_fixture["root"], args) + assert exc_info.value.code == "invalid_transition" + + def test_test_both_success(self, plan_fixture: dict) -> None: + """test-both should succeed from ESCALATE state and write artifacts.""" + force_escalate(plan_fixture) + args = plan_fixture["make_args"]( + plan="test-plan", override_action="test-both", reason="critique stagnated", + ) + args._test_both_root = plan_fixture["root"] + result = megaplan.handle_override(plan_fixture["root"], args) + + assert result["success"] is True + assert result["step"] == "override" + assert "test-both.json" in result["artifacts"] + assert (plan_fixture["plan_dir"] / "test-both.json").exists() + + def test_test_both_writes_verdict_to_state(self, plan_fixture: dict) -> None: + """test-both should record the verdict in meta.overrides.""" + force_escalate(plan_fixture) + args = plan_fixture["make_args"]( + plan="test-plan", override_action="test-both", reason="deadlock", + ) + args._test_both_root = plan_fixture["root"] + megaplan.handle_override(plan_fixture["root"], args) + + state = load_state(plan_fixture["plan_dir"]) + last_override = state["meta"]["overrides"][-1] + assert last_override["action"] == "test-both" + assert last_override["verdict"] in {"approach_a", "approach_b", "synthesis"} + assert last_override["reason"] == "deadlock" + + def test_test_both_records_history(self, plan_fixture: dict) -> None: + """test-both should append a history entry.""" + force_escalate(plan_fixture) + args = plan_fixture["make_args"]( + plan="test-plan", override_action="test-both", reason="test", + ) + args._test_both_root = plan_fixture["root"] + megaplan.handle_override(plan_fixture["root"], args) + + state = load_state(plan_fixture["plan_dir"]) + test_both_entries = [h for h in state["history"] if h["step"] == "test-both"] + assert len(test_both_entries) == 1 + assert test_both_entries[0]["result"] == "success" + assert test_both_entries[0]["output_file"] == "test-both.json" + + def test_test_both_synthesis_sets_continue(self, plan_fixture: dict) -> None: + """Mock returns synthesis verdict — evaluation should be set to CONTINUE.""" + force_escalate(plan_fixture) + args = plan_fixture["make_args"]( + plan="test-plan", override_action="test-both", reason="test", + ) + args._test_both_root = plan_fixture["root"] + result = megaplan.handle_override(plan_fixture["root"], args) + + # Default mock returns "synthesis" verdict + state = load_state(plan_fixture["plan_dir"]) + assert state["last_evaluation"]["recommendation"] == "CONTINUE" + assert result["next_step"] == "integrate" + + def test_test_both_approach_a_wins( + self, plan_fixture: dict, monkeypatch: pytest.MonkeyPatch, + ) -> None: + """When approach_a wins, should proceed toward gate.""" + force_escalate(plan_fixture) + original_mock = megaplan.workers.mock_worker_output + + def mock_approach_a_wins(step: str, state: dict, plan_dir: Path): + if step == "test-both": + payload = { + "approach_a": { + "label": "Current plan", + "build_pass": True, "test_pass": True, + "issues": [], "evidence": "Plan is solid.", + }, + "approach_b": { + "label": "Alternative", + "build_pass": False, "test_pass": False, + "issues": ["Fails to build."], + "evidence": "Alternative has compilation errors.", + }, + "verdict": "approach_a", + "verdict_rationale": "Current plan builds; alternative does not.", + } + return megaplan.workers.WorkerResult( + payload=payload, raw_output=json.dumps(payload), + duration_ms=10, cost_usd=0.0, session_id="test", + ) + return original_mock(step, state, plan_dir) + + monkeypatch.setattr(megaplan.workers, "mock_worker_output", mock_approach_a_wins) + + args = plan_fixture["make_args"]( + plan="test-plan", override_action="test-both", reason="test", + ) + args._test_both_root = plan_fixture["root"] + result = megaplan.handle_override(plan_fixture["root"], args) + + state = load_state(plan_fixture["plan_dir"]) + assert state["last_evaluation"]["recommendation"] == "SKIP" + # Gate may not pass if unresolved flags exist from the escalation setup. + # The important thing is the verdict was recorded and evaluation set to SKIP. + assert (plan_fixture["plan_dir"] / "gate.json").exists() + assert result["next_step"] in {"execute", "integrate"} + + def test_test_both_approach_b_wins( + self, plan_fixture: dict, monkeypatch: pytest.MonkeyPatch, + ) -> None: + """When approach_b wins, should proceed to integrate.""" + force_escalate(plan_fixture) + original_mock = megaplan.workers.mock_worker_output + + def mock_approach_b_wins(step: str, state: dict, plan_dir: Path): + if step == "test-both": + payload = { + "approach_a": { + "label": "Current plan", + "build_pass": True, "test_pass": False, + "issues": ["Fails existing tests."], + "evidence": "Test suite regression.", + }, + "approach_b": { + "label": "Alternative", + "build_pass": True, "test_pass": True, + "issues": [], "evidence": "Clean build and tests.", + }, + "verdict": "approach_b", + "verdict_rationale": "Alternative passes all tests; current plan regresses.", + } + return megaplan.workers.WorkerResult( + payload=payload, raw_output=json.dumps(payload), + duration_ms=10, cost_usd=0.0, session_id="test", + ) + return original_mock(step, state, plan_dir) + + monkeypatch.setattr(megaplan.workers, "mock_worker_output", mock_approach_b_wins) + + args = plan_fixture["make_args"]( + plan="test-plan", override_action="test-both", reason="test", + ) + args._test_both_root = plan_fixture["root"] + result = megaplan.handle_override(plan_fixture["root"], args) + + state = load_state(plan_fixture["plan_dir"]) + assert state["last_evaluation"]["recommendation"] == "CONTINUE" + assert result["next_step"] == "integrate" + + +class TestTestBothSchema: + def test_schema_exists(self) -> None: + """test-both.json schema should be registered.""" + from megaplan.schemas import SCHEMAS + assert "test-both.json" in SCHEMAS + + def test_schema_required_fields(self) -> None: + from megaplan.schemas import SCHEMAS + schema = SCHEMAS["test-both.json"] + assert "approach_a" in schema["properties"] + assert "approach_b" in schema["properties"] + assert "verdict" in schema["properties"] + assert "verdict_rationale" in schema["properties"] + assert "verdict" in schema["required"] + + def test_verdict_enum(self) -> None: + from megaplan.schemas import SCHEMAS + schema = SCHEMAS["test-both.json"] + assert schema["properties"]["verdict"]["enum"] == [ + "approach_a", "approach_b", "synthesis", + ] + + +class TestTestBothInferNextSteps: + def test_escalate_includes_test_both(self, plan_fixture: dict) -> None: + """infer_next_steps should include test-both for ESCALATE.""" + state = load_state(plan_fixture["plan_dir"]) + state["current_state"] = megaplan.STATE_EVALUATED + state["last_evaluation"] = {"recommendation": "ESCALATE"} + next_steps = megaplan.infer_next_steps(state) + assert "override test-both" in next_steps + + def test_abort_includes_test_both(self, plan_fixture: dict) -> None: + """infer_next_steps should include test-both for ABORT.""" + state = load_state(plan_fixture["plan_dir"]) + state["current_state"] = megaplan.STATE_EVALUATED + state["last_evaluation"] = {"recommendation": "ABORT"} + next_steps = megaplan.infer_next_steps(state) + assert "override test-both" in next_steps + + def test_skip_does_not_include_test_both(self, plan_fixture: dict) -> None: + """infer_next_steps should NOT include test-both for SKIP.""" + state = load_state(plan_fixture["plan_dir"]) + state["current_state"] = megaplan.STATE_EVALUATED + state["last_evaluation"] = {"recommendation": "SKIP"} + next_steps = megaplan.infer_next_steps(state) + assert "override test-both" not in next_steps + + +class TestTestBothMock: + def test_mock_returns_valid_payload(self) -> None: + """Mock worker should return a valid test-both payload.""" + from megaplan.workers import mock_worker_output, WorkerResult + state = { + "idea": "test idea", + "config": {"project_dir": "/tmp/test"}, + "iteration": 1, + "meta": {}, + "plan_versions": [{"version": 1, "file": "plan_v1.md"}], + } + result = mock_worker_output("test-both", state, Path("/tmp")) + assert isinstance(result, WorkerResult) + assert "approach_a" in result.payload + assert "approach_b" in result.payload + assert result.payload["verdict"] in {"approach_a", "approach_b", "synthesis"} + assert "verdict_rationale" in result.payload