peteromallet · claytona500 · Mar 21, 2026
diff --git a/megaplan/_core.py b/megaplan/_core.py
@@ -210,6 +210,7 @@ class StepResponse(TypedDict, total=False):
     "integrate": "claude",
     "execute": "codex",
     "review": "codex",
+    "test-both": "claude",
 }
 KNOWN_AGENTS = ["claude", "codex"]
 ROBUSTNESS_LEVELS = ("light", "standard", "thorough")

diff --git a/megaplan/cli.py b/megaplan/cli.py
@@ -226,7 +226,7 @@ def infer_next_steps(state: PlanState) -> list[str]:
         if recommendation in {"SKIP", "CONTINUE"}:
             valid.append("gate")
         if recommendation in {"ESCALATE", "ABORT"}:
-            valid.extend(["override add-note", "override force-proceed", "override abort"])
+            valid.extend(["override test-both", "override add-note", "override force-proceed", "override abort"])
         return valid or ["override add-note", "override abort"]
     if current == STATE_GATED:
         return ["execute"]
@@ -1008,11 +1008,92 @@ def _override_skip(plan_dir: Path, state: PlanState, args: argparse.Namespace) -
     }
 
 
+def _override_test_both(plan_dir: Path, state: PlanState, args: argparse.Namespace) -> StepResponse:
+    if state["current_state"] != STATE_EVALUATED:
+        raise CliError(
+            "invalid_transition",
+            "test-both is only supported from evaluated state",
+            valid_next=infer_next_steps(state),
+        )
+    recommendation = state["last_evaluation"].get("recommendation")
+    if recommendation not in {"ESCALATE", "ABORT"}:
+        raise CliError(
+            "invalid_transition",
+            f"test-both requires an ESCALATE or ABORT evaluation, got {recommendation!r}",
+            valid_next=infer_next_steps(state),
+        )
+    root = args._test_both_root if hasattr(args, "_test_both_root") else Path.cwd()
+    try:
+        worker, agent, mode, refreshed = run_step_with_worker("test-both", state, plan_dir, args, root=root)
+    except CliError as error:
+        record_step_failure(plan_dir, state, step="test-both", iteration=state["iteration"], error=error)
+        raise
+    test_both_filename = "test-both.json"
+    atomic_write_json(plan_dir / test_both_filename, worker.payload)
+    verdict = worker.payload["verdict"]
+    rationale = worker.payload["verdict_rationale"]
+    apply_session_update(state, "test-both", agent, worker.session_id, mode=mode, refreshed=refreshed)
+    append_history(
+        state,
+        make_history_entry(
+            "test-both",
+            duration_ms=worker.duration_ms,
+            cost_usd=worker.cost_usd,
+            result="success",
+            worker=worker,
+            agent=agent,
+            mode=mode,
+            output_file=test_both_filename,
+            artifact_hash=sha256_file(plan_dir / test_both_filename),
+            recommendation=verdict,
+        ),
+    )
+    if verdict == "approach_a":
+        # Current plan wins — proceed to gate
+        gate = run_gate_checks(plan_dir, state)
+        atomic_write_json(plan_dir / "gate.json", gate)
+        if gate["passed"]:
+            final_plan = latest_plan_path(plan_dir, state).read_text(encoding="utf-8")
+            atomic_write_text(plan_dir / "final.md", final_plan)
+            state["current_state"] = STATE_GATED
+        next_step = "execute" if gate["passed"] else "integrate"
+    elif verdict == "approach_b":
+        # Alternative wins — need to integrate the alternative into the plan
+        next_step = "integrate"
+    else:
+        # Synthesis — need to integrate the synthesis
+        next_step = "integrate"
+    evaluation = copy.deepcopy(state["last_evaluation"])
+    evaluation["recommendation"] = "SKIP" if verdict == "approach_a" else "CONTINUE"
+    state["last_evaluation"] = evaluation
+    _append_to_meta(state, "overrides", {
+        "action": "test-both",
+        "timestamp": now_utc(),
+        "verdict": verdict,
+        "rationale": rationale,
+        "reason": args.reason,
+    })
+    save_state(plan_dir, state)
+    response: StepResponse = {
+        "success": True,
+        "step": "override",
+        "summary": f"Test-both complete. Verdict: {verdict}. {rationale}",
+        "artifacts": [test_both_filename],
+        "next_step": next_step,
+        "state": state["current_state"],
+    }
+    if verdict == "synthesis" and worker.payload.get("synthesis_description"):
+        response["message"] = worker.payload["synthesis_description"]
+    attach_agent_fallback(response, args)
+    return response
+
+
 _OVERRIDE_ACTIONS: dict[str, Callable[[Path, PlanState, argparse.Namespace], StepResponse]] = {
     "add-note": _override_add_note,
     "abort": _override_abort,
     "force-proceed": _override_force_proceed,
     "skip": _override_skip,
+    "test-both": _override_test_both,
 }
 
 
@@ -1292,7 +1373,7 @@ def build_parser() -> argparse.ArgumentParser:
     config_sub.add_parser("reset")
 
     override_parser = subparsers.add_parser("override")
-    override_parser.add_argument("override_action", choices=["skip", "abort", "force-proceed", "add-note"])
+    override_parser.add_argument("override_action", choices=["skip", "abort", "force-proceed", "add-note", "test-both"])
     override_parser.add_argument("--plan")
     override_parser.add_argument("--reason", default="")
     override_parser.add_argument("--note")

diff --git a/megaplan/data/instructions.md b/megaplan/data/instructions.md
@@ -83,6 +83,22 @@ Auto-force-proceed (and tell the user why) when:
 - `suggested_override` is `"force-proceed"`, OR
 - Robustness is `light` and `weighted_score` < 4.0
 
+When the critique loop has stagnated (recurring critiques or score not improving),
+consider using `test-both` to break the deadlock empirically:
+
+```bash
+megaplan override test-both --plan <name> --reason "critique loop stagnated"
+```
+
+This invokes a judge agent that evaluates both the current plan and an alternative
+approach against the unresolved flags, then renders a verdict (approach_a, approach_b,
+or synthesis). The verdict determines the next step:
+- `approach_a` (current plan wins) → proceeds to gate
+- `approach_b` or `synthesis` → proceeds to integrate with the judge's recommendations
+
+Use `test-both` when the same concerns keep recurring across iterations and neither
+force-proceed nor add-note is resolving the impasse.
+
 Otherwise, present the evaluation details and ask the user what to do.
 
 ## Minor Megaplan Edits
@@ -114,5 +130,6 @@ megaplan status --plan <name>
 megaplan audit --plan <name>
 megaplan list
 megaplan override add-note --plan <name> --note "user context"
+megaplan override test-both --plan <name> --reason "critique loop stagnated"
 megaplan override abort --plan <name>
 ```
diff --git a/megaplan/prompts.py b/megaplan/prompts.py
@@ -249,6 +249,62 @@ def _execute_prompt(state: PlanState, plan_dir: Path) -> str:
     ).strip()
 
 
+def _test_both_prompt(state: PlanState, plan_dir: Path) -> str:
+    project_dir = Path(state["config"]["project_dir"])
+    latest_plan = latest_plan_path(plan_dir, state).read_text(encoding="utf-8")
+    latest_meta = read_json(latest_plan_meta_path(plan_dir, state))
+    flag_registry = load_flag_registry(plan_dir)
+    unresolved = unresolved_significant_flags(flag_registry)
+    open_flags = [
+        {
+            "id": flag["id"],
+            "severity": flag.get("severity"),
+            "concern": flag.get("concern"),
+            "evidence": flag.get("evidence"),
+        }
+        for flag in unresolved
+    ]
+    return textwrap.dedent(
+        f"""
+        You are a neutral judge resolving a deadlock between a planner and a critic.
+        The critique loop has stagnated — the same concerns keep recurring despite
+        revisions. Your job is to test both the current plan AND an alternative
+        approach, then rule based on evidence.
+
+        Project directory:
+        {project_dir}
+
+        {intent_and_notes_block(state)}
+
+        Current plan (Approach A):
+        {latest_plan}
+
+        Plan metadata:
+        {json_dump(latest_meta).strip()}
+
+        Unresolved flags from the critic (the concerns driving the deadlock):
+        {json_dump(open_flags).strip()}
+
+        Requirements:
+        - Inspect the actual repository before judging.
+        - Evaluate Approach A (the current plan) against the unresolved flags.
+          For each flag, determine: does the plan actually have this problem,
+          or is the critic being overly cautious?
+        - Propose Approach B: an alternative that addresses the unresolved flags
+          differently. This could be a modified version of the plan, a simpler
+          approach, or a fundamentally different strategy.
+        - For BOTH approaches, assess:
+          1. Would it build and pass existing tests? (build_pass, test_pass)
+          2. What concrete issues would it cause? (issues)
+          3. What evidence supports your assessment? (evidence)
+        - Render a verdict: approach_a, approach_b, or synthesis.
+        - If synthesis, describe what to take from each approach.
+        - Judge based on correctness and practicality, not elegance.
+        - An approach that would fail to build loses automatically.
+        """
+    ).strip()
+
+
 def _review_claude_prompt(state: PlanState, plan_dir: Path) -> str:
     project_dir = Path(state["config"]["project_dir"])
     latest_plan = latest_plan_path(plan_dir, state).read_text(encoding="utf-8")
@@ -332,6 +388,7 @@ def _review_codex_prompt(state: PlanState, plan_dir: Path) -> str:
     "critique": _critique_prompt,
     "execute": _execute_prompt,
     "review": _review_claude_prompt,
+    "test-both": _test_both_prompt,
 }
 
 _CODEX_PROMPT_BUILDERS: dict[str, Any] = {
@@ -341,6 +398,7 @@ def _review_codex_prompt(state: PlanState, plan_dir: Path) -> str:
     "critique": _critique_prompt,
     "execute": _execute_prompt,
     "review": _review_codex_prompt,
+    "test-both": _test_both_prompt,
 }
 
 

diff --git a/megaplan/schemas.py b/megaplan/schemas.py
@@ -112,6 +112,40 @@
         },
         "required": ["criteria", "issues"],
     },
+    "test-both.json": {
+        "type": "object",
+        "properties": {
+            "approach_a": {
+                "type": "object",
+                "properties": {
+                    "label": {"type": "string"},
+                    "build_pass": {"type": "boolean"},
+                    "test_pass": {"type": "boolean"},
+                    "issues": {"type": "array", "items": {"type": "string"}},
+                    "evidence": {"type": "string"},
+                },
+                "required": ["label", "build_pass", "test_pass", "issues", "evidence"],
+            },
+            "approach_b": {
+                "type": "object",
+                "properties": {
+                    "label": {"type": "string"},
+                    "build_pass": {"type": "boolean"},
+                    "test_pass": {"type": "boolean"},
+                    "issues": {"type": "array", "items": {"type": "string"}},
+                    "evidence": {"type": "string"},
+                },
+                "required": ["label", "build_pass", "test_pass", "issues", "evidence"],
+            },
+            "verdict": {
+                "type": "string",
+                "enum": ["approach_a", "approach_b", "synthesis"],
+            },
+            "verdict_rationale": {"type": "string"},
+            "synthesis_description": {"type": "string"},
+        },
+        "required": ["approach_a", "approach_b", "verdict", "verdict_rationale"],
+    },
 }
 
 

diff --git a/megaplan/workers.py b/megaplan/workers.py
@@ -45,6 +45,7 @@
     "critique": "critique.json",
     "execute": "execution.json",
     "review": "review.json",
+    "test-both": "test-both.json",
 }
 
 # Derive required keys per step from SCHEMAS so they aren't duplicated.
@@ -304,13 +305,37 @@ def _mock_review(state: PlanState, plan_dir: Path) -> WorkerResult:
     return WorkerResult(payload=payload, raw_output=json_dump(payload), duration_ms=10, cost_usd=0.0, session_id=str(uuid.uuid4()))
 
 
+def _mock_test_both(state: PlanState, plan_dir: Path) -> WorkerResult:
+    payload = {
+        "approach_a": {
+            "label": "Current plan",
+            "build_pass": True,
+            "test_pass": True,
+            "issues": [],
+            "evidence": "The current plan builds and passes existing tests.",
+        },
+        "approach_b": {
+            "label": "Simplified alternative addressing unresolved flags",
+            "build_pass": True,
+            "test_pass": True,
+            "issues": ["Requires minor refactor of existing module structure."],
+            "evidence": "Alternative approach resolves the flagged concerns with a simpler design.",
+        },
+        "verdict": "synthesis",
+        "verdict_rationale": "Both approaches build and pass tests. Approach A is more complete but carries the flagged risks. Approach B addresses the flags but introduces a minor refactor. A synthesis takes the core structure from A with the risk mitigations from B.",
+        "synthesis_description": "Keep the current plan structure but incorporate the critic's suggested safeguards for the flagged concerns.",
+    }
+    return WorkerResult(payload=payload, raw_output=json_dump(payload), duration_ms=10, cost_usd=0.0, session_id=str(uuid.uuid4()))
+
+
 _MOCK_DISPATCH: dict[str, Any] = {
     "clarify": _mock_clarify,
     "plan": _mock_plan,
     "critique": _mock_critique,
     "integrate": _mock_integrate,
     "execute": _mock_execute,
     "review": _mock_review,
+    "test-both": _mock_test_both,
 }
 
 
@@ -330,6 +355,8 @@ def session_key_for(step: str, agent: str) -> str:
         return f"{agent}_executor"
     if step == "review":
         return f"{agent}_reviewer"
+    if step == "test-both":
+        return f"{agent}_judge"
     return f"{agent}_{step}"
 
 

diff --git a/tests/test_megaplan.py b/tests/test_megaplan.py
@@ -320,8 +320,8 @@ def test_infer_next_steps_non_evaluated_states(current_state: str, last_evaluati
     [
         ("CONTINUE", ["integrate", "gate"]),
         ("SKIP", ["gate"]),
-        ("ESCALATE", ["override add-note", "override force-proceed", "override abort"]),
-        ("ABORT", ["override add-note", "override force-proceed", "override abort"]),
+        ("ESCALATE", ["override test-both", "override add-note", "override force-proceed", "override abort"]),
+        ("ABORT", ["override test-both", "override add-note", "override force-proceed", "override abort"]),
         (None, ["override add-note", "override abort"]),
     ],
 )

diff --git a/tests/test_schemas.py b/tests/test_schemas.py
@@ -94,7 +94,7 @@ def test_list_passthrough(self) -> None:
 
 class TestSCHEMAS:
     def test_schemas_contains_expected_keys(self) -> None:
-        expected = {"clarify.json", "plan.json", "integrate.json", "critique.json", "execution.json", "review.json"}
+        expected = {"clarify.json", "plan.json", "integrate.json", "critique.json", "execution.json", "review.json", "test-both.json"}
         assert expected == set(SCHEMAS.keys())
 
     def test_all_schemas_are_objects(self) -> None: