Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions megaplan/_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ class StepResponse(TypedDict, total=False):
"integrate": "claude",
"execute": "codex",
"review": "codex",
"test-both": "claude",
}
KNOWN_AGENTS = ["claude", "codex"]
ROBUSTNESS_LEVELS = ("light", "standard", "thorough")
Expand Down
85 changes: 83 additions & 2 deletions megaplan/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def infer_next_steps(state: PlanState) -> list[str]:
if recommendation in {"SKIP", "CONTINUE"}:
valid.append("gate")
if recommendation in {"ESCALATE", "ABORT"}:
valid.extend(["override add-note", "override force-proceed", "override abort"])
valid.extend(["override test-both", "override add-note", "override force-proceed", "override abort"])
return valid or ["override add-note", "override abort"]
if current == STATE_GATED:
return ["execute"]
Expand Down Expand Up @@ -1008,11 +1008,92 @@ def _override_skip(plan_dir: Path, state: PlanState, args: argparse.Namespace) -
}


def _override_test_both(plan_dir: Path, state: PlanState, args: argparse.Namespace) -> StepResponse:
if state["current_state"] != STATE_EVALUATED:
raise CliError(
"invalid_transition",
"test-both is only supported from evaluated state",
valid_next=infer_next_steps(state),
)
recommendation = state["last_evaluation"].get("recommendation")
if recommendation not in {"ESCALATE", "ABORT"}:
raise CliError(
"invalid_transition",
f"test-both requires an ESCALATE or ABORT evaluation, got {recommendation!r}",
valid_next=infer_next_steps(state),
)
root = args._test_both_root if hasattr(args, "_test_both_root") else Path.cwd()
try:
worker, agent, mode, refreshed = run_step_with_worker("test-both", state, plan_dir, args, root=root)
except CliError as error:
record_step_failure(plan_dir, state, step="test-both", iteration=state["iteration"], error=error)
raise
test_both_filename = "test-both.json"
atomic_write_json(plan_dir / test_both_filename, worker.payload)
verdict = worker.payload["verdict"]
rationale = worker.payload["verdict_rationale"]
apply_session_update(state, "test-both", agent, worker.session_id, mode=mode, refreshed=refreshed)
append_history(
state,
make_history_entry(
"test-both",
duration_ms=worker.duration_ms,
cost_usd=worker.cost_usd,
result="success",
worker=worker,
agent=agent,
mode=mode,
output_file=test_both_filename,
artifact_hash=sha256_file(plan_dir / test_both_filename),
recommendation=verdict,
),
)
if verdict == "approach_a":
# Current plan wins — proceed to gate
gate = run_gate_checks(plan_dir, state)
atomic_write_json(plan_dir / "gate.json", gate)
if gate["passed"]:
final_plan = latest_plan_path(plan_dir, state).read_text(encoding="utf-8")
atomic_write_text(plan_dir / "final.md", final_plan)
state["current_state"] = STATE_GATED
next_step = "execute" if gate["passed"] else "integrate"
elif verdict == "approach_b":
# Alternative wins — need to integrate the alternative into the plan
next_step = "integrate"
else:
# Synthesis — need to integrate the synthesis
next_step = "integrate"
evaluation = copy.deepcopy(state["last_evaluation"])
evaluation["recommendation"] = "SKIP" if verdict == "approach_a" else "CONTINUE"
state["last_evaluation"] = evaluation
_append_to_meta(state, "overrides", {
"action": "test-both",
"timestamp": now_utc(),
"verdict": verdict,
"rationale": rationale,
"reason": args.reason,
})
save_state(plan_dir, state)
response: StepResponse = {
"success": True,
"step": "override",
"summary": f"Test-both complete. Verdict: {verdict}. {rationale}",
"artifacts": [test_both_filename],
"next_step": next_step,
"state": state["current_state"],
}
if verdict == "synthesis" and worker.payload.get("synthesis_description"):
response["message"] = worker.payload["synthesis_description"]
attach_agent_fallback(response, args)
return response


_OVERRIDE_ACTIONS: dict[str, Callable[[Path, PlanState, argparse.Namespace], StepResponse]] = {
"add-note": _override_add_note,
"abort": _override_abort,
"force-proceed": _override_force_proceed,
"skip": _override_skip,
"test-both": _override_test_both,
}


Expand Down Expand Up @@ -1292,7 +1373,7 @@ def build_parser() -> argparse.ArgumentParser:
config_sub.add_parser("reset")

override_parser = subparsers.add_parser("override")
override_parser.add_argument("override_action", choices=["skip", "abort", "force-proceed", "add-note"])
override_parser.add_argument("override_action", choices=["skip", "abort", "force-proceed", "add-note", "test-both"])
override_parser.add_argument("--plan")
override_parser.add_argument("--reason", default="")
override_parser.add_argument("--note")
Expand Down
17 changes: 17 additions & 0 deletions megaplan/data/instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,22 @@ Auto-force-proceed (and tell the user why) when:
- `suggested_override` is `"force-proceed"`, OR
- Robustness is `light` and `weighted_score` < 4.0

When the critique loop has stagnated (recurring critiques or score not improving),
consider using `test-both` to break the deadlock empirically:

```bash
megaplan override test-both --plan <name> --reason "critique loop stagnated"
```

This invokes a judge agent that evaluates both the current plan and an alternative
approach against the unresolved flags, then renders a verdict (approach_a, approach_b,
or synthesis). The verdict determines the next step:
- `approach_a` (current plan wins) → proceeds to gate
- `approach_b` or `synthesis` → proceeds to integrate with the judge's recommendations

Use `test-both` when the same concerns keep recurring across iterations and neither
force-proceed nor add-note is resolving the impasse.

Otherwise, present the evaluation details and ask the user what to do.

## Minor Megaplan Edits
Expand Down Expand Up @@ -114,5 +130,6 @@ megaplan status --plan <name>
megaplan audit --plan <name>
megaplan list
megaplan override add-note --plan <name> --note "user context"
megaplan override test-both --plan <name> --reason "critique loop stagnated"
megaplan override abort --plan <name>
```
58 changes: 58 additions & 0 deletions megaplan/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,62 @@ def _execute_prompt(state: PlanState, plan_dir: Path) -> str:
).strip()


def _test_both_prompt(state: PlanState, plan_dir: Path) -> str:
project_dir = Path(state["config"]["project_dir"])
latest_plan = latest_plan_path(plan_dir, state).read_text(encoding="utf-8")
latest_meta = read_json(latest_plan_meta_path(plan_dir, state))
flag_registry = load_flag_registry(plan_dir)
unresolved = unresolved_significant_flags(flag_registry)
open_flags = [
{
"id": flag["id"],
"severity": flag.get("severity"),
"concern": flag.get("concern"),
"evidence": flag.get("evidence"),
}
for flag in unresolved
]
return textwrap.dedent(
f"""
You are a neutral judge resolving a deadlock between a planner and a critic.
The critique loop has stagnated — the same concerns keep recurring despite
revisions. Your job is to test both the current plan AND an alternative
approach, then rule based on evidence.

Project directory:
{project_dir}

{intent_and_notes_block(state)}

Current plan (Approach A):
{latest_plan}

Plan metadata:
{json_dump(latest_meta).strip()}

Unresolved flags from the critic (the concerns driving the deadlock):
{json_dump(open_flags).strip()}

Requirements:
- Inspect the actual repository before judging.
- Evaluate Approach A (the current plan) against the unresolved flags.
For each flag, determine: does the plan actually have this problem,
or is the critic being overly cautious?
- Propose Approach B: an alternative that addresses the unresolved flags
differently. This could be a modified version of the plan, a simpler
approach, or a fundamentally different strategy.
- For BOTH approaches, assess:
1. Would it build and pass existing tests? (build_pass, test_pass)
2. What concrete issues would it cause? (issues)
3. What evidence supports your assessment? (evidence)
- Render a verdict: approach_a, approach_b, or synthesis.
- If synthesis, describe what to take from each approach.
- Judge based on correctness and practicality, not elegance.
- An approach that would fail to build loses automatically.
"""
).strip()


def _review_claude_prompt(state: PlanState, plan_dir: Path) -> str:
project_dir = Path(state["config"]["project_dir"])
latest_plan = latest_plan_path(plan_dir, state).read_text(encoding="utf-8")
Expand Down Expand Up @@ -332,6 +388,7 @@ def _review_codex_prompt(state: PlanState, plan_dir: Path) -> str:
"critique": _critique_prompt,
"execute": _execute_prompt,
"review": _review_claude_prompt,
"test-both": _test_both_prompt,
}

_CODEX_PROMPT_BUILDERS: dict[str, Any] = {
Expand All @@ -341,6 +398,7 @@ def _review_codex_prompt(state: PlanState, plan_dir: Path) -> str:
"critique": _critique_prompt,
"execute": _execute_prompt,
"review": _review_codex_prompt,
"test-both": _test_both_prompt,
}


Expand Down
34 changes: 34 additions & 0 deletions megaplan/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,40 @@
},
"required": ["criteria", "issues"],
},
"test-both.json": {
"type": "object",
"properties": {
"approach_a": {
"type": "object",
"properties": {
"label": {"type": "string"},
"build_pass": {"type": "boolean"},
"test_pass": {"type": "boolean"},
"issues": {"type": "array", "items": {"type": "string"}},
"evidence": {"type": "string"},
},
"required": ["label", "build_pass", "test_pass", "issues", "evidence"],
},
"approach_b": {
"type": "object",
"properties": {
"label": {"type": "string"},
"build_pass": {"type": "boolean"},
"test_pass": {"type": "boolean"},
"issues": {"type": "array", "items": {"type": "string"}},
"evidence": {"type": "string"},
},
"required": ["label", "build_pass", "test_pass", "issues", "evidence"],
},
"verdict": {
"type": "string",
"enum": ["approach_a", "approach_b", "synthesis"],
},
"verdict_rationale": {"type": "string"},
"synthesis_description": {"type": "string"},
},
"required": ["approach_a", "approach_b", "verdict", "verdict_rationale"],
},
}


Expand Down
27 changes: 27 additions & 0 deletions megaplan/workers.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
"critique": "critique.json",
"execute": "execution.json",
"review": "review.json",
"test-both": "test-both.json",
}

# Derive required keys per step from SCHEMAS so they aren't duplicated.
Expand Down Expand Up @@ -304,13 +305,37 @@ def _mock_review(state: PlanState, plan_dir: Path) -> WorkerResult:
return WorkerResult(payload=payload, raw_output=json_dump(payload), duration_ms=10, cost_usd=0.0, session_id=str(uuid.uuid4()))


def _mock_test_both(state: PlanState, plan_dir: Path) -> WorkerResult:
payload = {
"approach_a": {
"label": "Current plan",
"build_pass": True,
"test_pass": True,
"issues": [],
"evidence": "The current plan builds and passes existing tests.",
},
"approach_b": {
"label": "Simplified alternative addressing unresolved flags",
"build_pass": True,
"test_pass": True,
"issues": ["Requires minor refactor of existing module structure."],
"evidence": "Alternative approach resolves the flagged concerns with a simpler design.",
},
"verdict": "synthesis",
"verdict_rationale": "Both approaches build and pass tests. Approach A is more complete but carries the flagged risks. Approach B addresses the flags but introduces a minor refactor. A synthesis takes the core structure from A with the risk mitigations from B.",
"synthesis_description": "Keep the current plan structure but incorporate the critic's suggested safeguards for the flagged concerns.",
}
return WorkerResult(payload=payload, raw_output=json_dump(payload), duration_ms=10, cost_usd=0.0, session_id=str(uuid.uuid4()))


_MOCK_DISPATCH: dict[str, Any] = {
"clarify": _mock_clarify,
"plan": _mock_plan,
"critique": _mock_critique,
"integrate": _mock_integrate,
"execute": _mock_execute,
"review": _mock_review,
"test-both": _mock_test_both,
}


Expand All @@ -330,6 +355,8 @@ def session_key_for(step: str, agent: str) -> str:
return f"{agent}_executor"
if step == "review":
return f"{agent}_reviewer"
if step == "test-both":
return f"{agent}_judge"
return f"{agent}_{step}"


Expand Down
4 changes: 2 additions & 2 deletions tests/test_megaplan.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,8 @@ def test_infer_next_steps_non_evaluated_states(current_state: str, last_evaluati
[
("CONTINUE", ["integrate", "gate"]),
("SKIP", ["gate"]),
("ESCALATE", ["override add-note", "override force-proceed", "override abort"]),
("ABORT", ["override add-note", "override force-proceed", "override abort"]),
("ESCALATE", ["override test-both", "override add-note", "override force-proceed", "override abort"]),
("ABORT", ["override test-both", "override add-note", "override force-proceed", "override abort"]),
(None, ["override add-note", "override abort"]),
],
)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def test_list_passthrough(self) -> None:

class TestSCHEMAS:
def test_schemas_contains_expected_keys(self) -> None:
expected = {"clarify.json", "plan.json", "integrate.json", "critique.json", "execution.json", "review.json"}
expected = {"clarify.json", "plan.json", "integrate.json", "critique.json", "execution.json", "review.json", "test-both.json"}
assert expected == set(SCHEMAS.keys())

def test_all_schemas_are_objects(self) -> None:
Expand Down
Loading