diff --git a/.recursive/architecture/MODULE_MAP.md b/.recursive/architecture/MODULE_MAP.md index 6e86665..71574c6 100644 --- a/.recursive/architecture/MODULE_MAP.md +++ b/.recursive/architecture/MODULE_MAP.md @@ -1,6 +1,6 @@ # Module Map -Last updated: 2026-04-09 by session #0092 +Last updated: 2026-04-09 by session #0093 Generated via: `python3 -m nightshift module-map --write` Stale after: 5 newer sessions without a refresh @@ -14,10 +14,10 @@ Read it before opening modules one by one when you need fast orientation. | `core/errors.py` | 7 | Nightshift error types. | `NightshiftError` | 1636b72 | | `core/types.py` | 594 | Strict type definitions for all Nightshift data structures. | `NightshiftConfig`, `DiffScore`, `Counters`, `Baseline` | PR #231 (1052c38) | | `settings/eval_targets.py` | 99 | Known evaluation targets and their repo-specific verification settings. | `infer_target_verify_command`, `PHRACTAL_URL`, `_KNOWN_TARGET_VERIFY_COMMANDS` | PR #258 (9bf4032) | -| `core/constants.py` | 847 | Module-level constants and tiny utilities used across the package. | `now_local`, `print_status`, `DATA_VERSION`, `SUPPORTED_AGENTS` | PR #269 (2e91d5f) | +| `core/constants.py` | 851 | Module-level constants and tiny utilities used across the package. | `now_local`, `print_status`, `DATA_VERSION`, `SUPPORTED_AGENTS` | PR #269 (2e91d5f) | | `raven/summary.py` | 141 | Feature summary generation for Loop 2 build output. | `generate_feature_summary`, `_API_DIR_SEGMENTS`, `_CLI_DIR_SEGMENTS`, `_CONFIG_DIR_SEGMENTS` | 1636b72 | | `core/shell.py` | 256 | Subprocess execution: streaming runner, git helper, shell utilities. | `run_command`, `run_capture`, `git`, `command_exists` | PR #269 (2e91d5f) | -| `core/state.py` | 237 | Shift state: read, write, mutate counters, JSON I/O. | `load_json`, `write_json`, `read_state`, `top_path` | session #0092 | +| `core/state.py` | 237 | Shift state: read, write, mutate counters, JSON I/O. | `load_json`, `write_json`, `read_state`, `top_path` | PR #271 (2f509ab) | | `owl/readiness.py` | 234 | Production-readiness checks for Loop 2 feature builds. | `collect_changed_files`, `check_secrets`, `check_debug_prints`, `check_test_coverage` | PR #204 (df36eff) | | `raven/coordination.py` | 196 | Sub-agent coordination for Loop 2 -- detects file overlaps and generates hints. | `extract_file_references`, `detect_overlaps`, `generate_coordination_hints`, `inject_hints` | PR #229 (c2acba2) | | `infra/module_map.py` | 473 | Generate a persistent module map for fast cross-session orientation. | `module_map_path`, `generate_module_map`, `render_module_map`, `write_module_map` | PR #251 (c32e527) | @@ -25,17 +25,17 @@ Read it before opening modules one by one when you need fast orientation. | `infra/release.py` | 327 | Auto-release version tagging -- checks readiness and creates GitHub releases. | `check_and_release`, `find_releasable_version` | PR #268 (3ef4d4c) | | `owl/scoring.py` | 113 | Post-cycle diff scoring: evaluates production impact of cycle changes. | `score_diff`, `log_score` | 1636b72 | | `settings/config.py` | 259 | Configuration loading, agent resolution, and environment detection. | `merge_config`, `prompt_for_agent`, `resolve_agent`, `infer_package_manager` | PR #269 (2e91d5f) | -| `infra/worktree.py` | 264 | Git worktree lifecycle: create, shift log, sync, revert, cleanup. | `canonical_repo_relative_path`, `resolve_nightshift_dir`, `resolve_shift_log_relative_dir`, `resolve_test_runtime_dir` | PR #258 (9bf4032) | -| `owl/eval_runner.py` | 698 | Evaluation runner: score nightshift against a target repo (or dry-run with synthetic data). | `score_artifacts`, `format_eval_table`, `run_eval_dry_run`, `run_eval_full` | PR #269 (2e91d5f) | +| `infra/worktree.py` | 279 | Git worktree lifecycle: create, shift log, sync, revert, cleanup. | `canonical_repo_relative_path`, `resolve_nightshift_dir`, `resolve_shift_log_relative_dir`, `resolve_test_runtime_dir` | session #0093 | +| `owl/eval_runner.py` | 739 | Evaluation runner: score nightshift against a target repo (or dry-run with synthetic data). | `score_artifacts`, `format_eval_table`, `run_eval_dry_run`, `run_eval_full` | session #0093 | | `raven/e2e.py` | 113 | End-to-end test runner for Loop 2 feature builds. | `infer_test_command`, `detect_smoke_test`, `run_e2e_tests`, `_MAKEFILE_TEST_TARGET` | 1636b72 | | `raven/profiler.py` | 547 | Repo profiling for Loop 2 -- detects language, framework, dependencies, structure. | `profile_repo` | PR #220 (d9e4320) | -| `owl/cycle.py` | 983 | Per-cycle logic: prompt building, agent dispatch, verification, evaluation. | `extract_json`, `read_repo_instructions`, `wrap_repo_instructions`, `command_for_agent` | session #0092 | +| `owl/cycle.py` | 983 | Per-cycle logic: prompt building, agent dispatch, verification, evaluation. | `extract_json`, `read_repo_instructions`, `wrap_repo_instructions`, `command_for_agent` | PR #272 (304bb7a) | | `raven/planner.py` | 483 | Feature planner for Loop 2 -- builds structured plans from repo profiles. | `build_plan_prompt`, `validate_plan`, `parse_plan`, `execution_order` | 1636b72 | | `raven/subagent.py` | 281 | Sub-agent spawner for Loop 2 -- executes work orders via codex or claude CLI. | `spawn_task`, `spawn_wave`, `format_wave_result`, `_TASK_COMPLETION_REQUIRED_KEYS` | 1636b72 | | `raven/decomposer.py` | 175 | Task decomposer for Loop 2 -- converts FeaturePlans into sub-agent work orders. | `build_work_order_prompt`, `decompose_plan`, `format_work_orders` | 1636b72 | | `raven/integrator.py` | 325 | Wave integrator for Loop 2 -- merges sub-agent work, runs tests, handles failures. | `collect_wave_files`, `stage_files`, `run_test_suite`, `diagnose_failure` | 1636b72 | | `raven/feature.py` | 744 | Loop 2 feature-build orchestration and persisted build state. | `feature_state_path`, `feature_log_dir`, `read_feature_state`, `write_feature_state` | PR #208 (a4b3d0e) | -| `cli.py` | 723 | CLI entry points: run, test, summarize, verify-cycle, module-map. | `run_nightshift`, `summarize`, `verify_cycle_cli`, `plan_feature` | PR #258 (9bf4032) | +| `cli.py` | 766 | CLI entry points: run, test, summarize, verify-cycle, module-map. | `run_nightshift`, `summarize`, `verify_cycle_cli`, `plan_feature` | PR #258 (9bf4032) | | `__main__.py` | 5 | Entry point for python3 -m nightshift. | `main` | 2802c51 | | `__init__.py` | 502 | Nightshift -- autonomous overnight codebase improvement agent. | `AGENT_DEFAULT_MODELS`, `BACKEND_DIR_NAMES`, `BACKEND_EXTENSIONS`, `CATEGORY_ORDER` | PR #269 (2e91d5f) | @@ -50,8 +50,8 @@ Topological order derived from internal `nightshift.*` imports. ## Recent Shipped Sessions +- PR #273: docs: record eval 0020 rerun +- PR #272: fix: neutralize repo instruction delimiters +- PR #271: fix: sanitize corrupt state counters - PR #268: fix: use --notes-file tempfile in release.py to prevent gh @ file expansion (C-4) - PR #269: fix: validate eval_target_repo URL and use mkdtemp for clone dest (C-1, C-2) -- PR #267: fix: guard int(v) in category_counts, deduplicate VALID_CATEGORIES -- PR #266: feat: sanitize category_counts on load, add dominance and eval scorer tests -- PR #265: fix: apply category allowlist to cycle.py dominance guard diff --git a/.recursive/evaluations/0093.md b/.recursive/evaluations/0093.md new file mode 100644 index 0000000..a0da117 --- /dev/null +++ b/.recursive/evaluations/0093.md @@ -0,0 +1,21 @@ +# Evaluation 0093 + +**Date**: 2026-04-09 +**Target**: https://github.com/fazxes/Phractal +**Agent**: codex + +## Scorecard + +| Dimension | Score | Max | Notes | +|-----------|------:|----:|-------| +| Startup | 8 | 10 | exit=0 | +| Discovery | 6 | 10 | fixes=2 issues=0 | +| Fix quality | 10 | 10 | 2/2 structured | +| Shift log | 3 | 10 | template unfilled | +| State file | 10 | 10 | 2 structured fixes; category_counts populated | +| Verification | 10 | 10 | 2/2 passed | +| Guard rails | 9 | 10 | clean | +| Clean state | 10 | 10 | clean | +| Breadth | 6 | 10 | 2 categories | +| Usefulness | 6 | 10 | fixes=2 tests=0 | +| **TOTAL** | **78** | **100** | | diff --git a/.recursive/evaluations/0094.md b/.recursive/evaluations/0094.md new file mode 100644 index 0000000..9523383 --- /dev/null +++ b/.recursive/evaluations/0094.md @@ -0,0 +1,21 @@ +# Evaluation 0094 + +**Date**: 2026-04-09 +**Target**: https://github.com/fazxes/Phractal +**Agent**: codex + +## Scorecard + +| Dimension | Score | Max | Notes | +|-----------|------:|----:|-------| +| Startup | 8 | 10 | exit=0 | +| Discovery | 6 | 10 | fixes=2 issues=0 | +| Fix quality | 10 | 10 | 2/2 structured | +| Shift log | 3 | 10 | template unfilled | +| State file | 10 | 10 | 2 structured fixes; category_counts populated | +| Verification | 10 | 10 | 2/2 passed | +| Guard rails | 9 | 10 | clean | +| Clean state | 10 | 10 | clean | +| Breadth | 6 | 10 | 2 categories | +| Usefulness | 6 | 10 | fixes=2 tests=0 | +| **TOTAL** | **78** | **100** | | diff --git a/.recursive/tasks/0277.md b/.recursive/tasks/0277.md index ec0f716..2259fcb 100644 --- a/.recursive/tasks/0277.md +++ b/.recursive/tasks/0277.md @@ -1,11 +1,11 @@ --- -status: pending +status: done priority: urgent target: v0.0.9 vision_section: self-maintaining created: 2026-04-09 source: evaluation-0020 -completed: +completed: 2026-04-09 --- # Make Claude eval reruns scorable from Claude Code sessions @@ -24,4 +24,3 @@ Make the eval runner or launch path resilient to Claude-in-Claude execution so a - [ ] If Claude-in-Claude remains unsupported, the runner automatically falls back to a supported agent or emits a clear, actionable failure before starting cycles - [ ] Regression coverage exists for the nested-session path - [ ] A fresh eval rerun produces a scorable report instead of halting after two agent failures - diff --git a/nightshift/cli.py b/nightshift/cli.py index 13860b3..446850f 100644 --- a/nightshift/cli.py +++ b/nightshift/cli.py @@ -180,17 +180,60 @@ def _write_rejected_cycle_artifact( artifact_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") +def _claude_code_session_markers() -> list[str]: + """Return environment markers that indicate we are inside Claude Code.""" + markers = [ + key + for key in os.environ + if key == "CLAUDECODE" or key.startswith("CLAUDECODE_") or key.startswith("CLAUDE_CODE_") + ] + return sorted(markers) + + +def _resolve_runtime_agent(agent: str, *, allow_fallback: bool) -> tuple[str, str | None]: + """Return the agent Nightshift should actually launch for this run. + + Claude Code sessions can block nested Claude CLI invocations. When the + caller requested Claude and we detect that environment, fall back to Codex + if it is available; otherwise raise an actionable failure before cycles + start. + """ + if agent != "claude" or not allow_fallback: + return agent, None + + markers = _claude_code_session_markers() + if not markers: + return agent, None + + if command_exists("codex"): + note = f"Claude Code session detected via {', '.join(markers)}; falling back from claude to codex for this run." + return "codex", note + + marker_text = ", ".join(markers) + raise NightshiftError( + "Claude Code session detected via " + f"{marker_text}, but claude cannot launch nested inside it and codex is not available. " + "Install codex or rerun `nightshift test --agent codex --cycles 2 --cycle-minutes 5` " + "from a shell without Claude Code active." + ) + + def run_nightshift(args: argparse.Namespace, *, test_mode: bool) -> int: repo_dir = Path(args.repo_dir or os.getcwd()).resolve() if test_mode and not repo_dir.exists(): _ensure_repo_dir(repo_dir) config = merge_config(repo_dir) agent = resolve_agent(config, args.agent) - config["agent"] = agent if getattr(args, "hours", None) is not None: config["hours"] = args.hours if getattr(args, "cycle_minutes", None) is not None: config["cycle_minutes"] = args.cycle_minutes + runtime_note: str | None = None + if not args.dry_run: + agent, runtime_note = _resolve_runtime_agent(agent, allow_fallback=True) + if runtime_note: + print_status(f"[nightshift] {runtime_note}") + config["agent"] = agent today = args.date or now_local().strftime("%Y-%m-%d") runtime_dir = resolve_runtime_dir(repo_dir, test_mode=test_mode) shift_log_dir = resolve_shift_log_relative_dir(repo_dir) diff --git a/nightshift/core/constants.py b/nightshift/core/constants.py index b66f2b0..41c4ad0 100644 --- a/nightshift/core/constants.py +++ b/nightshift/core/constants.py @@ -805,6 +805,10 @@ def print_status(message: str) -> None: # runtime artifacts so `nightshift test` does not dirty the target checkout. TEST_RUNTIME_ARTIFACT_DIRNAME = "nightshift-test-runs" +# Optional override used by eval runners to force child test runs to reuse a +# parent-selected runtime directory. +TEST_RUNTIME_DIR_ENV = "NIGHTSHIFT_TEST_RUNTIME_DIR" + # --- Release data ----------------------------------------------------------- # Regex to extract the version tag from a changelog filename (e.g. "v0.0.8"). diff --git a/nightshift/infra/worktree.py b/nightshift/infra/worktree.py index e5ce1e3..1ce3bfd 100644 --- a/nightshift/infra/worktree.py +++ b/nightshift/infra/worktree.py @@ -3,6 +3,7 @@ from __future__ import annotations import hashlib +import os import shutil import subprocess import tempfile @@ -13,6 +14,7 @@ SAFE_ARTIFACT_GLOBS, SHIFT_LOG_TEMPLATE, TEST_RUNTIME_ARTIFACT_DIRNAME, + TEST_RUNTIME_DIR_ENV, now_local, print_status, ) @@ -75,6 +77,19 @@ def resolve_shift_log_relative_dir(repo_dir: Path) -> str: def resolve_test_runtime_dir(repo_dir: Path) -> Path: """Return an isolated runtime directory for test-mode runs.""" + override = os.environ.get(TEST_RUNTIME_DIR_ENV) + if override: + override_path = Path(override) + if not override_path.is_absolute(): + raise NightshiftError(f"{TEST_RUNTIME_DIR_ENV} must be an absolute path inside the system temp directory.") + resolved_override = override_path.resolve(strict=False) + temp_root = Path(tempfile.gettempdir()).resolve() + if resolved_override.parent != temp_root or not resolved_override.name.startswith("nightshift-eval-run-"): + raise NightshiftError( + f"{TEST_RUNTIME_DIR_ENV} must point to a direct child of {temp_root} with the " + "'nightshift-eval-run-' prefix." + ) + return resolved_override digest = hashlib.sha256(str(repo_dir).encode("utf-8")).hexdigest()[:12] return Path(tempfile.gettempdir()) / TEST_RUNTIME_ARTIFACT_DIRNAME / f"{repo_dir.name}-{digest}" diff --git a/nightshift/owl/eval_runner.py b/nightshift/owl/eval_runner.py index 0773f31..3c7888d 100644 --- a/nightshift/owl/eval_runner.py +++ b/nightshift/owl/eval_runner.py @@ -15,6 +15,7 @@ from __future__ import annotations import json +import os import shutil import subprocess import tempfile @@ -29,9 +30,10 @@ EVALUATION_SCORE_THRESHOLD, EVALUATION_SHIFT_TIMEOUT, EVALUATION_TEMPLATE_MARKERS, + TEST_RUNTIME_DIR_ENV, ) from nightshift.core.errors import NightshiftError -from nightshift.core.shell import validate_repo_url +from nightshift.core.shell import command_exists, validate_repo_url from nightshift.core.types import DimensionScore, EvaluationResult, ShiftArtifacts, ShiftRunResult from nightshift.settings.config import merge_config @@ -133,6 +135,42 @@ def _build_synthetic_artifacts() -> ShiftArtifacts: ) +def _claude_code_session_markers() -> list[str]: + """Return environment markers that indicate we are inside Claude Code.""" + markers = [ + key + for key in os.environ + if key == "CLAUDECODE" or key.startswith("CLAUDECODE_") or key.startswith("CLAUDE_CODE_") + ] + return sorted(markers) + + +def _resolve_eval_runtime_agent(agent: str) -> str: + """Resolve the actual agent used for a full eval run. + + Claude Code sessions can block nested Claude CLI invocations. For eval + reruns we narrow the fallback to the child launch path so the resulting + report records the runtime agent that was actually selected. + """ + if agent != "claude": + return agent + + markers = _claude_code_session_markers() + if not markers: + return agent + + if command_exists("codex"): + return "codex" + + marker_text = ", ".join(markers) + raise NightshiftError( + "Claude Code session detected via " + f"{marker_text}, but claude cannot launch nested inside it and codex is not available. " + "Install codex or rerun `nightshift test --agent codex --cycles 2 --cycle-minutes 5` " + "from a shell without Claude Code active." + ) + + # --------------------------------------------------------------------------- # Scoring (pure -- no I/O) # --------------------------------------------------------------------------- @@ -573,6 +611,7 @@ def run_eval_full( # in _build_config(), but we re-validate here immediately before the # subprocess call to defend against any future bypass of the config layer. validate_repo_url(target) + runtime_agent = _resolve_eval_runtime_agent(agent) eval_dir = repo_dir / ".recursive" / "evaluations" eval_id = _next_eval_id(eval_dir) @@ -598,7 +637,7 @@ def run_eval_full( result_data = _run_test_shift_subprocess( repo_dir=repo_dir, clone_dest=clone_dest, - agent=agent, + agent=runtime_agent, runtime_dir=runtime_dir, date=date, ) @@ -611,12 +650,11 @@ def run_eval_full( dimensions = score_artifacts(artifacts) total = sum(d["score"] for d in dimensions) max_total = sum(d["max_score"] for d in dimensions) - result = EvaluationResult( evaluation_id=eval_id, date=date, target_repo=target, - agent=agent, + agent=runtime_agent, cycles=EVALUATION_DEFAULT_CYCLES, after_task="", dimensions=dimensions, @@ -657,12 +695,15 @@ def _run_test_shift_subprocess( date, ] try: + env = os.environ.copy() + env[TEST_RUNTIME_DIR_ENV] = str(runtime_dir) proc = subprocess.run( cmd, capture_output=True, text=True, timeout=EVALUATION_SHIFT_TIMEOUT, cwd=str(repo_dir), + env=env, ) return ShiftRunResult(exit_code=proc.returncode, stdout=proc.stdout, stderr=proc.stderr) except subprocess.TimeoutExpired: diff --git a/nightshift/tests/test_eval_runner.py b/nightshift/tests/test_eval_runner.py index a011b51..5207ee4 100644 --- a/nightshift/tests/test_eval_runner.py +++ b/nightshift/tests/test_eval_runner.py @@ -3,6 +3,7 @@ from __future__ import annotations import argparse +import subprocess import sys from pathlib import Path from unittest.mock import patch @@ -21,6 +22,7 @@ _build_synthetic_artifacts, _next_eval_id, _safe_rmtree, + _run_test_shift_subprocess, _score_breadth, _score_clean_state, _score_discovery, @@ -922,3 +924,55 @@ def test_run_eval_full_rejects_invalid_url(self, tmp_path: Path) -> None: mock_cfg.return_value = config_bad_url with pytest.raises(NightshiftError, match="--"): run_eval_full(tmp_path) + + def test_run_eval_full_uses_runtime_agent_for_report(self, tmp_path: Path) -> None: + """Fallback runs should be scored and reported with the runtime agent.""" + import copy + import os + import subprocess + from unittest.mock import patch + + from nightshift.core.constants import DEFAULT_CONFIG + from nightshift.owl.eval_runner import run_eval_full + + config = copy.deepcopy(dict(DEFAULT_CONFIG)) + config["eval_target_repo"] = "https://github.com/example/repo.git" + + artifacts = _build_synthetic_artifacts() + state = artifacts["state"] + assert isinstance(state, dict) + state["agent"] = "claude" + + with ( + patch("nightshift.owl.eval_runner.merge_config") as mock_cfg, + patch("nightshift.owl.eval_runner.subprocess.run") as mock_run, + patch("nightshift.owl.eval_runner.command_exists", return_value=True), + patch("nightshift.owl.eval_runner._run_test_shift_subprocess") as mock_shift, + patch("nightshift.owl.eval_runner._collect_artifacts_from_dir", return_value=artifacts), + patch.dict(os.environ, {"CLAUDE_CODE_ENTRYPOINT": "cli"}, clear=False), + ): + mock_cfg.return_value = config + mock_run.return_value = subprocess.CompletedProcess(args=["git"], returncode=0, stdout="", stderr="") + mock_shift.return_value = {"exit_code": 0, "stdout": "", "stderr": ""} + result = run_eval_full(tmp_path, agent="claude", write_report=True) + + assert result["agent"] == "codex" + assert result["total_score"] > 0 + assert mock_shift.call_args.kwargs["agent"] == "codex" + report = tmp_path / ".recursive" / "evaluations" / f"{result['evaluation_id']:04d}.md" + assert report.exists() + assert "**Agent**: codex" in report.read_text(encoding="utf-8") + + def test_run_test_shift_subprocess_sets_runtime_dir_env(self, tmp_path: Path) -> None: + with patch("nightshift.owl.eval_runner.subprocess.run") as mock_run: + mock_run.return_value = subprocess.CompletedProcess(args=["python3"], returncode=0, stdout="", stderr="") + _run_test_shift_subprocess( + repo_dir=tmp_path, + clone_dest=tmp_path / "clone", + agent="claude", + runtime_dir=tmp_path / "runtime", + date="2026-04-09", + ) + + env = mock_run.call_args.kwargs["env"] + assert env["NIGHTSHIFT_TEST_RUNTIME_DIR"] == str(tmp_path / "runtime") diff --git a/nightshift/tests/test_nightshift.py b/nightshift/tests/test_nightshift.py index 356bc58..4337a99 100644 --- a/nightshift/tests/test_nightshift.py +++ b/nightshift/tests/test_nightshift.py @@ -557,6 +557,72 @@ def test_interactive_prompt_claude(self): assert nightshift.resolve_agent(config, None) == "claude" +class TestResolveRuntimeAgent: + def test_claude_code_session_falls_back_to_codex(self) -> None: + from nightshift.cli import _resolve_runtime_agent + + env = { + "CLAUDE_CODE_ENTRYPOINT": "cli", + "CLAUDE_CODE_EXECPATH": "/tmp/claude", + } + with patch.dict(os.environ, env, clear=True), patch("nightshift.cli.command_exists", return_value=True): + agent, note = _resolve_runtime_agent("claude", allow_fallback=True) + + assert agent == "codex" + assert note is not None + assert "falling back from claude to codex" in note + + def test_claude_code_session_without_codex_raises_clear_error(self) -> None: + from nightshift.cli import _resolve_runtime_agent + + env = {"CLAUDE_CODE_ENTRYPOINT": "cli"} + with ( + patch.dict(os.environ, env, clear=True), + patch("nightshift.cli.command_exists", return_value=False), + pytest.raises(nightshift.NightshiftError, match="codex"), + ): + _resolve_runtime_agent("claude", allow_fallback=True) + + def test_non_claude_agent_is_left_unchanged(self) -> None: + from nightshift.cli import _resolve_runtime_agent + + with patch.dict(os.environ, {"CLAUDE_CODE_ENTRYPOINT": "cli"}, clear=True): + agent, note = _resolve_runtime_agent("codex", allow_fallback=True) + + assert agent == "codex" + assert note is None + + +class TestResolveTestRuntimeDir: + def test_env_override_wins(self) -> None: + import shutil + import tempfile + + override = Path(tempfile.mkdtemp(prefix="nightshift-eval-run-test-")) + try: + override_resolved = override.resolve(strict=False) + with patch.dict(os.environ, {"NIGHTSHIFT_TEST_RUNTIME_DIR": str(override)}, clear=True): + assert nightshift.resolve_test_runtime_dir(Path("/tmp/example")) == override_resolved + finally: + shutil.rmtree(override, ignore_errors=True) + + def test_default_path_depends_on_repo_name(self, tmp_path: Path) -> None: + repo = tmp_path / "repo" + repo.mkdir() + runtime_dir = nightshift.resolve_test_runtime_dir(repo) + assert runtime_dir.name.startswith("repo-") + + def test_rejects_non_eval_override(self) -> None: + import tempfile + + bad_override = Path(tempfile.gettempdir()) / "unsafe-runtime-dir" + with ( + patch.dict(os.environ, {"NIGHTSHIFT_TEST_RUNTIME_DIR": str(bad_override)}, clear=True), + pytest.raises(nightshift.NightshiftError, match="nightshift-eval-run-"), + ): + nightshift.resolve_test_runtime_dir(Path("/tmp/example")) + + class TestPromptForAgent: def test_choice_1_returns_codex(self): with patch("builtins.input", return_value="1"): @@ -1703,6 +1769,68 @@ def test_missing_repo_dir_does_not_clone_in_run_mode(self, tmp_path: Path) -> No assert not missing.exists(), "run mode must not auto-clone a missing repo_dir" + def test_test_mode_falls_back_to_codex_and_reports_codex_from_cli(self, tmp_path: Path) -> None: + import shutil + import tempfile + + repo = tmp_path / "repo" + repo.mkdir() + subprocess.run(["git", "init"], cwd=repo, capture_output=True, check=True) + subprocess.run(["git", "config", "user.email", "test@test.com"], cwd=repo, capture_output=True, check=True) + subprocess.run(["git", "config", "user.name", "Test"], cwd=repo, capture_output=True, check=True) + (repo / "README.md").write_text("hello\n", encoding="utf-8") + subprocess.run(["git", "add", "README.md"], cwd=repo, capture_output=True, check=True) + subprocess.run(["git", "commit", "-m", "init"], cwd=repo, capture_output=True, check=True) + + bin_dir = tmp_path / "bin" + bin_dir.mkdir() + codex = bin_dir / "codex" + codex.write_text("#!/bin/sh\nexit 0\n", encoding="utf-8") + codex.chmod(0o755) + + runtime_dir = Path(tempfile.mkdtemp(prefix="nightshift-eval-run-cli-")) + env = os.environ.copy() + env["PATH"] = f"{bin_dir}:{env.get('PATH', '')}" + env["CLAUDE_CODE_ENTRYPOINT"] = "cli" + env["CLAUDE_CODE_EXECPATH"] = "/tmp/claude" + env["CLAUDE_CODE_SSE_PORT"] = "12345" + env["PYTHONPATH"] = str(Path(__file__).resolve().parent.parent.parent) + env["NIGHTSHIFT_TEST_RUNTIME_DIR"] = str(runtime_dir) + + try: + result = subprocess.run( + [ + sys.executable, + "-m", + "nightshift", + "test", + "--agent", + "claude", + "--cycles", + "0", + "--cycle-minutes", + "1", + "--repo-dir", + str(repo), + ], + cwd=str(repo), + capture_output=True, + text=True, + env=env, + check=False, + ) + + assert result.returncode == 0 + assert "Agent: codex" in result.stdout + assert "NIGHTSHIFT COMPLETE" in result.stdout + today = nightshift.now_local().strftime("%Y-%m-%d") + state_path = runtime_dir / f"{today}.state.json" + assert state_path.exists() + state = json.loads(state_path.read_text(encoding="utf-8")) + assert state["agent"] == "codex" + finally: + shutil.rmtree(runtime_dir, ignore_errors=True) + class TestEnsureWorktree: def test_recreates_broken_existing_worktree(self, tmp_path: Path) -> None: