Recusive · fazxes · Apr 9, 2026 · Apr 9, 2026
diff --git a/.recursive/architecture/MODULE_MAP.md b/.recursive/architecture/MODULE_MAP.md
@@ -1,6 +1,6 @@
 # Module Map
 
-Last updated: 2026-04-09 by session #0092
+Last updated: 2026-04-09 by session #0093
 Generated via: `python3 -m nightshift module-map --write`
 Stale after: 5 newer sessions without a refresh
 
@@ -14,28 +14,28 @@ Read it before opening modules one by one when you need fast orientation.
 | `core/errors.py` | 7 | Nightshift error types. | `NightshiftError` | 1636b72 |
 | `core/types.py` | 594 | Strict type definitions for all Nightshift data structures. | `NightshiftConfig`, `DiffScore`, `Counters`, `Baseline` | PR #231 (1052c38) |
 | `settings/eval_targets.py` | 99 | Known evaluation targets and their repo-specific verification settings. | `infer_target_verify_command`, `PHRACTAL_URL`, `_KNOWN_TARGET_VERIFY_COMMANDS` | PR #258 (9bf4032) |
-| `core/constants.py` | 847 | Module-level constants and tiny utilities used across the package. | `now_local`, `print_status`, `DATA_VERSION`, `SUPPORTED_AGENTS` | PR #269 (2e91d5f) |
+| `core/constants.py` | 851 | Module-level constants and tiny utilities used across the package. | `now_local`, `print_status`, `DATA_VERSION`, `SUPPORTED_AGENTS` | PR #269 (2e91d5f) |
 | `raven/summary.py` | 141 | Feature summary generation for Loop 2 build output. | `generate_feature_summary`, `_API_DIR_SEGMENTS`, `_CLI_DIR_SEGMENTS`, `_CONFIG_DIR_SEGMENTS` | 1636b72 |
 | `core/shell.py` | 256 | Subprocess execution: streaming runner, git helper, shell utilities. | `run_command`, `run_capture`, `git`, `command_exists` | PR #269 (2e91d5f) |
-| `core/state.py` | 237 | Shift state: read, write, mutate counters, JSON I/O. | `load_json`, `write_json`, `read_state`, `top_path` | session #0092 |
+| `core/state.py` | 237 | Shift state: read, write, mutate counters, JSON I/O. | `load_json`, `write_json`, `read_state`, `top_path` | PR #271 (2f509ab) |
 | `owl/readiness.py` | 234 | Production-readiness checks for Loop 2 feature builds. | `collect_changed_files`, `check_secrets`, `check_debug_prints`, `check_test_coverage` | PR #204 (df36eff) |
 | `raven/coordination.py` | 196 | Sub-agent coordination for Loop 2 -- detects file overlaps and generates hints. | `extract_file_references`, `detect_overlaps`, `generate_coordination_hints`, `inject_hints` | PR #229 (c2acba2) |
 | `infra/module_map.py` | 473 | Generate a persistent module map for fast cross-session orientation. | `module_map_path`, `generate_module_map`, `render_module_map`, `write_module_map` | PR #251 (c32e527) |
 | `infra/multi.py` | 117 | Multi-repo shift orchestration: run hardening loops across multiple repos. | `validate_repos`, `format_multi_summary`, `run_multi_shift` | 1636b72 |
 | `infra/release.py` | 327 | Auto-release version tagging -- checks readiness and creates GitHub releases. | `check_and_release`, `find_releasable_version` | PR #268 (3ef4d4c) |
 | `owl/scoring.py` | 113 | Post-cycle diff scoring: evaluates production impact of cycle changes. | `score_diff`, `log_score` | 1636b72 |
 | `settings/config.py` | 259 | Configuration loading, agent resolution, and environment detection. | `merge_config`, `prompt_for_agent`, `resolve_agent`, `infer_package_manager` | PR #269 (2e91d5f) |
-| `infra/worktree.py` | 264 | Git worktree lifecycle: create, shift log, sync, revert, cleanup. | `canonical_repo_relative_path`, `resolve_nightshift_dir`, `resolve_shift_log_relative_dir`, `resolve_test_runtime_dir` | PR #258 (9bf4032) |
-| `owl/eval_runner.py` | 698 | Evaluation runner: score nightshift against a target repo (or dry-run with synthetic data). | `score_artifacts`, `format_eval_table`, `run_eval_dry_run`, `run_eval_full` | PR #269 (2e91d5f) |
+| `infra/worktree.py` | 279 | Git worktree lifecycle: create, shift log, sync, revert, cleanup. | `canonical_repo_relative_path`, `resolve_nightshift_dir`, `resolve_shift_log_relative_dir`, `resolve_test_runtime_dir` | session #0093 |
+| `owl/eval_runner.py` | 739 | Evaluation runner: score nightshift against a target repo (or dry-run with synthetic data). | `score_artifacts`, `format_eval_table`, `run_eval_dry_run`, `run_eval_full` | session #0093 |
 | `raven/e2e.py` | 113 | End-to-end test runner for Loop 2 feature builds. | `infer_test_command`, `detect_smoke_test`, `run_e2e_tests`, `_MAKEFILE_TEST_TARGET` | 1636b72 |
 | `raven/profiler.py` | 547 | Repo profiling for Loop 2 -- detects language, framework, dependencies, structure. | `profile_repo` | PR #220 (d9e4320) |
-| `owl/cycle.py` | 983 | Per-cycle logic: prompt building, agent dispatch, verification, evaluation. | `extract_json`, `read_repo_instructions`, `wrap_repo_instructions`, `command_for_agent` | session #0092 |
+| `owl/cycle.py` | 983 | Per-cycle logic: prompt building, agent dispatch, verification, evaluation. | `extract_json`, `read_repo_instructions`, `wrap_repo_instructions`, `command_for_agent` | PR #272 (304bb7a) |
 | `raven/planner.py` | 483 | Feature planner for Loop 2 -- builds structured plans from repo profiles. | `build_plan_prompt`, `validate_plan`, `parse_plan`, `execution_order` | 1636b72 |
 | `raven/subagent.py` | 281 | Sub-agent spawner for Loop 2 -- executes work orders via codex or claude CLI. | `spawn_task`, `spawn_wave`, `format_wave_result`, `_TASK_COMPLETION_REQUIRED_KEYS` | 1636b72 |
 | `raven/decomposer.py` | 175 | Task decomposer for Loop 2 -- converts FeaturePlans into sub-agent work orders. | `build_work_order_prompt`, `decompose_plan`, `format_work_orders` | 1636b72 |
 | `raven/integrator.py` | 325 | Wave integrator for Loop 2 -- merges sub-agent work, runs tests, handles failures. | `collect_wave_files`, `stage_files`, `run_test_suite`, `diagnose_failure` | 1636b72 |
 | `raven/feature.py` | 744 | Loop 2 feature-build orchestration and persisted build state. | `feature_state_path`, `feature_log_dir`, `read_feature_state`, `write_feature_state` | PR #208 (a4b3d0e) |
-| `cli.py` | 723 | CLI entry points: run, test, summarize, verify-cycle, module-map. | `run_nightshift`, `summarize`, `verify_cycle_cli`, `plan_feature` | PR #258 (9bf4032) |
+| `cli.py` | 766 | CLI entry points: run, test, summarize, verify-cycle, module-map. | `run_nightshift`, `summarize`, `verify_cycle_cli`, `plan_feature` | PR #258 (9bf4032) |
 | `__main__.py` | 5 | Entry point for python3 -m nightshift. | `main` | 2802c51 |
 | `__init__.py` | 502 | Nightshift -- autonomous overnight codebase improvement agent. | `AGENT_DEFAULT_MODELS`, `BACKEND_DIR_NAMES`, `BACKEND_EXTENSIONS`, `CATEGORY_ORDER` | PR #269 (2e91d5f) |
 
@@ -50,8 +50,8 @@ Topological order derived from internal `nightshift.*` imports.
 
 ## Recent Shipped Sessions
 
+- PR #273: docs: record eval 0020 rerun
+- PR #272: fix: neutralize repo instruction delimiters
+- PR #271: fix: sanitize corrupt state counters
 - PR #268: fix: use --notes-file tempfile in release.py to prevent gh @ file expansion (C-4)
 - PR #269: fix: validate eval_target_repo URL and use mkdtemp for clone dest (C-1, C-2)
-- PR #267: fix: guard int(v) in category_counts, deduplicate VALID_CATEGORIES
-- PR #266: feat: sanitize category_counts on load, add dominance and eval scorer tests
-- PR #265: fix: apply category allowlist to cycle.py dominance guard
diff --git a/.recursive/evaluations/0093.md b/.recursive/evaluations/0093.md
@@ -0,0 +1,21 @@
+# Evaluation 0093
+
+**Date**: 2026-04-09
+**Target**: https://github.com/fazxes/Phractal
+**Agent**: codex
+
+## Scorecard
+
+| Dimension | Score | Max | Notes |
+|-----------|------:|----:|-------|
+| Startup | 8 | 10 | exit=0 |
+| Discovery | 6 | 10 | fixes=2 issues=0 |
+| Fix quality | 10 | 10 | 2/2 structured |
+| Shift log | 3 | 10 | template unfilled |
+| State file | 10 | 10 | 2 structured fixes; category_counts populated |
+| Verification | 10 | 10 | 2/2 passed |
+| Guard rails | 9 | 10 | clean |
+| Clean state | 10 | 10 | clean |
+| Breadth | 6 | 10 | 2 categories |
+| Usefulness | 6 | 10 | fixes=2 tests=0 |
+| **TOTAL** | **78** | **100** | |
diff --git a/.recursive/evaluations/0094.md b/.recursive/evaluations/0094.md
@@ -0,0 +1,21 @@
+# Evaluation 0094
+
+**Date**: 2026-04-09
+**Target**: https://github.com/fazxes/Phractal
+**Agent**: codex
+
+## Scorecard
+
+| Dimension | Score | Max | Notes |
+|-----------|------:|----:|-------|
+| Startup | 8 | 10 | exit=0 |
+| Discovery | 6 | 10 | fixes=2 issues=0 |
+| Fix quality | 10 | 10 | 2/2 structured |
+| Shift log | 3 | 10 | template unfilled |
+| State file | 10 | 10 | 2 structured fixes; category_counts populated |
+| Verification | 10 | 10 | 2/2 passed |
+| Guard rails | 9 | 10 | clean |
+| Clean state | 10 | 10 | clean |
+| Breadth | 6 | 10 | 2 categories |
+| Usefulness | 6 | 10 | fixes=2 tests=0 |
+| **TOTAL** | **78** | **100** | |
diff --git a/.recursive/tasks/0277.md b/.recursive/tasks/0277.md
@@ -1,11 +1,11 @@
 ---
-status: pending
+status: done
 priority: urgent
 target: v0.0.9
 vision_section: self-maintaining
 created: 2026-04-09
 source: evaluation-0020
-completed:
+completed: 2026-04-09
 ---
 
 # Make Claude eval reruns scorable from Claude Code sessions
@@ -24,4 +24,3 @@ Make the eval runner or launch path resilient to Claude-in-Claude execution so a
 - [ ] If Claude-in-Claude remains unsupported, the runner automatically falls back to a supported agent or emits a clear, actionable failure before starting cycles
 - [ ] Regression coverage exists for the nested-session path
 - [ ] A fresh eval rerun produces a scorable report instead of halting after two agent failures
-
diff --git a/nightshift/cli.py b/nightshift/cli.py
@@ -180,17 +180,60 @@ def _write_rejected_cycle_artifact(
     artifact_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
 
 
+def _claude_code_session_markers() -> list[str]:
+    """Return environment markers that indicate we are inside Claude Code."""
+    markers = [
+        key
+        for key in os.environ
+        if key == "CLAUDECODE" or key.startswith("CLAUDECODE_") or key.startswith("CLAUDE_CODE_")
+    ]
+    return sorted(markers)
+
+
+def _resolve_runtime_agent(agent: str, *, allow_fallback: bool) -> tuple[str, str | None]:
+    """Return the agent Nightshift should actually launch for this run.
+
+    Claude Code sessions can block nested Claude CLI invocations. When the
+    caller requested Claude and we detect that environment, fall back to Codex
+    if it is available; otherwise raise an actionable failure before cycles
+    start.
+    """
+    if agent != "claude" or not allow_fallback:
+        return agent, None
+
+    markers = _claude_code_session_markers()
+    if not markers:
+        return agent, None
+
+    if command_exists("codex"):
+        note = f"Claude Code session detected via {', '.join(markers)}; falling back from claude to codex for this run."
+        return "codex", note
+
+    marker_text = ", ".join(markers)
+    raise NightshiftError(
+        "Claude Code session detected via "
+        f"{marker_text}, but claude cannot launch nested inside it and codex is not available. "
+        "Install codex or rerun `nightshift test --agent codex --cycles 2 --cycle-minutes 5` "
+        "from a shell without Claude Code active."
+    )
+
+
 def run_nightshift(args: argparse.Namespace, *, test_mode: bool) -> int:
     repo_dir = Path(args.repo_dir or os.getcwd()).resolve()
     if test_mode and not repo_dir.exists():
         _ensure_repo_dir(repo_dir)
     config = merge_config(repo_dir)
     agent = resolve_agent(config, args.agent)
-    config["agent"] = agent
     if getattr(args, "hours", None) is not None:
         config["hours"] = args.hours
     if getattr(args, "cycle_minutes", None) is not None:
         config["cycle_minutes"] = args.cycle_minutes
+    runtime_note: str | None = None
+    if not args.dry_run:
+        agent, runtime_note = _resolve_runtime_agent(agent, allow_fallback=True)
+        if runtime_note:
+            print_status(f"[nightshift] {runtime_note}")
+    config["agent"] = agent
     today = args.date or now_local().strftime("%Y-%m-%d")
     runtime_dir = resolve_runtime_dir(repo_dir, test_mode=test_mode)
     shift_log_dir = resolve_shift_log_relative_dir(repo_dir)

diff --git a/nightshift/core/constants.py b/nightshift/core/constants.py
@@ -805,6 +805,10 @@ def print_status(message: str) -> None:
 # runtime artifacts so `nightshift test` does not dirty the target checkout.
 TEST_RUNTIME_ARTIFACT_DIRNAME = "nightshift-test-runs"
 
+# Optional override used by eval runners to force child test runs to reuse a
+# parent-selected runtime directory.
+TEST_RUNTIME_DIR_ENV = "NIGHTSHIFT_TEST_RUNTIME_DIR"
+
 # --- Release data -----------------------------------------------------------
 
 # Regex to extract the version tag from a changelog filename (e.g. "v0.0.8").

diff --git a/nightshift/infra/worktree.py b/nightshift/infra/worktree.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import hashlib
+import os
 import shutil
 import subprocess
 import tempfile
@@ -13,6 +14,7 @@
     SAFE_ARTIFACT_GLOBS,
     SHIFT_LOG_TEMPLATE,
     TEST_RUNTIME_ARTIFACT_DIRNAME,
+    TEST_RUNTIME_DIR_ENV,
     now_local,
     print_status,
 )
@@ -75,6 +77,19 @@ def resolve_shift_log_relative_dir(repo_dir: Path) -> str:
 
 def resolve_test_runtime_dir(repo_dir: Path) -> Path:
     """Return an isolated runtime directory for test-mode runs."""
+    override = os.environ.get(TEST_RUNTIME_DIR_ENV)
+    if override:
+        override_path = Path(override)
+        if not override_path.is_absolute():
+            raise NightshiftError(f"{TEST_RUNTIME_DIR_ENV} must be an absolute path inside the system temp directory.")
+        resolved_override = override_path.resolve(strict=False)
+        temp_root = Path(tempfile.gettempdir()).resolve()
+        if resolved_override.parent != temp_root or not resolved_override.name.startswith("nightshift-eval-run-"):
+            raise NightshiftError(
+                f"{TEST_RUNTIME_DIR_ENV} must point to a direct child of {temp_root} with the "
+                "'nightshift-eval-run-' prefix."
+            )
+        return resolved_override
     digest = hashlib.sha256(str(repo_dir).encode("utf-8")).hexdigest()[:12]
     return Path(tempfile.gettempdir()) / TEST_RUNTIME_ARTIFACT_DIRNAME / f"{repo_dir.name}-{digest}"
 

diff --git a/nightshift/owl/eval_runner.py b/nightshift/owl/eval_runner.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import json
+import os
 import shutil
 import subprocess
 import tempfile
@@ -29,9 +30,10 @@
     EVALUATION_SCORE_THRESHOLD,
     EVALUATION_SHIFT_TIMEOUT,
     EVALUATION_TEMPLATE_MARKERS,
+    TEST_RUNTIME_DIR_ENV,
 )
 from nightshift.core.errors import NightshiftError
-from nightshift.core.shell import validate_repo_url
+from nightshift.core.shell import command_exists, validate_repo_url
 from nightshift.core.types import DimensionScore, EvaluationResult, ShiftArtifacts, ShiftRunResult
 from nightshift.settings.config import merge_config
 
@@ -133,6 +135,42 @@ def _build_synthetic_artifacts() -> ShiftArtifacts:
     )
 
 
+def _claude_code_session_markers() -> list[str]:
+    """Return environment markers that indicate we are inside Claude Code."""
+    markers = [
+        key
+        for key in os.environ
+        if key == "CLAUDECODE" or key.startswith("CLAUDECODE_") or key.startswith("CLAUDE_CODE_")
+    ]
+    return sorted(markers)
+
+
+def _resolve_eval_runtime_agent(agent: str) -> str:
+    """Resolve the actual agent used for a full eval run.
+
+    Claude Code sessions can block nested Claude CLI invocations. For eval
+    reruns we narrow the fallback to the child launch path so the resulting
+    report records the runtime agent that was actually selected.
+    """
+    if agent != "claude":
+        return agent
+
+    markers = _claude_code_session_markers()
+    if not markers:
+        return agent
+
+    if command_exists("codex"):
+        return "codex"
+
+    marker_text = ", ".join(markers)
+    raise NightshiftError(
+        "Claude Code session detected via "
+        f"{marker_text}, but claude cannot launch nested inside it and codex is not available. "
+        "Install codex or rerun `nightshift test --agent codex --cycles 2 --cycle-minutes 5` "
+        "from a shell without Claude Code active."
+    )
+
+
 # ---------------------------------------------------------------------------
 # Scoring (pure -- no I/O)
 # ---------------------------------------------------------------------------
@@ -573,6 +611,7 @@ def run_eval_full(
     # in _build_config(), but we re-validate here immediately before the
     # subprocess call to defend against any future bypass of the config layer.
     validate_repo_url(target)
+    runtime_agent = _resolve_eval_runtime_agent(agent)
 
     eval_dir = repo_dir / ".recursive" / "evaluations"
     eval_id = _next_eval_id(eval_dir)
@@ -598,7 +637,7 @@ def run_eval_full(
         result_data = _run_test_shift_subprocess(
             repo_dir=repo_dir,
             clone_dest=clone_dest,
-            agent=agent,
+            agent=runtime_agent,
             runtime_dir=runtime_dir,
             date=date,
         )
@@ -611,12 +650,11 @@ def run_eval_full(
     dimensions = score_artifacts(artifacts)
     total = sum(d["score"] for d in dimensions)
     max_total = sum(d["max_score"] for d in dimensions)
-
     result = EvaluationResult(
         evaluation_id=eval_id,
         date=date,
         target_repo=target,
-        agent=agent,
+        agent=runtime_agent,
         cycles=EVALUATION_DEFAULT_CYCLES,
         after_task="",
         dimensions=dimensions,
@@ -657,12 +695,15 @@ def _run_test_shift_subprocess(
         date,
     ]
     try:
+        env = os.environ.copy()
+        env[TEST_RUNTIME_DIR_ENV] = str(runtime_dir)
         proc = subprocess.run(
             cmd,
             capture_output=True,
             text=True,
             timeout=EVALUATION_SHIFT_TIMEOUT,
             cwd=str(repo_dir),
+            env=env,
         )
         return ShiftRunResult(exit_code=proc.returncode, stdout=proc.stdout, stderr=proc.stderr)
     except subprocess.TimeoutExpired: