Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions .recursive/architecture/MODULE_MAP.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Module Map

Last updated: 2026-04-09 by session #0092
Last updated: 2026-04-09 by session #0093
Generated via: `python3 -m nightshift module-map --write`
Stale after: 5 newer sessions without a refresh

Expand All @@ -14,28 +14,28 @@ Read it before opening modules one by one when you need fast orientation.
| `core/errors.py` | 7 | Nightshift error types. | `NightshiftError` | 1636b72 |
| `core/types.py` | 594 | Strict type definitions for all Nightshift data structures. | `NightshiftConfig`, `DiffScore`, `Counters`, `Baseline` | PR #231 (1052c38) |
| `settings/eval_targets.py` | 99 | Known evaluation targets and their repo-specific verification settings. | `infer_target_verify_command`, `PHRACTAL_URL`, `_KNOWN_TARGET_VERIFY_COMMANDS` | PR #258 (9bf4032) |
| `core/constants.py` | 847 | Module-level constants and tiny utilities used across the package. | `now_local`, `print_status`, `DATA_VERSION`, `SUPPORTED_AGENTS` | PR #269 (2e91d5f) |
| `core/constants.py` | 851 | Module-level constants and tiny utilities used across the package. | `now_local`, `print_status`, `DATA_VERSION`, `SUPPORTED_AGENTS` | PR #269 (2e91d5f) |
| `raven/summary.py` | 141 | Feature summary generation for Loop 2 build output. | `generate_feature_summary`, `_API_DIR_SEGMENTS`, `_CLI_DIR_SEGMENTS`, `_CONFIG_DIR_SEGMENTS` | 1636b72 |
| `core/shell.py` | 256 | Subprocess execution: streaming runner, git helper, shell utilities. | `run_command`, `run_capture`, `git`, `command_exists` | PR #269 (2e91d5f) |
| `core/state.py` | 237 | Shift state: read, write, mutate counters, JSON I/O. | `load_json`, `write_json`, `read_state`, `top_path` | session #0092 |
| `core/state.py` | 237 | Shift state: read, write, mutate counters, JSON I/O. | `load_json`, `write_json`, `read_state`, `top_path` | PR #271 (2f509ab) |
| `owl/readiness.py` | 234 | Production-readiness checks for Loop 2 feature builds. | `collect_changed_files`, `check_secrets`, `check_debug_prints`, `check_test_coverage` | PR #204 (df36eff) |
| `raven/coordination.py` | 196 | Sub-agent coordination for Loop 2 -- detects file overlaps and generates hints. | `extract_file_references`, `detect_overlaps`, `generate_coordination_hints`, `inject_hints` | PR #229 (c2acba2) |
| `infra/module_map.py` | 473 | Generate a persistent module map for fast cross-session orientation. | `module_map_path`, `generate_module_map`, `render_module_map`, `write_module_map` | PR #251 (c32e527) |
| `infra/multi.py` | 117 | Multi-repo shift orchestration: run hardening loops across multiple repos. | `validate_repos`, `format_multi_summary`, `run_multi_shift` | 1636b72 |
| `infra/release.py` | 327 | Auto-release version tagging -- checks readiness and creates GitHub releases. | `check_and_release`, `find_releasable_version` | PR #268 (3ef4d4c) |
| `owl/scoring.py` | 113 | Post-cycle diff scoring: evaluates production impact of cycle changes. | `score_diff`, `log_score` | 1636b72 |
| `settings/config.py` | 259 | Configuration loading, agent resolution, and environment detection. | `merge_config`, `prompt_for_agent`, `resolve_agent`, `infer_package_manager` | PR #269 (2e91d5f) |
| `infra/worktree.py` | 264 | Git worktree lifecycle: create, shift log, sync, revert, cleanup. | `canonical_repo_relative_path`, `resolve_nightshift_dir`, `resolve_shift_log_relative_dir`, `resolve_test_runtime_dir` | PR #258 (9bf4032) |
| `owl/eval_runner.py` | 698 | Evaluation runner: score nightshift against a target repo (or dry-run with synthetic data). | `score_artifacts`, `format_eval_table`, `run_eval_dry_run`, `run_eval_full` | PR #269 (2e91d5f) |
| `infra/worktree.py` | 279 | Git worktree lifecycle: create, shift log, sync, revert, cleanup. | `canonical_repo_relative_path`, `resolve_nightshift_dir`, `resolve_shift_log_relative_dir`, `resolve_test_runtime_dir` | session #0093 |
| `owl/eval_runner.py` | 739 | Evaluation runner: score nightshift against a target repo (or dry-run with synthetic data). | `score_artifacts`, `format_eval_table`, `run_eval_dry_run`, `run_eval_full` | session #0093 |
| `raven/e2e.py` | 113 | End-to-end test runner for Loop 2 feature builds. | `infer_test_command`, `detect_smoke_test`, `run_e2e_tests`, `_MAKEFILE_TEST_TARGET` | 1636b72 |
| `raven/profiler.py` | 547 | Repo profiling for Loop 2 -- detects language, framework, dependencies, structure. | `profile_repo` | PR #220 (d9e4320) |
| `owl/cycle.py` | 983 | Per-cycle logic: prompt building, agent dispatch, verification, evaluation. | `extract_json`, `read_repo_instructions`, `wrap_repo_instructions`, `command_for_agent` | session #0092 |
| `owl/cycle.py` | 983 | Per-cycle logic: prompt building, agent dispatch, verification, evaluation. | `extract_json`, `read_repo_instructions`, `wrap_repo_instructions`, `command_for_agent` | PR #272 (304bb7a) |
| `raven/planner.py` | 483 | Feature planner for Loop 2 -- builds structured plans from repo profiles. | `build_plan_prompt`, `validate_plan`, `parse_plan`, `execution_order` | 1636b72 |
| `raven/subagent.py` | 281 | Sub-agent spawner for Loop 2 -- executes work orders via codex or claude CLI. | `spawn_task`, `spawn_wave`, `format_wave_result`, `_TASK_COMPLETION_REQUIRED_KEYS` | 1636b72 |
| `raven/decomposer.py` | 175 | Task decomposer for Loop 2 -- converts FeaturePlans into sub-agent work orders. | `build_work_order_prompt`, `decompose_plan`, `format_work_orders` | 1636b72 |
| `raven/integrator.py` | 325 | Wave integrator for Loop 2 -- merges sub-agent work, runs tests, handles failures. | `collect_wave_files`, `stage_files`, `run_test_suite`, `diagnose_failure` | 1636b72 |
| `raven/feature.py` | 744 | Loop 2 feature-build orchestration and persisted build state. | `feature_state_path`, `feature_log_dir`, `read_feature_state`, `write_feature_state` | PR #208 (a4b3d0e) |
| `cli.py` | 723 | CLI entry points: run, test, summarize, verify-cycle, module-map. | `run_nightshift`, `summarize`, `verify_cycle_cli`, `plan_feature` | PR #258 (9bf4032) |
| `cli.py` | 766 | CLI entry points: run, test, summarize, verify-cycle, module-map. | `run_nightshift`, `summarize`, `verify_cycle_cli`, `plan_feature` | PR #258 (9bf4032) |
| `__main__.py` | 5 | Entry point for python3 -m nightshift. | `main` | 2802c51 |
| `__init__.py` | 502 | Nightshift -- autonomous overnight codebase improvement agent. | `AGENT_DEFAULT_MODELS`, `BACKEND_DIR_NAMES`, `BACKEND_EXTENSIONS`, `CATEGORY_ORDER` | PR #269 (2e91d5f) |

Expand All @@ -50,8 +50,8 @@ Topological order derived from internal `nightshift.*` imports.

## Recent Shipped Sessions

- PR #273: docs: record eval 0020 rerun
- PR #272: fix: neutralize repo instruction delimiters
- PR #271: fix: sanitize corrupt state counters
- PR #268: fix: use --notes-file tempfile in release.py to prevent gh @ file expansion (C-4)
- PR #269: fix: validate eval_target_repo URL and use mkdtemp for clone dest (C-1, C-2)
- PR #267: fix: guard int(v) in category_counts, deduplicate VALID_CATEGORIES
- PR #266: feat: sanitize category_counts on load, add dominance and eval scorer tests
- PR #265: fix: apply category allowlist to cycle.py dominance guard
21 changes: 21 additions & 0 deletions .recursive/evaluations/0093.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Evaluation 0093

**Date**: 2026-04-09
**Target**: https://github.com/fazxes/Phractal
**Agent**: codex

## Scorecard

| Dimension | Score | Max | Notes |
|-----------|------:|----:|-------|
| Startup | 8 | 10 | exit=0 |
| Discovery | 6 | 10 | fixes=2 issues=0 |
| Fix quality | 10 | 10 | 2/2 structured |
| Shift log | 3 | 10 | template unfilled |
| State file | 10 | 10 | 2 structured fixes; category_counts populated |
| Verification | 10 | 10 | 2/2 passed |
| Guard rails | 9 | 10 | clean |
| Clean state | 10 | 10 | clean |
| Breadth | 6 | 10 | 2 categories |
| Usefulness | 6 | 10 | fixes=2 tests=0 |
| **TOTAL** | **78** | **100** | |
21 changes: 21 additions & 0 deletions .recursive/evaluations/0094.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Evaluation 0094

**Date**: 2026-04-09
**Target**: https://github.com/fazxes/Phractal
**Agent**: codex

## Scorecard

| Dimension | Score | Max | Notes |
|-----------|------:|----:|-------|
| Startup | 8 | 10 | exit=0 |
| Discovery | 6 | 10 | fixes=2 issues=0 |
| Fix quality | 10 | 10 | 2/2 structured |
| Shift log | 3 | 10 | template unfilled |
| State file | 10 | 10 | 2 structured fixes; category_counts populated |
| Verification | 10 | 10 | 2/2 passed |
| Guard rails | 9 | 10 | clean |
| Clean state | 10 | 10 | clean |
| Breadth | 6 | 10 | 2 categories |
| Usefulness | 6 | 10 | fixes=2 tests=0 |
| **TOTAL** | **78** | **100** | |
5 changes: 2 additions & 3 deletions .recursive/tasks/0277.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
---
status: pending
status: done
priority: urgent
target: v0.0.9
vision_section: self-maintaining
created: 2026-04-09
source: evaluation-0020
completed:
completed: 2026-04-09
---

# Make Claude eval reruns scorable from Claude Code sessions
Expand All @@ -24,4 +24,3 @@ Make the eval runner or launch path resilient to Claude-in-Claude execution so a
- [ ] If Claude-in-Claude remains unsupported, the runner automatically falls back to a supported agent or emits a clear, actionable failure before starting cycles
- [ ] Regression coverage exists for the nested-session path
- [ ] A fresh eval rerun produces a scorable report instead of halting after two agent failures

45 changes: 44 additions & 1 deletion nightshift/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,17 +180,60 @@ def _write_rejected_cycle_artifact(
artifact_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")


def _claude_code_session_markers() -> list[str]:
"""Return environment markers that indicate we are inside Claude Code."""
markers = [
key
for key in os.environ
if key == "CLAUDECODE" or key.startswith("CLAUDECODE_") or key.startswith("CLAUDE_CODE_")
]
return sorted(markers)


def _resolve_runtime_agent(agent: str, *, allow_fallback: bool) -> tuple[str, str | None]:
"""Return the agent Nightshift should actually launch for this run.

Claude Code sessions can block nested Claude CLI invocations. When the
caller requested Claude and we detect that environment, fall back to Codex
if it is available; otherwise raise an actionable failure before cycles
start.
"""
if agent != "claude" or not allow_fallback:
return agent, None

markers = _claude_code_session_markers()
if not markers:
return agent, None

if command_exists("codex"):
note = f"Claude Code session detected via {', '.join(markers)}; falling back from claude to codex for this run."
return "codex", note

marker_text = ", ".join(markers)
raise NightshiftError(
"Claude Code session detected via "
f"{marker_text}, but claude cannot launch nested inside it and codex is not available. "
"Install codex or rerun `nightshift test --agent codex --cycles 2 --cycle-minutes 5` "
"from a shell without Claude Code active."
)


def run_nightshift(args: argparse.Namespace, *, test_mode: bool) -> int:
repo_dir = Path(args.repo_dir or os.getcwd()).resolve()
if test_mode and not repo_dir.exists():
_ensure_repo_dir(repo_dir)
config = merge_config(repo_dir)
agent = resolve_agent(config, args.agent)
config["agent"] = agent
if getattr(args, "hours", None) is not None:
config["hours"] = args.hours
if getattr(args, "cycle_minutes", None) is not None:
config["cycle_minutes"] = args.cycle_minutes
runtime_note: str | None = None
if not args.dry_run:
agent, runtime_note = _resolve_runtime_agent(agent, allow_fallback=True)
if runtime_note:
print_status(f"[nightshift] {runtime_note}")
config["agent"] = agent
today = args.date or now_local().strftime("%Y-%m-%d")
runtime_dir = resolve_runtime_dir(repo_dir, test_mode=test_mode)
shift_log_dir = resolve_shift_log_relative_dir(repo_dir)
Expand Down
4 changes: 4 additions & 0 deletions nightshift/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -805,6 +805,10 @@ def print_status(message: str) -> None:
# runtime artifacts so `nightshift test` does not dirty the target checkout.
TEST_RUNTIME_ARTIFACT_DIRNAME = "nightshift-test-runs"

# Optional override used by eval runners to force child test runs to reuse a
# parent-selected runtime directory.
TEST_RUNTIME_DIR_ENV = "NIGHTSHIFT_TEST_RUNTIME_DIR"

# --- Release data -----------------------------------------------------------

# Regex to extract the version tag from a changelog filename (e.g. "v0.0.8").
Expand Down
15 changes: 15 additions & 0 deletions nightshift/infra/worktree.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import hashlib
import os
import shutil
import subprocess
import tempfile
Expand All @@ -13,6 +14,7 @@
SAFE_ARTIFACT_GLOBS,
SHIFT_LOG_TEMPLATE,
TEST_RUNTIME_ARTIFACT_DIRNAME,
TEST_RUNTIME_DIR_ENV,
now_local,
print_status,
)
Expand Down Expand Up @@ -75,6 +77,19 @@ def resolve_shift_log_relative_dir(repo_dir: Path) -> str:

def resolve_test_runtime_dir(repo_dir: Path) -> Path:
"""Return an isolated runtime directory for test-mode runs."""
override = os.environ.get(TEST_RUNTIME_DIR_ENV)
if override:
override_path = Path(override)
if not override_path.is_absolute():
raise NightshiftError(f"{TEST_RUNTIME_DIR_ENV} must be an absolute path inside the system temp directory.")
resolved_override = override_path.resolve(strict=False)
temp_root = Path(tempfile.gettempdir()).resolve()
if resolved_override.parent != temp_root or not resolved_override.name.startswith("nightshift-eval-run-"):
raise NightshiftError(
f"{TEST_RUNTIME_DIR_ENV} must point to a direct child of {temp_root} with the "
"'nightshift-eval-run-' prefix."
)
return resolved_override
digest = hashlib.sha256(str(repo_dir).encode("utf-8")).hexdigest()[:12]
return Path(tempfile.gettempdir()) / TEST_RUNTIME_ARTIFACT_DIRNAME / f"{repo_dir.name}-{digest}"

Expand Down
49 changes: 45 additions & 4 deletions nightshift/owl/eval_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from __future__ import annotations

import json
import os
import shutil
import subprocess
import tempfile
Expand All @@ -29,9 +30,10 @@
EVALUATION_SCORE_THRESHOLD,
EVALUATION_SHIFT_TIMEOUT,
EVALUATION_TEMPLATE_MARKERS,
TEST_RUNTIME_DIR_ENV,
)
from nightshift.core.errors import NightshiftError
from nightshift.core.shell import validate_repo_url
from nightshift.core.shell import command_exists, validate_repo_url
from nightshift.core.types import DimensionScore, EvaluationResult, ShiftArtifacts, ShiftRunResult
from nightshift.settings.config import merge_config

Expand Down Expand Up @@ -133,6 +135,42 @@ def _build_synthetic_artifacts() -> ShiftArtifacts:
)


def _claude_code_session_markers() -> list[str]:
"""Return environment markers that indicate we are inside Claude Code."""
markers = [
key
for key in os.environ
if key == "CLAUDECODE" or key.startswith("CLAUDECODE_") or key.startswith("CLAUDE_CODE_")
]
return sorted(markers)


def _resolve_eval_runtime_agent(agent: str) -> str:
"""Resolve the actual agent used for a full eval run.

Claude Code sessions can block nested Claude CLI invocations. For eval
reruns we narrow the fallback to the child launch path so the resulting
report records the runtime agent that was actually selected.
"""
if agent != "claude":
return agent

markers = _claude_code_session_markers()
if not markers:
return agent

if command_exists("codex"):
return "codex"

marker_text = ", ".join(markers)
raise NightshiftError(
"Claude Code session detected via "
f"{marker_text}, but claude cannot launch nested inside it and codex is not available. "
"Install codex or rerun `nightshift test --agent codex --cycles 2 --cycle-minutes 5` "
"from a shell without Claude Code active."
)


# ---------------------------------------------------------------------------
# Scoring (pure -- no I/O)
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -573,6 +611,7 @@ def run_eval_full(
# in _build_config(), but we re-validate here immediately before the
# subprocess call to defend against any future bypass of the config layer.
validate_repo_url(target)
runtime_agent = _resolve_eval_runtime_agent(agent)

eval_dir = repo_dir / ".recursive" / "evaluations"
eval_id = _next_eval_id(eval_dir)
Expand All @@ -598,7 +637,7 @@ def run_eval_full(
result_data = _run_test_shift_subprocess(
repo_dir=repo_dir,
clone_dest=clone_dest,
agent=agent,
agent=runtime_agent,
runtime_dir=runtime_dir,
date=date,
)
Expand All @@ -611,12 +650,11 @@ def run_eval_full(
dimensions = score_artifacts(artifacts)
total = sum(d["score"] for d in dimensions)
max_total = sum(d["max_score"] for d in dimensions)

result = EvaluationResult(
evaluation_id=eval_id,
date=date,
target_repo=target,
agent=agent,
agent=runtime_agent,
cycles=EVALUATION_DEFAULT_CYCLES,
after_task="",
dimensions=dimensions,
Expand Down Expand Up @@ -657,12 +695,15 @@ def _run_test_shift_subprocess(
date,
]
try:
env = os.environ.copy()
env[TEST_RUNTIME_DIR_ENV] = str(runtime_dir)
proc = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=EVALUATION_SHIFT_TIMEOUT,
cwd=str(repo_dir),
env=env,
)
return ShiftRunResult(exit_code=proc.returncode, stdout=proc.stdout, stderr=proc.stderr)
except subprocess.TimeoutExpired:
Expand Down
Loading
Loading