From c31221bec6d8914aaba1b4000a51731b4eeb8f2b Mon Sep 17 00:00:00 2001 From: ndjama Date: Sat, 6 Jun 2026 16:26:33 +0200 Subject: [PATCH 01/19] fix: cgh status display, DuckDB default + empty Endpoints Two cosmetic bugs in cgh status: - A fresh repo with no DB on disk said '(would create graph.db)', but DuckDB has been the default backend since v0.4, so a first index creates graph.duckdb. Corrected the text. - The Endpoints row rendered as a bare ', ' when counts came from the FTS-only or unknown fallback (no graph read). Now shows 'unknown (graph locked)' or 'unknown' to match the Files cell. --- codegraph/cli/commands_monitor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/codegraph/cli/commands_monitor.py b/codegraph/cli/commands_monitor.py index c476d45..4f06c14 100644 --- a/codegraph/cli/commands_monitor.py +++ b/codegraph/cli/commands_monitor.py @@ -544,10 +544,10 @@ def cmd_status(args) -> None: endpoints_cell = f"{endpoint_count:,}" elif counts_source == "fts_only": files_cell = f"[dim]graph locked[/dim]{fts_suffix}" - endpoints_cell = "[dim], [/dim]" + endpoints_cell = "[dim]unknown (graph locked)[/dim]" else: files_cell = "[dim]unknown[/dim]" - endpoints_cell = "[dim], [/dim]" + endpoints_cell = "[dim]unknown[/dim]" table.add_row("Files", files_cell) table.add_row("Endpoints", endpoints_cell) table.add_row("Extra dirs", ", ".join(extra_dirs) if extra_dirs else "[dim]none[/dim]") @@ -624,7 +624,7 @@ def _backend_status_line(root: str) -> str: f"[dim]none on disk[/dim] " f"[dim](CGH_DB={env_value!r}, next `cgh index` writes a {env_backend} DB)[/dim]" ) - return "[dim]none on disk[/dim] [dim](would create graph.db)[/dim]" + return "[dim]none on disk[/dim] [dim](would create graph.duckdb)[/dim]" def _size(p: Path) -> str: size = p.stat().st_size From 2645d1624b85e067b391e13f1fa487e863c033bf Mon Sep 17 00:00:00 2001 From: ndjama Date: Sat, 6 Jun 2026 16:39:32 +0200 Subject: [PATCH 02/19] feat: resolve the codegraph root from any subdirectory, like git cgh treated the working directory literally and only looked for .codegraph/ right there, so running cgh status / index / serve from a subdir of an initialized repo reported no index even though the root one directory up had it. find_codegraph_root walks up to the nearest ancestor with a .codegraph/, the way git finds its repo root via .git. main() resolves --root through it for every command except init/setup (which create in the literal directory) and the internal _serve_owner / _reindex_hook (which get an explicit root). The 'Using codegraph root: ...' hint prints to stderr so stdout and --json output stay clean. Cross-platform: uses pathlib resolve()/.parents, which stop at the drive root on Windows and the filesystem root on POSIX. Nearest .codegraph/ wins, so a federated child's subdir resolves to the child, not the parent. Tests cover current-dir, ancestor, absent, and nearest-wins. --- codegraph/__main__.py | 20 +++++++++++++++ codegraph/core/config.py | 15 +++++++++++ tests/test_core/test_find_root.py | 42 +++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) create mode 100644 tests/test_core/test_find_root.py diff --git a/codegraph/__main__.py b/codegraph/__main__.py index 17afb29..93436c6 100644 --- a/codegraph/__main__.py +++ b/codegraph/__main__.py @@ -13,6 +13,7 @@ import argparse import os import sys +from pathlib import Path from rich.panel import Panel from rich.table import Table @@ -445,6 +446,25 @@ def error(self, message: str) -> None: # type: ignore[override] _print_help() return + # Resolve the codegraph root by walking up to the nearest .codegraph/, the + # way git finds its repo root via .git. This lets every command work from + # a subdirectory of an initialized repo. init/setup create in the literal + # directory, and _serve_owner / _reindex_hook get an explicit root from + # their spawner, so those opt out. The hint goes to stderr to keep stdout + # clean for --json output and piping. + _NO_ROOT_WALK = {"init", "setup", "_serve_owner", "_reindex_hook"} + if args.cmd not in _NO_ROOT_WALK and getattr(args, "root", None): + from codegraph.core.config import find_codegraph_root + + discovered = find_codegraph_root(args.root) + if discovered is not None and discovered != Path(args.root).resolve(): + from rich.console import Console as _Console + + _Console(stderr=True).print( + f"[dim]Using codegraph root: {discovered}[/dim]" + ) + args.root = str(discovered) + dispatch = { "init": cmd_init, "setup": cmd_setup, diff --git a/codegraph/core/config.py b/codegraph/core/config.py index 887bf9e..3edf295 100644 --- a/codegraph/core/config.py +++ b/codegraph/core/config.py @@ -47,6 +47,21 @@ CLAUDE_HOME = Path.home() / ".claude" +def find_codegraph_root(start: "str | Path") -> "Path | None": + """Walk up from ``start`` to the nearest ancestor that has a .codegraph/ + directory, the way git finds its repo root via .git. Returns that + directory, or None if none is found up to the filesystem root. + + This lets every read command work from a subdirectory of an initialized + repo: a file deep in the tree still knows it belongs to the cgh root. + """ + p = Path(start).resolve() + for d in [p, *p.parents]: + if (d / CODEGRAPH_DIR).is_dir(): + return d + return None + + def _claude_project_slug_from_abs(abs_path: str) -> str: """Slug Claude Code uses for ~/.claude/projects//. diff --git a/tests/test_core/test_find_root.py b/tests/test_core/test_find_root.py new file mode 100644 index 0000000..d56e089 --- /dev/null +++ b/tests/test_core/test_find_root.py @@ -0,0 +1,42 @@ +# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# +# __creation__ = 2026-06-06 +# __author__ = "jndjama (Joy Ndjama)" +# __copyright__ = "Copyright 2026 ALTIKVA." +# __licence__ = "MIT & CC BY-NC-SA (http://www.altikva.com/licenses/LICENSE-1.0)" +# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# +# Description: find_codegraph_root walks up to the nearest .codegraph/, the way +# git finds its repo root, so cgh works from any subdirectory. + +from __future__ import annotations + +from codegraph.core.config import find_codegraph_root + + +def test_finds_in_current_dir(tmp_path): + (tmp_path / ".codegraph").mkdir() + assert find_codegraph_root(tmp_path) == tmp_path.resolve() + + +def test_finds_in_ancestor(tmp_path): + (tmp_path / ".codegraph").mkdir() + deep = tmp_path / "a" / "b" / "c" + deep.mkdir(parents=True) + assert find_codegraph_root(deep) == tmp_path.resolve() + + +def test_returns_none_when_absent(tmp_path): + deep = tmp_path / "x" / "y" + deep.mkdir(parents=True) + assert find_codegraph_root(deep) is None + + +def test_nearest_root_wins(tmp_path): + # A federated child has its own .codegraph/ inside the parent's. From the + # child's subdir, the nearest (the child) must win, not the parent. + (tmp_path / ".codegraph").mkdir() + child = tmp_path / "child" + child.mkdir() + (child / ".codegraph").mkdir() + sub = child / "sub" + sub.mkdir() + assert find_codegraph_root(sub) == child.resolve() From e44740199218519b03255c4da4cf550f11285efa Mon Sep 17 00:00:00 2001 From: ndjama Date: Sun, 7 Jun 2026 22:21:27 +0200 Subject: [PATCH 03/19] fix(security): close audit findings in the MCP owner and tools - Auth token compared with hmac.compare_digest instead of != (timing-safe); it is the loopback bridge's only auth check. - Removed the dead auth env-injection path (inject_auth_key_into_mcp_json, validate_server_auth_key were never called) and corrected the auth.py lifecycle docstring: the 0600 file contents are the shared secret, there is no env hand-off. The .codegraph/ dir is now chmod 0700 at creation so auth.key's parent is owner-only too. - index_changed_files rejects a 'since' ref starting with '-' and appends '--', so a value like '--output=PATH' can't be parsed as a git flag (argument injection via the MCP arg). - pattern_search passes the user pattern after '--' (ripgrep) and via '-e' (git-grep), so a pattern like '--pre=sh' can't reach ripgrep's preprocessor (code exec). - force_index refuses absolute paths resolving outside the repo root (new _within_repo guard) and surfaces them as refused_outside_repo, instead of indexing arbitrary files the project never declared. - Pinned the mermaid CDN script to 11.4.1 with an SRI integrity hash and crossorigin, so a compromised CDN can't run JS in the generated report. Tests: _within_repo containment cases; full suite green (391). --- codegraph/analysis/pattern.py | 8 ++-- codegraph/core/config.py | 7 +++ codegraph/server/__init__.py | 6 ++- codegraph/server/tools_index.py | 27 ++++++++++- codegraph/state/auth.py | 48 +++---------------- codegraph/viz/html.py | 4 +- .../test_force_index_containment.py | 29 +++++++++++ 7 files changed, 82 insertions(+), 47 deletions(-) create mode 100644 tests/test_server/test_force_index_containment.py diff --git a/codegraph/analysis/pattern.py b/codegraph/analysis/pattern.py index 9cf2c9b..0b66de2 100644 --- a/codegraph/analysis/pattern.py +++ b/codegraph/analysis/pattern.py @@ -116,8 +116,9 @@ def _run_rg( args.append("--fixed-strings") if glob: args.extend(["--glob", glob]) - args.append(pattern) - args.append(str(root)) + # "--" stops the pattern from being parsed as a flag: without it a + # pattern like "--pre=sh" reaches ripgrep's preprocessor (code exec). + args.extend(["--", pattern, str(root)]) try: r = subprocess.run(args, capture_output=True, text=True, encoding="utf-8", errors="replace", timeout=30) except (subprocess.TimeoutExpired, OSError): @@ -169,7 +170,8 @@ def _run_git_grep( args.append("-F") else: args.append("-E") - args.append(pattern) + # "-e " so a pattern starting with "-" is never read as a flag. + args.extend(["-e", pattern]) if glob: args.extend(["--", glob]) try: diff --git a/codegraph/core/config.py b/codegraph/core/config.py index 3edf295..ea76539 100644 --- a/codegraph/core/config.py +++ b/codegraph/core/config.py @@ -375,6 +375,13 @@ def init_project(root: Path) -> dict: cg_dir.mkdir(parents=True) created.append(str(cg_dir)) + # Restrict the index dir to the owner: auth.key lives here and is the + # whole loopback-auth boundary. No-op on filesystems without POSIX modes. + try: + cg_dir.chmod(0o700) + except OSError: + pass + config_path = cg_dir / CONFIG_FILE if not config_path.exists(): config_path.write_text(generate_default_config(), encoding="utf-8") diff --git a/codegraph/server/__init__.py b/codegraph/server/__init__.py index 140b94d..0cf4464 100644 --- a/codegraph/server/__init__.py +++ b/codegraph/server/__init__.py @@ -361,6 +361,8 @@ def _cleanup(): _atexit.register(_cleanup) # Build auth middleware, rejects any request without the bearer token + import hmac + from starlette.middleware.base import BaseHTTPMiddleware from starlette.responses import JSONResponse from starlette.types import ASGIApp @@ -374,7 +376,9 @@ async def dispatch(self, request, call_next): # Accept any path on 127.0.0.1 with correct bearer header = request.headers.get("authorization", "") expected = f"Bearer {self._token}" - if header != expected: + # Constant-time compare so the loopback port gives no timing oracle + # on the token (this is the system's only auth check). + if not hmac.compare_digest(header, expected): return JSONResponse( {"error": "unauthorized"}, status_code=401, diff --git a/codegraph/server/tools_index.py b/codegraph/server/tools_index.py index 0e270a0..5fc102f 100644 --- a/codegraph/server/tools_index.py +++ b/codegraph/server/tools_index.py @@ -13,6 +13,16 @@ from pathlib import Path +def _within_repo(target: Path, root: Path) -> bool: + """True if ``target`` resolves inside ``root``. Used to keep force_index + from reading arbitrary absolute paths the repo never declared.""" + try: + target.resolve().relative_to(root.resolve()) + return True + except ValueError: + return False + + def _load_config_toml(root: Path) -> tuple[Path, dict]: """Load .codegraph/config.toml. Returns (config_path, data).""" import tomllib @@ -76,8 +86,12 @@ def force_index(paths: list[str], confirmed: bool = False) -> str: # Step 1: Preview, collect files that would be indexed preview_files = [] + refused: list[str] = [] for p in paths: target = Path(p) if os.path.isabs(p) else (root / p) if root else Path(p) + if root is not None and not _within_repo(target, root): + refused.append(str(target)) + continue if target.is_file(): if is_supported(target): preview_files.append(str(target.relative_to(root) if root else target)) @@ -100,6 +114,7 @@ def force_index(paths: list[str], confirmed: bool = False) -> str: ), "files_to_index": preview_files, "file_count": len(preview_files), + "refused_outside_repo": refused, }, indent=2, ) @@ -112,6 +127,10 @@ def force_index(paths: list[str], confirmed: bool = False) -> str: for p in paths: target = Path(p) if os.path.isabs(p) else (root / p) if root else Path(p) + if root is not None and not _within_repo(target, root): + refused.append(str(target)) + continue + if target.is_file(): try: ok = index_file(target, root, force=True) @@ -146,6 +165,7 @@ def force_index(paths: list[str], confirmed: bool = False) -> str: "indexed": indexed, "skipped": skipped, "errors": errors, + "refused_outside_repo": refused, "indexed_count": len(indexed), }, indent=2, @@ -342,7 +362,12 @@ def index_changed_files(since: str = "HEAD~1") -> str: if since == "staged": cmd = ["git", "diff", "--cached", "--name-only", "--diff-filter=ACMR"] else: - cmd = ["git", "diff", "--name-only", "--diff-filter=ACMR", since] + # Reject a leading dash so a value like "--output=/path" can't be + # parsed as a git flag (argument injection via the MCP arg). The + # trailing "--" keeps the ref from being read as a pathspec. + if since.startswith("-"): + return json.dumps({"error": f"invalid git ref: {since!r}"}) + cmd = ["git", "diff", "--name-only", "--diff-filter=ACMR", since, "--"] try: result = subprocess.run( diff --git a/codegraph/state/auth.py b/codegraph/state/auth.py index 8b39348..97e6ab1 100644 --- a/codegraph/state/auth.py +++ b/codegraph/state/auth.py @@ -6,22 +6,22 @@ # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# # Description: MCP auth key management: generation, storage, validation. # -# The auth key protects the MCP server from unauthorized access. -# Defense-in-depth for when codegraph moves to HTTP transport. +# The auth key protects the owner's loopback HTTP bridge from other local +# processes. It is the shared secret behind the Bearer-token check. # # Key lifecycle: -# 1. `cgh init` generates the key → .codegraph/auth.key -# 2. `cgh setup` injects it into .mcp.json as CODEGRAPH_AUTH_KEY env var -# 3. Server reads CODEGRAPH_AUTH_KEY on startup and validates requests +# 1. `cgh init` (or the first owner) generates the key -> .codegraph/auth.key, +# mode 0600, gitignored. +# 2. Both the owner and every worker/CLI caller read that file via +# ensure_auth_key() and send `Authorization: Bearer `. +# The file contents are the secret; there is no env-var hand-off. from __future__ import annotations -import os import secrets from pathlib import Path AUTH_KEY_FILE = "auth.key" -AUTH_KEY_ENV = "CODEGRAPH_AUTH_KEY" _CODEGRAPH_DIR = ".codegraph" @@ -84,37 +84,3 @@ def ensure_gitignore_has_auth_key(repo_root: str | Path) -> bool: f.write(f"\n# codegraph auth key (never commit)\n{pattern}\n") return True return False - - -def inject_auth_key_into_mcp_json(repo_root: str | Path, key: str) -> bool: - """ - Add CODEGRAPH_AUTH_KEY to the codegraph server env in .mcp.json. - Returns True if the file was modified. - """ - import json - - mcp_path = Path(repo_root) / ".mcp.json" - if not mcp_path.exists(): - return False - - data = json.loads(mcp_path.read_text(encoding="utf-8")) - servers = data.get("mcpServers", {}) - cg_server = servers.get("codegraph") - if cg_server is None: - return False - - env = cg_server.setdefault("env", {}) - if env.get(AUTH_KEY_ENV) == key: - return False # already set - - env[AUTH_KEY_ENV] = key - mcp_path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8") - return True - - -def validate_server_auth_key() -> str | None: - """ - Read the auth key from environment on server startup. - Returns the key if set, None if auth is disabled (no key configured). - """ - return os.environ.get(AUTH_KEY_ENV) diff --git a/codegraph/viz/html.py b/codegraph/viz/html.py index 64c6d0b..0d02d03 100644 --- a/codegraph/viz/html.py +++ b/codegraph/viz/html.py @@ -116,7 +116,9 @@ - +