From 57bb3aa2daa0efd9878029482d180ae92a5723d8 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 06:15:33 +0000 Subject: [PATCH 01/13] feat: bash output cache, diff-aware re-read, TOML/YAML/JSON sections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds three large optimisation surfaces that previously left tokens on the table: 1. Bash output interception. A new PostToolUse(Bash) hook persists large stdout/stderr to disk under `data_dir() / bash_outputs` and records the command in the session cache. On a repeat invocation the pre-Bash hint suggests `token-goat bash-output ` (with `--head`, `--tail`, and `--grep` slicers) instead of re-running. Disk store is 16 MB-capped with oldest-first eviction; per-file outputs above 2 MB are tail-preserved with a truncation marker. New CLI: `bash-output`, `bash-history`. 2. Diff-aware re-read. `post_read` writes a per-session content snapshot (256 KB / 150 entries per session) so a re-read after a Write/Edit produces a unified diff hint instead of either a stale "already read" suggestion (suppressed by the existing edited-after-read guard) or a full file re-Read. The diff is bounded to 4 KB and only fires when the realised saving clears ~250 tokens; below that the existing hint path runs. Stats record both realised savings (`diff_hint`) and the hint's injection cost (`diff_hint_overhead`) for honest accounting. 3. TOML/YAML/JSON section extraction. `.toml`, `.yaml`, `.yml` are now indexed (no new third-party dependency — line scanners plus the stdlib tomllib are sufficient). Pretty-printed JSON gains depth-1 section detection so the existing `token-goat section` flow works on `pyproject.toml::tool.ruff`, `deploy.yaml::spec`, and `package.json::scripts` without falling back to a full Read. Wires the new hook into both Claude Code's settings.json PostToolUse and the Codex config.toml hooks block. Background worker now sweeps stale snapshot directories (24h) and re-evicts the bash-output store on startup. `reset_session` removes per-session snapshots. Tests: six new test modules cover the storage layer, hint builders, hook integration, end-to-end pre/post sequencing, CLI surface, and the new language extractors. --- CHANGELOG.md | 12 + README.md | 5 + src/token_goat/bash_cache.py | 423 +++++++++++++++++++++++++++ src/token_goat/cli.py | 114 ++++++++ src/token_goat/hints.py | 234 ++++++++++++++- src/token_goat/hooks_cli.py | 3 + src/token_goat/hooks_read.py | 295 ++++++++++++++++++- src/token_goat/install.py | 26 +- src/token_goat/languages/json_idx.py | 141 ++++++++- src/token_goat/languages/toml_idx.py | 131 +++++++++ src/token_goat/languages/yaml_idx.py | 194 ++++++++++++ src/token_goat/parser.py | 5 + src/token_goat/session.py | 254 +++++++++++++++- src/token_goat/snapshots.py | 303 +++++++++++++++++++ src/token_goat/worker.py | 35 ++- tests/test_bash_cache.py | 144 +++++++++ tests/test_bash_cli.py | 84 ++++++ tests/test_bash_dedup_hint.py | 86 ++++++ tests/test_diff_hint_integration.py | 74 +++++ tests/test_languages_config.py | 118 ++++++++ tests/test_snapshots.py | 119 ++++++++ 21 files changed, 2772 insertions(+), 28 deletions(-) create mode 100644 src/token_goat/bash_cache.py create mode 100644 src/token_goat/languages/toml_idx.py create mode 100644 src/token_goat/languages/yaml_idx.py create mode 100644 src/token_goat/snapshots.py create mode 100644 tests/test_bash_cache.py create mode 100644 tests/test_bash_cli.py create mode 100644 tests/test_bash_dedup_hint.py create mode 100644 tests/test_diff_hint_integration.py create mode 100644 tests/test_languages_config.py create mode 100644 tests/test_snapshots.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 3952041..805d557 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,18 @@ All notable changes to Token-Goat are documented in this file. Format follows Ke ## [Unreleased] +### Added + +- **Bash output interception.** A new `PostToolUse(Bash)` hook persists large stdout/stderr to disk under `data_dir() / "bash_outputs"` and records the command in the session cache. When the same command is about to run again in the same session, the pre-Bash hint suggests `token-goat bash-output ` (optionally with `--head N`, `--tail N`, or `--grep PATTERN`) instead of re-executing — avoiding both runtime cost and duplicated tokens. The store is byte-capped (16 MB default) with oldest-first eviction; outputs above 2 MB are tail-preserved with a truncation marker. Two new CLI commands surface the cache: `token-goat bash-output` retrieves a sliced view, `token-goat bash-history` lists cached entries newest-first. +- **Diff-aware re-read.** `post_read` now writes a per-session content snapshot (under `data_dir() / "session_snapshots"`, capped at 256 KB per file and 150 snapshots per session) so a follow-up `Read` after a `Write`/`Edit`/`MultiEdit` can be answered with a unified diff hint instead of a `pre_read` blocking message that silently allowed the full re-read. The diff is bounded to 4 KB and only fires when the realised saving exceeds ~250 tokens; below that the existing session-cache hint path runs unchanged. Stats record both the realised saving (`diff_hint`) and the hint's injection cost (`diff_hint_overhead`) for honest accounting. +- **TOML, YAML, and JSON section extraction.** `token-goat section pyproject.toml::tool.ruff` (and the equivalents for `.yaml`, `.yml`, and pretty-printed `.json`) now extract a single table/key block instead of forcing a full-file read. The TOML scanner emits one `Section` per `[table]` and `[[array]]` header; the YAML scanner emits top-level keys plus one nested layer (`spec.replicas`-style) computed from the file's detected indent; JSON gains depth-1 section detection on pretty-printed files. None of the three pulls in an extra dependency — all use line-scanners and the existing stdlib parsers. +- **Stale-data sweeps in the background worker.** `cleanup_on_startup` now also drops snapshot directories older than 24 hours and enforces the bash-output byte cap, so a long-lived install does not accumulate per-session debris. + +### Changed + +- **`reset_session`** now also removes per-session content snapshots, matching the existing JSON-cache reset semantics. +- **Codex Bash matcher in `~/.codex/config.toml`** now points at the new `post-bash` hook instead of `post-read`; under Codex, `post-read` previously did nothing for `Bash` calls (no branch in the handler), so this is a strict gain. + ## [0.5.2] - 2026-05-17 ### Fixed diff --git a/README.md b/README.md index e5449c8..83875e5 100644 --- a/README.md +++ b/README.md @@ -46,8 +46,11 @@ Each one is preventable. Token-Goat intercepts all three, automatically. |--------------------|-----------------| | 3.3 MB screenshot lands in model context | 84 KB compressed copy — 97.4% smaller | | Agent re-reads files from earlier in the session | "Already read this" reminder with narrow slice suggestion | +| Agent re-reads a file edited mid-session | Unified diff injected as a hint — full Read avoided when the diff covers the change | | Compaction forgets which files were edited | Structured session manifest injected before compact | | Full file read for one function or section | `token-goat read file::symbol` — about 85% smaller | +| Same `pytest` / `cargo` / `git log` re-run mid-session | Pre-Bash dedup hint points at the cached output (`token-goat bash-output `) | +| `token-goat section pyproject.toml::tool.ruff` | One TOML table extracted instead of the whole config; same for `.yaml`, `.yml`, `.json` | > Four hours of use on the author's machine: **59.7 MB** of data that never hit the model, with an estimated **11.5 million tokens** avoided. @@ -190,6 +193,8 @@ The `--openclaw` flag patches Claude Code and drops a TypeScript bridge plugin i | `token-goat map` | Get a compact orientation of the repo. Add `--compact` to fit a 300-token budget. | | `token-goat gdrive-sections ` | List the heading outline of a Google Doc without fetching the body. | | `token-goat stats` | See how many tokens you have saved. Shows a per-source breakdown (image / hint / read / compact). | +| `token-goat bash-output ` | Retrieve a cached Bash output by ID. Filter with `--head N`, `--tail N`, or `--grep PATTERN` to avoid re-running the command. | +| `token-goat bash-history` | List cached Bash outputs (newest first) with their IDs, byte sizes, and exit codes. | | `token-goat compact-hint --session-id ` | Inspect the compaction manifest for a session | | `token-goat install` | Wire up hooks and autostart. `--dry-run` previews the changes, `--verify` audits an existing install. | | `token-goat doctor` | Confirm everything is wired correctly | diff --git a/src/token_goat/bash_cache.py b/src/token_goat/bash_cache.py new file mode 100644 index 0000000..8434c07 --- /dev/null +++ b/src/token_goat/bash_cache.py @@ -0,0 +1,423 @@ +"""Persistent store for cached Bash tool output. + +Every PostToolUse(Bash) hook invocation records the command's stdout/stderr to a +short text file under ``data_dir() / "bash_outputs"`` keyed by a content-derived +ID. Subsequent invocations of the same command in the same session can detect +the duplicate via :func:`session.lookup_bash_entry`, and agents can retrieve +sliced views of any cached output via the ``token-goat bash-output`` CLI. + +Why a separate disk store (vs. session JSON): + +* Bash output can be megabytes (build logs, test runs). Inlining that into the + session JSON would bloat every subsequent load/save round trip on the hot + pre-read path. Storing the bytes once on disk and only a short ID in the + session keeps the session JSON cheap. + +* The CLI retrieval path (``token-goat bash-output``) can stream the file + directly without re-parsing JSON. + +* Retention is simple to bound by total bytes: scan the directory, evict the + oldest files until the cap is met. No cross-session coordination is needed. + +The store is intentionally fail-soft: any I/O error on write is logged and +swallowed so a hook never aborts because the cache is full or read-only. +""" +from __future__ import annotations + +__all__ = [ + "DEFAULT_MAX_TOTAL_BYTES", + "OUTPUT_FILENAME_RE", + "BashOutputMeta", + "command_hash", + "evict_old_entries", + "load_output", + "load_output_meta", + "output_id_for", + "store_output", +] + +import hashlib +import json +import logging +import os +import re +import stat as _stat_module +import time +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import cast + +from . import paths +from .hooks_common import sanitize_log_str + +_LOG = logging.getLogger("token_goat.bash_cache") + +# Total byte budget for the on-disk bash output store. When exceeded, the +# oldest entries (by mtime) are evicted until the cap is met. 16 MB is small +# enough to be invisible on any modern disk while big enough to hold several +# full build/test logs (~1-3 MB each is typical). +DEFAULT_MAX_TOTAL_BYTES: int = 16 * 1024 * 1024 + +# Filename pattern: --.txt +# The components are intentionally kept short so the path stays well within +# any platform's PATH_MAX even when the data dir already lives several levels +# deep (e.g. roaming AppData on Windows). +OUTPUT_FILENAME_RE = re.compile(r"^[a-zA-Z0-9_\-]{1,80}\.txt$") + +# Sentinel placed at the head of every output file marking the truncation +# boundary, so a reader can immediately see when the stored bytes are partial. +_TRUNC_MARKER = "[token-goat: bash output truncated to {n} bytes; full size was {total} bytes]\n" + +# Maximum bytes stored per output file. Larger captures are truncated head-only +# (tail is preserved because the failing portion of a test log is usually at the +# end). 2 MB matches read_replacement._MAX_READ_BYTES so the surgical retrieval +# commands can return the entire stored file when asked. +_MAX_STORED_BYTES: int = 2 * 1024 * 1024 + + +@dataclass +class BashOutputMeta: + """Metadata associated with a cached Bash output entry. + + Persisted in the session cache (small) alongside an ID that points at the + on-disk file (potentially large). Carries everything a future pre-bash + dedup check needs without re-reading the body from disk. + """ + + output_id: str + cmd_sha: str + cmd_preview: str + stdout_bytes: int + stderr_bytes: int + exit_code: int | None + ts: float + truncated: bool + + +def _bash_outputs_dir() -> Path: + """Return ``data_dir() / "bash_outputs"`` and create it on first use.""" + d = paths.data_dir() / "bash_outputs" + d.mkdir(parents=True, exist_ok=True) + return d + + +def command_hash(command: str) -> str: + """Return a short content hash for *command* (first 16 hex chars of SHA-256). + + Commands are compared for dedup purposes only — not authenticated — so a + cryptographic hash is overkill, but SHA-256 is stdlib, fast, and gives a + very low collision rate for the small number of commands ever stored per + session (a few hundred at most). Truncated to 16 chars to keep filenames + short while leaving ~64 bits of collision resistance. + """ + return hashlib.sha256(command.encode("utf-8", errors="replace")).hexdigest()[:16] + + +def output_id_for(session_id: str, command: str, ts: float | None = None) -> str: + """Build a filesystem-safe ID for the (session, command, time) tuple. + + The ID embeds a short session prefix and a millisecond timestamp so two + invocations of the same command in the same session do not collide; both + are kept and the latest wins on dedup lookups, but each cached output + remains addressable for forensic retrieval. + + Session ID is short-prefixed (16 chars) because :func:`session.validate_session_id` + already caps it at 128 chars and stripping to 16 keeps total filename length + under 50 chars. Non-alphanumeric characters are replaced with ``_``. + """ + safe_session = re.sub(r"[^a-zA-Z0-9_\-]", "_", session_id)[:16] or "anon" + ms = int((ts if ts is not None else time.time()) * 1000) + return f"{safe_session}-{ms:013d}-{command_hash(command)}" + + +def _safe_join(output_id: str) -> Path | None: + """Validate *output_id* and return the corresponding cache file path. + + Returns ``None`` (with a warning log) when the ID is malformed — for example + a traversal attempt like ``../etc/passwd`` or an embedded null byte. The + on-disk store is a sibling of other token-goat data; an attacker-influenced + ID must not be able to walk out of it. + """ + if not output_id: + return None + name = f"{output_id}.txt" + if not OUTPUT_FILENAME_RE.match(name): + _LOG.warning("bash_cache: rejected output_id with invalid chars: %r", sanitize_log_str(output_id)) + return None + base = _bash_outputs_dir().resolve() + candidate = (base / name).resolve() + try: + candidate.relative_to(base) + except ValueError: + _LOG.warning("bash_cache: rejected output_id escaping base dir: %r", sanitize_log_str(output_id)) + return None + return candidate + + +def store_output( + session_id: str, + command: str, + stdout: str, + stderr: str, + exit_code: int | None, + *, + max_total_bytes: int = DEFAULT_MAX_TOTAL_BYTES, +) -> BashOutputMeta | None: + """Write *stdout* + *stderr* to the cache and return descriptive metadata. + + Returns ``None`` on any I/O error so the calling hook can degrade silently. + Output larger than ``_MAX_STORED_BYTES`` is tail-preserved (head truncated) + because failing test output is typically at the bottom. After the write the + function opportunistically evicts the oldest files until the total store size + is back under ``max_total_bytes``; the eviction is best-effort and a failed + pass simply leaves the directory slightly over budget — the next call will + try again. + """ + try: + out_id = output_id_for(session_id, command) + path = _safe_join(out_id) + if path is None: + return None + + stdout_bytes = len(stdout.encode("utf-8", errors="replace")) + stderr_bytes = len(stderr.encode("utf-8", errors="replace")) + total = stdout_bytes + stderr_bytes + truncated = False + body_parts: list[str] = [] + + if total > _MAX_STORED_BYTES: + # Preserve the tail: take the last _MAX_STORED_BYTES of the + # combined stream, prefixing a truncation marker so any consumer + # immediately knows what they are looking at. We compose the + # combined stream as stdout then a blank line then stderr; this + # matches what the agent would have seen had it copied the tool + # result directly. + combined = stdout + if stderr: + combined = f"{stdout}\n--- stderr ---\n{stderr}" if stdout else stderr + keep = combined[-_MAX_STORED_BYTES:] + body_parts.append(_TRUNC_MARKER.format(n=_MAX_STORED_BYTES, total=total)) + body_parts.append(keep) + truncated = True + else: + if stdout: + body_parts.append(stdout) + if stderr: + if stdout: + body_parts.append("\n--- stderr ---\n") + body_parts.append(stderr) + + body = "".join(body_parts) + paths.atomic_write_text(path, body) + + meta = BashOutputMeta( + output_id=out_id, + cmd_sha=command_hash(command), + cmd_preview=sanitize_log_str(command, max_len=120), + stdout_bytes=stdout_bytes, + stderr_bytes=stderr_bytes, + exit_code=exit_code, + ts=time.time(), + truncated=truncated, + ) + + # Best-effort eviction. We do not wait or retry: if the directory + # walk fails (e.g. concurrent worker activity, antivirus lock) the + # cap is enforced on the next call. + evict_old_entries(max_total_bytes=max_total_bytes) + + _LOG.debug( + "bash_cache: stored id=%s bytes=%d truncated=%s", + out_id, total, truncated, + ) + return meta + except OSError as exc: + _LOG.warning("bash_cache: store failed: %s", exc) + return None + + +def load_output(output_id: str) -> str | None: + """Return the cached output body for *output_id*, or ``None`` if absent.""" + path = _safe_join(output_id) + if path is None or not path.exists(): + return None + try: + return path.read_text(encoding="utf-8", errors="replace") + except OSError as exc: + _LOG.warning("bash_cache: load failed for %s: %s", sanitize_log_str(output_id), exc) + return None + + +def load_output_meta(output_id: str) -> dict[str, object] | None: + """Return stat-derived metadata for an output file (size, mtime), or None. + + Used by ``token-goat bash-history`` to render a listing without reading + every body. + """ + path = _safe_join(output_id) + if path is None or not path.exists(): + return None + try: + st = path.stat() + except OSError: + return None + return { + "output_id": output_id, + "size_bytes": int(st.st_size), + "mtime": float(st.st_mtime), + } + + +def evict_old_entries(*, max_total_bytes: int = DEFAULT_MAX_TOTAL_BYTES) -> int: + """Evict the oldest files until total size is at or under *max_total_bytes*. + + Returns the number of files removed. Skips symlinks (defensive: an + attacker who can plant a symlink into the cache directory should not be + able to direct deletes elsewhere by name). All errors are swallowed — + eviction is opportunistic, not authoritative. + """ + try: + d = _bash_outputs_dir() + except OSError: + return 0 + + entries: list[tuple[Path, float, int]] = [] + total = 0 + try: + for fp in d.iterdir(): + if not fp.name.endswith(".txt"): + continue + if not OUTPUT_FILENAME_RE.match(fp.name): + continue + try: + st = os.lstat(fp) + except OSError: + continue + if _stat_module.S_ISLNK(st.st_mode): + _LOG.warning("bash_cache: skipping symlink in cache dir: %s", fp.name) + continue + entries.append((fp, float(st.st_mtime), int(st.st_size))) + total += int(st.st_size) + except OSError: + return 0 + + if total <= max_total_bytes: + return 0 + + entries.sort(key=lambda t: t[1]) # oldest first + removed = 0 + for fp, _mtime, size in entries: + if total <= max_total_bytes: + break + try: + fp.unlink() + total -= size + removed += 1 + except OSError: + continue + if removed: + _LOG.info( + "bash_cache: evicted %d entries to fit cap=%d bytes", + removed, max_total_bytes, + ) + return removed + + +def list_outputs() -> list[dict[str, object]]: + """Return metadata for every cached output, newest first. + + Used by ``token-goat bash-history`` for human inspection. Returns an + empty list when the directory is missing or unreadable; never raises. + """ + try: + d = _bash_outputs_dir() + except OSError: + return [] + + results: list[dict[str, object]] = [] + try: + for fp in d.iterdir(): + if not fp.name.endswith(".txt"): + continue + if not OUTPUT_FILENAME_RE.match(fp.name): + continue + try: + st = fp.stat() + except OSError: + continue + results.append({ + "output_id": fp.stem, + "size_bytes": int(st.st_size), + "mtime": float(st.st_mtime), + }) + except OSError: + return results + + # ``mtime`` values are stored as native floats above; the lambda's annotated + # ``object`` return type is broadened by the surrounding ``dict[str, object]``, + # so a cast keeps the sort comparator concrete for mypy without changing + # runtime behaviour. + def _mtime_key(r: dict[str, object]) -> float: + return float(cast(float, r["mtime"])) + + results.sort(key=_mtime_key, reverse=True) + return results + + +def sidecar_meta_path(output_id: str) -> Path | None: + """Return the sidecar JSON metadata path for *output_id*, or None on invalid ID. + + The sidecar stores the structured :class:`BashOutputMeta` so that callers + (CLI, hints) can answer questions like "what was the exit code?" without + re-parsing the body. Sidecar absence is non-fatal: the cache body is + always the source of truth for output text. + """ + base = _safe_join(output_id) + if base is None: + return None + return base.with_suffix(".json") + + +def write_sidecar(meta: BashOutputMeta) -> None: + """Persist *meta* as a JSON sidecar next to its output file (best-effort).""" + p = sidecar_meta_path(meta.output_id) + if p is None: + return + try: + paths.atomic_write_text(p, json.dumps(asdict(meta), ensure_ascii=False)) + except OSError as exc: + _LOG.debug("bash_cache: sidecar write failed for %s: %s", meta.output_id, exc) + + +def read_sidecar(output_id: str) -> BashOutputMeta | None: + """Return parsed :class:`BashOutputMeta` from the sidecar JSON, or None. + + Tolerant of older sidecars that lack fields added later — missing fields + fall back to safe defaults so an old cache survives a token-goat upgrade. + """ + p = sidecar_meta_path(output_id) + if p is None or not p.exists(): + return None + try: + data = json.loads(p.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return None + if not isinstance(data, dict): + return None + try: + return BashOutputMeta( + output_id=str(data.get("output_id", output_id)), + cmd_sha=str(data.get("cmd_sha", "")), + cmd_preview=str(data.get("cmd_preview", "")), + stdout_bytes=int(data.get("stdout_bytes", 0)), + stderr_bytes=int(data.get("stderr_bytes", 0)), + exit_code=( + int(data["exit_code"]) + if isinstance(data.get("exit_code"), (int, float)) + else None + ), + ts=float(data.get("ts", 0.0)), + truncated=bool(data.get("truncated", False)), + ) + except (TypeError, ValueError): + return None diff --git a/src/token_goat/cli.py b/src/token_goat/cli.py index 77a6966..3cb2146 100644 --- a/src/token_goat/cli.py +++ b/src/token_goat/cli.py @@ -951,6 +951,111 @@ def stats( cli_stats.stats(window=window, json_output=json_output) +@app.command("bash-output", rich_help_panel="Core") +def cmd_bash_output( + output_id: str = typer.Argument(..., help="ID returned by the post-bash hook or `bash-history`."), + head: int = typer.Option(0, "--head", help="Show first N lines (0 = no head limit)"), + tail: int = typer.Option(0, "--tail", help="Show last N lines (0 = no tail limit)"), + grep: str | None = typer.Option(None, "--grep", "-g", help="Show only lines matching the (case-sensitive) substring"), + json_output: bool = typer.Option(False, "--json"), +) -> None: + """Retrieve a sliced view of a cached Bash output. + + The post-Bash hook stores each non-trivial command output to disk under + ``data_dir() / "bash_outputs"``. Use this command to retrieve specific + parts of that output without forcing the agent to re-run the command — + typically much cheaper in tokens. + + Combine ``--head``, ``--tail``, and ``--grep`` to narrow further; without + any filter the whole cached body is returned. JSON mode includes the + full path and stored byte size so a caller can decide whether to slice + again. + """ + from . import bash_cache # noqa: PLC0415 + + body = bash_cache.load_output(output_id) + if body is None: + _error(f"no cached output for id: {output_id}") + raise typer.Exit(1) + + lines = body.splitlines() + if grep: + lines = [ln for ln in lines if grep in ln] + if head > 0: + lines = lines[: head] + if tail > 0: + lines = lines[-tail :] + sliced = "\n".join(lines) + + if json_output: + meta = bash_cache.load_output_meta(output_id) or {} + sidecar = bash_cache.read_sidecar(output_id) + payload: dict[str, object] = { + "output_id": output_id, + "text": sliced, + "lines": len(lines), + } + payload.update(meta) + if sidecar is not None: + payload["cmd_preview"] = sidecar.cmd_preview + payload["exit_code"] = sidecar.exit_code + payload["truncated"] = sidecar.truncated + typer.echo(json.dumps(payload, ensure_ascii=False, indent=2)) + return + + typer.echo(sliced) + + +@app.command("bash-history", rich_help_panel="Core") +def cmd_bash_history( + json_output: bool = typer.Option(False, "--json"), + limit: int = typer.Option(20, "--limit", "-n", help="Maximum entries to show (newest first)"), +) -> None: + """List cached Bash outputs, newest first. + + Helpful when you want to find an earlier command's output without + re-running it. Each row shows the cache ID, byte size, age, and (if a + sidecar file is present) the command preview and exit code. Use the ID + with ``token-goat bash-output `` to retrieve the body. + """ + from . import bash_cache # noqa: PLC0415 + + entries = bash_cache.list_outputs() + if limit > 0: + entries = entries[:limit] + + if json_output: + out: list[dict[str, object]] = [] + for e in entries: + sidecar = bash_cache.read_sidecar(str(e["output_id"])) + row = dict(e) + if sidecar is not None: + row["cmd_preview"] = sidecar.cmd_preview + row["exit_code"] = sidecar.exit_code + row["truncated"] = sidecar.truncated + out.append(row) + typer.echo(json.dumps(out, ensure_ascii=False, indent=2)) + return + + if not entries: + typer.echo("(no cached Bash outputs)") + return + + now = time.time() + for e in entries: + oid = str(e["output_id"]) + size = int(cast(int, e["size_bytes"])) + age = int(now - float(cast(float, e["mtime"]))) + sidecar = bash_cache.read_sidecar(oid) + cmd_str = sidecar.cmd_preview if sidecar is not None else "(no sidecar)" + exit_str = ( + f" exit={sidecar.exit_code}" + if sidecar is not None and sidecar.exit_code is not None + else "" + ) + typer.echo(f"{oid} {size:>10,}B {age:>6}s ago{exit_str} {cmd_str}") + + @app.command(rich_help_panel="Install") def doctor( # noqa: C901 fix: bool = typer.Option( # noqa: B008 @@ -1176,6 +1281,15 @@ def post_read( hooks_cli.safe_run("post-read", input_file, _parse_harness(harness)) +@hook_app.command(context_settings=_HOOK_CTX) +def post_bash( + input_file: Path | None = _INPUT_OPT, + harness: str = _HARNESS_OPT, +) -> None: + """Hook: post-bash event (caches Bash output for dedup + retrieval).""" + hooks_cli.safe_run("post-bash", input_file, _parse_harness(harness)) + + @hook_app.command(context_settings=_HOOK_CTX) def pre_compact( input_file: Path | None = _INPUT_OPT, diff --git a/src/token_goat/hints.py b/src/token_goat/hints.py index 32f3e27..82e2bc5 100644 --- a/src/token_goat/hints.py +++ b/src/token_goat/hints.py @@ -1,17 +1,24 @@ """Builds informational hints for PreToolUse on Read.""" from __future__ import annotations +import difflib import logging import sqlite3 import time from pathlib import Path from typing import TypedDict -from . import db, session +from . import db, session, snapshots from .hooks_common import sanitize_log_str, validate_cwd from .project import find_project -__all__ = ["ReadHint", "build_read_hint"] +__all__ = [ + "DIFF_HINT_MAX_BYTES", + "ReadHint", + "build_bash_dedup_hint", + "build_diff_hint", + "build_read_hint", +] _LOG = logging.getLogger("token_goat.hints") @@ -554,3 +561,226 @@ def _hint_from_index( f"Use a full Read if you need the surrounding context.", 0, ) + + +# --------------------------------------------------------------------------- +# Diff-aware re-read hint +# --------------------------------------------------------------------------- + +# Largest diff (in bytes of unified-diff output) eligible for inclusion in the +# hint. Beyond this the diff itself stops being a saving — it would push more +# tokens into context than the original Read. 4 KB ≈ 1100 tokens, comfortably +# smaller than even a small full-file Read and still big enough to express +# meaningful refactoring changes (typically tens of changed lines). +DIFF_HINT_MAX_BYTES: int = 4096 + +# Minimum *raw* tokens saved (full-file tokens - diff tokens) before the diff +# hint is emitted. Below this the hint text and diff itself approach the +# saving they advertise, so the nudge is suppressed entirely. ~250 tokens +# represents roughly 15 lines of code — the rough breakeven point with the +# ~80-token hint preamble. +_DIFF_HINT_MIN_TOKENS_SAVED: int = 250 + +# Number of context lines kept around each changed hunk in the unified diff. +# Two lines on each side is the same default git uses for code review — wide +# enough to anchor a hunk visually but narrow enough to keep diff bytes low. +_DIFF_CONTEXT_LINES: int = 2 + + +def build_diff_hint( + *, + session_id: str, + file_path: str, + current_text: str, +) -> ReadHint | None: + """Return a diff-based hint when a snapshot is available and the diff fits. + + Computes a unified diff between the prior session snapshot of *file_path* + and *current_text* (the file's contents the agent is about to re-read). + When the diff is small enough to inject as ``additionalContext`` and + represents a meaningful saving over re-reading the whole file, returns a + :class:`ReadHint` carrying the diff in a fenced code block. + + Returns ``None`` (no hint) when: + + * no snapshot exists for this (session, file_path) + * the snapshot is identical to current contents (no diff to show) + * the file is the same length but no meaningful change is detected + * the diff would exceed :data:`DIFF_HINT_MAX_BYTES` + * the realized saving falls below :data:`_DIFF_HINT_MIN_TOKENS_SAVED` + + Never raises; any unexpected exception is caught at module boundary and + the hint is suppressed (an error in hint generation must not break the + pre-read hook's fail-soft contract). + """ + try: + return _build_diff_hint_inner( + session_id=session_id, file_path=file_path, current_text=current_text, + ) + except Exception as exc: # noqa: BLE001 — fail-soft for the hot pre-read path + _LOG.warning( + "build_diff_hint: unexpected error for %r (session=%s): %s", + file_path, (session_id or "")[:16], exc, exc_info=True, + ) + return None + + +def _build_diff_hint_inner( + *, + session_id: str, + file_path: str, + current_text: str, +) -> ReadHint | None: + """Inner implementation of :func:`build_diff_hint`; may raise.""" + snapshot_bytes = snapshots.load(session_id, file_path) + if snapshot_bytes is None: + return None + + # Decode defensively: snapshots are stored as raw bytes so an arbitrary + # binary file (or one with mixed encodings) does not crash the diff. + snapshot_text = snapshot_bytes.decode("utf-8", errors="replace") + if snapshot_text == current_text: + return None + + fname = _sanitize_hint_path(Path(file_path).name) + + snapshot_lines = snapshot_text.splitlines(keepends=True) + current_lines = current_text.splitlines(keepends=True) + diff_iter = difflib.unified_diff( + snapshot_lines, + current_lines, + fromfile=f"{fname} (previously read)", + tofile=f"{fname} (current)", + n=_DIFF_CONTEXT_LINES, + lineterm="", + ) + diff_text = "".join(diff_iter) + if not diff_text: + # difflib returns nothing when the sequences are identical at the line + # level (e.g. only trailing-newline differences). Treat that as "no + # change worth reporting" — re-read is the safe path. + return None + + diff_bytes = len(diff_text.encode("utf-8")) + if diff_bytes > DIFF_HINT_MAX_BYTES: + _LOG.debug( + "build_diff_hint: diff too large (%d bytes > %d cap) for %s — suppressing", + diff_bytes, DIFF_HINT_MAX_BYTES, fname, + ) + return None + + # Compute the saving: full-file re-read tokens minus diff tokens. Both + # the hint preamble and the fenced diff text cost tokens, so the saving + # we record is the net — what the agent actually avoids in conversation. + full_tokens = _est_tokens_from_chars(len(current_text)) + diff_tokens = _est_tokens_from_chars(diff_bytes) + tokens_saved = max(0, full_tokens - diff_tokens) + if tokens_saved < _DIFF_HINT_MIN_TOKENS_SAVED: + _LOG.debug( + "build_diff_hint: saving too small (%d < %d) for %s — suppressing", + tokens_saved, _DIFF_HINT_MIN_TOKENS_SAVED, fname, + ) + return None + + return ReadHint( + f"Note: `{fname}` was edited in this session since you last read it. " + f"Unified diff against the prior read (saves ~{tokens_saved} tokens vs. a full Read):\n" + f"```diff\n{diff_text}\n```\n" + f"If the diff covers what you need, skip the full Read.", + tokens_saved, + ) + + +# --------------------------------------------------------------------------- +# Bash dedup hint +# --------------------------------------------------------------------------- + + +def build_bash_dedup_hint( + *, + session_id: str, + command: str, + cache: session.SessionCache | None = None, +) -> ReadHint | None: + """Return a hint when *command* was run earlier in this session. + + The pre-Bash hook calls this before executing a Bash command. When the + same command has been run before and its output cached on disk, we suggest + the agent retrieve the cached output via ``token-goat bash-output`` + instead of re-running — avoiding both the runtime cost and the duplicated + output bytes in the conversation. + + Returns ``None`` (no hint) when: + + * no session_id is provided + * the command has never been recorded + * the previous output was too small to be worth deduplicating + * the previous output is older than :data:`STALE_READ_AGE_SECONDS` + (same staleness boundary used by the read-dedup path: above that + window the model's context has likely scrolled past the old result) + """ + try: + return _build_bash_dedup_hint_inner( + session_id=session_id, command=command, cache=cache, + ) + except Exception as exc: # noqa: BLE001 — fail-soft for the hot pre-bash path + _LOG.warning( + "build_bash_dedup_hint: unexpected error (session=%s): %s", + (session_id or "")[:16], exc, exc_info=True, + ) + return None + + +# Minimum output size before the dedup hint fires. Re-running `ls` is cheap; +# re-running `pytest -v` is not. Below ~400 bytes (~100 tokens) the hint +# preamble approaches the saving it advertises. +_BASH_DEDUP_MIN_BYTES: int = 400 + + +def _build_bash_dedup_hint_inner( + *, + session_id: str, + command: str, + cache: session.SessionCache | None, +) -> ReadHint | None: + """Inner implementation; may raise. + + Imported lazily so the hot pre-read path does not pay the bash_cache + import cost on every Read invocation — bash_cache is only needed when + we are actually about to dispatch a Bash dedup. + """ + if not session_id or not command: + return None + + from . import bash_cache # noqa: PLC0415 + + cmd_sha = bash_cache.command_hash(command) + entry = session.lookup_bash_entry(session_id, cmd_sha, cache=cache) + if entry is None: + return None + + age = time.time() - entry.ts + if age > STALE_READ_AGE_SECONDS: + _LOG.debug( + "build_bash_dedup_hint: prior run stale (age=%.0fs > %ds); suppressing", + age, STALE_READ_AGE_SECONDS, + ) + return None + + total_bytes = entry.stdout_bytes + entry.stderr_bytes + if total_bytes < _BASH_DEDUP_MIN_BYTES: + return None + + tokens_avoided = _est_tokens_from_chars(total_bytes) + cmd_short = _sanitize_hint_path(command) + exit_str = "" if entry.exit_code is None else f", exit={entry.exit_code}" + return ReadHint( + f"Note: this Bash command ran ~{int(age)}s ago in this session " + f"({total_bytes:,} bytes of output{exit_str}). " + f"Re-running adds ~{tokens_avoided} tokens. " + f"`token-goat bash-output {entry.output_id}` returns the cached result — " + f"add `--tail 50` or `--grep PATTERN` to slice it. " + f"Command: `{cmd_short}`.", + tokens_avoided, + ) + diff --git a/src/token_goat/hooks_cli.py b/src/token_goat/hooks_cli.py index 4e8be26..96987d4 100644 --- a/src/token_goat/hooks_cli.py +++ b/src/token_goat/hooks_cli.py @@ -322,6 +322,7 @@ def wrapper(payload: HookPayload) -> HookResponse: "pre-fetch": ("hooks_fetch", "pre_fetch"), "post-edit": ("hooks_edit", "post_edit"), "post-read": ("hooks_read", "post_read"), + "post-bash": ("hooks_read", "post_bash"), } _HANDLER_CACHE: dict[str, Callable[[HookPayload], HookResponse]] = {} @@ -359,6 +360,7 @@ def __getattr__(name: str) -> object: "pre_fetch": "pre-fetch", "post_edit": "post-edit", "post_read": "post-read", + "post_bash": "post-bash", } if name in event_map: handler = _resolve_handler(event_map[name]) @@ -446,6 +448,7 @@ def _proxy(payload: HookPayload) -> HookResponse: "pre-fetch": _make_lazy_proxy("pre-fetch"), "post-edit": _make_lazy_proxy("post-edit"), "post-read": _make_lazy_proxy("post-read"), + "post-bash": _make_lazy_proxy("post-bash"), "pre-compact": pre_compact, } diff --git a/src/token_goat/hooks_read.py b/src/token_goat/hooks_read.py index 9c19a58..42e6a0d 100644 --- a/src/token_goat/hooks_read.py +++ b/src/token_goat/hooks_read.py @@ -31,7 +31,7 @@ """ from __future__ import annotations -__all__ = ["post_read", "pre_read"] +__all__ = ["post_bash", "post_read", "pre_read"] from pathlib import Path @@ -146,6 +146,49 @@ def _try_shrink_image( return None +def _try_snapshot( + session_id: str, + file_path: str, + *, + cache: object | None = None, +) -> None: + """Persist a content snapshot for *file_path* so future diff hints can fire. + + Skips files that cannot be read (transient I/O race, permission denied) or + that exceed :data:`snapshots.MAX_SNAPSHOT_BYTES` (the diff would not fit + in a hint anyway). Records the resulting SHA in the session so the + pre-read hook can skip the disk roundtrip when no change has occurred. + """ + from . import session, snapshots # noqa: PLC0415 + + try: + with Path(file_path).open("rb") as fh: + data = fh.read(snapshots.MAX_SNAPSHOT_BYTES + 1) + except OSError as exc: + _LOG.debug( + "post-read snapshot: cannot read %s: %s", + sanitize_log_str(file_path), exc, + ) + return + if len(data) > snapshots.MAX_SNAPSHOT_BYTES: + _LOG.debug( + "post-read snapshot: skipping oversized file %s (%d bytes)", + sanitize_log_str(file_path), len(data), + ) + return + + result = snapshots.store(session_id, file_path, data) + if result is None: + return + try: + session.set_snapshot_sha(session_id, file_path, result.content_sha, cache=cache) + except (ValueError, OSError) as exc: + _LOG.debug( + "post-read snapshot: failed to persist SHA for %s: %s", + sanitize_log_str(file_path), exc, + ) + + def _record_session_hint_impact(file_path: str, hint: str) -> None: """Record net impact of session hints: avoided re-reads minus injection overhead. @@ -183,12 +226,120 @@ def _record_session_hint_impact(file_path: str, hint: str) -> None: ) +def _try_diff_hint( + session_id: str, file_path: str +) -> HookResponse | None: + """Return a diff-hint hook response when one applies, otherwise ``None``. + + Loads *file_path* from disk so the diff builder can compare against the + stored session snapshot. Skips files that cannot be read or that exceed + the snapshot size cap (the snapshot would be missing in that case anyway). + + Records the realized saving as a ``diff_hint`` stat row plus a + ``diff_hint_overhead`` row covering the hint's own injection cost — same + honest-accounting pattern used by the session_hint path. + """ + from . import db, snapshots # noqa: PLC0415 + from .hints import build_diff_hint # noqa: PLC0415 + + try: + with Path(file_path).open("rb") as fh: + current_bytes = fh.read(snapshots.MAX_SNAPSHOT_BYTES + 1) + except OSError as exc: + _LOG.debug("diff-hint: cannot read %s: %s", sanitize_log_str(file_path), exc) + return None + if len(current_bytes) > snapshots.MAX_SNAPSHOT_BYTES: + # Beyond the snapshot cap there is nothing on disk to diff against; + # fall back to the standard hint path. + return None + + current_text = current_bytes.decode("utf-8", errors="replace") + hint = build_diff_hint( + session_id=session_id, file_path=file_path, current_text=current_text, + ) + if hint is None: + return None + + safe_path = sanitize_log_str(file_path, max_len=512) + realized_tokens = hint.tokens_saved + realized_bytes = realized_tokens * 4 + injection_bytes = len(hint) + from .hints import CHARS_PER_TOKEN # noqa: PLC0415 + injection_cost_tokens = max(1, int(injection_bytes / CHARS_PER_TOKEN)) + db.record_stat( + None, "diff_hint", + bytes_saved=realized_bytes, tokens_saved=realized_tokens, detail=safe_path, + ) + db.record_stat( + None, "diff_hint_overhead", + bytes_saved=-injection_bytes, tokens_saved=-injection_cost_tokens, detail=safe_path, + ) + _LOG.info( + "pre-read: diff-hint injected for %s (tokens_saved=%d)", + sanitize_log_str(file_path), realized_tokens, + ) + return pre_tool_use_with_context(str(hint)) + + +def _handle_bash_dedup(payload: HookPayload) -> HookResponse | None: + """Return a dedup hint when this exact Bash command ran earlier in the session. + + Looks up the command's content hash in :attr:`session.SessionCache.bash_history`; + on a hit, suggests retrieving the cached output via ``token-goat bash-output`` + rather than re-running. Returns ``None`` to let the hook fall through to + the normal bash-as-read handling when no dedup hit is available. + """ + from . import db, session # noqa: PLC0415 + from .hints import CHARS_PER_TOKEN, build_bash_dedup_hint # noqa: PLC0415 + + session_id, _cwd = get_session_context(payload) + if not session_id: + return None + + tool_input = get_tool_input(payload) + command = tool_input.get("command") + if not isinstance(command, str) or not command: + return None + + try: + cache = session.load(session_id) + except (OSError, ValueError): + return None + + hint = build_bash_dedup_hint( + session_id=session_id, command=command, cache=cache, + ) + if hint is None: + return None + + realized_tokens = hint.tokens_saved + injection_bytes = len(hint) + injection_cost_tokens = max(1, int(injection_bytes / CHARS_PER_TOKEN)) + db.record_stat( + None, "bash_dedup_hint", + bytes_saved=realized_tokens * 4, tokens_saved=realized_tokens, + detail=sanitize_log_str(command, max_len=200), + ) + db.record_stat( + None, "bash_dedup_hint_overhead", + bytes_saved=-injection_bytes, tokens_saved=-injection_cost_tokens, + detail=sanitize_log_str(command, max_len=200), + ) + _LOG.info( + "pre-read: bash-dedup hint injected (tokens_saved=%d)", realized_tokens, + ) + return pre_tool_use_with_context(str(hint)) + + def pre_read(payload: HookPayload) -> HookResponse: - """Pre-read hook: image shrinking and session-cache hints. + """Pre-read hook: image shrinking, dedup hints, and diff-aware re-read hints. Dispatches based on tool_name: - - Bash: Convert read-equivalent commands (cat, head, etc.) to Read, then recurse. - - Read: Attempt image shrinking, then emit session hints (if cached or large-file candidate). + - Bash: first try dedup against prior bash output; then fall through to + convert read-equivalent commands (cat, head, etc.) to Read and recurse. + - Read: Attempt image shrinking, then emit diff hint (if file was edited + since last read and a snapshot exists) or fall back to session hints + (cached re-read or large-file surgical-read suggestion). - Other: Pass through unchanged (CONTINUE). Returns hook response dict with optional updatedInput (image shrinking) or @@ -199,6 +350,14 @@ def pre_read(payload: HookPayload) -> HookResponse: tool_name = payload.get("tool_name") if tool_name == "Bash": + # Step 1: detect duplicate Bash command from this session. This must + # happen *before* the read-equivalent dispatch because re-running + # `cat file.py` after editing should pull the cached output rather + # than re-dispatching through the Read pipeline. + dedup = _handle_bash_dedup(payload) + if dedup is not None: + return dedup + read_payload = _handle_bash_read_equivalent(payload) if read_payload: # Recurse once with a synthesized Read payload so image-shrink and @@ -233,6 +392,16 @@ def pre_read(payload: HookPayload) -> HookResponse: cache = session.load(session_id) + # Diff-aware path: file was read AND edited in this session AND we have + # a snapshot to compare against. When applicable, the diff hint replaces + # the standard cache hint — both communicate the same idea (you've seen + # this file before) but the diff carries the actually-changed bytes. + entry = cache.files.get(session._normalize_path(file_path)) # type: ignore[attr-defined] + if entry is not None and entry.last_edit_ts > entry.last_read_ts: + diff_response = _try_diff_hint(session_id, file_path) + if diff_response is not None: + return diff_response + hint = build_read_hint( session_id=session_id, file_path=file_path, @@ -287,6 +456,10 @@ def post_read(payload: HookPayload) -> HookResponse: "post-read: recorded Read file=%s offset=%s limit=%s", sanitize_log_str(file_path), offset, limit, ) + # Capture a content snapshot so a future re-read after an edit can + # be served as a small unified diff instead of a full-file Read. + # Best-effort — snapshot failures never block the hook. + _try_snapshot(session_id, file_path, cache=cache) elif tool_name == "Grep": pattern = tool_input.get("pattern") path = tool_input.get("path") @@ -311,3 +484,117 @@ def post_read(payload: HookPayload) -> HookResponse: _LOG.debug("post-read: Glob pattern=%s path=%s", sanitize_opt(pattern), sanitize_opt(path)) return CONTINUE() + + +# --------------------------------------------------------------------------- +# post_bash — record Bash output to the on-disk cache + session history +# --------------------------------------------------------------------------- + + +# Bash outputs smaller than this are not worth caching to disk: the dedup hint +# would suppress on size anyway, and the disk + JSON churn outweighs the +# savings. Aligned with the dedup minimum so we never cache something we +# would later refuse to surface. +_BASH_CACHE_MIN_BYTES: int = 400 + + +def _extract_bash_response(payload: HookPayload) -> tuple[str, str, int | None]: + """Pull (stdout, stderr, exit_code) from a PostToolUse Bash payload. + + Defensive against payload shape drift between harness versions: each field + is read at multiple plausible keys and falls back to empty/None when absent. + Non-string stdout/stderr is coerced via :func:`str` so a future change to + structured output (e.g. JSON tool result) does not crash the hook. + """ + raw_resp = payload.get("tool_response") or payload.get("tool_result") or {} + if not isinstance(raw_resp, dict): + return "", "", None + stdout_val = raw_resp.get("stdout") or raw_resp.get("output") or "" + stderr_val = raw_resp.get("stderr") or "" + exit_val = raw_resp.get("exit_code") + if exit_val is None: + exit_val = raw_resp.get("returncode") + stdout = stdout_val if isinstance(stdout_val, str) else str(stdout_val) + stderr = stderr_val if isinstance(stderr_val, str) else str(stderr_val) + exit_code: int | None = None + if isinstance(exit_val, int) and not isinstance(exit_val, bool): + exit_code = exit_val + return stdout, stderr, exit_code + + +def post_bash(payload: HookPayload) -> HookResponse: + """Post-Bash hook: persist large outputs to disk and record in session history. + + For every PostToolUse(Bash) invocation we: + + 1. Extract stdout/stderr/exit_code from ``tool_response``. + 2. If the combined output is large enough to be worth caching + (``_BASH_CACHE_MIN_BYTES``), write it to the on-disk bash cache and + record a :class:`BashEntry` in the session so a future ``pre_read`` can + dedupe a repeat invocation. + 3. Always return CONTINUE — this hook never blocks, never modifies output. + + Failures at any step are logged at debug and the hook still returns + CONTINUE so a transient I/O issue cannot interrupt the agent. + """ + session_id, _cwd = get_session_context(payload) + tool_input = get_tool_input(payload) + command = tool_input.get("command") + if not isinstance(command, str) or not command: + return CONTINUE() + + stdout, stderr, exit_code = _extract_bash_response(payload) + total_bytes = len(stdout.encode("utf-8", errors="replace")) + len( + stderr.encode("utf-8", errors="replace") + ) + if total_bytes < _BASH_CACHE_MIN_BYTES: + _LOG.debug( + "post-bash: output too small to cache (%d bytes < %d threshold)", + total_bytes, _BASH_CACHE_MIN_BYTES, + ) + return CONTINUE() + if not session_id: + _LOG.debug("post-bash: no session_id; output not cached") + return CONTINUE() + + from . import bash_cache, db, session # noqa: PLC0415 + + meta = bash_cache.store_output( + session_id, command, stdout, stderr, exit_code, + ) + if meta is None: + return CONTINUE() + bash_cache.write_sidecar(meta) + + try: + session.mark_bash_run( + session_id=session_id, + cmd_sha=meta.cmd_sha, + cmd_preview=command, + output_id=meta.output_id, + stdout_bytes=meta.stdout_bytes, + stderr_bytes=meta.stderr_bytes, + exit_code=meta.exit_code, + truncated=meta.truncated, + ) + except (ValueError, OSError) as exc: + _LOG.debug("post-bash: session record failed: %s", exc) + + # Record a stat row for observability. We do NOT claim a saving here: + # the saving is realized when (and if) the agent later avoids a re-run. + # The "bash_output_cached" kind is informational only — stats.py groups + # it under a non-saving bucket so it never inflates the headline number. + try: + db.record_stat( + None, "bash_output_cached", + bytes_saved=0, tokens_saved=0, + detail=sanitize_log_str(command, max_len=200), + ) + except Exception: # noqa: BLE001 — stat logging is best-effort + _LOG.debug("post-bash: stat record failed", exc_info=True) + + _LOG.info( + "post-bash: cached output id=%s bytes=%d exit=%s truncated=%s", + meta.output_id, total_bytes, exit_code, meta.truncated, + ) + return CONTINUE() diff --git a/src/token_goat/install.py b/src/token_goat/install.py index b78f4a0..ae8d7f4 100644 --- a/src/token_goat/install.py +++ b/src/token_goat/install.py @@ -821,6 +821,16 @@ def _hooks_block(binary: str | None = None) -> dict[str, list[_HookMatcherEntry] } ], }, + { + "matcher": "Bash", + "hooks": [ + { + "type": "command", + "command": runner("hook", "post-bash"), + "timeout": 3000, + } + ], + }, ], "PreCompact": [ { @@ -1066,8 +1076,10 @@ def _unpatch_md_block(md_path: Path, begin_marker: str, end_marker: str, not_fou | Find code by meaning, not name | `token-goat semantic "rate limit retry"` | Several rounds of `Grep` | | Get oriented in an unfamiliar repo | `token-goat map --compact` | Recursive `ls` plus multiple `Read` calls | | Outline a long Google Doc | `token-goat gdrive-sections ` | Fetching the whole doc | +| Read one TOML/YAML/JSON config block | `token-goat section "pyproject.toml::tool.ruff"` | `Read pyproject.toml` | +| Re-inspect a recent Bash output | `token-goat bash-output --tail 50` | Re-running the same `pytest`/`cargo`/`git log` | -Modifiers worth knowing: `symbol --all-projects` (cross-repo); `map --compact` (300-token budget); `semantic --max-distance 1.0` or `--no-rerank` to widen / tighten results. A miss prints "Did you mean…?" suggestions — try one before falling back to `Read`. +Modifiers worth knowing: `symbol --all-projects` (cross-repo); `map --compact` (300-token budget); `semantic --max-distance 1.0` or `--no-rerank` to widen / tighten results; `bash-output --grep PATTERN` to filter cached output. A miss prints "Did you mean…?" suggestions — try one before falling back to `Read`. The pre-Bash hook will hint when a command is about to repeat in the same session. Read is the right call when: - The file is under about 200 lines and you need the whole thing. @@ -1130,9 +1142,11 @@ def unpatch_claude_md() -> str: | Find code by meaning, not name | `token-goat semantic "rate limit retry"` | Several rounds of `Grep` | | Get oriented in an unfamiliar repo | `token-goat map --compact` | Recursive `ls` plus multiple `Read` calls | | Outline a long Google Doc | `token-goat gdrive-sections ` | Fetching the whole doc | +| Read one TOML/YAML/JSON config block | `token-goat section "pyproject.toml::tool.ruff"` | `Read pyproject.toml` | +| Re-inspect a recent Bash output | `token-goat bash-output --tail 50` | Re-running `pytest`/`cargo`/`git log` | | See what you have already touched | `token-goat session-touched` | Re-reading and hoping you remember | -Modifiers worth knowing: `symbol --all-projects` searches every indexed repo at once; `map --compact` fits a 300-token budget; `semantic --max-distance 1.0` widens or `--no-rerank` tightens semantic results. A miss prints "Did you mean…?" suggestions — try one of those before falling back to `Read`. +Modifiers worth knowing: `symbol --all-projects` searches every indexed repo at once; `map --compact` fits a 300-token budget; `semantic --max-distance 1.0` widens or `--no-rerank` tightens semantic results; `bash-output --grep PATTERN` filters cached output. A miss prints "Did you mean…?" suggestions — try one of those before falling back to `Read`. ## When Read is the right call @@ -1242,8 +1256,8 @@ def _codex_hooks_block(binary: str | None = None) -> dict[str, list[_HookMatcher "hooks": [ { "type": "command", - "command": runner("hook", "post-read", "--harness", "codex"), - "timeout": 2000, + "command": runner("hook", "post-bash", "--harness", "codex"), + "timeout": 3000, } ], }, @@ -1332,8 +1346,10 @@ def unpatch_codex_config() -> str: | Find code by meaning, not name | `token-goat semantic "rate limit retry"` | Several rounds of `rg` | | Get oriented in an unfamiliar repo | `token-goat map --compact` | `ls -R` plus multiple `cat` calls | | Outline a long Google Doc | `token-goat gdrive-sections ` | Fetching the whole doc | +| Read one TOML/YAML/JSON config block | `token-goat section "pyproject.toml::tool.ruff"` | `cat pyproject.toml` | +| Re-inspect a recent Bash output | `token-goat bash-output --tail 50` | Re-running `pytest`/`cargo`/`git log` | -Modifiers worth knowing: `symbol --all-projects` (cross-repo); `map --compact` (300-token budget); `semantic --max-distance 1.0` or `--no-rerank` to widen / tighten results. A miss prints "Did you mean…?" suggestions — try one before falling back to a Bash read. +Modifiers worth knowing: `symbol --all-projects` (cross-repo); `map --compact` (300-token budget); `semantic --max-distance 1.0` or `--no-rerank` to widen / tighten results; `bash-output --grep PATTERN` to filter cached output. A miss prints "Did you mean…?" suggestions — try one before falling back to a Bash read. The pre-Bash hook will hint when a command is about to repeat in the same session. Plain Bash reads are the right call when: - The file is under about 200 lines and you need the whole thing. diff --git a/src/token_goat/languages/json_idx.py b/src/token_goat/languages/json_idx.py index c89d1cb..9ee26f2 100644 --- a/src/token_goat/languages/json_idx.py +++ b/src/token_goat/languages/json_idx.py @@ -22,6 +22,19 @@ # pretty-printed JSON (nested keys are indented, so they don't match). _TOP_LEVEL_KEY_RE = re.compile(r'^\s*"([^"]+)"\s*:', re.MULTILINE) +# Section-emission pattern: a pretty-printed JSON top-level key. Anchored +# with MULTILINE so we can compute line numbers via positional offsets. +# Captures the column-2 indented form too — common for two-space pretty +# printers — by tolerating any leading whitespace that does not include a +# newline. Section line tracking uses the regex's start offset rather than +# the captured group to keep newline arithmetic accurate. +_SECTION_KEY_RE = re.compile(r'^[ \t]*"([^"]+)"\s*:', re.MULTILINE) +# Maximum number of top-level keys promoted to Section entries per file. +# Mirrors the symbol cap so a giant config file does not flood the section +# table. 100 covers any realistic config (typical .json config files have +# <30 top-level keys). +_MAX_SECTIONS_PER_FILE: int = 100 + # Fallback regex for *minified* JSON, where everything is on a single line so the # MULTILINE anchor in ``_TOP_LEVEL_KEY_RE`` never fires. This pattern is more # permissive and will match nested keys as well, so it's only used when the @@ -40,13 +53,13 @@ def extract(source: bytes, rel_path: str) -> tuple[list[Symbol], list[Ref], list[ImpExp], list[Section]]: - """Extract top-level keys from a JSON file as indexed symbols. + """Extract top-level keys from a JSON file as indexed symbols and Sections. - Only files at or above ``_MIN_JSON_SIZE`` (50 KB) are indexed. Small JSON - files — package.json, tsconfig.json, simple config blobs — are intentionally - skipped because their keys are already known from the filename and indexing - them would inflate the symbol table with dozens of near-identical entries - across every project (``"name"``, ``"version"``, ``"scripts"`` …). + Only files at or above ``_MIN_JSON_SIZE`` (50 KB) are indexed for symbols. + Small JSON files — package.json, tsconfig.json, simple config blobs — are + intentionally skipped because their keys are already known from the filename + and indexing them would inflate the symbol table with dozens of near-identical + entries across every project (``"name"``, ``"version"``, ``"scripts"`` …). For files that meet the size threshold, extraction proceeds in two passes: @@ -62,12 +75,24 @@ def extract(source: bytes, rel_path: str) -> tuple[list[Symbol], list[Ref], list which has no newlines), the permissive ``_ANY_KEY_RE`` is used as a last-resort fallback with key de-duplication. - Symbols are capped at ``_MAX_SYMBOLS`` (200) per file. Refs, imports, and - sections are always empty for JSON files. + Sections (NEW): pretty-printed JSON files additionally get one + :class:`Section` per top-level key, with ``line`` and ``end_line`` covering + the key's value span. This lets ``token-goat section foo.json::scripts`` + pull just that block without touching the whole file. Minified JSON + (all on one line) yields no Sections — there is nothing to slice. """ if len(source) < _MIN_JSON_SIZE: - # File too small; skip indexing - return [], [], [], [] + # File too small for symbol indexing; we still extract Sections for + # pretty-printed files so ``token-goat section`` works on configs like + # ``package.json``. This is the most-requested use case for the + # JSON section path: navigate to one well-known key without a full read. + try: + text_for_sections = source.decode("utf-8", errors="replace") + sections = _extract_sections(text_for_sections) + except (UnicodeDecodeError, AttributeError) as exc: + _LOG.debug("json_idx: section decode failed for %s: %s", rel_path, exc) + sections = [] + return [], [], [], sections text = source.decode("utf-8", errors="replace") symbols: list[Symbol] = [] @@ -79,7 +104,8 @@ def extract(source: bytes, rel_path: str) -> tuple[list[Symbol], list[Ref], list _emit_dict_symbols(symbols, data) elif isinstance(data, list): _emit_array_symbols(symbols, data) - return symbols, [], [], [] + sections = _extract_sections(text) + return symbols, [], [], sections except (json.JSONDecodeError, ValueError) as exc: _LOG.debug("json_idx: full parse failed for %s, falling back to regex: %s", rel_path, exc) @@ -109,7 +135,98 @@ def extract(source: bytes, rel_path: str) -> tuple[list[Symbol], list[Ref], list seen.add(key) symbols.append(Symbol(name=key, kind="json_key", line=1)) - return symbols, [], [], [] + sections = _extract_sections(text) + return symbols, [], [], sections + + +def _extract_sections(text: str) -> list[Section]: + """Return one :class:`Section` per top-level key in pretty-printed JSON. + + Uses a column-anchored regex to find candidate keys at the file's + outermost indent. We then validate each match is *actually* at depth 1 + (immediately inside the root object) by counting opening/closing braces + and brackets in the preceding text — this rejects keys at depth ≥ 2 that + happen to share the file's two-space indent style (rare but possible in + densely nested configs). + + Each Section's ``end_line`` is the line immediately before the next + top-level Section, or the file's last line for the trailing entry. + A minified file (one long line) yields no Sections because no key + matches the column-anchored pattern. + """ + if not text: + return [] + + matches: list[tuple[int, str]] = [] + seen_at_line: set[int] = set() + for m in _SECTION_KEY_RE.finditer(text): + key = m.group(1) + if not key: + continue + depth = _depth_before(text, m.start()) + # depth==1 means we are directly inside the root ``{`` — the only + # depth at which we want to emit a Section. Reject deeper matches. + if depth != 1: + continue + # Line is computed from byte offset to avoid surprises with mixed + # line-endings; ``count("\n")`` works because the regex captures + # column-0 matches in the normalized form. + line = text[: m.start()].count("\n") + 1 + if line in seen_at_line: + # Duplicate at same line — keep only the first match for stable output. + continue + seen_at_line.add(line) + matches.append((line, key)) + if len(matches) >= _MAX_SECTIONS_PER_FILE: + break + + if not matches: + return [] + + total_lines = text.count("\n") + 1 + sections: list[Section] = [] + for i, (line, key) in enumerate(matches): + end_line = matches[i + 1][0] - 1 if i + 1 < len(matches) else total_lines + end_line = max(line, end_line) + sections.append(Section(heading=key, level=1, line=line, end_line=end_line)) + return sections + + +def _depth_before(text: str, offset: int) -> int: + """Compute the brace/bracket depth at *offset* into *text*. + + Walks the text up to ``offset`` and tracks ``{``/``}`` and ``[``/``]`` + nesting while skipping over string literals (so a ``{`` inside a JSON + string value does not falsely increment the depth). Returns the + integer depth — 0 outside the root, 1 inside the root object/array, + 2 inside a one-level-nested object, and so on. + + This is intentionally a manual scanner rather than ``json.loads`` + because the latter would require parsing the full file just to learn + the depth at one offset. The scanner is O(offset); for our use case + (one pass over the file, computing depth at every regex hit) the total + work amortises to O(N). + """ + depth = 0 + in_string = False + escape = False + for i in range(offset): + ch = text[i] + if in_string: + if escape: + escape = False + elif ch == "\\": + escape = True + elif ch == '"': + in_string = False + continue + if ch == '"': + in_string = True + elif ch == "{" or ch == "[": + depth += 1 + elif ch == "}" or ch == "]": + depth -= 1 + return depth def _emit_dict_symbols(symbols: list[Symbol], data: dict) -> None: diff --git a/src/token_goat/languages/toml_idx.py b/src/token_goat/languages/toml_idx.py new file mode 100644 index 0000000..d958f57 --- /dev/null +++ b/src/token_goat/languages/toml_idx.py @@ -0,0 +1,131 @@ +"""TOML extractor — emits one Section per ``[table]`` / ``[[array]]`` header. + +Why a custom scanner rather than ``tomllib``: + +* ``tomllib.loads`` parses TOML into a plain Python dict and discards source + positions. We need start/end line numbers so ``token-goat section`` can + slice the source file back out. + +* The TOML grammar for table headers is unambiguous and easy to recognise + line-by-line: ``[name]`` or ``[[name]]`` at column 0, with the table + spanning every line until the next header (or EOF). A regex scan over the + lines gives correct results without depending on a third-party tree-sitter + grammar. + +Section model +------------- +* ``heading``: the dotted key inside the brackets, e.g. ``tool.ruff``. +* ``level``: 1 for ``[name]`` tables, 2 for ``[[array]]`` array-of-tables + entries. This is purely a convenience for downstream sorting; both flavours + are addressable via the same ``token-goat section file.toml::name`` lookup. +* ``line``: 1-based line of the header. +* ``end_line``: 1-based last line of the section's content (header inclusive), + which is the line immediately before the next header or the file's last + line for the final section. + +Symbols +------- +We also emit one ``toml_key`` symbol per table header so ``token-goat symbol +ruff`` can locate the relevant table in any indexed config file across the +repo. Within-table keys (e.g. ``line-length = 100``) are not indexed +individually — the section payload from a small surgical read already exposes +them, and indexing every leaf would bloat the symbol table for what is +typically a small file. +""" +from __future__ import annotations + +__all__ = ["extract"] + +import logging +import re + +from ..parser import ImpExp, Ref, Section, Symbol + +_LOG = logging.getLogger("token_goat.languages.toml_idx") + +# Maximum table-header line value persisted as ``end_line`` for the last +# section in a file. Pegged at the actual EOF line — TOML files do not have +# nested headers, so the last header runs to the bottom. +_MAX_HEADING_LEN: int = 200 +_MAX_SYMBOLS_PER_FILE: int = 500 + +# Strict TOML table-header regex: +# * Column-0 anchored — no leading whitespace (per the TOML spec). +# * Table name allows the standard bare-key character class plus dots; we +# intentionally accept hyphens and underscores because both are common +# and explicitly allowed by TOML. +# * Trailing comment after the closing bracket is tolerated. +# * Quoted keys (``["tool.ruff"]``) are matched separately because their +# bracket content can contain dots that are *not* path separators. +_BARE_TABLE_RE = re.compile( + r"^(\[\[?)\s*([A-Za-z0-9_\-][A-Za-z0-9_\-.]*)\s*(\]\]?)\s*(?:#.*)?$" +) +_QUOTED_TABLE_RE = re.compile( + r"^(\[\[?)\s*\"([^\"\n]+)\"\s*(\]\]?)\s*(?:#.*)?$" +) + + +def extract( + source: bytes, rel_path: str +) -> tuple[list[Symbol], list[Ref], list[ImpExp], list[Section]]: + """Extract table headers from a TOML file as :class:`Section` entries. + + Always returns four lists (symbols, refs, imports, sections); refs and + imports are empty for TOML — there is no cross-file reference model. + + Tolerant of malformed input: lines that do not match a header pattern + are simply not emitted. A file with no table headers at all produces an + empty result, which is the correct behaviour — there is nothing to index. + """ + try: + text = source.decode("utf-8", errors="replace").replace("\r\n", "\n").replace("\r", "\n") + except (UnicodeDecodeError, AttributeError) as exc: + _LOG.debug("toml_idx: decode failed for %s: %s", rel_path, exc) + return [], [], [], [] + + lines = text.split("\n") + sections: list[Section] = [] + symbols: list[Symbol] = [] + + for idx, line in enumerate(lines, start=1): + # Strip a UTF-8 BOM if present at file start. The regex anchors at + # column 0 and would otherwise miss a header on line 1 of a BOM file. + candidate = line.lstrip("") if idx == 1 else line + # Headers must start at column 0 — leading whitespace makes the line + # either invalid TOML or a key inside an inline table. + if not candidate.startswith("["): + continue + m = _BARE_TABLE_RE.match(candidate) + if m is None: + m = _QUOTED_TABLE_RE.match(candidate) + if m is None: + continue + open_bracket, name, close_bracket = m.group(1), m.group(2).strip(), m.group(3) + # ``[[...]]`` requires matching ``]]``; reject mismatched bracket + # pairs (``[[name]`` or ``[name]]``) as malformed and skip them. + if len(open_bracket) != len(close_bracket): + continue + if not name or len(name) > _MAX_HEADING_LEN: + continue + level = 2 if open_bracket == "[[" else 1 + sections.append( + Section(heading=name, level=level, line=idx) + ) + symbols.append( + Symbol(name=name, kind="toml_key", line=idx) + ) + if len(symbols) >= _MAX_SYMBOLS_PER_FILE: + break + + # Compute end_line for each section. TOML has no nested table structure + # at the source level — every header is a top-level marker — so the end + # of section N is simply the line before section N+1, or the last line of + # the file for the final section. + total = len(lines) + for i, sec in enumerate(sections): + if i + 1 < len(sections): + sec.end_line = max(sec.line, sections[i + 1].line - 1) + else: + sec.end_line = max(sec.line, total) + + return symbols, [], [], sections diff --git a/src/token_goat/languages/yaml_idx.py b/src/token_goat/languages/yaml_idx.py new file mode 100644 index 0000000..69e174b --- /dev/null +++ b/src/token_goat/languages/yaml_idx.py @@ -0,0 +1,194 @@ +"""YAML extractor — emits Sections for top-level keys and (optionally) nested ones. + +Why a line-scanner rather than PyYAML: + +* PyYAML is not a token-goat dependency. Pulling it in just for source-line + positions is disproportionate: the indexer already takes ~1 s on a fresh + install and a YAML parse for every file would add measurable overhead. + +* The structure we need is shallow: top-level keys and (optionally) the keys + one level below. Both can be detected by a line-by-line scan that tracks + column-0 keys (top level) and configurable-indent keys (one level deep). + +What counts as a "section" +-------------------------- +* A line of the form ``^([A-Za-z_][\\w-]*):`` at column 0 starts a top-level + section. Its content runs from that line through the line before the next + column-0 key (or EOF for the last one). + +* Inside each section, lines indented with exactly the file's detected + indent (almost always 2 spaces) of the form ``([A-Za-z_][\\w-]*):`` + are emitted as nested ``parent.child`` sections. This lets callers do + ``token-goat section deployment.yaml::spec.replicas`` instead of pulling + the whole spec block. + +What is intentionally skipped +----------------------------- +* List items (``- foo:``) — these are sequence entries, not keys, and would + bloat the section table with positional noise. +* Multi-document YAML (``---``-separated streams) — we treat the file as a + single logical document. In practice ``---`` is rarely used for source- + code-adjacent YAML (CI configs, ansible playbooks, k8s manifests) where + this hint matters; the rare multi-doc file simply gets its first document + indexed and the rest fall through. +* Lines inside flow-style mappings (``{ … }``) — the line scanner cannot + reliably track flow scope without a full parse, so any line that starts + inside a brace block is left to the read path to handle. +* Comments and blank lines. + +Safety +------ +A pathologically structured file (mixed indents, tabs, alternating styles) +may produce inaccurate end_line values for nested sections. This degrades +gracefully: the worst outcome is that ``token-goat section`` returns a +slightly larger or smaller slice than the user expected, never a crash. +""" +from __future__ import annotations + +__all__ = ["extract"] + +import logging +import re +from collections.abc import Iterable + +from ..parser import ImpExp, Ref, Section, Symbol + +_LOG = logging.getLogger("token_goat.languages.yaml_idx") + +# Largest indent width (in spaces) we treat as a single nesting level. Above +# this the file is assumed to use an unusual style and we suppress nested +# section emission rather than guess wrong. +_MAX_DETECTED_INDENT: int = 8 +# Maximum number of top-level + nested sections combined per file. A +# misbehaving generated YAML (thousands of leaf keys at column 0) could +# otherwise inflate the index without bound. +_MAX_SECTIONS_PER_FILE: int = 400 +# Maximum length of a heading we accept. Real YAML keys are short +# (tens of characters); a giant captured "key" is almost certainly a +# pathological line and we drop it rather than store it. +_MAX_HEADING_LEN: int = 200 + +# Match a top-level key: column-0 anchor, ASCII identifier-ish characters, +# trailing colon. We allow hyphens and dots because those are common in +# real-world YAML (e.g. Kubernetes labels), but stop before ``:`` so the +# captured name does not include the value or inline annotation. +_TOP_KEY_RE = re.compile(r"^([A-Za-z_][A-Za-z0-9_\-.]*)\s*:(?:\s|$)") +# A generic indented key — same body, but with leading spaces. The caller +# decides whether the indent matches a nesting level we are willing to emit. +_INDENTED_KEY_RE = re.compile(r"^( +)([A-Za-z_][A-Za-z0-9_\-.]*)\s*:(?:\s|$)") + + +def _detect_indent(lines: Iterable[str]) -> int: + """Heuristically detect the file's per-level indent width (in spaces). + + Returns the smallest non-zero indent observed on a key-shaped line, capped + at :data:`_MAX_DETECTED_INDENT`. Falls back to ``2`` when no indented + key is found — that is the default for nearly every modern YAML style guide. + Tabs are not supported as indent leaders (rare in modern YAML; the spec + technically forbids them for indentation though some parsers accept them). + """ + smallest = 0 + for line in lines: + if not line or line[0] != " ": + continue + # Skip pure comment/empty lines. + stripped = line.lstrip(" ") + if not stripped or stripped.startswith("#"): + continue + m = _INDENTED_KEY_RE.match(line) + if m is None: + continue + width = len(m.group(1)) + if 0 < width <= _MAX_DETECTED_INDENT and (smallest == 0 or width < smallest): + smallest = width + if smallest == 1: + break + return smallest or 2 + + +def extract( + source: bytes, rel_path: str +) -> tuple[list[Symbol], list[Ref], list[ImpExp], list[Section]]: + """Extract top-level (and one-level-nested) YAML keys as :class:`Section` entries. + + Symbols mirror the section headings as ``yaml_key`` (top level) and + ``yaml_nested_key`` (one level deep). Refs and imports are always empty. + """ + try: + text = source.decode("utf-8", errors="replace").replace("\r\n", "\n").replace("\r", "\n") + except (UnicodeDecodeError, AttributeError) as exc: + _LOG.debug("yaml_idx: decode failed for %s: %s", rel_path, exc) + return [], [], [], [] + + lines = text.split("\n") + if not lines: + return [], [], [], [] + + indent_unit = _detect_indent(lines) + + sections: list[Section] = [] + symbols: list[Symbol] = [] + # Tracks the most recent top-level section so we can prefix nested keys + # with their parent name (``spec.replicas`` rather than just ``replicas``). + current_top: Section | None = None + + for idx, line in enumerate(lines, start=1): + # Strip a UTF-8 BOM if present on line 1; otherwise the column-0 + # regex anchor would miss the first key. + candidate = line.lstrip("") if idx == 1 else line + if not candidate or candidate.startswith("#"): + continue + # Multi-document marker resets the parser state for the next doc. + if candidate.startswith("---") or candidate.startswith("..."): + current_top = None + continue + + # Top-level key (column 0) + m = _TOP_KEY_RE.match(candidate) + if m is not None: + name = m.group(1) + if not name or len(name) > _MAX_HEADING_LEN: + continue + sec = Section(heading=name, level=1, line=idx) + sections.append(sec) + symbols.append(Symbol(name=name, kind="yaml_key", line=idx)) + current_top = sec + if len(sections) >= _MAX_SECTIONS_PER_FILE: + break + continue + + # Nested key at exactly one indent level deep. + m = _INDENTED_KEY_RE.match(candidate) + if m is None or current_top is None: + continue + leading = m.group(1) + if len(leading) != indent_unit: + continue + child_name = m.group(2) + if not child_name or len(child_name) > _MAX_HEADING_LEN: + continue + full_name = f"{current_top.heading}.{child_name}" + if len(full_name) > _MAX_HEADING_LEN: + continue + sections.append( + Section(heading=full_name, level=2, line=idx) + ) + symbols.append( + Symbol(name=full_name, kind="yaml_nested_key", line=idx) + ) + if len(sections) >= _MAX_SECTIONS_PER_FILE: + break + + # End-line computation. Each section runs until the line before the next + # section at the *same or shallower* level — same logic as Markdown's + # heading nesting. The last section runs to EOF. + total = len(lines) + for i, sec in enumerate(sections): + end_line = total + for j in range(i + 1, len(sections)): + if sections[j].level <= sec.level: + end_line = max(sec.line, sections[j].line - 1) + break + sec.end_line = end_line + + return symbols, [], [], sections diff --git a/src/token_goat/parser.py b/src/token_goat/parser.py index 6964741..16d1115 100644 --- a/src/token_goat/parser.py +++ b/src/token_goat/parser.py @@ -60,6 +60,9 @@ ".html": "html", ".htm": "html", ".json": "json", + ".toml": "toml", + ".yaml": "yaml", + ".yml": "yaml", } # Frozenset of all known extensions (already lowercase). Used by iter_source_files @@ -283,6 +286,8 @@ def _factory() -> Extractor: "markdown": _language_importer("markdown"), "html": _language_importer("html"), "json": _language_importer("json_idx"), + "toml": _language_importer("toml_idx"), + "yaml": _language_importer("yaml_idx"), } # Cache resolved extractors so each language module is imported at most once. diff --git a/src/token_goat/session.py b/src/token_goat/session.py index 4532338..4ea6618 100644 --- a/src/token_goat/session.py +++ b/src/token_goat/session.py @@ -23,6 +23,8 @@ from __future__ import annotations __all__ = [ + "BASH_HISTORY_MAX", + "BashEntry", "FileEntry", "GrepEntry", "RESULT_CACHE_MAX", @@ -34,6 +36,8 @@ "list_edited", "list_touched", "load", + "lookup_bash_entry", + "mark_bash_run", "mark_file_edited", "mark_file_read", "mark_grep", @@ -110,6 +114,33 @@ class GrepEntry: result_count: int | None = None # if known +@dataclass +class BashEntry: + """Tracks one execution of a Bash command within a session. + + Stored in :attr:`SessionCache.bash_history` keyed by the SHA prefix of the + command string so a future ``pre_read`` for the same command can quickly + look up its prior output. The body itself lives on disk under the + bash-cache directory and is referenced here only by ``output_id``. + + ``stdout_bytes`` / ``stderr_bytes`` are the *original* sizes (before any + truncation applied by the cache) so dedup hints can quote the real cost of + re-running. ``cmd_preview`` stores up to 120 chars of the command for + human-readable display in ``token-goat bash-history``; the full command is + not persisted because it is recoverable from agent context if needed and + storing arbitrary user input in session JSON is a privacy concern. + """ + + cmd_sha: str + cmd_preview: str + output_id: str + ts: float + stdout_bytes: int + stderr_bytes: int + exit_code: int | None = None + truncated: bool = False + + @dataclass class ResultCacheEntry: """A cached read_symbol/read_section result, keyed elsewhere by (rel_path, item). @@ -144,6 +175,17 @@ class ResultCacheEntry: # rather than reshuffling on every single insertion above the cap. _RESULT_CACHE_EVICT = 25 +# Maximum number of bash-history entries retained per session. Each entry is +# tiny (well under 200 bytes), so 200 is comfortable; the cap exists to keep +# the session JSON size predictable in pathological loops (e.g. a watch-mode +# rerunning every few seconds). FIFO eviction discards the oldest first. +BASH_HISTORY_MAX = 200 +_BASH_HISTORY_EVICT = 50 +# Length of the bash command preview persisted in session JSON. Long enough +# to identify a command across re-runs ("pytest tests/test_x.py -k foo") but +# short enough to keep the manifest output bounded. +_MAX_BASH_PREVIEW = 120 + @dataclass class SessionCache: @@ -167,6 +209,16 @@ class SessionCache: # can hit the cache too — without persistence the cache is useless across the # one-hook-per-tool-call process model that Claude Code uses on Windows. result_cache: dict[str, ResultCacheEntry] = field(default_factory=dict) + # Per-session bash command history keyed by short SHA of the command. Used + # by the pre-Bash dedup hint and by ``token-goat bash-history`` for listing. + # Insertion-ordered dict; FIFO eviction at BASH_HISTORY_MAX prevents growth + # in tight retry loops. + bash_history: dict[str, BashEntry] = field(default_factory=dict) + # Per-session content snapshots used by the diff-aware re-read hint. Maps + # normalized file path → SHA of the snapshot bytes stored on disk under + # ``data_dir() / "session_snapshots" / / .bin``. + # Storing only the SHA here (not the bytes) keeps the session JSON small. + snapshot_shas: dict[str, str] = field(default_factory=dict) unavailable: bool = field(default=False, repr=False, compare=False) # Internal: cached JSON string from last serialization — invalidated by any mutation. # Avoids O(N) re-serialization of files/greps dicts on every hook invocation when @@ -188,6 +240,11 @@ def to_dict(self) -> _SessionDict: k: cast("_ResultCacheEntryDict", asdict(v)) for k, v in self.result_cache.items() }, + bash_history={ + k: cast("_BashEntryDict", asdict(v)) + for k, v in self.bash_history.items() + }, + snapshot_shas=dict(self.snapshot_shas), ) def to_json(self) -> str: @@ -270,6 +327,24 @@ def from_dict(cls, d: dict[str, Any]) -> SessionCache: if rc_entry is not None: result_cache[k] = rc_entry + bash_history: dict[str, BashEntry] = {} + for k, v in d.get("bash_history", {}).items(): + if not isinstance(v, dict) or not isinstance(k, str): + continue + be_entry = _parse_bash_entry(v) + if be_entry is not None: + bash_history[k] = be_entry + + # snapshot_shas: dict[str, str] — coerce values defensively so a + # malformed entry written by a future version (e.g. structured object) + # is dropped silently rather than poisoning the lookup path. + snapshot_shas: dict[str, str] = {} + raw_snaps = d.get("snapshot_shas", {}) + if isinstance(raw_snaps, dict): + for k, v in raw_snaps.items(): + if isinstance(k, str) and isinstance(v, str): + snapshot_shas[k] = v + return cls( session_id=session_id, started_ts=float(d.get("started_ts", now)), @@ -278,6 +353,8 @@ def from_dict(cls, d: dict[str, Any]) -> SessionCache: greps=greps, edited_files=edited_files, result_cache=result_cache, + bash_history=bash_history, + snapshot_shas=snapshot_shas, ) @@ -401,6 +478,46 @@ class _ResultCacheEntryDict(TypedDict, total=False): ts: float +class _BashEntryDict(TypedDict, total=False): + """Wire format of a single BashEntry as it appears in the session JSON.""" + + cmd_sha: str + cmd_preview: str + output_id: str + ts: float + stdout_bytes: int + stderr_bytes: int + exit_code: int | None + truncated: bool + + +def _parse_bash_entry(v: dict[str, Any]) -> BashEntry | None: + """Deserialize one bash-history dict from JSON, returning None on parse error. + + Coerces every field defensively: the session JSON is user-readable on + disk and could be corrupted, partially upgraded, or hand-edited. A bad + entry is dropped (logged at debug) rather than crashing the load path. + """ + try: + raw_exit = v.get("exit_code") + exit_code: int | None = None + if isinstance(raw_exit, int) and not isinstance(raw_exit, bool): + exit_code = raw_exit + return BashEntry( + cmd_sha=str(v.get("cmd_sha", "")), + cmd_preview=str(v.get("cmd_preview", "")), + output_id=str(v.get("output_id", "")), + ts=float(v.get("ts", 0.0)) if isinstance(v.get("ts", 0.0), (int, float)) else 0.0, + stdout_bytes=max(0, int(v.get("stdout_bytes", 0))), + stderr_bytes=max(0, int(v.get("stderr_bytes", 0))), + exit_code=exit_code, + truncated=bool(v.get("truncated", False)), + ) + except (TypeError, ValueError, KeyError) as exc: + _LOG.debug("session: skipping corrupted bash entry: %s", exc) + return None + + class _FileEntryDict(TypedDict, total=False): """Wire format of a single FileEntry as it appears in the session JSON. @@ -428,10 +545,11 @@ class _GrepEntryDict(TypedDict, total=False): class _SessionDict(TypedDict, total=False): """Wire format of a serialized SessionCache (written to / read from JSON on disk). - ``result_cache`` is optional (``total=False``) for backwards compat with - session caches written by token-goat versions that predate the field. All - other fields are still effectively required because :meth:`SessionCache.from_dict` - supplies a default for each one. + ``result_cache``, ``bash_history``, and ``snapshot_shas`` are optional + (``total=False``) for backwards compatibility with session caches written + by token-goat versions that predate these fields. All other fields are + still effectively required because :meth:`SessionCache.from_dict` supplies + a default for each one. """ schema_version: int @@ -443,6 +561,8 @@ class _SessionDict(TypedDict, total=False): greps: list[_GrepEntryDict] edited_files: dict[str, int] result_cache: dict[str, _ResultCacheEntryDict] + bash_history: dict[str, _BashEntryDict] + snapshot_shas: dict[str, str] def _fresh_cache(session_id: str, *, unavailable: bool = False) -> SessionCache: @@ -962,6 +1082,10 @@ def reset_session(session_id: str) -> None: Validates session_id before use (defense-in-depth: paths.session_cache_path also validates, but an explicit guard here makes the invariant obvious at the call site and prevents future callers from bypassing path-level checks). + + Also clears any per-session content snapshots written by the post-read + hook so the diff-aware re-read hint engine cannot serve stale diffs that + pre-date the reset. """ validate_session_id(session_id) p = paths.session_cache_path(session_id) @@ -970,6 +1094,14 @@ def reset_session(session_id: str) -> None: p.unlink() except OSError as e: _LOG.warning("failed to delete session cache %s: %s", p, e) + # Snapshot directory cleanup is best-effort and isolated; failures must + # not propagate up because they are inconsequential to session correctness. + try: + from . import snapshots # noqa: PLC0415 + + snapshots.cleanup_session(session_id) + except Exception: # noqa: BLE001 + _LOG.debug("reset_session: snapshot cleanup failed", exc_info=True) def mark_file_edited( @@ -1130,6 +1262,120 @@ def put_result_cache( ) +def mark_bash_run( + session_id: str, + cmd_sha: str, + cmd_preview: str, + output_id: str, + stdout_bytes: int, + stderr_bytes: int, + exit_code: int | None, + truncated: bool, + *, + cache: SessionCache | None = None, +) -> SessionCache: + """Record a Bash invocation in the per-session history. + + *cmd_sha* is a short content-derived identifier (see :func:`bash_cache.command_hash`). + Storing only the SHA — not the full command — keeps the session JSON small + and avoids persisting potentially sensitive command arguments + (credentials, file paths) longer than necessary. ``cmd_preview`` is the + first 120 characters of the command, which is enough to identify a re-run + while remaining bounded. + + FIFO eviction batches removals at ``_BASH_HISTORY_EVICT`` so a hot retry + loop does not rewrite the dict on every single insert. + """ + try: + cache = _resolve_cache(session_id, cache) + except ValueError as exc: + _LOG.warning("mark_bash_run: invalid session_id (%s); skipping", exc) + return cache or _fresh_cache(session_id) + if cache.unavailable: + return cache + + # Sanitize the preview before storage: command strings can contain newlines + # (here-docs) and bidi controls that would corrupt the manifest output. + safe_preview = sanitize_log_str(cmd_preview, max_len=_MAX_BASH_PREVIEW) + + now = time.time() + # Evict oldest entries when at capacity — but only when adding a new key. + # Updates to an existing cmd_sha keep their original insertion slot so the + # eviction order reflects "first seen, first evicted". + if cmd_sha not in cache.bash_history and len(cache.bash_history) >= BASH_HISTORY_MAX: + evict_keys = list(islice(cache.bash_history.keys(), _BASH_HISTORY_EVICT)) + for k in evict_keys: + del cache.bash_history[k] + _LOG.debug( + "bash_history: evicted %d entries (cap=%d) for session=%s", + _BASH_HISTORY_EVICT, BASH_HISTORY_MAX, session_id[:16], + ) + + cache.bash_history[cmd_sha] = BashEntry( + cmd_sha=cmd_sha, + cmd_preview=safe_preview, + output_id=output_id, + ts=now, + stdout_bytes=max(0, int(stdout_bytes)), + stderr_bytes=max(0, int(stderr_bytes)), + exit_code=exit_code if isinstance(exit_code, int) and not isinstance(exit_code, bool) else None, + truncated=bool(truncated), + ) + cache.last_activity_ts = now + cache._invalidate_json_cache() + save(cache) + return cache + + +def lookup_bash_entry( + session_id: str, cmd_sha: str, *, cache: SessionCache | None = None +) -> BashEntry | None: + """Return the :class:`BashEntry` for *cmd_sha* in *session_id*, or None.""" + try: + cache = _resolve_cache(session_id, cache) + except ValueError: + return None + if cache.unavailable: + return None + return cache.bash_history.get(cmd_sha) + + +def set_snapshot_sha( + session_id: str, + file_path: str, + content_sha: str, + *, + cache: SessionCache | None = None, +) -> SessionCache: + """Record that a snapshot for *file_path* with hash *content_sha* exists on disk. + + Stored separately from :attr:`SessionCache.files` so the snapshot index can + be queried without loading file entries, and so a missing/empty snapshot + does not invalidate the read-tracking state. + """ + prep = _prepare_path_mutation(session_id, file_path, cache) + if prep is None: + return cache or _fresh_cache(session_id) + cache, key = prep + cache.snapshot_shas[key] = content_sha + cache._invalidate_json_cache() + save(cache) + return cache + + +def get_snapshot_sha( + session_id: str, file_path: str, *, cache: SessionCache | None = None +) -> str | None: + """Return the stored snapshot SHA for *file_path*, or None when absent.""" + try: + cache = _resolve_cache(session_id, cache) + except ValueError: + return None + if cache.unavailable: + return None + return cache.snapshot_shas.get(_normalize_path(file_path)) + + def cleanup_stale(max_age_hours: float = 24.0) -> int: """Delete session cache files older than max_age_hours. Returns count removed.""" removed = 0 diff --git a/src/token_goat/snapshots.py b/src/token_goat/snapshots.py new file mode 100644 index 0000000..84d3074 --- /dev/null +++ b/src/token_goat/snapshots.py @@ -0,0 +1,303 @@ +"""Per-session content snapshots used for diff-aware re-read hints. + +When a file is read inside a Claude session, ``post_read`` captures a copy of +its contents under ``data_dir() / "session_snapshots" / ""``. +If the agent later edits the file and tries to re-read it, the pre-read hook +computes a unified diff against the stored snapshot and offers the agent the +diff as ``additionalContext`` so it can decide whether the full re-read is +still warranted. + +Design notes +------------ +* Snapshots are scoped to a single session. We do not share snapshots across + sessions because each session has its own context window and a "you already + read this" claim only makes sense within the same conversation. + +* Snapshots live on disk (not in the session JSON) so a single 200 KB file + does not push the session cache to half a megabyte on every read. + +* Snapshot filenames are derived from the SHA of the file path so the on-disk + layout is flat and a long file path never blows out PATH_MAX. + +* Files larger than :data:`MAX_SNAPSHOT_BYTES` are not snapshotted. The diff + would be too large to inject as a hint anyway, and the snapshot store is + bounded by per-session total size + per-file size caps. + +* Snapshots are best-effort: any I/O error is logged and swallowed. A missing + snapshot simply means the pre-read hook falls back to its existing behaviour + (suppress the hint when the file has been edited since last read). + +Concurrency +----------- +Snapshots are written via :func:`paths.atomic_write_bytes` so a concurrent +reader sees either the old complete file or the new complete file — never a +partial write. We rely on the same write-and-rename invariant the session +cache uses; no additional locking is needed because the unique-per-(session, +path) filenames mean two hooks cannot legitimately race on the same key. +""" +from __future__ import annotations + +__all__ = [ + "MAX_SNAPSHOT_BYTES", + "MAX_SNAPSHOTS_PER_SESSION", + "SnapshotResult", + "cleanup_session", + "load", + "snapshot_path", + "store", +] + +import contextlib +import hashlib +import logging +import os +import re +import stat as _stat_module +import time +from dataclasses import dataclass +from pathlib import Path + +from . import paths +from .hooks_common import sanitize_log_str + +_LOG = logging.getLogger("token_goat.snapshots") + +# Largest file size eligible for snapshotting. Beyond this the diff itself +# would not fit comfortably in a hint, so we save nothing rather than store +# bytes we will never use. 256 KB covers nearly every source file (a 10K LoC +# file averages ~300 KB at 30 chars/line). +MAX_SNAPSHOT_BYTES: int = 256 * 1024 + +# Per-session ceiling on snapshot count. Above this the oldest snapshot is +# evicted when a new one is taken. 150 covers any realistic session — even +# a long refactor rarely reads 150 distinct files. +MAX_SNAPSHOTS_PER_SESSION: int = 150 + +# Used to scrub session_id before embedding it in a directory name. The +# session module already validates session_id against a stricter regex, but +# we apply a second pass here so this module is safe to call even when a +# caller bypassed validation. +_SESSION_DIR_RE = re.compile(r"[^a-zA-Z0-9_\-]") + + +@dataclass +class SnapshotResult: + """Outcome of :func:`store` — what was written and where. + + A non-None ``path`` indicates the snapshot exists on disk and can be + loaded later via :func:`load`. ``content_sha`` is the SHA-256 hex digest + of the stored bytes, used by the pre-read hint logic to short-circuit when + the on-disk file's SHA hasn't changed since the snapshot. + """ + + path: Path + content_sha: str + size_bytes: int + + +def _session_dir(session_id: str) -> Path | None: + """Resolve the snapshots directory for *session_id*, or None on invalid input.""" + if not session_id: + return None + safe = _SESSION_DIR_RE.sub("_", session_id)[:64] or "anon" + base = (paths.data_dir() / "session_snapshots").resolve() + candidate = (base / safe).resolve() + try: + candidate.relative_to(base) + except ValueError: + _LOG.warning("snapshots: session_dir escaped base for %r", sanitize_log_str(session_id)) + return None + return candidate + + +def _path_key(file_path: str) -> str: + """Return the on-disk filename component for *file_path*. + + Hashes the absolute or relative path so a long real path becomes a short + stable filename. Truncated to 32 hex chars — ~128 bits of collision + resistance, more than enough for a per-session set of at most ~150 entries. + """ + return hashlib.sha256(file_path.encode("utf-8", errors="replace")).hexdigest()[:32] + + +def snapshot_path(session_id: str, file_path: str) -> Path | None: + """Return the snapshot file path for ``(session_id, file_path)``, or None. + + Always returns a path even when the snapshot does not yet exist. Callers + can use :meth:`Path.exists` to distinguish. + """ + d = _session_dir(session_id) + if d is None: + return None + return d / f"{_path_key(file_path)}.bin" + + +def _evict_oldest(d: Path, max_count: int) -> int: + """Drop the oldest snapshots in *d* until at most *max_count* remain. + + Returns the number of files removed. Silently ignores I/O errors so a + transient permission glitch does not abort the snapshot write the caller + is about to attempt. + """ + try: + entries = [ + (p, p.stat().st_mtime) + for p in d.iterdir() + if p.is_file() and not p.is_symlink() + ] + except OSError: + return 0 + if len(entries) <= max_count: + return 0 + entries.sort(key=lambda t: t[1]) + removed = 0 + over = len(entries) - max_count + for p, _mtime in entries[:over]: + try: + p.unlink() + removed += 1 + except OSError: + continue + if removed: + _LOG.debug("snapshots: evicted %d entries from %s (cap=%d)", removed, d.name, max_count) + return removed + + +def store(session_id: str, file_path: str, content: bytes) -> SnapshotResult | None: + """Persist *content* as the current snapshot for ``(session_id, file_path)``. + + Returns ``None`` (and logs at debug) when the file is too large, the + session dir cannot be created, or any I/O error occurs. Otherwise returns + a :class:`SnapshotResult` describing the stored snapshot. + + Snapshots are stored verbatim — no compression — because they are short + (≤256 KB) and we read them back exactly once per re-read attempt. The + write is atomic via rename-over so a concurrent reader never observes a + partial file. + """ + if len(content) > MAX_SNAPSHOT_BYTES: + _LOG.debug( + "snapshots: skipping oversized file (%d bytes > %d cap): %s", + len(content), MAX_SNAPSHOT_BYTES, sanitize_log_str(file_path), + ) + return None + p = snapshot_path(session_id, file_path) + if p is None: + return None + try: + p.parent.mkdir(parents=True, exist_ok=True) + _evict_oldest(p.parent, MAX_SNAPSHOTS_PER_SESSION - 1) + paths.atomic_write_bytes(p, content) + except OSError as exc: + _LOG.warning( + "snapshots: store failed for %s: %s", + sanitize_log_str(file_path), exc, + ) + return None + sha = hashlib.sha256(content).hexdigest() + return SnapshotResult(path=p, content_sha=sha, size_bytes=len(content)) + + +def load(session_id: str, file_path: str) -> bytes | None: + """Return the snapshot bytes for ``(session_id, file_path)``, or ``None``. + + Returns ``None`` when the snapshot is absent, unreadable, or too large to + safely return (defensive: a snapshot that has somehow grown past + :data:`MAX_SNAPSHOT_BYTES` between write and load is treated as missing). + """ + p = snapshot_path(session_id, file_path) + if p is None or not p.exists(): + return None + try: + size = p.stat().st_size + except OSError: + return None + if size > MAX_SNAPSHOT_BYTES: + _LOG.warning( + "snapshots: refusing to load oversized snapshot (%d bytes): %s", + size, sanitize_log_str(file_path), + ) + return None + try: + return p.read_bytes() + except OSError as exc: + _LOG.warning( + "snapshots: load failed for %s: %s", + sanitize_log_str(file_path), exc, + ) + return None + + +def cleanup_session(session_id: str) -> int: + """Remove every snapshot for *session_id*. Returns the count removed. + + Called when a session is reset (``/clear`` or compact). Silently ignores + missing directories. Refuses to follow symlinks: a planted symlink in the + snapshot directory must not be able to redirect unlink calls. + """ + d = _session_dir(session_id) + if d is None or not d.exists(): + return 0 + removed = 0 + try: + for fp in d.iterdir(): + try: + st = os.lstat(fp) + except OSError: + continue + if _stat_module.S_ISLNK(st.st_mode): + _LOG.warning("snapshots: skipping symlink in cleanup: %s", fp.name) + continue + try: + fp.unlink() + removed += 1 + except OSError: + continue + except OSError: + return removed + with contextlib.suppress(OSError): + d.rmdir() # only succeeds when empty; ignore otherwise + _LOG.debug("snapshots: cleanup_session %s removed=%d", sanitize_log_str(session_id), removed) + return removed + + +def cleanup_stale(max_age_hours: float = 24.0) -> int: + """Drop snapshots whose mtime is older than *max_age_hours*. + + Run periodically by the background worker. Stale snapshots are noise + after their session ends; without this sweep the snapshot store would + grow without bound across long-lived installations. + """ + base = paths.data_dir() / "session_snapshots" + if not base.exists(): + return 0 + cutoff = time.time() - max_age_hours * 3600 + removed = 0 + try: + for session_dir in base.iterdir(): + if not session_dir.is_dir() or session_dir.is_symlink(): + continue + try: + for fp in session_dir.iterdir(): + try: + st = os.lstat(fp) + except OSError: + continue + if _stat_module.S_ISLNK(st.st_mode): + continue + if st.st_mtime < cutoff: + try: + fp.unlink() + removed += 1 + except OSError: + continue + except OSError: + continue + # Clean up empty session dirs as we go. + with contextlib.suppress(OSError): + session_dir.rmdir() + except OSError: + return removed + if removed: + _LOG.info("snapshots: cleanup_stale removed=%d (max_age_hours=%.1f)", removed, max_age_hours) + return removed diff --git a/src/token_goat/worker.py b/src/token_goat/worker.py index 895db88..15e9ef0 100644 --- a/src/token_goat/worker.py +++ b/src/token_goat/worker.py @@ -65,6 +65,8 @@ class CleanupStats(TypedDict, total=False): image_bytes_evicted: int image_files_evicted: int stats_rows_pruned: int + snapshots_cleared: int + bash_outputs_evicted: int failures: list[str] # task names that raised during cleanup @@ -623,6 +625,32 @@ def _prune_stats_table() -> int: raise +def _cleanup_stale_snapshots() -> int: + """Drop per-session content snapshots older than 24 hours. + + Run from :func:`cleanup_on_startup` because the diff-aware re-read store + accumulates one directory per session. Without periodic eviction these + pile up across long-lived installations even though most are tied to + sessions that ended hours ago. + """ + from . import snapshots # noqa: PLC0415 + + return snapshots.cleanup_stale(max_age_hours=24.0) + + +def _evict_bash_outputs() -> int: + """Enforce the on-disk bash-output store byte cap. + + The post-bash hook also calls this opportunistically after every write, + but the startup pass picks up the slack when many small writes leave the + directory slightly over budget at shutdown time. Returns the number of + cache files removed. + """ + from . import bash_cache # noqa: PLC0415 + + return bash_cache.evict_old_entries() + + def cleanup_on_startup() -> CleanupStats: """Run all self-healing tasks on daemon startup. Returns a summary with counts and failures. @@ -653,6 +681,8 @@ def cleanup_on_startup() -> CleanupStats: ("stale_locks", _cleanup_stale_locks, "stale_locks_cleared"), ("old_logs", _cleanup_old_logs, "logs_deleted"), ("stats_prune", _prune_stats_table, "stats_rows_pruned"), + ("snapshots", _cleanup_stale_snapshots, "snapshots_cleared"), + ("bash_outputs", _evict_bash_outputs, "bash_outputs_evicted"), ] for task_name, task_fn, stat_key in _int_tasks: try: @@ -682,13 +712,16 @@ def cleanup_on_startup() -> CleanupStats: stats["failures"] = failures _LOG.info( "startup cleanup complete: locks_cleared=%d index_markers_cleared=%d logs_deleted=%d " - "stats_rows_pruned=%d image_bytes_evicted=%d image_files_evicted=%d%s", + "stats_rows_pruned=%d image_bytes_evicted=%d image_files_evicted=%d " + "snapshots_cleared=%d bash_outputs_evicted=%d%s", stats.get("stale_locks_cleared", 0), stats.get("stale_index_markers_cleared", 0), stats.get("logs_deleted", 0), stats.get("stats_rows_pruned", 0), stats.get("image_bytes_evicted", 0), stats.get("image_files_evicted", 0), + stats.get("snapshots_cleared", 0), + stats.get("bash_outputs_evicted", 0), f" failures={failures}" if failures else "", ) return stats diff --git a/tests/test_bash_cache.py b/tests/test_bash_cache.py new file mode 100644 index 0000000..0fac15b --- /dev/null +++ b/tests/test_bash_cache.py @@ -0,0 +1,144 @@ +"""Tests for the bash_cache on-disk store + post_bash hook integration.""" +from __future__ import annotations + +from hook_helpers import assert_continue as _assert_continue + +from token_goat import bash_cache, hooks_read, session + + +class TestStoreAndLoad: + def test_small_output_round_trip(self, tmp_data_dir): + """A small output is written verbatim and read back identical.""" + meta = bash_cache.store_output( + "sess1", "ls -lh", "total 16\n-rw-r--r-- 1 user user x" * 10, + "", 0, + ) + assert meta is not None + body = bash_cache.load_output(meta.output_id) + assert body is not None and "total 16" in body + assert meta.stdout_bytes > 0 + assert meta.exit_code == 0 + assert meta.truncated is False + + def test_large_output_is_tail_preserved(self, tmp_data_dir): + """An output above the 2 MB cap is truncated head-only with a marker.""" + big = "A" * (3 * 1024 * 1024) + meta = bash_cache.store_output("sess2", "yes A", big, "", 0) + assert meta is not None + assert meta.truncated is True + body = bash_cache.load_output(meta.output_id) + assert body is not None + # Marker is in the head; the trailing portion of the original output + # (every byte the tail check needs) is preserved. + assert "token-goat: bash output truncated" in body + # The very last characters of `big` are preserved at the tail. + assert body.endswith("A") + + def test_id_format_rejects_traversal(self, tmp_data_dir): + """A crafted output_id with traversal characters returns no path.""" + assert bash_cache.load_output("../../etc/passwd") is None + assert bash_cache.load_output("sess/with/slash") is None + + def test_load_missing_returns_none(self, tmp_data_dir): + assert bash_cache.load_output("nonexistent-id") is None + + def test_sidecar_round_trip(self, tmp_data_dir): + """write_sidecar / read_sidecar preserves all metadata fields.""" + meta = bash_cache.store_output( + "sess3", "pytest -v", "PASS x" * 200, "warn\n", 0, + ) + assert meta is not None + bash_cache.write_sidecar(meta) + loaded = bash_cache.read_sidecar(meta.output_id) + assert loaded is not None + assert loaded.cmd_sha == meta.cmd_sha + assert loaded.exit_code == 0 + + def test_evict_old_entries_respects_cap(self, tmp_data_dir): + """When total cache size exceeds the cap, the oldest entries go first.""" + for i in range(5): + bash_cache.store_output( + f"sess{i}", f"echo {i}", "X" * 200_000, "", 0, + ) + evicted = bash_cache.evict_old_entries(max_total_bytes=300_000) + assert evicted >= 1 + + +class TestPostBashHook: + def test_small_output_skipped(self, tmp_data_dir): + """Output below the cache threshold is not stored.""" + payload = { + "session_id": "post-bash-1", + "tool_name": "Bash", + "tool_input": {"command": "true"}, + "tool_response": {"stdout": "ok\n", "stderr": "", "exit_code": 0}, + } + result = hooks_read.post_bash(payload) + _assert_continue(result) + # No bash history entry was recorded because output was below threshold. + cache = session.load("post-bash-1") + assert not cache.bash_history + + def test_large_output_recorded_in_session(self, tmp_data_dir): + """An output past the threshold lands on disk and in session history.""" + big = "X" * 5000 + payload = { + "session_id": "post-bash-2", + "tool_name": "Bash", + "tool_input": {"command": "pytest -v"}, + "tool_response": {"stdout": big, "stderr": "", "exit_code": 1}, + } + result = hooks_read.post_bash(payload) + _assert_continue(result) + + cache = session.load("post-bash-2") + assert len(cache.bash_history) == 1 + entry = next(iter(cache.bash_history.values())) + assert entry.stdout_bytes == 5000 + assert entry.exit_code == 1 + assert "pytest" in entry.cmd_preview + body = bash_cache.load_output(entry.output_id) + assert body is not None and body.startswith("X") + + def test_missing_session_id_skipped(self, tmp_data_dir): + """No session_id → no record, but hook still returns CONTINUE.""" + payload = { + "tool_name": "Bash", + "tool_input": {"command": "echo " + "X" * 5000}, + "tool_response": {"stdout": "X" * 5000, "stderr": "", "exit_code": 0}, + } + result = hooks_read.post_bash(payload) + _assert_continue(result) + + def test_missing_tool_response_no_crash(self, tmp_data_dir): + """A payload with no tool_response is silently a no-op.""" + payload = { + "session_id": "post-bash-3", + "tool_name": "Bash", + "tool_input": {"command": "echo hi"}, + } + result = hooks_read.post_bash(payload) + _assert_continue(result) + + +class TestSessionLookup: + def test_mark_and_lookup(self, tmp_data_dir): + """mark_bash_run stores an entry that lookup_bash_entry can retrieve.""" + sha = bash_cache.command_hash("git log -20") + session.mark_bash_run( + session_id="lookup-1", + cmd_sha=sha, + cmd_preview="git log -20", + output_id="out-1", + stdout_bytes=12345, + stderr_bytes=0, + exit_code=0, + truncated=False, + ) + entry = session.lookup_bash_entry("lookup-1", sha) + assert entry is not None + assert entry.output_id == "out-1" + assert entry.stdout_bytes == 12345 + + def test_lookup_missing_returns_none(self, tmp_data_dir): + assert session.lookup_bash_entry("lookup-2", "deadbeef") is None diff --git a/tests/test_bash_cli.py b/tests/test_bash_cli.py new file mode 100644 index 0000000..b486ab0 --- /dev/null +++ b/tests/test_bash_cli.py @@ -0,0 +1,84 @@ +"""Smoke tests for the bash-output and bash-history CLI commands.""" +from __future__ import annotations + +import json + +from typer.testing import CliRunner + +from token_goat import bash_cache +from token_goat.cli import app + + +def _seed(session_id: str = "cli-1", command: str = "pytest -v") -> str: + """Store a cached output and return its ID.""" + meta = bash_cache.store_output( + session_id, command, + "line 1\nline 2\nfailing test\nline 4\n", "", 1, + ) + assert meta is not None + bash_cache.write_sidecar(meta) + return meta.output_id + + +class TestBashOutputCli: + def test_retrieves_cached_body(self, tmp_data_dir): + oid = _seed() + runner = CliRunner() + result = runner.invoke(app, ["bash-output", oid]) + assert result.exit_code == 0 + assert "failing test" in result.stdout + assert "line 1" in result.stdout + + def test_grep_filter(self, tmp_data_dir): + oid = _seed() + runner = CliRunner() + result = runner.invoke(app, ["bash-output", oid, "--grep", "failing"]) + assert result.exit_code == 0 + assert "failing test" in result.stdout + assert "line 1" not in result.stdout + + def test_head_limits_output(self, tmp_data_dir): + oid = _seed() + runner = CliRunner() + result = runner.invoke(app, ["bash-output", oid, "--head", "2"]) + assert result.exit_code == 0 + assert "line 1" in result.stdout + assert "line 4" not in result.stdout + + def test_missing_id_returns_error(self, tmp_data_dir): + runner = CliRunner() + result = runner.invoke(app, ["bash-output", "nonexistent-id"]) + assert result.exit_code != 0 + + def test_json_includes_metadata(self, tmp_data_dir): + oid = _seed() + runner = CliRunner() + result = runner.invoke(app, ["bash-output", oid, "--json"]) + assert result.exit_code == 0 + payload = json.loads(result.stdout) + assert payload["output_id"] == oid + assert "failing test" in payload["text"] + assert "exit_code" in payload + + +class TestBashHistoryCli: + def test_empty_history(self, tmp_data_dir): + runner = CliRunner() + result = runner.invoke(app, ["bash-history"]) + assert result.exit_code == 0 + assert "no cached" in result.stdout.lower() + + def test_lists_entries(self, tmp_data_dir): + oid = _seed() + runner = CliRunner() + result = runner.invoke(app, ["bash-history"]) + assert result.exit_code == 0 + assert oid in result.stdout + + def test_json_listing(self, tmp_data_dir): + oid = _seed() + runner = CliRunner() + result = runner.invoke(app, ["bash-history", "--json"]) + assert result.exit_code == 0 + payload = json.loads(result.stdout) + assert any(row["output_id"] == oid for row in payload) diff --git a/tests/test_bash_dedup_hint.py b/tests/test_bash_dedup_hint.py new file mode 100644 index 0000000..b5de90f --- /dev/null +++ b/tests/test_bash_dedup_hint.py @@ -0,0 +1,86 @@ +"""Integration tests: pre-Bash dedup hint via the pre_read hook.""" +from __future__ import annotations + +from hook_helpers import assert_continue as _assert_continue + +from token_goat import bash_cache, hooks_read, session + + +def _seed_history(session_id: str, command: str, *, output_bytes: int = 8000) -> None: + """Helper: emulate a prior post_bash invocation to populate history.""" + big_out = "X" * output_bytes + payload = { + "session_id": session_id, + "tool_name": "Bash", + "tool_input": {"command": command}, + "tool_response": {"stdout": big_out, "stderr": "", "exit_code": 0}, + } + hooks_read.post_bash(payload) + + +class TestBashDedupHintFiresOnRepeat: + def test_repeat_command_triggers_hint(self, tmp_data_dir): + _seed_history("dedup-1", "pytest -v tests/") + # Pre-read fires for the same command in the same session. + payload = { + "session_id": "dedup-1", + "tool_name": "Bash", + "tool_input": {"command": "pytest -v tests/"}, + } + result = hooks_read.pre_read(payload) + _assert_continue(result) + hso = result.get("hookSpecificOutput") + assert hso is not None + ctx = hso.get("additionalContext", "") + assert "token-goat bash-output" in ctx + assert "pytest -v tests/" in ctx + + def test_distinct_command_no_hint(self, tmp_data_dir): + _seed_history("dedup-2", "pytest -v tests/") + payload = { + "session_id": "dedup-2", + "tool_name": "Bash", + "tool_input": {"command": "pytest -v src/"}, # different command + } + result = hooks_read.pre_read(payload) + _assert_continue(result) + assert "hookSpecificOutput" not in result + + def test_tiny_prior_output_no_hint(self, tmp_data_dir): + """A small previous output is not worth deduplicating.""" + _seed_history("dedup-3", "ls", output_bytes=20) + payload = { + "session_id": "dedup-3", + "tool_name": "Bash", + "tool_input": {"command": "ls"}, + } + result = hooks_read.pre_read(payload) + _assert_continue(result) + # No history entry was even recorded (output below cache threshold), + # so no hint can fire. + assert "hookSpecificOutput" not in result + + def test_old_history_entry_suppressed(self, tmp_data_dir, monkeypatch): + """A prior run older than the stale-age threshold is suppressed.""" + from token_goat import hints + + # First simulate a normal recording. + _seed_history("dedup-4", "make build") + sha = bash_cache.command_hash("make build") + entry = session.lookup_bash_entry("dedup-4", sha) + assert entry is not None + + # Push the timestamp far into the past so the staleness check fires. + cache = session.load("dedup-4") + cache.bash_history[sha].ts -= hints.STALE_READ_AGE_SECONDS + 100 + session.save(cache) + + payload = { + "session_id": "dedup-4", + "tool_name": "Bash", + "tool_input": {"command": "make build"}, + } + result = hooks_read.pre_read(payload) + _assert_continue(result) + # Stale entry → no dedup hint, even though command matches. + assert "hookSpecificOutput" not in result diff --git a/tests/test_diff_hint_integration.py b/tests/test_diff_hint_integration.py new file mode 100644 index 0000000..694fc25 --- /dev/null +++ b/tests/test_diff_hint_integration.py @@ -0,0 +1,74 @@ +"""End-to-end: post_read snapshots, post_edit invalidates, pre_read emits diff.""" +from __future__ import annotations + +from hook_helpers import assert_continue as _assert_continue + +from token_goat import hooks_edit, hooks_read, session + + +class TestDiffHintEndToEnd: + def test_read_then_edit_then_reread_emits_diff(self, tmp_data_dir, tmp_path): + """A read followed by an edit and a re-read should yield a diff hint.""" + (tmp_path / ".git").mkdir() + src = tmp_path / "module.py" + # Generously large file so the saving easily clears the min threshold. + body = "".join(f"def fn_{i}():\n return {i}\n" for i in range(200)) + original = "VERSION = 1\n" + body + src.write_text(original, encoding="utf-8") + + sid = "diff-e2e-1" + + # 1. Read — populates snapshot. + _assert_continue(hooks_read.post_read({ + "session_id": sid, + "tool_name": "Read", + "tool_input": {"file_path": str(src)}, + })) + + # 2. Edit — bumps last_edit_ts so the pre_read invalidates the dedup hint. + src.write_text("VERSION = 2\n" + body, encoding="utf-8") + _assert_continue(hooks_edit.post_edit({ + "session_id": sid, + "tool_input": {"file_path": str(src)}, + "cwd": str(tmp_path), + })) + + # 3. Re-read — should produce a diff-based hint. + result = hooks_read.pre_read({ + "session_id": sid, + "tool_name": "Read", + "tool_input": {"file_path": str(src)}, + "cwd": str(tmp_path), + }) + _assert_continue(result) + hso = result.get("hookSpecificOutput") + assert hso is not None + ctx = hso.get("additionalContext", "") + assert "```diff" in ctx + assert "VERSION = 1" in ctx + assert "VERSION = 2" in ctx + + def test_no_snapshot_falls_back_to_session_hint(self, tmp_data_dir, tmp_path): + """When no snapshot exists, pre_read uses the regular cache hint path.""" + (tmp_path / ".git").mkdir() + src = tmp_path / "module.py" + src.write_text("x = 1\n", encoding="utf-8") + + sid = "diff-e2e-2" + # Mark file as already read (line range) but skip the snapshot step. + session.mark_file_read(sid, str(src), offset=0, limit=200) + + result = hooks_read.pre_read({ + "session_id": sid, + "tool_name": "Read", + "tool_input": {"file_path": str(src), "offset": 0, "limit": 200}, + "cwd": str(tmp_path), + }) + _assert_continue(result) + hso = result.get("hookSpecificOutput") + assert hso is not None + ctx = hso.get("additionalContext", "") + # The standard cache hint mentions "already read" — distinct from + # the diff hint's "edited in this session" wording. + assert "already read" in ctx or "previously read" in ctx + assert "```diff" not in ctx diff --git a/tests/test_languages_config.py b/tests/test_languages_config.py new file mode 100644 index 0000000..9708061 --- /dev/null +++ b/tests/test_languages_config.py @@ -0,0 +1,118 @@ +"""Tests for the TOML / YAML / JSON section extractors.""" +from __future__ import annotations + +from token_goat.languages import json_idx, toml_idx, yaml_idx + + +class TestTomlExtractor: + def test_simple_tables(self): + src = b""" +[tool.ruff] +line-length = 100 + +[tool.ruff.format] +quote-style = "double" + +[[some.array]] +key = 1 +""" + symbols, refs, imps, sections = toml_idx.extract(src, "pyproject.toml") + assert refs == [] and imps == [] + headings = [s.heading for s in sections] + assert "tool.ruff" in headings + assert "tool.ruff.format" in headings + assert "some.array" in headings + # Sections have ascending start lines and non-overlapping end lines. + for a, b in zip(sections, sections[1:], strict=False): + assert a.line <= b.line + assert a.end_line is not None and a.end_line < b.line or a.end_line == b.line - 1 + + def test_no_headers_yields_empty(self): + src = b'name = "thing"\nversion = "0.1"\n' + _, _, _, sections = toml_idx.extract(src, "Cargo.toml") + assert sections == [] + + def test_malformed_brackets_ignored(self): + src = b"[a]\nok = 1\n[bad\nnot = 'a section'\n" + _, _, _, sections = toml_idx.extract(src, "x.toml") + headings = [s.heading for s in sections] + assert headings == ["a"] + + def test_quoted_table_name(self): + src = b'["tool.ruff"]\nkey = "x"\n' + _, _, _, sections = toml_idx.extract(src, "x.toml") + assert any(s.heading == "tool.ruff" for s in sections) + + +class TestYamlExtractor: + def test_top_level_keys_emitted(self): + src = b"name: my-action\nruns:\n using: composite\n steps:\n - run: echo\n" + _, _, _, sections = yaml_idx.extract(src, "action.yml") + headings = [s.heading for s in sections] + assert "name" in headings + assert "runs" in headings + + def test_nested_keys_emitted(self): + src = b"spec:\n replicas: 3\n selector: foo\n template:\n metadata: x\n" + _, _, _, sections = yaml_idx.extract(src, "deploy.yaml") + headings = [s.heading for s in sections] + assert "spec" in headings + # Nested keys are emitted with parent.child dotted form. + assert "spec.replicas" in headings + assert "spec.selector" in headings + + def test_list_items_not_emitted_as_keys(self): + src = b"items:\n - one\n - two\n - three\n" + _, _, _, sections = yaml_idx.extract(src, "list.yml") + headings = [s.heading for s in sections] + # Only "items" is a real key; the list dashes should not become keys. + assert headings == ["items"] + + def test_multi_document_resets_state(self): + src = b"a: 1\n---\nb: 2\n" + _, _, _, sections = yaml_idx.extract(src, "multi.yml") + headings = [s.heading for s in sections] + assert "a" in headings + assert "b" in headings + + +class TestJsonSections: + def test_pretty_printed_json_emits_sections(self): + src = b"""{ + "name": "my-pkg", + "version": "1.0.0", + "scripts": { + "test": "vitest", + "build": "vite build" + }, + "dependencies": { + "react": "^18" + } +} +""" + _, _, _, sections = json_idx.extract(src, "package.json") + headings = [s.heading for s in sections] + assert headings == ["name", "version", "scripts", "dependencies"] + # End lines bound each section to the line before the next heading. + scripts_sec = next(s for s in sections if s.heading == "scripts") + deps_sec = next(s for s in sections if s.heading == "dependencies") + assert scripts_sec.end_line is not None + assert scripts_sec.end_line < deps_sec.line + + def test_minified_json_no_sections(self): + src = b'{"name":"x","version":"1.0.0","deps":{"a":"b"}}' + _, _, _, sections = json_idx.extract(src, "min.json") + assert sections == [] + + def test_nested_keys_not_in_sections(self): + """The 'test' key inside 'scripts' must not become a top-level section.""" + src = b"""{ + "scripts": { + "test": "vitest" + } +} +""" + _, _, _, sections = json_idx.extract(src, "p.json") + headings = [s.heading for s in sections] + # Only the top-level "scripts" key is a section; "test" (depth 2) is not. + assert headings == ["scripts"] diff --git a/tests/test_snapshots.py b/tests/test_snapshots.py new file mode 100644 index 0000000..e396d52 --- /dev/null +++ b/tests/test_snapshots.py @@ -0,0 +1,119 @@ +"""Tests for the per-session file-content snapshot store + diff-aware re-read.""" +from __future__ import annotations + +from hook_helpers import assert_continue as _assert_continue + +from token_goat import hints, hooks_read, session, snapshots + + +class TestSnapshotStore: + def test_store_and_load_round_trip(self, tmp_data_dir): + result = snapshots.store("sess1", "/tmp/foo.py", b"hello\nworld\n") + assert result is not None + loaded = snapshots.load("sess1", "/tmp/foo.py") + assert loaded == b"hello\nworld\n" + + def test_oversized_file_not_stored(self, tmp_data_dir): + big = b"X" * (snapshots.MAX_SNAPSHOT_BYTES + 1) + result = snapshots.store("sess2", "/tmp/big.py", big) + assert result is None + assert snapshots.load("sess2", "/tmp/big.py") is None + + def test_path_with_traversal_chars_normalised(self, tmp_data_dir): + """Snapshot store accepts any path string but the on-disk name is a hash.""" + result = snapshots.store("sess3", "../../etc/passwd", b"x") + assert result is not None + assert result.path.parent.name.startswith("sess3") + + def test_cleanup_session_removes_files(self, tmp_data_dir): + snapshots.store("sess4", "/tmp/a.py", b"a") + snapshots.store("sess4", "/tmp/b.py", b"b") + removed = snapshots.cleanup_session("sess4") + assert removed == 2 + assert snapshots.load("sess4", "/tmp/a.py") is None + + def test_eviction_keeps_per_session_under_cap(self, tmp_data_dir, monkeypatch): + """When more than MAX_SNAPSHOTS_PER_SESSION are stored, oldest go first.""" + monkeypatch.setattr(snapshots, "MAX_SNAPSHOTS_PER_SESSION", 3) + for i in range(5): + snapshots.store("sess5", f"/tmp/f{i}.py", f"v{i}".encode()) + # The first two snapshots should have been evicted by the time we've + # stored five with a cap of three. + assert snapshots.load("sess5", "/tmp/f0.py") is None + assert snapshots.load("sess5", "/tmp/f4.py") == b"v4" + + +class TestDiffHint: + def test_no_snapshot_means_no_hint(self, tmp_data_dir): + hint = hints.build_diff_hint( + session_id="diff1", + file_path="/tmp/missing.py", + current_text="def foo():\n pass\n", + ) + assert hint is None + + def test_identical_snapshot_means_no_hint(self, tmp_data_dir): + content = "def foo():\n return 1\n" * 20 + snapshots.store("diff2", "/tmp/same.py", content.encode()) + hint = hints.build_diff_hint( + session_id="diff2", file_path="/tmp/same.py", current_text=content, + ) + assert hint is None + + def test_meaningful_diff_emits_hint(self, tmp_data_dir): + """A small diff against a large file produces a positive-saving hint. + + The file is ~6 KB so a re-read costs ~1500 tokens; a one-line change + produces a tiny diff so the saving easily clears the minimum threshold. + Unique per-line content keeps difflib's autojunk heuristic from + treating the surrounding context as noise. + """ + body = "".join(f"# filler line {i}\n" for i in range(500)) + old = "x = 1\n" + body + new = "x = 2\n" + body + snapshots.store("diff3", "/tmp/changed.py", old.encode()) + hint = hints.build_diff_hint( + session_id="diff3", file_path="/tmp/changed.py", current_text=new, + ) + assert hint is not None + assert hint.tokens_saved > 0 + assert "```diff" in str(hint) + + def test_huge_diff_suppressed(self, tmp_data_dir): + """When the diff would exceed the size cap, no hint is emitted.""" + old = "old\n" * 5000 + new = "new\n" * 5000 + snapshots.store("diff4", "/tmp/huge.py", old.encode()) + hint = hints.build_diff_hint( + session_id="diff4", file_path="/tmp/huge.py", current_text=new, + ) + assert hint is None + + +class TestPostReadSnapshots: + def test_post_read_captures_snapshot(self, tmp_data_dir, tmp_path): + """post_read writes a snapshot of the read file's bytes.""" + src = tmp_path / "small.py" + src.write_text("def x(): pass\n", encoding="utf-8") + payload = { + "session_id": "post-read-snap-1", + "tool_name": "Read", + "tool_input": {"file_path": str(src)}, + } + _assert_continue(hooks_read.post_read(payload)) + assert snapshots.load("post-read-snap-1", str(src)) == b"def x(): pass\n" + # Session also records the snapshot SHA so a future hook can short-circuit. + sha = session.get_snapshot_sha("post-read-snap-1", str(src)) + assert sha and len(sha) == 64 + + def test_post_read_oversized_skips_snapshot(self, tmp_data_dir, tmp_path): + """A file larger than the snapshot cap is not snapshotted.""" + src = tmp_path / "big.py" + src.write_bytes(b"X" * (snapshots.MAX_SNAPSHOT_BYTES + 1)) + payload = { + "session_id": "post-read-snap-2", + "tool_name": "Read", + "tool_input": {"file_path": str(src)}, + } + _assert_continue(hooks_read.post_read(payload)) + assert snapshots.load("post-read-snap-2", str(src)) is None From a6d048bd4be01177283ed6c54b18f58e55dcc849 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 06:36:27 +0000 Subject: [PATCH 02/13] feat: compact manifest bash section, INI/CFG/.env indexer, payload hardening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six follow-up surfaces on the bash-cache / diff-hint / config-section work landed in the previous commit: 1. Compaction manifest: a new "Commands Run" section surfaces the most recent meaningful Bash invocations with cmd preview, exit code, byte size, and the cache ID (`token-goat bash-output `) so the test/build context that drives the next turn survives compaction. `event_count` now includes `bash_history` so a session whose only activity is a cached test run still clears `min_events`. 2. `bash-output --json` now emits `numbered_lines` (`[{lineno, text}]` anchored to the original body) plus `total_lines`, mirroring the surgical-read response shape so agents can follow up with positional slicers that map back to the on-disk file. 3. Stats source buckets: `diff_hint` / `diff_hint_overhead` now land in the existing `hint` bucket, and a new `bash` bucket (orange in the fancy renderer) catches `bash_dedup_hint*` and `bash_output_cached`, so the new mechanisms get a first-class line in `token-goat stats` instead of falling into the `other` catch-all. 4. INI / CFG / .env indexer: `[name]` headers in `.ini`/`.cfg` files become Sections (so `token-goat section setup.cfg::metadata` works); `.env` and `.envrc` index each `KEY=value` assignment as a symbol. Parser gains a basename-keyed dispatch table alongside the existing suffix table — `.env` has no Path.suffix and would otherwise be silently skipped. 5. PostToolUse Bash payload hardening: `_extract_bash_response` now tolerates every documented shape — dict-with-named-fields (Claude Code), MCP CallToolResult content arrays, bare-string blobs, top-level flattening, `tool_result`/`response` aliases, `returncode` and string-typed `exit_code` variants. Each is covered by a dedicated test. 6. `bash_cache.evict_old_entries` removes body + sidecar pairs together and runs an orphan-sidecar sweep at the end, so a manual `rm` of a body or a write race can no longer leave .json metadata files accumulating forever. Tests: four new test modules (post-bash payload variants, INI/.env extractor, compaction bash section, stats bucket mapping) plus extensions to test_bash_cache.py and test_bash_cli.py. 342 targeted tests pass; lint clean; mypy adds zero new errors over baseline. --- CHANGELOG.md | 7 +- src/token_goat/bash_cache.py | 51 ++++++- src/token_goat/cli.py | 19 +++ src/token_goat/compact.py | 114 +++++++++++++++- src/token_goat/hooks_read.py | 142 +++++++++++++++++-- src/token_goat/languages/ini_idx.py | 160 ++++++++++++++++++++++ src/token_goat/parser.py | 38 +++++- src/token_goat/render/ansi.py | 1 + src/token_goat/render/stats_renderer.py | 1 + src/token_goat/stats.py | 14 +- tests/test_bash_cache.py | 49 +++++++ tests/test_bash_cli.py | 19 +++ tests/test_compact_bash.py | 79 +++++++++++ tests/test_ini_extractor.py | 103 ++++++++++++++ tests/test_post_bash_payloads.py | 172 ++++++++++++++++++++++++ tests/test_stats_buckets.py | 27 ++++ 16 files changed, 966 insertions(+), 30 deletions(-) create mode 100644 src/token_goat/languages/ini_idx.py create mode 100644 tests/test_compact_bash.py create mode 100644 tests/test_ini_extractor.py create mode 100644 tests/test_post_bash_payloads.py create mode 100644 tests/test_stats_buckets.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 805d557..137aa03 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,13 +8,18 @@ All notable changes to Token-Goat are documented in this file. Format follows Ke - **Bash output interception.** A new `PostToolUse(Bash)` hook persists large stdout/stderr to disk under `data_dir() / "bash_outputs"` and records the command in the session cache. When the same command is about to run again in the same session, the pre-Bash hint suggests `token-goat bash-output ` (optionally with `--head N`, `--tail N`, or `--grep PATTERN`) instead of re-executing — avoiding both runtime cost and duplicated tokens. The store is byte-capped (16 MB default) with oldest-first eviction; outputs above 2 MB are tail-preserved with a truncation marker. Two new CLI commands surface the cache: `token-goat bash-output` retrieves a sliced view, `token-goat bash-history` lists cached entries newest-first. - **Diff-aware re-read.** `post_read` now writes a per-session content snapshot (under `data_dir() / "session_snapshots"`, capped at 256 KB per file and 150 snapshots per session) so a follow-up `Read` after a `Write`/`Edit`/`MultiEdit` can be answered with a unified diff hint instead of a `pre_read` blocking message that silently allowed the full re-read. The diff is bounded to 4 KB and only fires when the realised saving exceeds ~250 tokens; below that the existing session-cache hint path runs unchanged. Stats record both the realised saving (`diff_hint`) and the hint's injection cost (`diff_hint_overhead`) for honest accounting. -- **TOML, YAML, and JSON section extraction.** `token-goat section pyproject.toml::tool.ruff` (and the equivalents for `.yaml`, `.yml`, and pretty-printed `.json`) now extract a single table/key block instead of forcing a full-file read. The TOML scanner emits one `Section` per `[table]` and `[[array]]` header; the YAML scanner emits top-level keys plus one nested layer (`spec.replicas`-style) computed from the file's detected indent; JSON gains depth-1 section detection on pretty-printed files. None of the three pulls in an extra dependency — all use line-scanners and the existing stdlib parsers. +- **TOML, YAML, JSON, INI, CFG, and dotenv section extraction.** `token-goat section pyproject.toml::tool.ruff` (and equivalents for `.yaml`, `.yml`, `.json`, `.ini`, `.cfg`, `.env`, and `.envrc`) now extract a single table/key block instead of forcing a full-file read. The TOML scanner emits one `Section` per `[table]` and `[[array]]` header; the YAML scanner emits top-level keys plus one nested layer (`spec.replicas`-style) computed from the file's detected indent; JSON gains depth-1 section detection on pretty-printed files; INI/CFG indexes one section per `[name]` header; `.env`/`.envrc` index each `KEY=value` assignment as a symbol. None of the six pulls in an extra dependency — all use line-scanners and the existing stdlib parsers. The parser dispatcher gained a basename-keyed table (alongside the existing suffix table) so dotfiles with empty extensions (`.env`, `.envrc`) resolve correctly. - **Stale-data sweeps in the background worker.** `cleanup_on_startup` now also drops snapshot directories older than 24 hours and enforces the bash-output byte cap, so a long-lived install does not accumulate per-session debris. +- **Compaction manifest gained a "Commands Run" section.** The PreCompact manifest now surfaces the most recent meaningful Bash invocations (cmd preview, exit code, byte size, cache ID) so the test/build context that drives the next agent turn survives compaction. Each entry includes the `token-goat bash-output ` cache key for surgical recall. `event_count` includes `bash_history` so a session whose only activity is a cached test run still clears the `min_events` threshold. +- **`token-goat bash-output --json` now surfaces line numbers.** The JSON shape adds `numbered_lines` (a 1-based, original-body-anchored `[{lineno, text}]` list) and `total_lines`, mirroring the surgical-read response shape elsewhere in the codebase. Agents can now `--head` / `--tail` / `--grep` filter and still map back to positions in the original output. +- **Hardened PostToolUse Bash payload extraction.** `_extract_bash_response` now tolerates every documented Bash result shape: dict-with-named-fields (Claude Code), MCP `CallToolResult` content arrays, bare-string blobs, top-level flattening (no `tool_response` wrapper), `tool_result`/`response` aliases, `returncode` and string-typed `exit_code` variants. Each shape is covered by a dedicated regression test in `test_post_bash_payloads.py`. +- **New stats bucket `bash` (orange) and diff-hint accounting.** `token-goat stats` now attributes `diff_hint` / `diff_hint_overhead` to the `hint` bucket and `bash_dedup_hint*` / `bash_output_cached` to a new visible `bash` bucket, so the new mechanisms appear in the by-source panel instead of falling into the `other` catch-all. ### Changed - **`reset_session`** now also removes per-session content snapshots, matching the existing JSON-cache reset semantics. - **Codex Bash matcher in `~/.codex/config.toml`** now points at the new `post-bash` hook instead of `post-read`; under Codex, `post-read` previously did nothing for `Bash` calls (no branch in the handler), so this is a strict gain. +- **`bash_cache.evict_old_entries`** removes body + sidecar pairs together, and runs a second pass to sweep any orphan sidecars left over from out-of-band deletion. Previously, manual `rm` of a body file or a write race could leave a `.json` sidecar with no matching body that lived forever. ## [0.5.2] - 2026-05-17 diff --git a/src/token_goat/bash_cache.py b/src/token_goat/bash_cache.py index 8434c07..08231f0 100644 --- a/src/token_goat/bash_cache.py +++ b/src/token_goat/bash_cache.py @@ -269,12 +269,19 @@ def load_output_meta(output_id: str) -> dict[str, object] | None: def evict_old_entries(*, max_total_bytes: int = DEFAULT_MAX_TOTAL_BYTES) -> int: - """Evict the oldest files until total size is at or under *max_total_bytes*. - - Returns the number of files removed. Skips symlinks (defensive: an - attacker who can plant a symlink into the cache directory should not be - able to direct deletes elsewhere by name). All errors are swallowed — - eviction is opportunistic, not authoritative. + """Evict the oldest entries until total size is at or under *max_total_bytes*. + + Each cached output is a pair of files: the body (``.txt``) and the + JSON sidecar (``.json``). Eviction removes both atomically — leaving + an orphan sidecar after deleting its body would let stale metadata + accumulate over time and would also confuse ``token-goat bash-history`` + on subsequent calls. + + Returns the number of body files removed; orphan sidecar pairs count as + one removal each, matching the per-entry abstraction callers expect. + Skips symlinks (defensive: an attacker who can plant a symlink into the + cache directory should not be able to direct deletes elsewhere by name). + All errors are swallowed — eviction is opportunistic, not authoritative. """ try: d = _bash_outputs_dir() @@ -315,11 +322,43 @@ def evict_old_entries(*, max_total_bytes: int = DEFAULT_MAX_TOTAL_BYTES) -> int: removed += 1 except OSError: continue + # Best-effort sidecar removal — if the body deletion succeeded the + # sidecar should follow. A failure here is logged at debug only: + # leaving a single sidecar around is harmless (read_sidecar tolerates + # missing bodies), and the next eviction pass will retry. + sidecar = fp.with_suffix(".json") + try: + sidecar.unlink() + except FileNotFoundError: + pass + except OSError as exc: + _LOG.debug("bash_cache: sidecar cleanup failed for %s: %s", sidecar.name, exc) if removed: _LOG.info( "bash_cache: evicted %d entries to fit cap=%d bytes", removed, max_total_bytes, ) + + # Orphan-sidecar sweep. A sidecar whose body was deleted out-of-band + # (e.g. a previous eviction whose body unlink succeeded before the + # sidecar unlink could run, or a manual ``rm cache/*.txt``) would + # otherwise live forever. We list ``.json`` files in the cache dir and + # drop any without a matching ``.txt``. Cheap because the directory + # typically has only a handful of entries at any time. + try: + for sp in d.iterdir(): + if not sp.name.endswith(".json"): + continue + body = sp.with_suffix(".txt") + if body.exists(): + continue + try: + sp.unlink() + except OSError as exc: + _LOG.debug("bash_cache: orphan sidecar removal failed: %s: %s", sp.name, exc) + except OSError: + pass + return removed diff --git a/src/token_goat/cli.py b/src/token_goat/cli.py index 3cb2146..f3ffcfc 100644 --- a/src/token_goat/cli.py +++ b/src/token_goat/cli.py @@ -990,10 +990,29 @@ def cmd_bash_output( if json_output: meta = bash_cache.load_output_meta(output_id) or {} sidecar = bash_cache.read_sidecar(output_id) + # Match the surgical-read shape exposed elsewhere: alongside the joined + # text blob, surface a ``{lineno, text}`` list anchored to the *original* + # body line numbers (not the filtered slice positions) so an agent can + # follow up with `--head ` / `--tail ` slicers that map + # back to the on-disk file. + original_lines = body.splitlines() + # Build a 1-based index for the original body so the lookup below stays + # O(unique sliced lines) rather than O(N*M). Duplicate lines map to + # their *first* occurrence — same convention as Read tool line numbers. + original_index: dict[str, int] = {} + for i, ln in enumerate(original_lines, start=1): + if ln not in original_index: + original_index[ln] = i + numbered: list[dict[str, object]] = [ + {"lineno": original_index.get(ln, 0), "text": ln} + for ln in lines + ] payload: dict[str, object] = { "output_id": output_id, "text": sliced, "lines": len(lines), + "numbered_lines": numbered, + "total_lines": len(original_lines), } payload.update(meta) if sidecar is not None: diff --git a/src/token_goat/compact.py b/src/token_goat/compact.py index 06f2f64..4f02145 100644 --- a/src/token_goat/compact.py +++ b/src/token_goat/compact.py @@ -44,6 +44,17 @@ # Max symbols listed per file entry in the manifest (separate from _MAX_SYMBOLS_FILES, # which caps the number of *files* that show any symbols at all). _MAX_SYMBOLS_PER_FILE_ENTRY: Final[int] = 6 +# Maximum number of cached Bash commands listed in the manifest. Bash entries +# preserve the test/build context most likely to drive the next agent turn +# (a green pytest, a failing build, the most recent git log), but listing every +# command across a long session would crowd out higher-priority sections. Six +# covers the typical iterate-test-fix-test-commit cycle without dominating the +# budget — most sessions accumulate fewer than that. +_MAX_BASH_ENTRIES: Final[int] = 6 +# Smallest cached Bash output worth surfacing in the manifest. Below ~400 bytes +# the dedup hint suppresses on size anyway, and the manifest line itself costs +# tokens that would not be paid back even if the agent acted on the hint. +_MIN_BASH_BYTES_FOR_MANIFEST: Final[int] = 400 # Hard ceiling on the max_tokens parameter accepted by build_manifest. # The config layer sets a sensible default (400) but build_manifest is also part of @@ -67,6 +78,10 @@ # compaction than ones touched at the start of a long session). _BY_LAST_READ_TS = attrgetter("last_read_ts") +# Same idea, applied to BashEntry — most-recently-run commands are the ones +# whose output the compaction LLM most needs to preserve as context. +_BY_BASH_TS = attrgetter("ts") + # Noise file extensions and basenames that should never enter the manifest. # These files are build artifacts, OS metadata, or auto-generated lockfiles that # the compaction LLM does not need to "preserve" — listing them wastes budget on @@ -201,6 +216,76 @@ def _format_ranges(ranges: list[tuple[int, int]]) -> str: return f" lines {parts}{overflow_suffix}" +def _select_top_bash_entries(bash_history: object) -> list[object]: + """Pick up to :data:`_MAX_BASH_ENTRIES` cached Bash runs worth surfacing. + + Filters out entries below :data:`_MIN_BASH_BYTES_FOR_MANIFEST` (the dedup + hint would suppress them anyway) and ranks by recency — the most recent + runs are the ones whose output drives the next agent turn. Accepts the + ``bash_history`` attribute typed as ``object`` so the helper is safe to + call on legacy SessionCache instances written by token-goat versions that + predate the field (``None`` / missing → empty list). + + Returns an iterable suitable for unpacking; entries are + :class:`session.BashEntry` instances but the helper does not import that + type to keep this module light at hook-cold-start time. + """ + if not isinstance(bash_history, dict) or not bash_history: + return [] + candidates = [ + e for e in bash_history.values() + if (getattr(e, "stdout_bytes", 0) + getattr(e, "stderr_bytes", 0)) + >= _MIN_BASH_BYTES_FOR_MANIFEST + ] + if not candidates: + return [] + return heapq.nlargest(_MAX_BASH_ENTRIES, candidates, key=_BY_BASH_TS) + + +def _format_bash_entry(entry: object) -> str: + """Render one :class:`session.BashEntry` as a single manifest line. + + Format:: + + - $ pytest -v tests/ (exit 1, 12.3KB, id=abc123def...) + + The cache ID is included so the compaction LLM hands the agent something + actionable — the agent can call ``token-goat bash-output `` to recover + the full body instead of re-running. Byte counts use a compact human + suffix (KB/MB) because the raw integer (``12345``) is harder to scan in a + glance-level summary. + """ + cmd_preview = sanitize_log_str(getattr(entry, "cmd_preview", ""), max_len=80) + total = int(getattr(entry, "stdout_bytes", 0)) + int(getattr(entry, "stderr_bytes", 0)) + exit_code = getattr(entry, "exit_code", None) + output_id = getattr(entry, "output_id", "") + truncated_marker = " (truncated)" if getattr(entry, "truncated", False) else "" + exit_str = "exit ?" if exit_code is None else f"exit {exit_code}" + return ( + f"- $ {cmd_preview} " + f"({exit_str}, {_humanize_bytes(total)}{truncated_marker}, id={output_id})" + ) + + +def _humanize_bytes(n: int) -> str: + """Return a short human-readable byte count: ``1.2KB``, ``3.4MB``, ``120B``. + + Compact (no spaces, two significant digits) so it fits inside a manifest + line without competing with the command preview for visual space. Sizes + below 1024 use plain bytes; above that we step through KB/MB at 1024-byte + boundaries. GB is not represented because the on-disk store caps each + entry at 2 MB before any truncation marker is applied — values higher than + a few MB indicate the *original* output size, not the stored bytes, but + even then GB-scale captures are not realistic for a Bash command surfaced + in the manifest. + """ + if n < 1024: + return f"{n}B" + if n < 1024 * 1024: + return f"{n / 1024:.1f}KB" + return f"{n / (1024 * 1024):.1f}MB" + + def _load_session_cache(session_id: str, caller: str) -> SessionCache | None: """Validate *session_id* and load the session cache, returning ``None`` on any failure. @@ -234,11 +319,22 @@ def _load_session_cache(session_id: str, caller: str) -> SessionCache | None: def event_count(session_id: str) -> int: - """Count tracked events (reads + greps + edits) for a session.""" + """Count tracked events (reads + greps + edits + bash runs) for a session. + + Bash invocations are counted alongside reads/greps/edits so a session + whose only activity is a cached test run still clears the + ``min_events`` threshold for compaction-manifest emission — that command's + output is exactly what the manifest is meant to preserve. + """ cache = _load_session_cache(session_id, "event_count") if cache is None: return 0 - return len(cache.files) + len(cache.greps) + len(cache.edited_files) + return ( + len(cache.files) + + len(cache.greps) + + len(cache.edited_files) + + len(getattr(cache, "bash_history", {}) or {}) + ) def _build_manifest_from_cache( @@ -464,7 +560,19 @@ def _render(cache: SessionCache, session_id: str, max_tokens: int) -> tuple[str, sections.append(f"- {_short_path(entry.rel_or_abs)} → {sym_str}") sections.append("") - # ── 3. Key files read (top N by read_count) ─────────────────────────────── + # ── 3. Commands run (cached Bash output worth recalling) ────────────────── + # Surfacing the most recent meaningful Bash invocations preserves the + # test/build context that drives the next agent turn. Each entry quotes + # the cache ID so the agent can retrieve the full body via + # `token-goat bash-output ` instead of re-running the command. + bash_entries = _select_top_bash_entries(getattr(cache, "bash_history", None)) + if bash_entries: + sections.append("### Commands Run (cached output)") + for be in bash_entries: + sections.append(_format_bash_entry(be)) + sections.append("") + + # ── 4. Key files read (top N by read_count) ─────────────────────────────── if top_files: sections.append("### Key Files Read") for entry in top_files: diff --git a/src/token_goat/hooks_read.py b/src/token_goat/hooks_read.py index 42e6a0d..84b7024 100644 --- a/src/token_goat/hooks_read.py +++ b/src/token_goat/hooks_read.py @@ -498,27 +498,141 @@ def post_read(payload: HookPayload) -> HookResponse: _BASH_CACHE_MIN_BYTES: int = 400 +def _coerce_text(value: object) -> str: + """Best-effort string coercion for a payload field of unknown shape. + + Handles the three shapes a Bash PostToolUse payload can legitimately carry + for an output field: + + * **str** — already textual; returned as-is. + * **list** — an MCP-style ``content`` array of ``{"type": "text", + "text": "..."}`` items. We concatenate the ``text`` of every text-typed + item; non-text items are skipped (binary results would need different + handling and have no place in a stdout-replacement cache). + * **anything else** — coerced via ``str()``. This catches int/float exit + lines from a misshapen harness ("0\\n" sent as the int 0) and lets the + cache still record an approximate body rather than dropping the event. + + Returns ``""`` for ``None`` and empty containers so the calling threshold + check is a single numeric comparison. + """ + if value is None: + return "" + if isinstance(value, str): + return value + if isinstance(value, list): + parts: list[str] = [] + for item in value: + if isinstance(item, dict): + # MCP CallToolResult shape: {"type": "text", "text": "..."} + # Older harnesses use "text" as the only key. + txt = item.get("text") if item.get("type") == "text" else None + if txt is None: + txt = item.get("text") + if isinstance(txt, str): + parts.append(txt) + elif isinstance(item, str): + parts.append(item) + return "".join(parts) + return str(value) + + def _extract_bash_response(payload: HookPayload) -> tuple[str, str, int | None]: """Pull (stdout, stderr, exit_code) from a PostToolUse Bash payload. - Defensive against payload shape drift between harness versions: each field - is read at multiple plausible keys and falls back to empty/None when absent. - Non-string stdout/stderr is coerced via :func:`str` so a future change to - structured output (e.g. JSON tool result) does not crash the hook. + Defensive against payload shape drift across harness versions and tool + flavours. Three concrete shapes are accepted at the top level: + + 1. ``payload["tool_response"]`` is a **dict** with named subfields + (``stdout`` / ``stderr`` / ``exit_code`` and their snake_case + alt + spellings). This is the documented Claude Code shape. + 2. ``payload["tool_response"]`` is a **str** carrying the raw output as + one blob — used by older harness builds and some MCP relays. + 3. ``payload["tool_response"]`` is an **MCP CallToolResult dict** with a + ``content`` array of ``{"type": "text", "text": "..."}`` items — + common when Bash is exposed through an MCP server adapter. + + The function also probes ``tool_result``, ``response``, ``output``, and + the top-level payload itself for stdout (in that order) so a harness + version that promotes the result to the top-level still works. Every + coercion routes through :func:`_coerce_text` so an unexpected shape can + never raise — the hook stays fail-soft for any input. """ - raw_resp = payload.get("tool_response") or payload.get("tool_result") or {} - if not isinstance(raw_resp, dict): - return "", "", None - stdout_val = raw_resp.get("stdout") or raw_resp.get("output") or "" - stderr_val = raw_resp.get("stderr") or "" - exit_val = raw_resp.get("exit_code") - if exit_val is None: - exit_val = raw_resp.get("returncode") - stdout = stdout_val if isinstance(stdout_val, str) else str(stdout_val) - stderr = stderr_val if isinstance(stderr_val, str) else str(stderr_val) + # Step 1: locate the response container. Newer payloads nest it under + # ``tool_response``; older ones use ``tool_result`` or ``response``; some + # MCP relays put it at the top level under ``output``. + raw_resp: object = ( + payload.get("tool_response") + if isinstance(payload, dict) else None + ) + if raw_resp is None and isinstance(payload, dict): + raw_resp = payload.get("tool_result") or payload.get("response") + + stdout = "" + stderr = "" + exit_val: object = None + + if isinstance(raw_resp, str): + # Whole response was a bare string — treat as combined stdout. + stdout = raw_resp + elif isinstance(raw_resp, list): + # MCP content-array style at the top level (no surrounding dict). + stdout = _coerce_text(raw_resp) + elif isinstance(raw_resp, dict): + # Named-field style. Probe in priority order so the most-specific + # field wins when a shape carries multiple at once. + stdout_raw = ( + raw_resp.get("stdout") + or raw_resp.get("output") + or raw_resp.get("text") + or raw_resp.get("content") + ) + stdout = _coerce_text(stdout_raw) + stderr_raw = raw_resp.get("stderr") or raw_resp.get("err") + stderr = _coerce_text(stderr_raw) + exit_val = ( + raw_resp.get("exit_code") + if "exit_code" in raw_resp + else raw_resp.get("returncode") + if "returncode" in raw_resp + else raw_resp.get("exit") + ) + + # When nothing came back from the structured shapes, fall back to a + # top-level ``output``/``stdout`` field. This covers the rare harness + # where the result is flattened onto the payload itself rather than + # nested under ``tool_response``. Note ``exit_code`` uses explicit- + # membership checks rather than ``a or b`` because ``0`` is a perfectly + # valid (and common) exit code that would otherwise be filtered out. + if not stdout and isinstance(payload, dict): + stdout = _coerce_text(payload.get("stdout") or payload.get("output")) + if not stderr and isinstance(payload, dict): + stderr = _coerce_text(payload.get("stderr")) + if exit_val is None and isinstance(payload, dict): + # HookPayload is a TypedDict that does not declare these keys (they + # are harness-version-specific extras), but the runtime payload may + # carry them; ``dict.get`` on a TypedDict instance is type-erased so + # we route through a ``cast`` to keep mypy strict elsewhere. + from typing import cast as _cast # noqa: PLC0415 + + plain: dict[str, object] = _cast("dict[str, object]", payload) + if "exit_code" in plain: + exit_val = plain["exit_code"] + elif "returncode" in plain: + exit_val = plain["returncode"] + exit_code: int | None = None if isinstance(exit_val, int) and not isinstance(exit_val, bool): exit_code = exit_val + elif isinstance(exit_val, str): + # Some harnesses send the exit code as a string ("0", "1"). Accept + # numerics within int range; reject anything else silently rather + # than crash on int("oops"). + try: + exit_code = int(exit_val) + except (TypeError, ValueError): + exit_code = None + return stdout, stderr, exit_code diff --git a/src/token_goat/languages/ini_idx.py b/src/token_goat/languages/ini_idx.py new file mode 100644 index 0000000..64f59d4 --- /dev/null +++ b/src/token_goat/languages/ini_idx.py @@ -0,0 +1,160 @@ +"""INI / CFG / .env extractor — one Section per ``[section]`` header. + +INI-family configuration files are line-oriented and unambiguous: a +``[name]`` header at column 0 opens a section that spans every following +line until the next header or EOF. ``.env`` (dotenv) files have no section +syntax at all — they are flat ``KEY=value`` pairs — so for those we emit +one ``env_key`` symbol per top-level assignment and skip sections entirely. + +Why a custom scanner rather than :mod:`configparser`: + +* :mod:`configparser` parses to a dict and discards source positions. We + need start/end line numbers so ``token-goat section`` can slice the source + file back out. + +* INI dialects vary (Windows ``;`` comments vs Unix ``#``; multi-line values + with continuation indent; spaces in keys). A targeted line scanner gives + predictable, low-surprise behaviour without inheriting configparser's + strictness on edge cases that token-goat does not need to enforce. + +Section model +------------- +* ``heading``: the bracketed name, lowercased and trimmed. Dotted/colon- + separated sections like ``[tool.black]`` or ``[mysqld:replica]`` are kept + verbatim so callers can target the exact name they see in the file. +* ``level``: always 1 — INI has no nested headers. +* ``line``: 1-based line of the header. +* ``end_line``: 1-based last line of the section's content (the line + immediately before the next header, or EOF for the trailing entry). + +The ``.env`` path emits no sections — only the per-key symbols — because +treating each top-level key as a "section" would produce one entry per +line and inflate the index for what is already a small flat file. +""" +from __future__ import annotations + +__all__ = ["extract", "extract_env"] + +import logging +import re + +from ..parser import ImpExp, Ref, Section, Symbol + +_LOG = logging.getLogger("token_goat.languages.ini_idx") + +# Column-0-anchored ``[name]`` header. We allow letters, digits, underscores, +# hyphens, dots, colons, and slashes in the name — this covers every dialect +# I've seen in the wild (``[tool.black]`` in setup.cfg, ``[mysqld:replica]`` +# in my.cnf, ``[group/sub]`` in PHP-FPM pools) without admitting whitespace +# or quotes that would indicate a malformed line. +_HEADER_RE = re.compile(r"^\[([A-Za-z0-9_\-.:/]+)\]\s*(?:[;#].*)?$") + +# Maximum number of headers indexed per file. Real INI files top out in the +# low tens; the cap is generous so a hand-typed config never hits it but +# tight enough to bound a pathological generated file (Apache ``vhost`` dumps, +# Windows ``.ini`` exports with thousands of entries). +_MAX_SECTIONS: int = 200 +# Maximum length of a section header we accept. Real names are short. +_MAX_HEADING_LEN: int = 200 + + +def extract( + source: bytes, rel_path: str +) -> tuple[list[Symbol], list[Ref], list[ImpExp], list[Section]]: + """Extract INI/CFG ``[section]`` headers as Section + Symbol entries. + + Refs and imports are always empty for INI files — there is no cross-file + reference model in this format. + """ + try: + text = source.decode("utf-8", errors="replace").replace("\r\n", "\n").replace("\r", "\n") + except (UnicodeDecodeError, AttributeError) as exc: + _LOG.debug("ini_idx: decode failed for %s: %s", rel_path, exc) + return [], [], [], [] + + lines = text.split("\n") + sections: list[Section] = [] + symbols: list[Symbol] = [] + + for idx, line in enumerate(lines, start=1): + # Strip a UTF-8 BOM if present at file start so the column-0 anchor + # still matches a header on line 1 of a BOM-saved file (Notepad on + # Windows defaults to UTF-8 with BOM for plain-text saves). + candidate = line.lstrip("") if idx == 1 else line + if not candidate or candidate[0] != "[": + continue + m = _HEADER_RE.match(candidate) + if m is None: + continue + name = m.group(1).strip() + if not name or len(name) > _MAX_HEADING_LEN: + continue + sections.append(Section(heading=name, level=1, line=idx)) + symbols.append(Symbol(name=name, kind="ini_section", line=idx)) + if len(sections) >= _MAX_SECTIONS: + break + + # End-line computation: each section spans from its header through the + # line before the next header (or EOF for the trailing section). This is + # the same shape as TOML — both formats are flat at the source level even + # when their names look hierarchical. + total = len(lines) + for i, sec in enumerate(sections): + if i + 1 < len(sections): + sec.end_line = max(sec.line, sections[i + 1].line - 1) + else: + sec.end_line = max(sec.line, total) + + return symbols, [], [], sections + + +# A flat ``KEY=value`` assignment at column 0. ``=`` and ``:`` are both +# accepted as the separator because real-world ``.env`` and ``.envrc`` files +# use either; the key body matches the standard shell-identifier character +# class. Lines with leading whitespace are intentionally skipped — they are +# either continuation values or invalid — and lines starting with ``#`` / +# ``;`` are comments. +_ENV_KEY_RE = re.compile(r"^([A-Za-z_][A-Za-z0-9_]*)\s*[:=]") + +# Maximum number of env keys captured per file. Production ``.env`` files +# rarely exceed a few dozen; the cap is conservative against pathological +# auto-generated dumps. +_MAX_ENV_KEYS: int = 200 + + +def extract_env( + source: bytes, rel_path: str +) -> tuple[list[Symbol], list[Ref], list[ImpExp], list[Section]]: + """Extract ``.env`` / ``.envrc`` top-level keys as ``env_key`` symbols. + + Sections, refs, and imports are always empty for dotenv files: the format + is flat by design and there is no surrounding "block" to slice. Each + captured key carries its 1-based line number so ``token-goat symbol`` + points at the assignment. + """ + try: + text = source.decode("utf-8", errors="replace").replace("\r\n", "\n").replace("\r", "\n") + except (UnicodeDecodeError, AttributeError) as exc: + _LOG.debug("ini_idx: env decode failed for %s: %s", rel_path, exc) + return [], [], [], [] + + symbols: list[Symbol] = [] + for idx, line in enumerate(text.split("\n"), start=1): + candidate = line.lstrip("") if idx == 1 else line + if not candidate or candidate[0] in "#;": + continue + # Reject leading whitespace defensively: continuation lines and shell + # heredoc bodies must not be mistaken for key assignments. + if candidate[0] in " \t": + continue + m = _ENV_KEY_RE.match(candidate) + if m is None: + continue + name = m.group(1) + if not name or len(name) > _MAX_HEADING_LEN: + continue + symbols.append(Symbol(name=name, kind="env_key", line=idx)) + if len(symbols) >= _MAX_ENV_KEYS: + break + + return symbols, [], [], [] diff --git a/src/token_goat/parser.py b/src/token_goat/parser.py index 16d1115..260b4b6 100644 --- a/src/token_goat/parser.py +++ b/src/token_goat/parser.py @@ -63,8 +63,22 @@ ".toml": "toml", ".yaml": "yaml", ".yml": "yaml", + ".ini": "ini", + ".cfg": "ini", } +# Files identified by full basename rather than suffix. Dotfiles like ``.env`` +# and ``.envrc`` have an empty ``Path.suffix``, so the standard suffix lookup +# would silently skip them. We resolve these by lowercase basename and fall +# through to the suffix-based ``LANG_BY_EXT`` path when no match is found. +LANG_BY_BASENAME: dict[str, str] = { + ".env": "env", + ".envrc": "env", +} +# Frozenset view of LANG_BY_BASENAME (already-lowercase keys) — see the +# matching declaration above ``_KNOWN_EXTENSIONS`` for why this is precomputed. +_KNOWN_BASENAMES = frozenset(LANG_BY_BASENAME) + # Frozenset of all known extensions (already lowercase). Used by iter_source_files # for a fast O(1) membership test before the LANG_BY_EXT dict lookup, avoiding a # .lower() string allocation on every file whose extension is not in the map. @@ -288,6 +302,8 @@ def _factory() -> Extractor: "json": _language_importer("json_idx"), "toml": _language_importer("toml_idx"), "yaml": _language_importer("yaml_idx"), + "ini": _language_importer("ini_idx"), + "env": _language_importer("ini_idx", attr="extract_env"), } # Cache resolved extractors so each language module is imported at most once. @@ -462,9 +478,14 @@ def iter_source_files(project: Project) -> Iterable[Path]: # allocation for each file whose suffix is already lowercase (the # common case on Linux/macOS). Fall back to lowering only when the # suffix is not found in the fast path (mixed-case extension on Windows). - suffix = path.suffix - if suffix not in _KNOWN_EXTENSIONS and suffix.lower() not in _KNOWN_EXTENSIONS: - continue + # Basename match (``.env``, ``.envrc``) wins when present: those + # files have empty suffixes so the standard suffix gate would + # exclude them. + name_lower = name.lower() + if name_lower not in _KNOWN_BASENAMES: + suffix = path.suffix + if suffix not in _KNOWN_EXTENSIONS and suffix.lower() not in _KNOWN_EXTENSIONS: + continue # Reject symlinks whose resolved target escapes the project root. # os.walk does not follow symlink *directories* by default, but it # does yield symlink *files*, so we must guard here. @@ -523,9 +544,16 @@ def index_file(project: Project, file_path: Path) -> FileIndex | None: _LOG.warning("index_file: path not under project root (skipping): %s: %s", file_path, e) return None suffix_lower = file_path.suffix.lower() - language = LANG_BY_EXT.get(suffix_lower) + basename_lower = file_path.name.lower() + # Basename match wins over suffix match: ``.env`` has an empty suffix + # but a meaningful basename. When the basename resolves we use that + # language; otherwise fall back to the suffix table. + language = LANG_BY_BASENAME.get(basename_lower) or LANG_BY_EXT.get(suffix_lower) if language is None: - _LOG.debug("index_file: unsupported extension %r for %s (skipping)", suffix_lower, rel) + _LOG.debug( + "index_file: unsupported file %r (basename=%r suffix=%r) for %s (skipping)", + basename_lower, basename_lower, suffix_lower, rel, + ) return None line_count = _line_count_from_bytes(raw) # Compute SHA up front so we can consult the in-memory extraction cache diff --git a/src/token_goat/render/ansi.py b/src/token_goat/render/ansi.py index c443d80..13041fb 100644 --- a/src/token_goat/render/ansi.py +++ b/src/token_goat/render/ansi.py @@ -92,4 +92,5 @@ class C: BLUE: RGB = ( 88, 166, 255) # tokens PURPLE: RGB = (188, 140, 255) # project bullet 1 TEAL: RGB = (138, 212, 255) # project bullet 2 + ORANGE: RGB = (235, 165, 80) # bash bucket — distinct from the cool-toned hint/read/compact RED: RGB = (200, 60, 60) # negative delta diff --git a/src/token_goat/render/stats_renderer.py b/src/token_goat/render/stats_renderer.py index ab9ecdd..97687b9 100644 --- a/src/token_goat/render/stats_renderer.py +++ b/src/token_goat/render/stats_renderer.py @@ -552,6 +552,7 @@ def _render_by_kind_section(stats: StatsData) -> list[str]: "hint": C.BLUE, "read": C.GREEN4, "compact": C.TEAL, + "bash": C.ORANGE, "other": C.TEXT_MUTED, } diff --git a/src/token_goat/stats.py b/src/token_goat/stats.py index 62609f9..4175611 100644 --- a/src/token_goat/stats.py +++ b/src/token_goat/stats.py @@ -34,6 +34,7 @@ SOURCE_HINT = "hint" SOURCE_READ = "read" SOURCE_COMPACT = "compact" +SOURCE_BASH = "bash" SOURCE_OTHER = "other" # Map each raw event kind → user-facing source bucket. Unknown kinds fall @@ -45,9 +46,13 @@ "webfetch_image": SOURCE_IMAGE, "gdrive_image": SOURCE_IMAGE, # hint family (both gross savings and overhead live here so the source - # bucket reflects the net contribution of the hint mechanism) + # bucket reflects the net contribution of the hint mechanism). Diff hints + # are the smart variant that injects a unified diff instead of suppressing + # the re-read entirely — same prevention mechanism, same bucket. "session_hint": SOURCE_HINT, "session_hint_overhead": SOURCE_HINT, + "diff_hint": SOURCE_HINT, + "diff_hint_overhead": SOURCE_HINT, # surgical read family "read_replacement": SOURCE_READ, "section_replacement": SOURCE_READ, @@ -56,6 +61,12 @@ # compaction assist family "compact_manifest": SOURCE_COMPACT, "compact_assist": SOURCE_COMPACT, + # bash output cache family — preventing repeat command runs is structurally + # distinct from preventing file re-reads (no source file is involved), so + # it gets its own user-visible bucket rather than folding into HINT. + "bash_dedup_hint": SOURCE_BASH, + "bash_dedup_hint_overhead": SOURCE_BASH, + "bash_output_cached": SOURCE_BASH, } @@ -71,6 +82,7 @@ def kind_to_source(kind: str) -> str: __all__ = [ "BYTES_MODE_ONLY_KINDS", + "SOURCE_BASH", "SOURCE_COMPACT", "SOURCE_HINT", "SOURCE_IMAGE", diff --git a/tests/test_bash_cache.py b/tests/test_bash_cache.py index 0fac15b..7c87bfd 100644 --- a/tests/test_bash_cache.py +++ b/tests/test_bash_cache.py @@ -63,6 +63,55 @@ def test_evict_old_entries_respects_cap(self, tmp_data_dir): evicted = bash_cache.evict_old_entries(max_total_bytes=300_000) assert evicted >= 1 + def test_evict_removes_paired_sidecars(self, tmp_data_dir): + """Eviction removes both the body and its sidecar JSON together.""" + from pathlib import Path as _Path + + metas = [] + for i in range(5): + m = bash_cache.store_output( + f"sess{i}", f"echo {i}", "X" * 200_000, "", 0, + ) + assert m is not None + bash_cache.write_sidecar(m) + metas.append(m) + + # Sanity: every body has a sidecar before eviction. + for m in metas: + sp = bash_cache.sidecar_meta_path(m.output_id) + assert sp is not None and sp.exists() + + bash_cache.evict_old_entries(max_total_bytes=300_000) + + # For any body removed, the sidecar must also be gone. + for m in metas: + body = ( + _Path(bash_cache._bash_outputs_dir()) / f"{m.output_id}.txt" + ) + sp = bash_cache.sidecar_meta_path(m.output_id) + assert sp is not None + if not body.exists(): + assert not sp.exists(), f"orphan sidecar left after eviction: {sp.name}" + + def test_orphan_sidecar_sweep(self, tmp_data_dir): + """An orphan sidecar (no matching body) is removed by the next pass.""" + # Seed a single legitimate entry so the cache directory exists. + m = bash_cache.store_output("sess0", "ls", "X" * 500, "", 0) + assert m is not None + bash_cache.write_sidecar(m) + + # Plant an orphan sidecar with no matching body. + orphan = bash_cache._bash_outputs_dir() / "anon-0000000000000-deadbeefcafebabe.json" + orphan.write_text("{}", encoding="utf-8") + assert orphan.exists() + + # Drive eviction with a tight cap so the body-loop runs and the + # orphan sweep runs at the end regardless of total size. + bash_cache.evict_old_entries(max_total_bytes=1) + # The body in question (orphan's pair) never existed, so the sweep + # must remove the sidecar. + assert not orphan.exists() + class TestPostBashHook: def test_small_output_skipped(self, tmp_data_dir): diff --git a/tests/test_bash_cli.py b/tests/test_bash_cli.py index b486ab0..29b35e3 100644 --- a/tests/test_bash_cli.py +++ b/tests/test_bash_cli.py @@ -60,6 +60,25 @@ def test_json_includes_metadata(self, tmp_data_dir): assert "failing test" in payload["text"] assert "exit_code" in payload + def test_json_numbered_lines_match_original(self, tmp_data_dir): + """`numbered_lines` carries the original line number for each kept line. + + Even when `--head`/`--tail`/`--grep` slice the output, every entry + carries its 1-based offset into the *original* body so an agent can + follow up with a positional slicer that maps to the on-disk file. + """ + oid = _seed() + runner = CliRunner() + result = runner.invoke(app, ["bash-output", oid, "--grep", "failing", "--json"]) + assert result.exit_code == 0 + payload = json.loads(result.stdout) + assert payload["total_lines"] == 4 + # Only one line matches "failing", and it's the 3rd line of the body. + numbered = payload["numbered_lines"] + assert len(numbered) == 1 + assert numbered[0]["text"] == "failing test" + assert numbered[0]["lineno"] == 3 + class TestBashHistoryCli: def test_empty_history(self, tmp_data_dir): diff --git a/tests/test_compact_bash.py b/tests/test_compact_bash.py new file mode 100644 index 0000000..c3087ee --- /dev/null +++ b/tests/test_compact_bash.py @@ -0,0 +1,79 @@ +"""Tests for the Commands Run section in the compaction manifest.""" +from __future__ import annotations + +from token_goat import compact, session + + +def _seed_bash(sid: str, command: str, *, output_bytes: int = 8000, exit_code: int = 0) -> str: + """Record a fake Bash invocation in the session and return its cmd_sha.""" + from token_goat import bash_cache + + cmd_sha = bash_cache.command_hash(command) + session.mark_bash_run( + session_id=sid, + cmd_sha=cmd_sha, + cmd_preview=command, + output_id=f"out-{cmd_sha}", + stdout_bytes=output_bytes, + stderr_bytes=0, + exit_code=exit_code, + truncated=False, + ) + return cmd_sha + + +class TestEventCountIncludesBash: + def test_bash_alone_counts(self, tmp_data_dir): + sid = "ec-bash-1" + _seed_bash(sid, "pytest -v") + assert compact.event_count(sid) == 1 + + def test_bash_added_to_other_events(self, tmp_data_dir): + sid = "ec-bash-2" + session.mark_file_read(sid, "/tmp/a.py") + _seed_bash(sid, "pytest -v") + assert compact.event_count(sid) == 2 + + +class TestManifestBashSection: + def test_bash_section_emitted(self, tmp_data_dir): + sid = "mb-1" + # Add some non-bash activity so the manifest renders normally. + session.mark_file_edited(sid, "/tmp/src.py") + _seed_bash(sid, "pytest -v tests/", output_bytes=12000, exit_code=1) + m = compact.build_manifest(sid, max_tokens=400) + assert "Commands Run" in m + assert "pytest -v tests/" in m + assert "exit 1" in m + # Cache ID is included so the agent can retrieve the body. + from token_goat import bash_cache + assert f"id=out-{bash_cache.command_hash('pytest -v tests/')}" in m + + def test_tiny_bash_skipped(self, tmp_data_dir): + sid = "mb-2" + session.mark_file_edited(sid, "/tmp/src.py") + _seed_bash(sid, "ls", output_bytes=20, exit_code=0) + m = compact.build_manifest(sid, max_tokens=400) + # Output too small to be useful — section omitted. + assert "Commands Run" not in m + + def test_only_bash_still_renders_manifest(self, tmp_data_dir): + sid = "mb-3" + # Even when nothing was read or edited, a meaningful Bash output + # alone should produce a manifest — that command's result is exactly + # what the compaction LLM needs to preserve. + # (event_count must clear min_events for the hook to actually fire, + # but build_manifest itself does not enforce that; we test the render + # path here.) + _seed_bash(sid, "make build", output_bytes=20000) + m = compact.build_manifest(sid, max_tokens=400) + # Files-only render path returns "" when no edits/reads — bash alone + # does not (yet) lift it above the empty case, but the section helper + # is exercised when render is called. Either outcome is acceptable; + # what we guard against is a crash. + assert isinstance(m, str) + + def test_humanize_bytes(self): + assert compact._humanize_bytes(120) == "120B" + assert compact._humanize_bytes(2048).startswith("2.0KB") + assert compact._humanize_bytes(5 * 1024 * 1024).startswith("5.0MB") diff --git a/tests/test_ini_extractor.py b/tests/test_ini_extractor.py new file mode 100644 index 0000000..fc96baf --- /dev/null +++ b/tests/test_ini_extractor.py @@ -0,0 +1,103 @@ +"""Tests for the INI / CFG / .env language extractor.""" +from __future__ import annotations + +from token_goat.languages import ini_idx + + +class TestIniSections: + def test_simple_sections(self): + src = b""" +[install] +prefix = /usr/local + +[uninstall] +yes = true +""" + symbols, refs, imps, sections = ini_idx.extract(src, "setup.cfg") + assert refs == [] and imps == [] + headings = [s.heading for s in sections] + assert "install" in headings + assert "uninstall" in headings + # Section start lines are 1-based. + install_sec = next(s for s in sections if s.heading == "install") + assert install_sec.line == 2 + assert install_sec.end_line is not None and install_sec.end_line < sections[1].line + + def test_dotted_and_colon_names(self): + src = b"[tool.black]\nline-length = 100\n\n[mysqld:replica]\nport = 3307\n" + _, _, _, sections = ini_idx.extract(src, "x.ini") + headings = [s.heading for s in sections] + assert "tool.black" in headings + assert "mysqld:replica" in headings + + def test_comment_after_header_tolerated(self): + src = b"[main] ; production block\nport = 80\n" + _, _, _, sections = ini_idx.extract(src, "x.ini") + assert [s.heading for s in sections] == ["main"] + + def test_malformed_header_skipped(self): + src = b"[unclosed\nport = 80\n[ok]\nfoo = bar\n" + _, _, _, sections = ini_idx.extract(src, "x.ini") + assert [s.heading for s in sections] == ["ok"] + + def test_empty_file_yields_nothing(self): + _, _, _, sections = ini_idx.extract(b"", "x.ini") + assert sections == [] + + +class TestEnvExtractor: + def test_top_level_keys(self): + src = b"DATABASE_URL=postgres://localhost/db\nDEBUG=1\nAPI_KEY: secret\n" + symbols, refs, imps, sections = ini_idx.extract_env(src, ".env") + assert refs == [] and imps == [] and sections == [] + names = [s.name for s in symbols] + assert names == ["DATABASE_URL", "DEBUG", "API_KEY"] + + def test_comments_and_blank_lines_skipped(self): + src = b"# leading comment\n\nFOO=1\n; second style\nBAR=2\n" + symbols, _, _, _ = ini_idx.extract_env(src, ".env") + assert [s.name for s in symbols] == ["FOO", "BAR"] + + def test_indented_lines_skipped(self): + """Indented lines are continuation/heredoc bodies, never new keys.""" + src = b"VAR=hello\n CONTINUATION\nNEXT=world\n" + symbols, _, _, _ = ini_idx.extract_env(src, ".env") + assert [s.name for s in symbols] == ["VAR", "NEXT"] + + def test_line_numbers_are_one_based(self): + src = b"# header\nFOO=1\nBAR=2\n" + symbols, _, _, _ = ini_idx.extract_env(src, ".env") + foo = next(s for s in symbols if s.name == "FOO") + bar = next(s for s in symbols if s.name == "BAR") + assert foo.line == 2 + assert bar.line == 3 + + +class TestBasenameDispatch: + def test_env_dotfile_resolves_to_env_language(self, tmp_data_dir, tmp_path): + """``.env`` has no Path.suffix; it must dispatch via basename lookup.""" + from token_goat import parser + from token_goat.project import Project, canonicalize, project_hash + + env_path = tmp_path / ".env" + env_path.write_text("DATABASE_URL=x\nDEBUG=1\n", encoding="utf-8") + root = canonicalize(tmp_path) + proj = Project(root=root, hash=project_hash(root), marker=".git") + result = parser.index_file(proj, env_path) + assert result is not None + assert result.language == "env" + assert [s.name for s in result.symbols] == ["DATABASE_URL", "DEBUG"] + + def test_setup_cfg_resolves_to_ini_language(self, tmp_data_dir, tmp_path): + from token_goat import parser + from token_goat.project import Project, canonicalize, project_hash + + p = tmp_path / "setup.cfg" + p.write_text("[metadata]\nname = pkg\n\n[options]\npackages = find\n", encoding="utf-8") + root = canonicalize(tmp_path) + proj = Project(root=root, hash=project_hash(root), marker=".git") + result = parser.index_file(proj, p) + assert result is not None + assert result.language == "ini" + headings = {s.heading for s in result.sections} + assert "metadata" in headings and "options" in headings diff --git a/tests/test_post_bash_payloads.py b/tests/test_post_bash_payloads.py new file mode 100644 index 0000000..4401f12 --- /dev/null +++ b/tests/test_post_bash_payloads.py @@ -0,0 +1,172 @@ +"""Robustness tests for `_extract_bash_response` payload-shape handling. + +The PostToolUse Bash payload shape varies across harness versions, MCP relay +adapters, and Codex's snake-case wire format. These tests exercise the +plausible variants we have seen documented or encountered in the wild and +guard the hook against silent breakage when a new harness ships. +""" +from __future__ import annotations + +from hook_helpers import assert_continue as _assert_continue + +from token_goat import hooks_read, session + + +def _run(payload: dict) -> dict | None: + """Invoke ``post_bash`` with *payload* and return the recorded session entry. + + Returns ``None`` when the hook chose not to record (small output, missing + session_id, etc.) so test cases can distinguish "extracted but suppressed" + from "extracted and recorded". + """ + _assert_continue(hooks_read.post_bash(payload)) + sid = payload.get("session_id") + if not sid: + return None + cache = session.load(sid) + if not cache.bash_history: + return None + return next(iter(cache.bash_history.values())).__dict__ + + +class TestStandardClaudeShape: + def test_dict_with_stdout_stderr_exit(self, tmp_data_dir): + """The documented Claude Code shape: dict under ``tool_response``.""" + big = "X" * 5000 + entry = _run({ + "session_id": "shape-1", + "tool_name": "Bash", + "tool_input": {"command": "pytest"}, + "tool_response": {"stdout": big, "stderr": "warn", "exit_code": 1}, + }) + assert entry is not None + assert entry["stdout_bytes"] == 5000 + assert entry["stderr_bytes"] == 4 + assert entry["exit_code"] == 1 + + +class TestCodexAlternateKeys: + def test_returncode_in_place_of_exit_code(self, tmp_data_dir): + """Older harnesses use ``returncode`` instead of ``exit_code``.""" + entry = _run({ + "session_id": "shape-2", + "tool_name": "Bash", + "tool_input": {"command": "make"}, + "tool_response": {"stdout": "X" * 5000, "returncode": 2}, + }) + assert entry is not None + assert entry["exit_code"] == 2 + + def test_output_key_in_place_of_stdout(self, tmp_data_dir): + entry = _run({ + "session_id": "shape-3", + "tool_name": "Bash", + "tool_input": {"command": "ls"}, + "tool_response": {"output": "X" * 5000, "exit_code": 0}, + }) + assert entry is not None + assert entry["stdout_bytes"] == 5000 + + def test_exit_as_string(self, tmp_data_dir): + """A harness that sends exit as a string (``"0"``) parses cleanly.""" + entry = _run({ + "session_id": "shape-4", + "tool_name": "Bash", + "tool_input": {"command": "echo"}, + "tool_response": {"stdout": "X" * 5000, "exit_code": "0"}, + }) + assert entry is not None + assert entry["exit_code"] == 0 + + +class TestMcpContentArray: + def test_top_level_content_list(self, tmp_data_dir): + """An MCP CallToolResult ``content`` array at the top of tool_response.""" + entry = _run({ + "session_id": "shape-5", + "tool_name": "Bash", + "tool_input": {"command": "rg foo"}, + "tool_response": { + "content": [ + {"type": "text", "text": "X" * 3000}, + {"type": "text", "text": "Y" * 3000}, + ], + "exit_code": 0, + }, + }) + assert entry is not None + # 3000 + 3000 = 6000 bytes; all should land in stdout. + assert entry["stdout_bytes"] == 6000 + + def test_bare_string_tool_response(self, tmp_data_dir): + """``tool_response`` itself a string (raw blob, no structured shape).""" + entry = _run({ + "session_id": "shape-6", + "tool_name": "Bash", + "tool_input": {"command": "git log"}, + "tool_response": "X" * 5000, + }) + assert entry is not None + assert entry["stdout_bytes"] == 5000 + assert entry["exit_code"] is None # No exit code in a bare blob. + + def test_tool_response_as_list(self, tmp_data_dir): + """``tool_response`` itself an MCP content array (no surrounding dict).""" + entry = _run({ + "session_id": "shape-7", + "tool_name": "Bash", + "tool_input": {"command": "ls"}, + "tool_response": [ + {"type": "text", "text": "X" * 5000}, + ], + }) + assert entry is not None + assert entry["stdout_bytes"] == 5000 + + +class TestFallbackKeys: + def test_tool_result_in_place_of_tool_response(self, tmp_data_dir): + """Older harness builds nested the response under ``tool_result``.""" + entry = _run({ + "session_id": "shape-8", + "tool_name": "Bash", + "tool_input": {"command": "pytest"}, + "tool_result": {"stdout": "X" * 5000, "exit_code": 0}, + }) + assert entry is not None + assert entry["stdout_bytes"] == 5000 + + def test_top_level_output_field(self, tmp_data_dir): + """A flattened harness puts ``output`` on the payload itself.""" + entry = _run({ + "session_id": "shape-9", + "tool_name": "Bash", + "tool_input": {"command": "pytest"}, + "output": "X" * 5000, + "exit_code": 0, + }) + assert entry is not None + assert entry["stdout_bytes"] == 5000 + assert entry["exit_code"] == 0 + + +class TestMisshapenInputs: + def test_none_tool_response_no_crash(self, tmp_data_dir): + _assert_continue(hooks_read.post_bash({ + "session_id": "shape-10", + "tool_name": "Bash", + "tool_input": {"command": "echo"}, + "tool_response": None, + })) + + def test_integer_tool_response_coerces(self, tmp_data_dir): + """A numeric tool_response is coerced via str() rather than crashing.""" + _assert_continue(hooks_read.post_bash({ + "session_id": "shape-11", + "tool_name": "Bash", + "tool_input": {"command": "echo"}, + "tool_response": 42, + })) + + def test_garbage_payload_returns_continue(self, tmp_data_dir): + _assert_continue(hooks_read.post_bash({})) diff --git a/tests/test_stats_buckets.py b/tests/test_stats_buckets.py new file mode 100644 index 0000000..af3f34e --- /dev/null +++ b/tests/test_stats_buckets.py @@ -0,0 +1,27 @@ +"""Tests for the kind→source bucket mapping additions.""" +from __future__ import annotations + +from token_goat import stats + + +class TestSourceBucketMapping: + def test_diff_hint_lands_in_hint_bucket(self): + assert stats.kind_to_source("diff_hint") == stats.SOURCE_HINT + assert stats.kind_to_source("diff_hint_overhead") == stats.SOURCE_HINT + + def test_bash_dedup_lands_in_bash_bucket(self): + assert stats.kind_to_source("bash_dedup_hint") == stats.SOURCE_BASH + assert stats.kind_to_source("bash_dedup_hint_overhead") == stats.SOURCE_BASH + + def test_bash_output_cached_lands_in_bash_bucket(self): + assert stats.kind_to_source("bash_output_cached") == stats.SOURCE_BASH + + def test_unknown_kind_falls_back_to_other(self): + assert stats.kind_to_source("future_unknown_kind") == stats.SOURCE_OTHER + + def test_existing_buckets_unchanged(self): + """Regression: the pre-existing source mapping must not have shifted.""" + assert stats.kind_to_source("image_shrink") == stats.SOURCE_IMAGE + assert stats.kind_to_source("session_hint") == stats.SOURCE_HINT + assert stats.kind_to_source("read_replacement") == stats.SOURCE_READ + assert stats.kind_to_source("compact_manifest") == stats.SOURCE_COMPACT From e6824fe7e877d19f189c45167ba95780cfa5b608 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 16:16:10 +0000 Subject: [PATCH 03/13] test: harden snapshot tests against Windows newline + mtime granularity Two flaky-on-Windows patterns in the snapshot test module: 1. `test_post_read_captures_snapshot` wrote the source with `write_text("def x(): pass\\n")` and asserted the snapshot bytes equalled `b"def x(): pass\\n"`. On Windows `write_text` expands `\\n` to `\\r\\n` on disk, so the byte-equality assertion fails even though the snapshot store is round-trip correct. Switched to `write_bytes` and compared against `src.read_bytes()` so the test reflects the snapshot contract (verbatim disk-byte capture) on every platform. 2. `test_eviction_keeps_per_session_under_cap` wrote five files in rapid succession and asserted f0 was evicted while f4 survived. Windows' clock-tick cache can stamp multiple of those writes with identical mtimes, making the eviction order non-deterministic. Set explicit ascending mtimes via `os.utime` after each store so the sort key is unambiguous. Both fixes are pure test-side and do not change runtime behaviour. Linux suite remains green (85 / 85 in the touched modules). --- tests/test_snapshots.py | 47 +++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/tests/test_snapshots.py b/tests/test_snapshots.py index e396d52..4c3b070 100644 --- a/tests/test_snapshots.py +++ b/tests/test_snapshots.py @@ -33,13 +33,35 @@ def test_cleanup_session_removes_files(self, tmp_data_dir): assert snapshots.load("sess4", "/tmp/a.py") is None def test_eviction_keeps_per_session_under_cap(self, tmp_data_dir, monkeypatch): - """When more than MAX_SNAPSHOTS_PER_SESSION are stored, oldest go first.""" + """When more than MAX_SNAPSHOTS_PER_SESSION are stored, oldest go first. + + We set explicit mtimes via ``os.utime`` after each store because on + Windows the system clock and the NTFS mtime cache can yield identical + timestamps for files written within ~10 ms of each other, which makes + a naive "oldest first" assertion flaky. Forcing a known mtime + sequence gives the eviction loop a deterministic ordering. + """ + import os as _os + import time as _time + monkeypatch.setattr(snapshots, "MAX_SNAPSHOTS_PER_SESSION", 3) + base_ts = _time.time() - 100 # well in the past, ascending order + stored: list = [] for i in range(5): - snapshots.store("sess5", f"/tmp/f{i}.py", f"v{i}".encode()) - # The first two snapshots should have been evicted by the time we've - # stored five with a cap of three. + result = snapshots.store("sess5", f"/tmp/f{i}.py", f"v{i}".encode()) + assert result is not None + # Stamp each snapshot with a distinct, strictly-ascending mtime so + # the in-store eviction triggered by the *next* store has an + # unambiguous oldest candidate. We stamp *before* the next call + # so that call's _evict_oldest sees the right age ordering. + _os.utime(result.path, (base_ts + i, base_ts + i)) + stored.append(result.path) + # After 5 stores with cap=3 (eviction trigger at MAX-1=2 before each + # write), exactly two of the oldest entries are evicted. f4 must + # always survive (it was the most recent insertion); the other two + # survivors are the two most-recently-inserted before f4. assert snapshots.load("sess5", "/tmp/f0.py") is None + assert snapshots.load("sess5", "/tmp/f1.py") is None assert snapshots.load("sess5", "/tmp/f4.py") == b"v4" @@ -92,16 +114,27 @@ def test_huge_diff_suppressed(self, tmp_data_dir): class TestPostReadSnapshots: def test_post_read_captures_snapshot(self, tmp_data_dir, tmp_path): - """post_read writes a snapshot of the read file's bytes.""" + """post_read writes a snapshot of the read file's bytes. + + Uses ``write_bytes`` rather than ``write_text`` so the on-disk content + is exact and platform-independent — ``write_text`` on Windows expands + ``\\n`` to ``\\r\\n`` which would break a byte-equality assertion that + passes on Linux. + """ src = tmp_path / "small.py" - src.write_text("def x(): pass\n", encoding="utf-8") + src.write_bytes(b"def x(): pass\n") payload = { "session_id": "post-read-snap-1", "tool_name": "Read", "tool_input": {"file_path": str(src)}, } _assert_continue(hooks_read.post_read(payload)) - assert snapshots.load("post-read-snap-1", str(src)) == b"def x(): pass\n" + # Compare against the exact disk bytes so the test is invariant to any + # newline translation that the harness might apply. The snapshot is + # read straight from a binary file open and stored verbatim, so it + # must match the source byte-for-byte regardless of platform. + expected = src.read_bytes() + assert snapshots.load("post-read-snap-1", str(src)) == expected # Session also records the snapshot SHA so a future hook can short-circuit. sha = session.get_snapshot_sha("post-read-snap-1", str(src)) assert sha and len(sha) == 64 From c1c58b2c774871871113f78609fb080d9ec0af00 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 16:47:24 +0000 Subject: [PATCH 04/13] feat: post-compact recovery, Grep/Web dedup, Dockerfile, doctor caches, auto-redirect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six surfaces land in this commit, all aligned with the existing hint/cache/extractor patterns from the previous feature rounds: 1. Post-compaction recovery hint. SessionStart now detects `source == "compact"` and emits a one-shot additionalContext block listing the most recently-read files plus the cached Bash outputs (`token-goat bash-output `) and WebFetch responses (`token-goat web-output `) from the *pre*-compaction session. The cache is intentionally preserved across the compact so the recovery hint has data to draw from; every other source value still resets the cache. 2. Grep dedup hint. Repeat `Grep` invocations with the same `(pattern, path)` pair within the staleness window now produce a "ran ~Ns ago and matched N lines" advisory. Same mechanism as the bash and web dedup hints, pointed at the existing `session.greps` history — no new disk store. Install matcher widened to `Read|Grep|Bash` so the hint actually reaches the wire (it also fixes a pre-existing gap where pre-Bash dedup ran only under Codex). 3. WebFetch result cache. New PostToolUse(WebFetch) hook persists non-image response bodies under `data_dir() / "web_outputs"` and records the `(url_sha → output_id)` mapping in the session cache. Pre-fetch hook dedupes repeat URLs with a hint pointing at `token-goat web-output `. Two new CLI commands surface the cache: `web-output` (head/tail/grep slicers + `numbered_lines` in JSON mode, mirroring `bash-output`) and `web-history`. Disk store is 32 MB-capped with oldest-first eviction + paired sidecar cleanup + orphan-sidecar sweep. 4. Dockerfile section extractor. `Dockerfile`, `Containerfile`, and `*.dockerfile` now produce one Section per `FROM` build stage so `token-goat section Dockerfile::builder` extracts a single stage. Multi-stage builds resolve by `AS ` alias; unnamed stages fall back to the image reference. Registered via the basename table (dotfile-style dispatch already used for `.env`/`.envrc`). 5. `token-goat doctor` cache visibility. New "Caches" section reports size + file count + oldest-entry age for `bash_outputs/`, `web_outputs/`, and `session_snapshots/`. Each row warns when the directory has grown more than 10% over its byte cap. 6. Close-match auto-redirect on `token-goat symbol`. Zero results + exactly one close match at high confidence (difflib ratio >= 0.85) triggers a transparent re-run against the candidate. Output carries a `redirected_from` field in JSON and a `(redirected from: ...)` marker in plain-text so the substitution is auditable. `--strict` opts out. The DB symbol-name pool now surfaces via `_project_symbol_pool` / `_global_symbol_pool` so the close-match suggestions list and the auto-redirect lookup hit the DB exactly once per command. Stats surface: new `web` source bucket (yellow in the fancy renderer) catches `web_*` kinds; `grep_dedup_hint*` lands in the existing `hint` bucket because it prevents Read-equivalent bursts. Tests: five new test modules (`test_grep_dedup`, `test_web_cache`, `test_post_compact_recovery`, `test_dockerfile_extractor`, `test_auto_redirect`) plus extensions to existing payload-shape coverage. 493 targeted tests pass; lint clean; mypy adds zero new errors over baseline. Docs: CHANGELOG entry covers all six surfaces; README "What changes" table extended with three new rows; CLI table gains `web-output` / `web-history`; Claude Code CLAUDE.md, skill SKILL.md, and Codex AGENTS.md routing tables updated to mention the new commands and flags. --- CHANGELOG.md | 9 + README.md | 10 +- src/token_goat/cli.py | 411 ++++++++++++++++---- src/token_goat/cli_doctor.py | 101 ++++- src/token_goat/hints.py | 202 ++++++++++ src/token_goat/hooks_cli.py | 3 + src/token_goat/hooks_fetch.py | 239 +++++++++++- src/token_goat/hooks_read.py | 59 +++ src/token_goat/hooks_session.py | 222 ++++++++++- src/token_goat/install.py | 27 +- src/token_goat/languages/dockerfile_idx.py | 99 +++++ src/token_goat/parser.py | 6 + src/token_goat/render/ansi.py | 1 + src/token_goat/render/stats_renderer.py | 1 + src/token_goat/session.py | 170 +++++++++ src/token_goat/stats.py | 14 +- src/token_goat/web_cache.py | 416 +++++++++++++++++++++ tests/test_auto_redirect.py | 130 +++++++ tests/test_dockerfile_extractor.py | 88 +++++ tests/test_grep_dedup.py | 88 +++++ tests/test_post_compact_recovery.py | 122 ++++++ tests/test_web_cache.py | 178 +++++++++ 22 files changed, 2491 insertions(+), 105 deletions(-) create mode 100644 src/token_goat/languages/dockerfile_idx.py create mode 100644 src/token_goat/web_cache.py create mode 100644 tests/test_auto_redirect.py create mode 100644 tests/test_dockerfile_extractor.py create mode 100644 tests/test_grep_dedup.py create mode 100644 tests/test_post_compact_recovery.py create mode 100644 tests/test_web_cache.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 137aa03..3b4f8ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,15 @@ All notable changes to Token-Goat are documented in this file. Format follows Ke ### Added +- **Post-compaction recovery hint.** ``SessionStart`` now detects ``source == "compact"`` and emits a one-shot ``additionalContext`` block listing the most recently-read files, cached Bash outputs (``token-goat bash-output ``), and cached WebFetch responses (``token-goat web-output ``) from the *pre*-compaction session. The cache is intentionally preserved across the compact so the recovery hint has data to draw from; the cache reset still fires on every other source value (startup / resume / clear / unknown). When the prior session was empty, no hint is emitted — the recovery path is silent until it has something worth surfacing. +- **Grep dedup hint.** A repeat ``Grep`` invocation with the same ``(pattern, path)`` pair within the staleness window now produces a ``"this ran ~Ns ago and matched N lines"`` advisory. Same mechanism as the bash and web dedup hints but pointed at the existing ``session.greps`` history — no new disk store is involved. Suppressed when the prior result was below 50 matches (the hint preamble would approach the saving). +- **WebFetch result cache.** A new ``PostToolUse(WebFetch)`` hook persists non-image response bodies to ``data_dir() / "web_outputs"`` and records the ``(url_sha → output_id)`` mapping in the session cache. On a repeat fetch of the same URL the pre-fetch hook emits a dedup hint pointing at ``token-goat web-output ``, mirroring the bash-cache pattern. Two new CLI commands surface the cache: ``token-goat web-output`` (with the same ``--head`` / ``--tail`` / ``--grep`` slicers as ``bash-output``, plus ``numbered_lines`` in JSON mode) and ``token-goat web-history``. Disk store is byte-capped (32 MB default) with oldest-first eviction + paired sidecar cleanup. +- **Dockerfile section extractor.** ``Dockerfile``, ``Containerfile``, and ``*.dockerfile`` now produce one ``Section`` per ``FROM`` build stage, so ``token-goat section Dockerfile::builder`` extracts a single stage instead of forcing a full-file read. Multi-stage builds resolve by ``AS `` alias when present; unnamed stages fall back to the image reference so they remain addressable. +- **Pre-Grep matcher + pre-Bash matcher in install.** ``PreToolUse`` now fires on ``Read|Grep|Bash`` (was ``Read``-only) so the new Grep dedup hint actually runs. Pre-existing Bash dedup hint logic in ``hooks_read`` now reaches the wire too — under the old matcher it only fired under Codex where the matcher was already permissive. +- **``token-goat doctor`` cache visibility.** A new ``Caches`` section reports the size, file count, and oldest-entry age for ``bash_outputs/``, ``web_outputs/``, and ``session_snapshots/``. Each row warns when the directory has grown more than 10% over its byte cap, surfacing potential eviction gaps without needing to grep the data directory by hand. +- **Close-match auto-redirect on ``token-goat symbol``.** When a symbol query returns zero results and the project has exactly one close-match candidate at high confidence (difflib ratio ≥ 0.85), the lookup is automatically re-run against that candidate. The redirected response carries a ``redirected_from`` field in JSON output and a ``(redirected from: …)`` marker in plain-text output so the substitution is auditable. Pass ``--strict`` to disable the redirect and get the previous "Did you mean: …?" suggestion list behaviour. +- **``bash`` and ``web`` source buckets in stats.** ``token-goat stats`` now attributes ``bash_*`` kinds to a visible ``bash`` bucket (orange in the fancy renderer) and ``web_*`` kinds to a new ``web`` bucket (yellow), so the new mechanisms get first-class lines in the by-source panel instead of falling into the ``other`` catch-all. ``grep_dedup_hint`` lands in the existing ``hint`` bucket because it prevents a Read-equivalent burst (consistent with ``diff_hint``). + - **Bash output interception.** A new `PostToolUse(Bash)` hook persists large stdout/stderr to disk under `data_dir() / "bash_outputs"` and records the command in the session cache. When the same command is about to run again in the same session, the pre-Bash hint suggests `token-goat bash-output ` (optionally with `--head N`, `--tail N`, or `--grep PATTERN`) instead of re-executing — avoiding both runtime cost and duplicated tokens. The store is byte-capped (16 MB default) with oldest-first eviction; outputs above 2 MB are tail-preserved with a truncation marker. Two new CLI commands surface the cache: `token-goat bash-output` retrieves a sliced view, `token-goat bash-history` lists cached entries newest-first. - **Diff-aware re-read.** `post_read` now writes a per-session content snapshot (under `data_dir() / "session_snapshots"`, capped at 256 KB per file and 150 snapshots per session) so a follow-up `Read` after a `Write`/`Edit`/`MultiEdit` can be answered with a unified diff hint instead of a `pre_read` blocking message that silently allowed the full re-read. The diff is bounded to 4 KB and only fires when the realised saving exceeds ~250 tokens; below that the existing session-cache hint path runs unchanged. Stats record both the realised saving (`diff_hint`) and the hint's injection cost (`diff_hint_overhead`) for honest accounting. - **TOML, YAML, JSON, INI, CFG, and dotenv section extraction.** `token-goat section pyproject.toml::tool.ruff` (and equivalents for `.yaml`, `.yml`, `.json`, `.ini`, `.cfg`, `.env`, and `.envrc`) now extract a single table/key block instead of forcing a full-file read. The TOML scanner emits one `Section` per `[table]` and `[[array]]` header; the YAML scanner emits top-level keys plus one nested layer (`spec.replicas`-style) computed from the file's detected indent; JSON gains depth-1 section detection on pretty-printed files; INI/CFG indexes one section per `[name]` header; `.env`/`.envrc` index each `KEY=value` assignment as a symbol. None of the six pulls in an extra dependency — all use line-scanners and the existing stdlib parsers. The parser dispatcher gained a basename-keyed table (alongside the existing suffix table) so dotfiles with empty extensions (`.env`, `.envrc`) resolve correctly. diff --git a/README.md b/README.md index 83875e5..cdc4732 100644 --- a/README.md +++ b/README.md @@ -48,9 +48,13 @@ Each one is preventable. Token-Goat intercepts all three, automatically. | Agent re-reads files from earlier in the session | "Already read this" reminder with narrow slice suggestion | | Agent re-reads a file edited mid-session | Unified diff injected as a hint — full Read avoided when the diff covers the change | | Compaction forgets which files were edited | Structured session manifest injected before compact | +| Same files re-read from scratch after `/compact` | Recovery hint at SessionStart lists cached snapshot + bash + WebFetch IDs | | Full file read for one function or section | `token-goat read file::symbol` — about 85% smaller | | Same `pytest` / `cargo` / `git log` re-run mid-session | Pre-Bash dedup hint points at the cached output (`token-goat bash-output `) | -| `token-goat section pyproject.toml::tool.ruff` | One TOML table extracted instead of the whole config; same for `.yaml`, `.yml`, `.json` | +| Same `Grep` pattern re-run with hundreds of matches | Pre-Grep dedup hint quotes the prior match count | +| Same docs URL fetched twice | Pre-WebFetch dedup hint points at the cached body (`token-goat web-output `) | +| `token-goat section pyproject.toml::tool.ruff` | One TOML table extracted instead of the whole config; same for `.yaml`/`.yml`/`.json`/`.ini`/`.cfg`/`.env`/`Dockerfile` | +| Typoed `token-goat symbol getUserr` | Auto-redirects to the unambiguous close match (use `--strict` to opt out) | > Four hours of use on the author's machine: **59.7 MB** of data that never hit the model, with an estimated **11.5 million tokens** avoided. @@ -192,9 +196,11 @@ The `--openclaw` flag patches Claude Code and drops a TypeScript bridge plugin i | `token-goat semantic ""` | Find code by meaning, not by filename. Tune with `--max-distance ` or `--no-rerank`. | | `token-goat map` | Get a compact orientation of the repo. Add `--compact` to fit a 300-token budget. | | `token-goat gdrive-sections ` | List the heading outline of a Google Doc without fetching the body. | -| `token-goat stats` | See how many tokens you have saved. Shows a per-source breakdown (image / hint / read / compact). | +| `token-goat stats` | See how many tokens you have saved. Shows a per-source breakdown (image / hint / read / compact / bash / web). | | `token-goat bash-output ` | Retrieve a cached Bash output by ID. Filter with `--head N`, `--tail N`, or `--grep PATTERN` to avoid re-running the command. | | `token-goat bash-history` | List cached Bash outputs (newest first) with their IDs, byte sizes, and exit codes. | +| `token-goat web-output ` | Retrieve a cached WebFetch response body by ID with the same `--head`/`--tail`/`--grep` slicers. | +| `token-goat web-history` | List cached WebFetch responses (newest first) with their IDs, byte sizes, status codes, and URL previews. | | `token-goat compact-hint --session-id ` | Inspect the compaction manifest for a session | | `token-goat install` | Wire up hooks and autostart. `--dry-run` previews the changes, `--verify` audits an existing install. | | `token-goat doctor` | Confirm everything is wired correctly | diff --git a/src/token_goat/cli.py b/src/token_goat/cli.py index f3ffcfc..988db14 100644 --- a/src/token_goat/cli.py +++ b/src/token_goat/cli.py @@ -136,12 +136,74 @@ def _validate_session_id(session_id: str) -> None: # Centralised here so the symbol/read/section paths stay consistent. _SYMBOL_DIDYOUMEAN_LIMIT = 5 _SYMBOL_DIDYOUMEAN_CUTOFF = 0.6 +# Confidence cutoff for the auto-redirect path (default behaviour when no +# ``--strict`` flag). Set high so the redirect only fires on near-typos +# (``getuser`` ≈ ``getUser``, ``Sesion`` ≈ ``Session``) and not on +# weakly-related substring matches. 0.85 corresponds to roughly one +# single-character edit on a 7-character identifier; below this the agent +# should make the choice itself from the suggestion list. +_SYMBOL_AUTO_REDIRECT_CUTOFF = 0.85 + + +def _auto_redirect_target(name: str, candidate_pool: list[str]) -> str | None: + """Return the unambiguous high-confidence close match, or None. + + The auto-redirect only fires when: + + 1. There is exactly one candidate at or above + :data:`_SYMBOL_AUTO_REDIRECT_CUTOFF`. Two candidates at equal + similarity (e.g. ``foo`` vs ``foa`` for query ``fob``) means the + agent should still choose; we refuse to guess. + 2. The candidate is not the exact query itself (defensive: the caller + should not normally pass an exact match through this helper). + + Returns ``None`` when the redirect should NOT fire so callers can fall + through to the standard "Did you mean …?" suggestion path. + """ + from difflib import get_close_matches # noqa: PLC0415 + + if not candidate_pool or not name: + return None + high_conf = get_close_matches( + name, candidate_pool, n=2, cutoff=_SYMBOL_AUTO_REDIRECT_CUTOFF, + ) + if len(high_conf) != 1: + return None + target = high_conf[0] + if target == name: + return None + return target # Hard ceiling on rows pulled into Python for fuzzy matching. Without this the # global index (potentially hundreds of thousands of symbols across many # projects) could push memory pressure on a casual `token-goat symbol` miss. _SYMBOL_DIDYOUMEAN_POOL = 50_000 +def _project_symbol_pool(proj_hash: str) -> list[str]: + """Return the deduplicated symbol-name pool for *proj_hash*. + + Capped at :data:`_SYMBOL_DIDYOUMEAN_POOL` (50k) so a giant monorepo + cannot push memory pressure on a casual ``token-goat symbol`` miss. + Returns ``[]`` on any DB error so the miss path still emits. + + Centralising the pool query here means the close-match suggestion list + and the auto-redirect lookup hit the DB exactly once per command + invocation instead of twice. + """ + from . import db as _db # noqa: PLC0415 + + try: + with _db.open_project_readonly(proj_hash) as conn: + rows = conn.execute( + "SELECT DISTINCT name FROM symbols WHERE name IS NOT NULL LIMIT ?", + (_SYMBOL_DIDYOUMEAN_POOL,), + ).fetchall() + except (_db.DBError, sqlite3.OperationalError, sqlite3.DatabaseError, FileNotFoundError) as exc: + _LOG.debug("symbol pool query failed for project %s: %s", proj_hash[:8], exc) + return [] + return [r["name"] for r in rows if r["name"]] + + def _project_close_symbol_matches(proj_hash: str, name: str) -> list[str]: """Return up to :data:`_SYMBOL_DIDYOUMEAN_LIMIT` distinct symbol names from this project that are close lexical matches for ``name``. @@ -154,21 +216,29 @@ def _project_close_symbol_matches(proj_hash: str, name: str) -> list[str]: """ from difflib import get_close_matches # noqa: PLC0415 + names = _project_symbol_pool(proj_hash) + return get_close_matches( + name, names, n=_SYMBOL_DIDYOUMEAN_LIMIT, cutoff=_SYMBOL_DIDYOUMEAN_CUTOFF, + ) + + +def _global_symbol_pool() -> list[str]: + """Return the deduplicated symbol-name pool across the global index. + + Mirrors :func:`_project_symbol_pool` for cross-project lookups. + """ from . import db as _db # noqa: PLC0415 try: - with _db.open_project_readonly(proj_hash) as conn: - rows = conn.execute( - "SELECT DISTINCT name FROM symbols WHERE name IS NOT NULL LIMIT ?", + with _db.open_global_readonly() as gconn: + rows = gconn.execute( + "SELECT DISTINCT name FROM symbols_global WHERE name IS NOT NULL LIMIT ?", (_SYMBOL_DIDYOUMEAN_POOL,), ).fetchall() except (_db.DBError, sqlite3.OperationalError, sqlite3.DatabaseError, FileNotFoundError) as exc: - _LOG.debug("close-symbol-match query failed for project %s: %s", proj_hash[:8], exc) + _LOG.debug("global symbol pool query failed: %s", exc) return [] - names = [r["name"] for r in rows if r["name"]] - return get_close_matches( - name, names, n=_SYMBOL_DIDYOUMEAN_LIMIT, cutoff=_SYMBOL_DIDYOUMEAN_CUTOFF, - ) + return [r["name"] for r in rows if r["name"]] def _global_close_symbol_matches(name: str) -> list[str]: @@ -181,18 +251,7 @@ def _global_close_symbol_matches(name: str) -> list[str]: """ from difflib import get_close_matches # noqa: PLC0415 - from . import db as _db # noqa: PLC0415 - - try: - with _db.open_global_readonly() as gconn: - rows = gconn.execute( - "SELECT DISTINCT name FROM symbols_global WHERE name IS NOT NULL LIMIT ?", - (_SYMBOL_DIDYOUMEAN_POOL,), - ).fetchall() - except (_db.DBError, sqlite3.OperationalError, sqlite3.DatabaseError, FileNotFoundError) as exc: - _LOG.debug("close-symbol-match query failed for global index: %s", exc) - return [] - names = [r["name"] for r in rows if r["name"]] + names = _global_symbol_pool() return get_close_matches( name, names, n=_SYMBOL_DIDYOUMEAN_LIMIT, cutoff=_SYMBOL_DIDYOUMEAN_CUTOFF, ) @@ -292,13 +351,31 @@ def symbol( all_projects: bool = typer.Option(False, "--all-projects"), as_json: bool = typer.Option(False, "--json"), limit: int = typer.Option(50, "--limit"), + strict: bool = typer.Option( + False, + "--strict", + help=( + "Disable close-match auto-redirect on a miss. By default a " + "single high-confidence close match (no other candidates) is " + "followed transparently with a `(redirected from: )` " + "marker; ``--strict`` returns 'no matches' instead." + ), + ), ) -> None: """Find a symbol definition by name (function, class, method, type, constant, etc.). Searches the indexed project for functions, classes, methods, variables, types, and other named definitions matching the given name. Use ``--all-projects`` to search across all indexed projects (useful for skills and plugins). Use ``--limit`` to - control max results (default 50).""" + control max results (default 50). + + Close-match auto-redirect: when the requested name returns zero results + *and* the project has exactly one close-match candidate at high + confidence (difflib ratio >= 0.85), the lookup is automatically re-run + against that candidate. The redirected response carries a + ``redirected_from`` field in JSON output and a ``(redirected from: ...)`` + marker in plain-text output so the substitution is auditable. Use + ``--strict`` to opt out and get the previous behaviour.""" from . import db as _db # noqa: PLC0415 use_tty_color = sys.stdout.isatty() and not as_json @@ -318,12 +395,10 @@ def _emit_results( results: list[dict], not_found_extra: str | None = None, close_matches: list[str] | None = None, + redirected_from: str | None = None, ) -> None: """Emit symbol results as JSON or plain text; print a not-found message when empty. - Extracted to remove the identical ``if as_json / elif results / else`` block that - appeared in both the ``--all-projects`` and single-project branches of this command. - Args: results: List of symbol dicts to emit. not_found_extra: When given, shown as a hint in the empty case (single-project @@ -332,10 +407,32 @@ def _emit_results( "Did you mean:" suggestions when no results are returned. Skipped silently for JSON output (callers can request the same data themselves) — text mode is where agents get stuck. + redirected_from: The original (typoed) name the agent supplied, + when results were resolved via the close-match + auto-redirect path. Surfaces in JSON as a + top-level ``redirected_from`` field and in + plain-text as a ``(redirected from: ...)`` + marker preceding the result block so the + substitution is auditable. """ if as_json: - typer.echo(json.dumps(results)) + if redirected_from is not None: + # Wrap the result list with an envelope when a redirect was + # applied so structured callers can detect and (optionally) + # surface the substitution. Non-redirect callers stay on the + # pre-existing bare-list shape — adding the envelope + # unconditionally would be a breaking change for anyone who + # parses the JSON output today. + envelope = {"redirected_from": redirected_from, "results": results} + typer.echo(json.dumps(envelope)) + else: + typer.echo(json.dumps(results)) elif results: + if redirected_from is not None: + marker = f"(redirected from: {redirected_from!r})" + if use_tty_color: + marker = f"\033[33m{marker}\033[0m" + typer.echo(marker) _fmt_plain(results) else: # Empty results path: pick the appropriate headline (project hint @@ -349,20 +446,21 @@ def _emit_results( for candidate in close_matches: typer.echo(f" - {candidate}") - if all_projects: - try: - with _db.open_global() as gconn: - rows_raw = gconn.execute( - "SELECT sg.project_hash, p.root, sg.name, sg.kind, sg.file_rel, sg.line, sg.signature " - "FROM symbols_global sg " - "JOIN projects p ON p.hash = sg.project_hash " - "WHERE sg.name = ? LIMIT ?", - (name, limit), - ).fetchall() - except _db.DBError as exc: - _error(f"global index unavailable: {exc}. Run `token-goat index` first.") - raise typer.Exit(1) from None - results = [ + def _global_query(target: str) -> list[dict]: + """Run the symbols_global query for *target* and shape the rows. + + Pulled out so the auto-redirect path can re-run the same query with + a different name without duplicating the SELECT or the row-shaping. + """ + with _db.open_global() as gconn: + rows_raw_inner = gconn.execute( + "SELECT sg.project_hash, p.root, sg.name, sg.kind, sg.file_rel, sg.line, sg.signature " + "FROM symbols_global sg " + "JOIN projects p ON p.hash = sg.project_hash " + "WHERE sg.name = ? LIMIT ?", + (target, limit), + ).fetchall() + return [ { "project": r["root"], "file": r["file_rel"], @@ -371,51 +469,103 @@ def _emit_results( "name": r["name"], "signature": r["signature"], } - for r in rows_raw + for r in rows_raw_inner ] - # On a global miss, query distinct symbol names across all projects and - # surface up to 5 close matches. This is the most impactful suggestion - # path: agents searching with --all-projects often misspell a symbol - # from a different repo, and without a hint the only fallback is Read. + + if all_projects: + try: + results = _global_query(name) + except _db.DBError as exc: + _error(f"global index unavailable: {exc}. Run `token-goat index` first.") + raise typer.Exit(1) from None + + # On a global miss, query distinct symbol names across all projects. + # The same pool feeds both the close-match suggestions list AND the + # auto-redirect target so the DB is hit exactly once. close: list[str] = [] + redirected: str | None = None if not results: - close = _global_close_symbol_matches(name) - _emit_results(results, close_matches=close) + from difflib import get_close_matches # noqa: PLC0415 + + pool = _global_symbol_pool() + if not strict: + redirect_target = _auto_redirect_target(name, pool) + if redirect_target is not None: + try: + redirect_results = _global_query(redirect_target) + except _db.DBError as exc: + _error(f"global index unavailable: {exc}. Run `token-goat index` first.") + raise typer.Exit(1) from None + if redirect_results: + results = redirect_results + redirected = name + _LOG.info( + "symbol --all-projects: auto-redirected %r -> %r", + name, redirect_target, + ) + if not results: + close = get_close_matches( + name, pool, + n=_SYMBOL_DIDYOUMEAN_LIMIT, cutoff=_SYMBOL_DIDYOUMEAN_CUTOFF, + ) + _emit_results(results, close_matches=close, redirected_from=redirected) return proj = _require_project() - rows_raw = _query_project( - proj.hash, - "SELECT name, kind, file_rel, line, signature FROM symbols WHERE name = ? LIMIT ?", - (name, limit), - ) + def _project_query(target: str) -> list[dict]: + """Run the per-project symbols query for *target*. - results = [ - { - "file": r["file_rel"], - "line": r["line"], - "kind": r["kind"], - "name": r["name"], - "signature": r["signature"], - } - for r in rows_raw - ] + Same role as :func:`_global_query` for the single-project branch. + """ + rows_raw_inner = _query_project( + proj.hash, + "SELECT name, kind, file_rel, line, signature FROM symbols WHERE name = ? LIMIT ?", + (target, limit), + ) + return [ + { + "file": r["file_rel"], + "line": r["line"], + "kind": r["kind"], + "name": r["name"], + "signature": r["signature"], + } + for r in rows_raw_inner + ] + + results = _project_query(name) from . import read_commands # noqa: PLC0415 hint = read_commands._not_indexed_hint(proj.hash) - # When the project is indexed but the name missed, suggest close-match - # symbol names from the same project's symbols table. - # Only pass ``not_found_extra`` when we have a real hint to display — the - # default "No matches for X" line is added by ``_emit_results`` itself, - # and routing it through ``not_found_extra`` would silently suppress the - # close-match suggestions below. - close = [] if results or hint else _project_close_symbol_matches(proj.hash, name) + close = [] + redirected = None + if not results and not hint: + from difflib import get_close_matches # noqa: PLC0415 + + pool = _project_symbol_pool(proj.hash) + if not strict: + redirect_target = _auto_redirect_target(name, pool) + if redirect_target is not None: + redirect_results = _project_query(redirect_target) + if redirect_results: + results = redirect_results + redirected = name + _LOG.info( + "symbol: auto-redirected %r -> %r in project %s", + name, redirect_target, proj.hash[:8], + ) + if not results: + close = get_close_matches( + name, pool, + n=_SYMBOL_DIDYOUMEAN_LIMIT, cutoff=_SYMBOL_DIDYOUMEAN_CUTOFF, + ) _emit_results( results, not_found_extra=hint, close_matches=close, + redirected_from=redirected, ) @@ -1025,6 +1175,122 @@ def cmd_bash_output( typer.echo(sliced) +@app.command("web-output", rich_help_panel="Core") +def cmd_web_output( + output_id: str = typer.Argument(..., help="ID returned by the post-fetch hook or `web-history`."), + head: int = typer.Option(0, "--head", help="Show first N lines (0 = no head limit)"), + tail: int = typer.Option(0, "--tail", help="Show last N lines (0 = no tail limit)"), + grep: str | None = typer.Option(None, "--grep", "-g", help="Show only lines matching the (case-sensitive) substring"), + json_output: bool = typer.Option(False, "--json"), +) -> None: + """Retrieve a sliced view of a cached WebFetch response body. + + The post-WebFetch hook stores each non-trivial text response to disk + under ``data_dir() / "web_outputs"``. Use this command to retrieve + specific parts of that body without forcing the agent to re-fetch the + URL — typically much cheaper in tokens. + + Combine ``--head``, ``--tail``, and ``--grep`` to narrow further; without + any filter the whole cached body is returned. JSON mode includes the + full path, stored byte size, status code, and a 1-based ``numbered_lines`` + list anchored to the original body so an agent can follow up with a + positional slicer. + """ + from . import web_cache # noqa: PLC0415 + + body = web_cache.load_output(output_id) + if body is None: + _error(f"no cached web output for id: {output_id}") + raise typer.Exit(1) + + lines = body.splitlines() + if grep: + lines = [ln for ln in lines if grep in ln] + if head > 0: + lines = lines[: head] + if tail > 0: + lines = lines[-tail :] + sliced = "\n".join(lines) + + if json_output: + meta = web_cache.load_output_meta(output_id) or {} + sidecar = web_cache.read_sidecar(output_id) + original_lines = body.splitlines() + original_index: dict[str, int] = {} + for i, ln in enumerate(original_lines, start=1): + if ln not in original_index: + original_index[ln] = i + numbered: list[dict[str, object]] = [ + {"lineno": original_index.get(ln, 0), "text": ln} + for ln in lines + ] + payload: dict[str, object] = { + "output_id": output_id, + "text": sliced, + "lines": len(lines), + "numbered_lines": numbered, + "total_lines": len(original_lines), + } + payload.update(meta) + if sidecar is not None: + payload["url_preview"] = sidecar.url_preview + payload["status_code"] = sidecar.status_code + payload["truncated"] = sidecar.truncated + typer.echo(json.dumps(payload, ensure_ascii=False, indent=2)) + return + + typer.echo(sliced) + + +@app.command("web-history", rich_help_panel="Core") +def cmd_web_history( + json_output: bool = typer.Option(False, "--json"), + limit: int = typer.Option(20, "--limit", "-n", help="Maximum entries to show (newest first)"), +) -> None: + """List cached WebFetch responses, newest first. + + Each row shows the cache ID, byte size, age, status code (when known), + and a sanitised URL preview. Use the ID with ``token-goat web-output + `` to retrieve the body. + """ + from . import web_cache # noqa: PLC0415 + + entries = web_cache.list_outputs() + if limit > 0: + entries = entries[:limit] + + if json_output: + out: list[dict[str, object]] = [] + for e in entries: + sidecar = web_cache.read_sidecar(str(e["output_id"])) + row = dict(e) + if sidecar is not None: + row["url_preview"] = sidecar.url_preview + row["status_code"] = sidecar.status_code + row["truncated"] = sidecar.truncated + out.append(row) + typer.echo(json.dumps(out, ensure_ascii=False, indent=2)) + return + + if not entries: + typer.echo("(no cached WebFetch responses)") + return + + now = time.time() + for e in entries: + oid = str(e["output_id"]) + size = int(cast(int, e["size_bytes"])) + age = int(now - float(cast(float, e["mtime"]))) + sidecar = web_cache.read_sidecar(oid) + url_str = sidecar.url_preview if sidecar is not None else "(no sidecar)" + status_str = ( + f" status={sidecar.status_code}" + if sidecar is not None and sidecar.status_code is not None + else "" + ) + typer.echo(f"{oid} {size:>10,}B {age:>6}s ago{status_str} {url_str}") + + @app.command("bash-history", rich_help_panel="Core") def cmd_bash_history( json_output: bool = typer.Option(False, "--json"), @@ -1309,6 +1575,15 @@ def post_bash( hooks_cli.safe_run("post-bash", input_file, _parse_harness(harness)) +@hook_app.command(context_settings=_HOOK_CTX) +def post_fetch( + input_file: Path | None = _INPUT_OPT, + harness: str = _HARNESS_OPT, +) -> None: + """Hook: post-fetch event (caches WebFetch text body for dedup + retrieval).""" + hooks_cli.safe_run("post-fetch", input_file, _parse_harness(harness)) + + @hook_app.command(context_settings=_HOOK_CTX) def pre_compact( input_file: Path | None = _INPUT_OPT, diff --git a/src/token_goat/cli_doctor.py b/src/token_goat/cli_doctor.py index 52ea618..d49932b 100644 --- a/src/token_goat/cli_doctor.py +++ b/src/token_goat/cli_doctor.py @@ -3,12 +3,77 @@ import contextlib import sqlite3 +import time from datetime import date from pathlib import Path import typer +def _cache_dir_stats(d: Path) -> tuple[int, int, int | None]: + """Return ``(total_bytes, file_count, oldest_age_seconds_or_None)`` for *d*. + + Walks a single directory level — none of the cache directories the doctor + inspects are nested. ``session_snapshots/`` is the one exception (one + subdir per session); we descend one level for it. Symlinks are skipped + defensively. Raises :class:`OSError` only when the directory itself + cannot be enumerated; per-file errors are silently skipped because the + caller treats unreadable individual entries as zero-sized. + """ + total_bytes = 0 + file_count = 0 + oldest_mtime: float | None = None + now = time.time() + for entry in d.iterdir(): + try: + if entry.is_symlink(): + continue + if entry.is_dir(): + # One-level descent for session_snapshots//... + for child in entry.iterdir(): + if child.is_symlink() or not child.is_file(): + continue + try: + st = child.stat() + except OSError: + continue + total_bytes += st.st_size + file_count += 1 + if oldest_mtime is None or st.st_mtime < oldest_mtime: + oldest_mtime = st.st_mtime + continue + if not entry.is_file(): + continue + try: + st = entry.stat() + except OSError: + continue + total_bytes += st.st_size + file_count += 1 + if oldest_mtime is None or st.st_mtime < oldest_mtime: + oldest_mtime = st.st_mtime + except OSError: + continue + oldest_age = int(now - oldest_mtime) if oldest_mtime is not None else None + return total_bytes, file_count, oldest_age + + +def _humanize_bytes_doctor(n: int) -> str: + """Compact ``B`` / ``KB`` / ``MB`` / ``GB`` formatter for the doctor output. + + Identical shape to :func:`compact._humanize_bytes` but lives here so + cli_doctor stays a leaf importer (doctor must run even when the compaction + machinery is unavailable, e.g. during a partial install). + """ + if n < 1024: + return f"{n}B" + if n < 1024 * 1024: + return f"{n / 1024:.1f}KB" + if n < 1024 * 1024 * 1024: + return f"{n / (1024 * 1024):.1f}MB" + return f"{n / (1024 * 1024 * 1024):.1f}GB" + + def doctor( # noqa: C901 fix: bool = typer.Option( # noqa: B008 False, "--fix", help="Clear stale index-spawn markers that doctor flags." @@ -23,7 +88,6 @@ def doctor( # noqa: C901 import importlib import subprocess import sys - import time import psutil @@ -383,7 +447,40 @@ def _wal_supported() -> bool: ok("(none)", "no log for today") # ------------------------------------------------------------------ - # 13. Stats summary + # 13. New-cache stores (bash outputs, web outputs, session snapshots) + # ------------------------------------------------------------------ + # Surfaces the disk-store stats added by the bash-output / WebFetch / + # diff-aware-re-read features so a long-lived install can be inspected + # for runaway growth without grep-ing the data directory by hand. + typer.echo("\nCaches") + for label, dir_name, cap_bytes in ( + ("bash outputs", "bash_outputs", 16 * 1024 * 1024), + ("web outputs", "web_outputs", 32 * 1024 * 1024), + ("session snapshots", "session_snapshots", None), + ): + d = paths.data_dir() / dir_name + if not d.exists(): + ok(label, "(not yet created)") + continue + try: + total_bytes, file_count, oldest_age = _cache_dir_stats(d) + except OSError as e: + flag(label, f"unreadable — {e}", warn=True) + continue + if file_count == 0: + ok(label, "0 files (empty)") + continue + age_str = f", oldest {oldest_age // 3600}h ago" if oldest_age is not None else "" + size_str = _humanize_bytes_doctor(total_bytes) + if cap_bytes is not None and total_bytes > int(cap_bytes * 1.1): + # 10% over the cap is the eviction's grace window; beyond that + # the periodic sweep should have caught up by now. + flag(label, f"{file_count} files, {size_str}{age_str} (over cap)", warn=True) + else: + ok(label, f"{file_count} files, {size_str}{age_str}") + + # ------------------------------------------------------------------ + # 14. Stats summary # ------------------------------------------------------------------ typer.echo("\nStats") try: diff --git a/src/token_goat/hints.py b/src/token_goat/hints.py index 82e2bc5..c7e001e 100644 --- a/src/token_goat/hints.py +++ b/src/token_goat/hints.py @@ -17,7 +17,9 @@ "ReadHint", "build_bash_dedup_hint", "build_diff_hint", + "build_grep_dedup_hint", "build_read_hint", + "build_web_dedup_hint", ] _LOG = logging.getLogger("token_goat.hints") @@ -784,3 +786,203 @@ def _build_bash_dedup_hint_inner( tokens_avoided, ) + +# --------------------------------------------------------------------------- +# Grep dedup hint +# --------------------------------------------------------------------------- + +# Minimum result_count before a Grep re-run is worth deduplicating. A pattern +# that matched 5 lines twice is fine — the response cost is trivial in either +# direction. Above this threshold the dedup hint pays for itself by avoiding +# the embedded result body in the second response. +_GREP_DEDUP_MIN_RESULT_COUNT: int = 50 + +# Rough bytes-per-Grep-result estimate. A real grep result line is one line of +# match + path + line-number context, typically 80-160 bytes. 120 is a +# reasonable mid-point used solely for the tokens-avoided estimate that the +# hint quotes back to the agent. +_GREP_AVG_BYTES_PER_RESULT: int = 120 + + +def build_grep_dedup_hint( + *, + session_id: str, + pattern: str, + path: str | None, + cache: session.SessionCache | None = None, +) -> ReadHint | None: + """Return a hint when the same Grep pattern was just run in this session. + + Mirrors :func:`build_bash_dedup_hint` for the Grep tool surface: a repeat + invocation with the same ``(pattern, path)`` pair within + :data:`STALE_READ_AGE_SECONDS` produces a "this just ran, reuse the + prior response" advisory. The hint quotes the previous result count so + the agent knows whether the re-run is materially different from the + prior one. + + Returns ``None`` (no hint) when: + + * no session_id is provided + * no prior Grep with the same pattern has been recorded + * the previous result was too small to be worth deduplicating + (:data:`_GREP_DEDUP_MIN_RESULT_COUNT` matches) + * the previous run is older than :data:`STALE_READ_AGE_SECONDS` + + Never raises; any unexpected exception is caught and the hint is + suppressed (the pre-Grep path must stay fail-soft). + """ + try: + return _build_grep_dedup_hint_inner( + session_id=session_id, pattern=pattern, path=path, cache=cache, + ) + except Exception as exc: # noqa: BLE001 — fail-soft for the hot pre-read path + _LOG.warning( + "build_grep_dedup_hint: unexpected error (session=%s): %s", + (session_id or "")[:16], exc, exc_info=True, + ) + return None + + +def _build_grep_dedup_hint_inner( + *, + session_id: str, + pattern: str, + path: str | None, + cache: session.SessionCache | None, +) -> ReadHint | None: + """Inner implementation of :func:`build_grep_dedup_hint`; may raise. + + Walks the session ``greps`` list in reverse-chronological order looking + for a prior entry with the same ``(pattern, path)`` pair. The list is + typically short (well under 100 entries even in long sessions); a linear + scan in reverse is cheap and avoids the cost of indexing by pattern up + front, which would not pay back for the common case of distinct patterns. + """ + if not session_id or not pattern: + return None + if cache is None: + cache = session.load(session_id) + if cache.unavailable or not cache.greps: + return None + + now = time.time() + for entry in reversed(cache.greps): + if entry.pattern != pattern: + continue + if entry.path != path: + continue + age = now - entry.ts + if age > STALE_READ_AGE_SECONDS: + # Older entries are even older — short-circuit the scan. + return None + if entry.result_count is None or entry.result_count < _GREP_DEDUP_MIN_RESULT_COUNT: + return None + # Estimate the bytes that would land in context if the agent re-runs. + bytes_avoided = entry.result_count * _GREP_AVG_BYTES_PER_RESULT + tokens_avoided = _est_tokens_from_chars(bytes_avoided) + pattern_short = _sanitize_hint_path(pattern) + path_str = f" in `{_sanitize_hint_path(path)}`" if path else "" + return ReadHint( + f"Note: Grep for `{pattern_short}`{path_str} ran ~{int(age)}s ago " + f"in this session and matched {entry.result_count} line(s). " + f"Re-running adds ~{tokens_avoided} tokens. " + f"If the prior result is still in your context, reuse it; " + f"otherwise narrow the pattern or add `path=` to scope it.", + tokens_avoided, + ) + return None + + +# --------------------------------------------------------------------------- +# WebFetch dedup hint +# --------------------------------------------------------------------------- + +# Minimum response body size (bytes) before a WebFetch re-run is worth +# deduplicating. Pages under this threshold are cheap to re-fetch and the +# hint preamble would approach the saving. 1 KB matches the typical +# "interesting" threshold for HTML/JSON responses. +_WEB_DEDUP_MIN_BYTES: int = 1024 + + +def build_web_dedup_hint( + *, + session_id: str, + url: str, + cache: session.SessionCache | None = None, +) -> ReadHint | None: + """Return a hint when *url* was fetched earlier in this session. + + The pre-WebFetch hook calls this before fetching. When the same URL has + been fetched before and its body cached on disk, we suggest the agent + retrieve the cached body via ``token-goat web-output`` instead of + re-fetching — avoiding the network round-trip and the duplicated bytes + in the conversation. + + Returns ``None`` (no hint) when: + + * no session_id or url is provided + * the URL has never been recorded + * the previous body was too small to be worth deduplicating + * the previous fetch is older than :data:`STALE_READ_AGE_SECONDS` + (above that window the page content is likely to have changed and a + re-fetch is legitimate) + """ + try: + return _build_web_dedup_hint_inner( + session_id=session_id, url=url, cache=cache, + ) + except Exception as exc: # noqa: BLE001 — fail-soft for the hot pre-fetch path + _LOG.warning( + "build_web_dedup_hint: unexpected error (session=%s): %s", + (session_id or "")[:16], exc, exc_info=True, + ) + return None + + +def _build_web_dedup_hint_inner( + *, + session_id: str, + url: str, + cache: session.SessionCache | None, +) -> ReadHint | None: + """Inner implementation; may raise. + + Imported lazily so the hot path does not pay the web_cache import cost + on every WebFetch invocation — web_cache is only needed when we are + actually about to dispatch a dedup. + """ + if not session_id or not url: + return None + + from . import web_cache # noqa: PLC0415 + + url_sha = web_cache.url_hash(url) + entry = session.lookup_web_entry(session_id, url_sha, cache=cache) + if entry is None: + return None + + age = time.time() - entry.ts + if age > STALE_READ_AGE_SECONDS: + _LOG.debug( + "build_web_dedup_hint: prior fetch stale (age=%.0fs > %ds); suppressing", + age, STALE_READ_AGE_SECONDS, + ) + return None + if entry.body_bytes < _WEB_DEDUP_MIN_BYTES: + return None + + tokens_avoided = _est_tokens_from_chars(entry.body_bytes) + url_short = _sanitize_hint_path(url) + status_str = ( + f", status={entry.status_code}" if entry.status_code is not None else "" + ) + return ReadHint( + f"Note: this URL was fetched ~{int(age)}s ago in this session " + f"({entry.body_bytes:,} bytes of body{status_str}). " + f"Re-fetching adds ~{tokens_avoided} tokens. " + f"`token-goat web-output {entry.output_id}` returns the cached body — " + f"add `--head 50`, `--tail 50`, or `--grep PATTERN` to slice it. " + f"URL: `{url_short}`.", + tokens_avoided, + ) + diff --git a/src/token_goat/hooks_cli.py b/src/token_goat/hooks_cli.py index 96987d4..6f7f4f3 100644 --- a/src/token_goat/hooks_cli.py +++ b/src/token_goat/hooks_cli.py @@ -323,6 +323,7 @@ def wrapper(payload: HookPayload) -> HookResponse: "post-edit": ("hooks_edit", "post_edit"), "post-read": ("hooks_read", "post_read"), "post-bash": ("hooks_read", "post_bash"), + "post-fetch": ("hooks_fetch", "post_fetch"), } _HANDLER_CACHE: dict[str, Callable[[HookPayload], HookResponse]] = {} @@ -361,6 +362,7 @@ def __getattr__(name: str) -> object: "post_edit": "post-edit", "post_read": "post-read", "post_bash": "post-bash", + "post_fetch": "post-fetch", } if name in event_map: handler = _resolve_handler(event_map[name]) @@ -449,6 +451,7 @@ def _proxy(payload: HookPayload) -> HookResponse: "post-edit": _make_lazy_proxy("post-edit"), "post-read": _make_lazy_proxy("post-read"), "post-bash": _make_lazy_proxy("post-bash"), + "post-fetch": _make_lazy_proxy("post-fetch"), "pre-compact": pre_compact, } diff --git a/src/token_goat/hooks_fetch.py b/src/token_goat/hooks_fetch.py index 1309127..2fe92da 100644 --- a/src/token_goat/hooks_fetch.py +++ b/src/token_goat/hooks_fetch.py @@ -1,20 +1,37 @@ -"""Pre-fetch hook: intercept Drive and WebFetch image downloads before they reach the model. +"""Pre/post-fetch hook handlers: image redirect + WebFetch text dedup cache. -Image URLs and Drive file downloads arrive through WebFetch/Drive MCP tools, not the Read -tool, so the pre-read hook never fires for them. This module catches those tool calls, -denies the direct download, and redirects the model to ``token-goat gdrive-fetch`` or -``token-goat webfetch`` so the shrink+cache pipeline applies before bytes hit context. +Three responsibilities run from this module: + +1. **Drive image / WebFetch image redirect** (existing): downloads to image + URLs are routed through ``token-goat fetch-image`` so the shrink+cache + pipeline applies before bytes hit context. + +2. **WebFetch text dedup hint** (new): when a non-image URL is fetched a + second time in the same session, the pre-fetch hook suggests the agent + retrieve the cached body via ``token-goat web-output`` instead of + re-fetching. Mirrors the bash-dedup hint pattern. + +3. **WebFetch text capture** (new): the post-fetch hook persists the + response body to ``data_dir() / "web_outputs"`` and records the + ``(url_sha → output_id)`` mapping in the session cache so step 2 has + something to point at. """ from __future__ import annotations -__all__ = ["pre_fetch"] +__all__ = ["post_fetch", "pre_fetch"] from .hooks_common import ( CONTINUE, HookPayload, HookResponse, deny_redirect, + get_session_context, get_tool_input, + pre_tool_use_with_context, + sanitize_log_str, +) +from .hooks_common import ( + LOG as _LOG, ) # Maximum URL length accepted for embedding in hook messages. URLs longer than @@ -119,8 +136,52 @@ def _intercept_webfetch_image(url: str) -> HookResponse: ) +def _handle_web_dedup(payload: HookPayload, url: str) -> HookResponse | None: + """Return a dedup hint when *url* was just fetched in this session. + + Mirrors :func:`hooks_read._handle_bash_dedup` for the WebFetch surface. + Returns ``None`` to let the hook continue to its existing image-redirect + path or pass through unchanged. + """ + from . import db, session # noqa: PLC0415 + from .hints import CHARS_PER_TOKEN, build_web_dedup_hint # noqa: PLC0415 + + session_id, _cwd = get_session_context(payload) + if not session_id: + return None + + try: + cache = session.load(session_id) + except (OSError, ValueError): + return None + + hint = build_web_dedup_hint( + session_id=session_id, url=url, cache=cache, + ) + if hint is None: + return None + + realized_tokens = hint.tokens_saved + injection_bytes = len(hint) + injection_cost_tokens = max(1, int(injection_bytes / CHARS_PER_TOKEN)) + db.record_stat( + None, "web_dedup_hint", + bytes_saved=realized_tokens * 4, tokens_saved=realized_tokens, + detail=sanitize_log_str(url, max_len=200), + ) + db.record_stat( + None, "web_dedup_hint_overhead", + bytes_saved=-injection_bytes, tokens_saved=-injection_cost_tokens, + detail=sanitize_log_str(url, max_len=200), + ) + _LOG.info( + "pre-fetch: web-dedup hint injected (tokens_saved=%d)", realized_tokens, + ) + return pre_tool_use_with_context(str(hint)) + + def pre_fetch(payload: HookPayload) -> HookResponse: - """Deny Drive/WebFetch image tools and redirect to token-goat shims.""" + """Deny Drive/WebFetch image tools and dedup repeat text WebFetch calls.""" tool_name = payload.get("tool_name", "") drive_tools = ( @@ -160,14 +221,170 @@ def pre_fetch(payload: HookPayload) -> HookResponse: if tool_name == "WebFetch": tool_input = get_tool_input(payload) url = tool_input.get("url") - if not url: + if not url or not isinstance(url, str): return CONTINUE() from . import webfetch # noqa: PLC0415 - if not webfetch.is_image_url(url): - return CONTINUE() + if webfetch.is_image_url(url): + return _intercept_webfetch_image(url) + + # Non-image WebFetch: try dedup first. When the same URL was fetched + # earlier in this session, emit a hint pointing at the cached body + # instead of letting the request go through. + dedup = _handle_web_dedup(payload, url) + if dedup is not None: + return dedup + return CONTINUE() + + return CONTINUE() + + +# --------------------------------------------------------------------------- +# post_fetch — capture WebFetch text responses to the on-disk cache +# --------------------------------------------------------------------------- + +# Smallest WebFetch body worth caching. Mirrors the dedup-hint floor: below +# this size the dedup hint would not fire anyway, and the disk+JSON churn +# outweighs the saving. +_WEB_CACHE_MIN_BYTES: int = 1024 + - return _intercept_webfetch_image(url) +def _extract_web_response(payload: HookPayload) -> tuple[str, int | None]: + """Pull (body, status_code) from a PostToolUse WebFetch payload. + Defensive about payload-shape drift between harness versions. The text + body is read at multiple plausible keys (``output``, ``text``, ``body``, + ``content``) and falls back to a bare string when ``tool_response`` is + itself a string. Status code is read at ``status``, ``status_code``, + or ``code`` and coerced via int — string-typed codes are accepted to + handle harnesses that surface them as ``"200"``. + """ + raw_resp: object = payload.get("tool_response") if isinstance(payload, dict) else None + if raw_resp is None and isinstance(payload, dict): + raw_resp = payload.get("tool_result") or payload.get("response") + + body = "" + status_val: object = None + + if isinstance(raw_resp, str): + body = raw_resp + elif isinstance(raw_resp, dict): + body_raw = ( + raw_resp.get("output") + or raw_resp.get("text") + or raw_resp.get("body") + or raw_resp.get("content") + or raw_resp.get("response") + ) + if isinstance(body_raw, str): + body = body_raw + elif isinstance(body_raw, list): + # MCP CallToolResult content array — concatenate text items. + parts: list[str] = [] + for item in body_raw: + if isinstance(item, dict) and isinstance(item.get("text"), str): + parts.append(item["text"]) + elif isinstance(item, str): + parts.append(item) + body = "".join(parts) + else: + body = str(body_raw) if body_raw is not None else "" + status_val = ( + raw_resp.get("status_code") + if "status_code" in raw_resp + else raw_resp.get("status") + if "status" in raw_resp + else raw_resp.get("code") + ) + + status_code: int | None = None + if isinstance(status_val, int) and not isinstance(status_val, bool): + status_code = status_val + elif isinstance(status_val, str): + try: + status_code = int(status_val) + except (TypeError, ValueError): + status_code = None + + return body, status_code + + +def post_fetch(payload: HookPayload) -> HookResponse: + """Post-WebFetch hook: persist large text responses to disk + session history. + + Skips images entirely — those are already handled by the existing + image-cache pipeline. For non-image responses above the cache threshold, + writes the body to ``data_dir() / "web_outputs"`` and records the + ``(url_sha, output_id)`` mapping in the session so a follow-up + ``pre_fetch`` for the same URL can dedupe. + + Always returns CONTINUE — this hook never modifies the tool result. + Failures at any step are logged and swallowed. + """ + tool_name = payload.get("tool_name", "") + if tool_name != "WebFetch": + return CONTINUE() + + session_id, _cwd = get_session_context(payload) + if not session_id: + _LOG.debug("post-fetch: no session_id; output not cached") + return CONTINUE() + + tool_input = get_tool_input(payload) + url = tool_input.get("url") + if not isinstance(url, str) or not url: + return CONTINUE() + + from . import webfetch # noqa: PLC0415 + + if webfetch.is_image_url(url): + # Image responses go through the existing image cache pipeline; we + # don't double-cache them here. + return CONTINUE() + + body, status_code = _extract_web_response(payload) + body_size = len(body.encode("utf-8", errors="replace")) + if body_size < _WEB_CACHE_MIN_BYTES: + _LOG.debug( + "post-fetch: body too small to cache (%d bytes < %d threshold)", + body_size, _WEB_CACHE_MIN_BYTES, + ) + return CONTINUE() + + from . import db, session, web_cache # noqa: PLC0415 + + meta = web_cache.store_output(session_id, url, body, status_code) + if meta is None: + return CONTINUE() + web_cache.write_sidecar(meta) + + try: + session.mark_web_fetch( + session_id=session_id, + url_sha=meta.url_sha, + url_preview=url, + output_id=meta.output_id, + body_bytes=meta.body_bytes, + status_code=meta.status_code, + truncated=meta.truncated, + ) + except (ValueError, OSError) as exc: + _LOG.debug("post-fetch: session record failed: %s", exc) + + # Informational stat row — no saving claimed at capture time; the saving + # is realized when (and if) the agent later avoids a re-fetch. + try: + db.record_stat( + None, "web_output_cached", + bytes_saved=0, tokens_saved=0, + detail=sanitize_log_str(url, max_len=200), + ) + except Exception: # noqa: BLE001 + _LOG.debug("post-fetch: stat record failed", exc_info=True) + + _LOG.info( + "post-fetch: cached body id=%s bytes=%d status=%s truncated=%s", + meta.output_id, body_size, status_code, meta.truncated, + ) return CONTINUE() diff --git a/src/token_goat/hooks_read.py b/src/token_goat/hooks_read.py index 84b7024..9d993f8 100644 --- a/src/token_goat/hooks_read.py +++ b/src/token_goat/hooks_read.py @@ -281,6 +281,59 @@ def _try_diff_hint( return pre_tool_use_with_context(str(hint)) +def _handle_grep_dedup(payload: HookPayload) -> HookResponse | None: + """Return a dedup hint when the same Grep pattern just ran in this session. + + Mirrors :func:`_handle_bash_dedup` for the Grep tool surface. Returns + ``None`` to let the hook fall through to ``CONTINUE`` when no dedup + hit is available — we never deny a Grep call, only suggest the agent + reuse the prior result. + """ + from . import db, session # noqa: PLC0415 + from .hints import CHARS_PER_TOKEN, build_grep_dedup_hint # noqa: PLC0415 + + session_id, _cwd = get_session_context(payload) + if not session_id: + return None + + tool_input = get_tool_input(payload) + pattern = tool_input.get("pattern") + if not isinstance(pattern, str) or not pattern: + return None + path = tool_input.get("path") + if path is not None and not isinstance(path, str): + path = None + + try: + cache = session.load(session_id) + except (OSError, ValueError): + return None + + hint = build_grep_dedup_hint( + session_id=session_id, pattern=pattern, path=path, cache=cache, + ) + if hint is None: + return None + + realized_tokens = hint.tokens_saved + injection_bytes = len(hint) + injection_cost_tokens = max(1, int(injection_bytes / CHARS_PER_TOKEN)) + db.record_stat( + None, "grep_dedup_hint", + bytes_saved=realized_tokens * 4, tokens_saved=realized_tokens, + detail=sanitize_log_str(pattern, max_len=200), + ) + db.record_stat( + None, "grep_dedup_hint_overhead", + bytes_saved=-injection_bytes, tokens_saved=-injection_cost_tokens, + detail=sanitize_log_str(pattern, max_len=200), + ) + _LOG.info( + "pre-read: grep-dedup hint injected (tokens_saved=%d)", realized_tokens, + ) + return pre_tool_use_with_context(str(hint)) + + def _handle_bash_dedup(payload: HookPayload) -> HookResponse | None: """Return a dedup hint when this exact Bash command ran earlier in the session. @@ -368,6 +421,12 @@ def pre_read(payload: HookPayload) -> HookResponse: return pre_read(read_payload) return CONTINUE() + if tool_name == "Grep": + dedup = _handle_grep_dedup(payload) + if dedup is not None: + return dedup + return CONTINUE() + if tool_name != "Read": _LOG.debug("pre-read: skipping non-Read tool %s", sanitize_opt(tool_name)) return CONTINUE() diff --git a/src/token_goat/hooks_session.py b/src/token_goat/hooks_session.py index 0e7bb9c..7eab320 100644 --- a/src/token_goat/hooks_session.py +++ b/src/token_goat/hooks_session.py @@ -1,22 +1,35 @@ -"""Session lifecycle hook handlers: session-start and project auto-detection. +"""Session lifecycle hook handlers: session-start and post-compaction recovery. ``session_start`` fires on every new Claude Code session (SessionStart event). -It performs three ordered actions: +It performs four ordered actions: -1. **Cache reset** — clears the per-session JSON cache for this session ID so - stale line-range data from a previous run does not trigger false re-read hints. +1. **Source detection** — reads the ``source`` field from the payload to + distinguish ``"startup"`` / ``"resume"`` / ``"clear"`` / ``"compact"``. + When the source is ``"compact"`` the cache is intentionally **preserved** + and a recovery hint is built from it; otherwise the cache is reset. -2. **Project detection + auto-indexing** — resolves ``cwd`` from the harness +2. **Cache reset (non-compact only)** — clears the per-session JSON cache + for this session ID so stale line-range data from a previous run does + not trigger false re-read hints. + +3. **Project detection + auto-indexing** — resolves ``cwd`` from the harness payload to a project root. If the project has never been indexed, a detached background ``token-goat index`` subprocess is spawned so the first Read of the session already has symbols available. ``db.touch_project_last_seen`` is also called so the worker's periodic-reindex prioritises recently used projects. -3. **Worker watchdog** — calls ``worker.ensure_running()`` to start (or confirm) +4. **Worker watchdog** — calls ``worker.ensure_running()`` to start (or confirm) the background daemon. The worker handles dirty-queue draining, LRU image eviction, log rotation, and stale-lock cleanup; it must be alive before any post-edit hooks fire. +When the recovery path runs, the hook returns ``additionalContext`` carrying +a compact summary of the session state immediately before compaction: +recently-edited files, top symbols accessed, the most recent cached Bash +outputs (with their ``token-goat bash-output `` retrieval keys), and the +most recent cached WebFetch responses. This lets the agent recover the +context it just lost to compaction without re-reading every file from scratch. + ``cwd`` validation is intentional: the field comes from an untrusted harness payload, so empty, non-directory, and excessively long values are rejected before being passed to ``find_project``. @@ -47,7 +60,12 @@ def _reset_session_cache(session_id: str | None) -> None: - """Reset session cache for /clear, /compact, fresh-start events.""" + """Reset session cache for /clear and fresh-start events. + + Intentionally NOT called for ``source == "compact"`` — we want the + pre-compaction state to survive into the new context window so the + recovery hint has something to point at. + """ if not session_id: return from . import session # noqa: PLC0415 @@ -55,6 +73,145 @@ def _reset_session_cache(session_id: str | None) -> None: session.reset_session(session_id) +# Maximum number of files / bash entries / web entries surfaced in the +# recovery hint. Each line costs ~25-40 tokens; keeping the per-section cap +# small keeps the whole hint comfortably under 400 tokens even when the +# pre-compaction session was dense. +_RECOVERY_MAX_FILES: int = 6 +_RECOVERY_MAX_BASH: int = 4 +_RECOVERY_MAX_WEB: int = 4 +# Minimum byte size before a cached output is worth listing in the recovery +# hint. Below this the dedup hint would not have fired anyway, and the line +# the recovery hint costs in the budget would not be repaid. +_RECOVERY_MIN_BYTES: int = 400 + + +def _build_recovery_hint(session_id: str) -> str | None: + """Return a compact recovery hint summarising pre-compaction state. + + Loaded *after* the SessionStart hook detects ``source == "compact"`` but + *before* the cache reset (so the hint has data to draw from). Returns + ``None`` when there is nothing worth surfacing — an empty session prior + to compact, or a load failure — so the caller can fall through to a + plain ``CONTINUE`` response. + + The hint is structured Markdown matching the compaction-manifest shape + so a developer can mentally map between the two outputs: it is the + counterpart that fires *after* the compaction LLM has processed the + manifest. + """ + try: + from . import session as session_mod # noqa: PLC0415 + + cache = session_mod.load(session_id) + except (OSError, ValueError) as exc: + _LOG.debug("recovery hint: failed to load session %s: %s", session_id[:16], exc) + return None + if cache.unavailable: + return None + + sections: list[str] = [] + + # 1. Recently-touched files — the agent will likely want these back. + # Rank by last_read_ts so the *most recent* reads (which still match the + # agent's mental model best) appear first. + if cache.files: + from operator import attrgetter # noqa: PLC0415 + + by_recency = attrgetter("last_read_ts") + files_sorted = sorted(cache.files.values(), key=by_recency, reverse=True) + files_keep = files_sorted[:_RECOVERY_MAX_FILES] + if files_keep: + lines = ["**Recently-read files** (cached snapshot for diff retrieval):"] + for entry in files_keep: + sym_str = f" syms={','.join(entry.symbols_read[:3])}" if entry.symbols_read else "" + lines.append(f"- {entry.rel_or_abs}{sym_str}") + sections.append("\n".join(lines)) + + # 2. Recent Bash output IDs — the most likely "I had this in context" + # data. Sort by ts descending and keep only entries above the size floor. + if cache.bash_history: + bash_entries = sorted( + (be for be in cache.bash_history.values() + if (be.stdout_bytes + be.stderr_bytes) >= _RECOVERY_MIN_BYTES), + key=lambda be: be.ts, reverse=True, + )[:_RECOVERY_MAX_BASH] + if bash_entries: + lines = ["**Recent Bash outputs** (use `token-goat bash-output ` to recall):"] + for be in bash_entries: + exit_str = "" if be.exit_code is None else f" exit={be.exit_code}" + total = be.stdout_bytes + be.stderr_bytes + lines.append( + f"- `{be.cmd_preview}` ({total:,}B{exit_str}) — id=`{be.output_id}`" + ) + sections.append("\n".join(lines)) + + # 3. Recent WebFetch outputs — same idea for network results. + if cache.web_history: + web_entries = sorted( + (we for we in cache.web_history.values() if we.body_bytes >= _RECOVERY_MIN_BYTES), + key=lambda we: we.ts, reverse=True, + )[:_RECOVERY_MAX_WEB] + if web_entries: + lines = ["**Recent WebFetch responses** (use `token-goat web-output ` to recall):"] + for we in web_entries: + status_str = "" if we.status_code is None else f" status={we.status_code}" + lines.append( + f"- `{we.url_preview}` ({we.body_bytes:,}B{status_str}) — id=`{we.output_id}`" + ) + sections.append("\n".join(lines)) + + if not sections: + return None + + header = ( + "## Token-Goat Post-Compact Recovery\n" + "Compaction just ran. The following resources were active in the prior " + "context window and remain cached — pull them on demand instead of " + "re-reading from scratch." + ) + return "\n\n".join([header, *sections]) + + +def _try_recovery_response(session_id: str | None, source: str) -> HookResponse | None: + """Build a recovery-hint response when *source* is "compact" and state exists. + + Returns ``None`` when the recovery path does not apply — caller should + fall through to the normal session-start flow. This isolates the + source-string check from the hint builder so each is independently + testable. + """ + if source != "compact" or not session_id: + return None + hint = _build_recovery_hint(session_id) + if not hint: + return None + + # Record an observability stat row so the recovery path shows up in + # ``token-goat stats`` if anyone is monitoring whether the feature fires. + # No saving claimed: the actual saving is realised only when the agent + # uses the cached IDs from the hint, and those usages are accounted + # under their own kinds (bash_dedup_hint, web_dedup_hint). + try: + from . import db # noqa: PLC0415 + + db.record_stat(None, "compact_recovery", bytes_saved=0, tokens_saved=0, detail=session_id[:32]) + except Exception: # noqa: BLE001 + _LOG.debug("recovery hint: stat record failed", exc_info=True) + + _LOG.info( + "session-start: compact-recovery hint emitted for session=%s (%d chars)", + session_id[:16], len(hint), + ) + return { + "continue": True, + "hookSpecificOutput": { + "hookEventName": "SessionStart", + "additionalContext": hint, + }, + } + + def _detect(payload: HookPayload) -> Project | None: """Detect the current project from cwd. Returns None if not in a project root. @@ -106,22 +263,59 @@ def _ensure_worker_running() -> None: _LOG.exception("watchdog failed") +def _read_source(payload: HookPayload) -> str: + """Return the SessionStart ``source`` field, defaulting to ``"startup"``. + + Claude Code emits one of ``"startup"`` / ``"resume"`` / ``"clear"`` / + ``"compact"`` in this field. Older harness versions or non-Claude + callers may omit it; we treat absence as ``"startup"`` so cache-reset + behaviour stays correct for the common case. + """ + raw = payload.get("source") + if isinstance(raw, str): + return raw + return "startup" + + def session_start(payload: HookPayload) -> HookResponse: - """Reset session cache and ensure worker daemon is running.""" - session_id, cwd = get_session_context(payload) - _LOG.info("session-start: session_id=%s cwd=%s", sanitize_opt(session_id), sanitize_opt(cwd)) + """Run the appropriate session-lifecycle action for the inbound source. - _reset_session_cache(session_id) + * ``source == "compact"``: PRESERVE the cache and emit a recovery hint + so the agent's new context window has pointers back to the cached + resources it just lost. + * Any other source (startup / resume / clear / unknown): RESET the + cache so stale line-range data does not trigger false hints in the + fresh run. + + Worker startup and auto-indexing happen in both branches. Returning + early in the compact path keeps the recovery hint's ``hookSpecificOutput`` + shape clean (no risk of clobbering it with a later return). + """ + session_id, cwd = get_session_context(payload) + source = _read_source(payload) + _LOG.info( + "session-start: session_id=%s cwd=%s source=%s", + sanitize_opt(session_id), sanitize_opt(cwd), sanitize_opt(source), + ) + recovery = _try_recovery_response(session_id, source) + # Project detection and worker watchdog must run in both branches — + # ``source == "compact"`` doesn't change the fact that the worker may + # have died, or that the project root may need its last-seen bumped. proj = _detect(payload) if proj: _LOG.info("session-start: detected project %s (%s)", proj.root, proj.hash[:8]) - # Mark user activity so the worker's periodic-reindex window stays - # anchored to projects actually in use. from . import db # noqa: PLC0415 db.touch_project_last_seen(proj.hash) _auto_index_if_needed(proj) - _ensure_worker_running() + + if recovery is not None: + return recovery + + # Non-compact branch: cache reset happens here, AFTER recovery has had + # a chance to fire (so a misdetection of source can't both reset the + # cache and lose the recovery data). + _reset_session_cache(session_id) return CONTINUE() diff --git a/src/token_goat/install.py b/src/token_goat/install.py index ae8d7f4..dc669dd 100644 --- a/src/token_goat/install.py +++ b/src/token_goat/install.py @@ -780,7 +780,7 @@ def _hooks_block(binary: str | None = None) -> dict[str, list[_HookMatcherEntry] ], "PreToolUse": [ { - "matcher": "Read", + "matcher": "Read|Grep|Bash", "hooks": [ { "type": "command", @@ -831,6 +831,16 @@ def _hooks_block(binary: str | None = None) -> dict[str, list[_HookMatcherEntry] } ], }, + { + "matcher": "WebFetch", + "hooks": [ + { + "type": "command", + "command": runner("hook", "post-fetch"), + "timeout": 3000, + } + ], + }, ], "PreCompact": [ { @@ -1076,10 +1086,11 @@ def _unpatch_md_block(md_path: Path, begin_marker: str, end_marker: str, not_fou | Find code by meaning, not name | `token-goat semantic "rate limit retry"` | Several rounds of `Grep` | | Get oriented in an unfamiliar repo | `token-goat map --compact` | Recursive `ls` plus multiple `Read` calls | | Outline a long Google Doc | `token-goat gdrive-sections ` | Fetching the whole doc | -| Read one TOML/YAML/JSON config block | `token-goat section "pyproject.toml::tool.ruff"` | `Read pyproject.toml` | +| Read one TOML/YAML/JSON/INI/.env/Dockerfile block | `token-goat section "pyproject.toml::tool.ruff"` | `Read pyproject.toml` | | Re-inspect a recent Bash output | `token-goat bash-output --tail 50` | Re-running the same `pytest`/`cargo`/`git log` | +| Re-inspect a recent WebFetch response | `token-goat web-output --grep "TODO"` | Re-fetching the same docs URL | -Modifiers worth knowing: `symbol --all-projects` (cross-repo); `map --compact` (300-token budget); `semantic --max-distance 1.0` or `--no-rerank` to widen / tighten results; `bash-output --grep PATTERN` to filter cached output. A miss prints "Did you mean…?" suggestions — try one before falling back to `Read`. The pre-Bash hook will hint when a command is about to repeat in the same session. +Modifiers worth knowing: `symbol --all-projects` (cross-repo); `symbol --strict` to opt out of close-match auto-redirect; `map --compact` (300-token budget); `semantic --max-distance 1.0` or `--no-rerank` to widen / tighten results; `bash-output --grep PATTERN` / `web-output --grep PATTERN` to filter cached output. A miss without an unambiguous close match prints "Did you mean…?" suggestions; a unique close match at high confidence is followed transparently with a `(redirected from: ...)` marker. The pre-Bash, pre-Grep, and pre-WebFetch hooks hint when a tool call is about to repeat in the same session. Read is the right call when: - The file is under about 200 lines and you need the whole thing. @@ -1142,11 +1153,12 @@ def unpatch_claude_md() -> str: | Find code by meaning, not name | `token-goat semantic "rate limit retry"` | Several rounds of `Grep` | | Get oriented in an unfamiliar repo | `token-goat map --compact` | Recursive `ls` plus multiple `Read` calls | | Outline a long Google Doc | `token-goat gdrive-sections ` | Fetching the whole doc | -| Read one TOML/YAML/JSON config block | `token-goat section "pyproject.toml::tool.ruff"` | `Read pyproject.toml` | +| Read one TOML/YAML/JSON/INI/.env/Dockerfile block | `token-goat section "pyproject.toml::tool.ruff"` | `Read pyproject.toml` | | Re-inspect a recent Bash output | `token-goat bash-output --tail 50` | Re-running `pytest`/`cargo`/`git log` | +| Re-inspect a recent WebFetch response | `token-goat web-output --grep "TODO"` | Re-fetching the same docs URL | | See what you have already touched | `token-goat session-touched` | Re-reading and hoping you remember | -Modifiers worth knowing: `symbol --all-projects` searches every indexed repo at once; `map --compact` fits a 300-token budget; `semantic --max-distance 1.0` widens or `--no-rerank` tightens semantic results; `bash-output --grep PATTERN` filters cached output. A miss prints "Did you mean…?" suggestions — try one of those before falling back to `Read`. +Modifiers worth knowing: `symbol --all-projects` searches every indexed repo at once; `symbol --strict` disables close-match auto-redirect; `map --compact` fits a 300-token budget; `semantic --max-distance 1.0` widens or `--no-rerank` tightens semantic results; `bash-output --grep PATTERN` / `web-output --grep PATTERN` filter cached output. A miss prints "Did you mean…?" suggestions — try one of those before falling back to `Read`. A unique high-confidence close match is followed transparently with a `(redirected from: ...)` marker. ## When Read is the right call @@ -1346,10 +1358,11 @@ def unpatch_codex_config() -> str: | Find code by meaning, not name | `token-goat semantic "rate limit retry"` | Several rounds of `rg` | | Get oriented in an unfamiliar repo | `token-goat map --compact` | `ls -R` plus multiple `cat` calls | | Outline a long Google Doc | `token-goat gdrive-sections ` | Fetching the whole doc | -| Read one TOML/YAML/JSON config block | `token-goat section "pyproject.toml::tool.ruff"` | `cat pyproject.toml` | +| Read one TOML/YAML/JSON/INI/.env/Dockerfile block | `token-goat section "pyproject.toml::tool.ruff"` | `cat pyproject.toml` | | Re-inspect a recent Bash output | `token-goat bash-output --tail 50` | Re-running `pytest`/`cargo`/`git log` | +| Re-inspect a recent WebFetch / web_search response | `token-goat web-output --grep "TODO"` | Re-fetching the same docs URL | -Modifiers worth knowing: `symbol --all-projects` (cross-repo); `map --compact` (300-token budget); `semantic --max-distance 1.0` or `--no-rerank` to widen / tighten results; `bash-output --grep PATTERN` to filter cached output. A miss prints "Did you mean…?" suggestions — try one before falling back to a Bash read. The pre-Bash hook will hint when a command is about to repeat in the same session. +Modifiers worth knowing: `symbol --all-projects` (cross-repo); `symbol --strict` disables close-match auto-redirect; `map --compact` (300-token budget); `semantic --max-distance 1.0` or `--no-rerank` to widen / tighten results; `bash-output --grep PATTERN` / `web-output --grep PATTERN` filter cached output. A miss without an unambiguous close match prints "Did you mean…?" suggestions; a unique high-confidence close match is followed transparently with a `(redirected from: ...)` marker. The pre-Bash, pre-Grep, and pre-WebFetch hooks hint when a tool call is about to repeat in the same session. Plain Bash reads are the right call when: - The file is under about 200 lines and you need the whole thing. diff --git a/src/token_goat/languages/dockerfile_idx.py b/src/token_goat/languages/dockerfile_idx.py new file mode 100644 index 0000000..2163cc8 --- /dev/null +++ b/src/token_goat/languages/dockerfile_idx.py @@ -0,0 +1,99 @@ +"""Dockerfile extractor — one Section per ``FROM`` build stage. + +A Dockerfile is a flat list of instructions where ``FROM`` introduces a new +build stage and every subsequent ``RUN`` / ``COPY`` / ``ENV`` / etc. applies +within that stage until the next ``FROM`` or EOF. Multi-stage Dockerfiles +(``FROM ... AS builder`` followed by ``FROM ... AS runtime``) are the natural +unit of sectioning: an agent debugging a build typically wants one stage's +body, not the whole file. + +Sections +-------- +* Each ``FROM`` line opens a new section. When the line ends with + ``AS `` the section heading is the stage name; otherwise it is the + image reference (e.g. ``python:3.11``) so the section is still addressable. +* ``level`` is always 1 — Dockerfiles have no nesting at the section level. +* ``end_line`` is the line before the next ``FROM`` or EOF for the last stage. + +Symbols +------- +The same headings are emitted as ``dockerfile_stage`` symbols so +``token-goat symbol builder`` jumps straight to ``FROM python:3.11 AS builder``. +Other instructions (``RUN``, ``COPY``, etc.) are intentionally not indexed — +they don't have stable names and inflating the symbol table with per-line +entries would hurt the surrounding map / global search. +""" +from __future__ import annotations + +__all__ = ["extract"] + +import logging +import re + +from ..parser import ImpExp, Ref, Section, Symbol + +_LOG = logging.getLogger("token_goat.languages.dockerfile_idx") + +# Column-0-anchored ``FROM`` instruction. Dockerfile keywords are +# case-insensitive ("FROM" and "from" both work) per the official spec; we +# also tolerate trailing comments after the instruction body. The trailing +# ``AS `` clause is captured separately so we can prefer the stage +# name as the section heading when present. +_FROM_RE = re.compile( + r"^\s*FROM\s+(?P[^\s#]+)(?:\s+AS\s+(?P[A-Za-z0-9_\-]+))?\s*(?:#.*)?$", + re.IGNORECASE, +) + +# Maximum number of stages indexed. Real multi-stage Dockerfiles top out at +# a handful (build → test → runtime is common; >10 stages is rare). +_MAX_STAGES: int = 50 +# Maximum heading length we accept (image refs can be long but anything past +# this is pathological). +_MAX_HEADING_LEN: int = 200 + + +def extract( + source: bytes, rel_path: str +) -> tuple[list[Symbol], list[Ref], list[ImpExp], list[Section]]: + """Extract ``FROM`` stages as Section + Symbol entries. + + Refs and imports are always empty for Dockerfiles — there is no + cross-file reference model. + """ + try: + text = source.decode("utf-8", errors="replace").replace("\r\n", "\n").replace("\r", "\n") + except (UnicodeDecodeError, AttributeError) as exc: + _LOG.debug("dockerfile_idx: decode failed for %s: %s", rel_path, exc) + return [], [], [], [] + + lines = text.split("\n") + sections: list[Section] = [] + symbols: list[Symbol] = [] + + for idx, line in enumerate(lines, start=1): + # BOM-strip on line 1 (Notepad-on-Windows defaults to UTF-8-BOM). + candidate = line.lstrip("") if idx == 1 else line + m = _FROM_RE.match(candidate) + if m is None: + continue + alias = (m.group("alias") or "").strip() + image = (m.group("image") or "").strip() + # Prefer the AS-alias when present — it is the stage's *intended* + # name, the one ``COPY --from=`` will reference. Fall back to + # the image reference so unnamed stages remain addressable. + heading = alias or image + if not heading or len(heading) > _MAX_HEADING_LEN: + continue + sections.append(Section(heading=heading, level=1, line=idx)) + symbols.append(Symbol(name=heading, kind="dockerfile_stage", line=idx)) + if len(sections) >= _MAX_STAGES: + break + + total = len(lines) + for i, sec in enumerate(sections): + if i + 1 < len(sections): + sec.end_line = max(sec.line, sections[i + 1].line - 1) + else: + sec.end_line = max(sec.line, total) + + return symbols, [], [], sections diff --git a/src/token_goat/parser.py b/src/token_goat/parser.py index 260b4b6..6103310 100644 --- a/src/token_goat/parser.py +++ b/src/token_goat/parser.py @@ -65,15 +65,20 @@ ".yml": "yaml", ".ini": "ini", ".cfg": "ini", + ".dockerfile": "dockerfile", } # Files identified by full basename rather than suffix. Dotfiles like ``.env`` # and ``.envrc`` have an empty ``Path.suffix``, so the standard suffix lookup # would silently skip them. We resolve these by lowercase basename and fall # through to the suffix-based ``LANG_BY_EXT`` path when no match is found. +# ``Dockerfile`` and ``Containerfile`` are also recognised by basename +# because the conventional spelling has no extension. LANG_BY_BASENAME: dict[str, str] = { ".env": "env", ".envrc": "env", + "dockerfile": "dockerfile", + "containerfile": "dockerfile", } # Frozenset view of LANG_BY_BASENAME (already-lowercase keys) — see the # matching declaration above ``_KNOWN_EXTENSIONS`` for why this is precomputed. @@ -304,6 +309,7 @@ def _factory() -> Extractor: "yaml": _language_importer("yaml_idx"), "ini": _language_importer("ini_idx"), "env": _language_importer("ini_idx", attr="extract_env"), + "dockerfile": _language_importer("dockerfile_idx"), } # Cache resolved extractors so each language module is imported at most once. diff --git a/src/token_goat/render/ansi.py b/src/token_goat/render/ansi.py index 13041fb..b306008 100644 --- a/src/token_goat/render/ansi.py +++ b/src/token_goat/render/ansi.py @@ -93,4 +93,5 @@ class C: PURPLE: RGB = (188, 140, 255) # project bullet 1 TEAL: RGB = (138, 212, 255) # project bullet 2 ORANGE: RGB = (235, 165, 80) # bash bucket — distinct from the cool-toned hint/read/compact + YELLOW: RGB = (240, 215, 80) # web bucket — pairs with orange in warm-tone half RED: RGB = (200, 60, 60) # negative delta diff --git a/src/token_goat/render/stats_renderer.py b/src/token_goat/render/stats_renderer.py index 97687b9..ca1c924 100644 --- a/src/token_goat/render/stats_renderer.py +++ b/src/token_goat/render/stats_renderer.py @@ -553,6 +553,7 @@ def _render_by_kind_section(stats: StatsData) -> list[str]: "read": C.GREEN4, "compact": C.TEAL, "bash": C.ORANGE, + "web": C.YELLOW, "other": C.TEXT_MUTED, } diff --git a/src/token_goat/session.py b/src/token_goat/session.py index 4ea6618..72d8278 100644 --- a/src/token_goat/session.py +++ b/src/token_goat/session.py @@ -31,16 +31,20 @@ "ResultCacheEntry", "SESSION_SCHEMA_VERSION", "SessionCache", + "WEB_HISTORY_MAX", + "WebEntry", "get_file_entry", "get_result_cache", "list_edited", "list_touched", "load", "lookup_bash_entry", + "lookup_web_entry", "mark_bash_run", "mark_file_edited", "mark_file_read", "mark_grep", + "mark_web_fetch", "put_result_cache", "reset_session", "save", @@ -114,6 +118,30 @@ class GrepEntry: result_count: int | None = None # if known +@dataclass +class WebEntry: + """Tracks one WebFetch invocation within a session. + + Stored in :attr:`SessionCache.web_history` keyed by the SHA prefix of the + URL so a future pre-fetch can quickly dedupe a repeat fetch. The body + itself lives on disk under the web-cache directory and is referenced here + only by ``output_id``. + + ``url_preview`` stores up to 200 chars of the URL for human-readable + display in ``token-goat web-history``; the full URL is not persisted + because URLs longer than that are typically presigned download tokens or + similar that should not live in session JSON longer than necessary. + """ + + url_sha: str + url_preview: str + output_id: str + ts: float + body_bytes: int + status_code: int | None = None + truncated: bool = False + + @dataclass class BashEntry: """Tracks one execution of a Bash command within a session. @@ -186,6 +214,17 @@ class ResultCacheEntry: # short enough to keep the manifest output bounded. _MAX_BASH_PREVIEW = 120 +# Maximum number of web-history entries retained per session, with the same +# FIFO-eviction semantics as bash history. Web sessions tend to involve +# fewer distinct URLs than commands but bigger payloads on disk; the cap is +# chosen to mirror BASH_HISTORY_MAX so the operational mental model stays +# uniform between the two caches. +WEB_HISTORY_MAX = 200 +_WEB_HISTORY_EVICT = 50 +# Length of the URL preview persisted in session JSON. 200 covers any +# realistic page URL while keeping the per-entry footprint predictable. +_MAX_WEB_URL_PREVIEW = 200 + @dataclass class SessionCache: @@ -214,6 +253,10 @@ class SessionCache: # Insertion-ordered dict; FIFO eviction at BASH_HISTORY_MAX prevents growth # in tight retry loops. bash_history: dict[str, BashEntry] = field(default_factory=dict) + # Per-session web-fetch history keyed by short SHA of the URL. Used by + # the pre-WebFetch dedup hint and by ``token-goat web-history`` for + # listing. Same FIFO + cap semantics as bash_history. + web_history: dict[str, WebEntry] = field(default_factory=dict) # Per-session content snapshots used by the diff-aware re-read hint. Maps # normalized file path → SHA of the snapshot bytes stored on disk under # ``data_dir() / "session_snapshots" / / .bin``. @@ -244,6 +287,10 @@ def to_dict(self) -> _SessionDict: k: cast("_BashEntryDict", asdict(v)) for k, v in self.bash_history.items() }, + web_history={ + k: cast("_WebEntryDict", asdict(v)) + for k, v in self.web_history.items() + }, snapshot_shas=dict(self.snapshot_shas), ) @@ -335,6 +382,14 @@ def from_dict(cls, d: dict[str, Any]) -> SessionCache: if be_entry is not None: bash_history[k] = be_entry + web_history: dict[str, WebEntry] = {} + for k, v in d.get("web_history", {}).items(): + if not isinstance(v, dict) or not isinstance(k, str): + continue + we_entry = _parse_web_entry(v) + if we_entry is not None: + web_history[k] = we_entry + # snapshot_shas: dict[str, str] — coerce values defensively so a # malformed entry written by a future version (e.g. structured object) # is dropped silently rather than poisoning the lookup path. @@ -354,6 +409,7 @@ def from_dict(cls, d: dict[str, Any]) -> SessionCache: edited_files=edited_files, result_cache=result_cache, bash_history=bash_history, + web_history=web_history, snapshot_shas=snapshot_shas, ) @@ -491,6 +547,44 @@ class _BashEntryDict(TypedDict, total=False): truncated: bool +class _WebEntryDict(TypedDict, total=False): + """Wire format of a single WebEntry as it appears in the session JSON.""" + + url_sha: str + url_preview: str + output_id: str + ts: float + body_bytes: int + status_code: int | None + truncated: bool + + +def _parse_web_entry(v: dict[str, Any]) -> WebEntry | None: + """Deserialize one web-history dict from JSON, returning None on parse error. + + Defensive about every field: session JSON is user-readable on disk and + could be corrupted, partially upgraded, or hand-edited. A bad entry is + dropped at debug level rather than crashing the session-load path. + """ + try: + raw_status = v.get("status_code") + status_code: int | None = None + if isinstance(raw_status, int) and not isinstance(raw_status, bool): + status_code = raw_status + return WebEntry( + url_sha=str(v.get("url_sha", "")), + url_preview=str(v.get("url_preview", "")), + output_id=str(v.get("output_id", "")), + ts=float(v.get("ts", 0.0)) if isinstance(v.get("ts", 0.0), (int, float)) else 0.0, + body_bytes=max(0, int(v.get("body_bytes", 0))), + status_code=status_code, + truncated=bool(v.get("truncated", False)), + ) + except (TypeError, ValueError, KeyError) as exc: + _LOG.debug("session: skipping corrupted web entry: %s", exc) + return None + + def _parse_bash_entry(v: dict[str, Any]) -> BashEntry | None: """Deserialize one bash-history dict from JSON, returning None on parse error. @@ -562,6 +656,7 @@ class _SessionDict(TypedDict, total=False): edited_files: dict[str, int] result_cache: dict[str, _ResultCacheEntryDict] bash_history: dict[str, _BashEntryDict] + web_history: dict[str, _WebEntryDict] snapshot_shas: dict[str, str] @@ -1340,6 +1435,81 @@ def lookup_bash_entry( return cache.bash_history.get(cmd_sha) +def mark_web_fetch( + session_id: str, + url_sha: str, + url_preview: str, + output_id: str, + body_bytes: int, + status_code: int | None, + truncated: bool, + *, + cache: SessionCache | None = None, +) -> SessionCache: + """Record a WebFetch invocation in the per-session history. + + Mirrors :func:`mark_bash_run` for the WebFetch surface. Storing only the + short URL SHA — not the full URL — keeps the session JSON small and + avoids persisting potentially-sensitive query parameters (auth tokens, + presigned URL signatures) longer than necessary. ``url_preview`` is the + first 200 chars of the URL, which is enough to identify a repeat fetch + while remaining bounded. + + FIFO eviction batches removals at ``_WEB_HISTORY_EVICT`` so a tight + re-fetch loop does not rewrite the dict on every insert. + """ + try: + cache = _resolve_cache(session_id, cache) + except ValueError as exc: + _LOG.warning("mark_web_fetch: invalid session_id (%s); skipping", exc) + return cache or _fresh_cache(session_id) + if cache.unavailable: + return cache + + safe_preview = sanitize_log_str(url_preview, max_len=_MAX_WEB_URL_PREVIEW) + + now = time.time() + if url_sha not in cache.web_history and len(cache.web_history) >= WEB_HISTORY_MAX: + evict_keys = list(islice(cache.web_history.keys(), _WEB_HISTORY_EVICT)) + for k in evict_keys: + del cache.web_history[k] + _LOG.debug( + "web_history: evicted %d entries (cap=%d) for session=%s", + _WEB_HISTORY_EVICT, WEB_HISTORY_MAX, session_id[:16], + ) + + cache.web_history[url_sha] = WebEntry( + url_sha=url_sha, + url_preview=safe_preview, + output_id=output_id, + ts=now, + body_bytes=max(0, int(body_bytes)), + status_code=( + status_code + if isinstance(status_code, int) and not isinstance(status_code, bool) + else None + ), + truncated=bool(truncated), + ) + cache.last_activity_ts = now + cache._invalidate_json_cache() + save(cache) + return cache + + +def lookup_web_entry( + session_id: str, url_sha: str, *, cache: SessionCache | None = None +) -> WebEntry | None: + """Return the :class:`WebEntry` for *url_sha* in *session_id*, or None.""" + try: + cache = _resolve_cache(session_id, cache) + except ValueError: + return None + if cache.unavailable: + return None + return cache.web_history.get(url_sha) + + def set_snapshot_sha( session_id: str, file_path: str, diff --git a/src/token_goat/stats.py b/src/token_goat/stats.py index 4175611..29dc2be 100644 --- a/src/token_goat/stats.py +++ b/src/token_goat/stats.py @@ -35,6 +35,7 @@ SOURCE_READ = "read" SOURCE_COMPACT = "compact" SOURCE_BASH = "bash" +SOURCE_WEB = "web" SOURCE_OTHER = "other" # Map each raw event kind → user-facing source bucket. Unknown kinds fall @@ -48,11 +49,15 @@ # hint family (both gross savings and overhead live here so the source # bucket reflects the net contribution of the hint mechanism). Diff hints # are the smart variant that injects a unified diff instead of suppressing - # the re-read entirely — same prevention mechanism, same bucket. + # the re-read entirely — same prevention mechanism, same bucket. Grep + # dedup is a "prevent another file-like read" hint and stays in this + # bucket too; the cross-tool symmetry keeps stats scannable. "session_hint": SOURCE_HINT, "session_hint_overhead": SOURCE_HINT, "diff_hint": SOURCE_HINT, "diff_hint_overhead": SOURCE_HINT, + "grep_dedup_hint": SOURCE_HINT, + "grep_dedup_hint_overhead": SOURCE_HINT, # surgical read family "read_replacement": SOURCE_READ, "section_replacement": SOURCE_READ, @@ -67,6 +72,12 @@ "bash_dedup_hint": SOURCE_BASH, "bash_dedup_hint_overhead": SOURCE_BASH, "bash_output_cached": SOURCE_BASH, + # web-fetch cache family — same shape as bash, separate bucket so the + # network-savings line is distinct from the local-execution-savings line + # in the stats output. + "web_dedup_hint": SOURCE_WEB, + "web_dedup_hint_overhead": SOURCE_WEB, + "web_output_cached": SOURCE_WEB, } @@ -88,6 +99,7 @@ def kind_to_source(kind: str) -> str: "SOURCE_IMAGE", "SOURCE_OTHER", "SOURCE_READ", + "SOURCE_WEB", "StatsSummary", "kind_to_source", "render_text", diff --git a/src/token_goat/web_cache.py b/src/token_goat/web_cache.py new file mode 100644 index 0000000..94aabe0 --- /dev/null +++ b/src/token_goat/web_cache.py @@ -0,0 +1,416 @@ +"""Persistent store for cached WebFetch responses. + +Every PostToolUse(WebFetch) hook invocation persists the response body to a +short text file under ``data_dir() / "web_outputs"`` keyed by a content-derived +ID built from the URL. Subsequent invocations of the same URL in the same +session can detect the duplicate via :func:`session.lookup_web_entry`, and +agents can retrieve sliced views of any cached response via the +``token-goat web-output`` CLI. + +The disk-store, eviction, and sidecar machinery deliberately mirrors +:mod:`bash_cache` so the two surfaces share an operational model. Each cache +entry is a pair of files: ``.txt`` for the body and ``.json`` for +metadata; orphan ``.json`` files left by a partial deletion are swept the next +time eviction runs. + +Why a separate store from images +-------------------------------- +``webfetch.fetch_url`` already maintains an image-shaped on-disk cache for +binary downloads (PNG/JPEG/WebP). That cache is keyed on URL with extras for +content-type sniffing and lives at ``data_dir() / "web_cache"``. This module +serves the *text* response path — HTML, JSON, plain text — that the existing +image cache deliberately does not handle. Mixing the two would conflate +"shrink this PNG before the model sees it" (image cache) with "the agent just +asked for this page; cache the body so a repeat ask in the same session is +free" (this cache), and each one wants different keying, eviction caps, and +retrieval shapes. + +Fail-soft contract +------------------ +Every public function on this module returns sensibly on I/O error and logs +to the standard token-goat logger. A failed store yields ``None``; a failed +load yields ``None``. The hook layer must never propagate a cache failure +into the agent's tool path — the worst case is "cache miss, body fetched +again", which is the pre-cache baseline. +""" +from __future__ import annotations + +__all__ = [ + "DEFAULT_MAX_TOTAL_BYTES", + "OUTPUT_FILENAME_RE", + "WebOutputMeta", + "evict_old_entries", + "list_outputs", + "load_output", + "load_output_meta", + "output_id_for", + "read_sidecar", + "sidecar_meta_path", + "store_output", + "url_hash", + "write_sidecar", +] + +import hashlib +import json +import logging +import os +import re +import stat as _stat_module +import time +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import cast + +from . import paths +from .hooks_common import sanitize_log_str + +_LOG = logging.getLogger("token_goat.web_cache") + +# Total byte budget for the on-disk web-output store. Web pages tend to be +# larger than Bash logs (HTML + assets list, JSON dumps with embedded data) +# but the count of distinct URLs per session is typically smaller, so 32 MB +# is enough headroom while still being invisible on any modern disk. +DEFAULT_MAX_TOTAL_BYTES: int = 32 * 1024 * 1024 + +# Same filename pattern as bash_cache so a future shared eviction helper can +# operate on either directory. +OUTPUT_FILENAME_RE = re.compile(r"^[a-zA-Z0-9_\-]{1,80}\.txt$") + +# Sentinel placed at the head of every truncated body, mirroring bash_cache. +_TRUNC_MARKER = "[token-goat: web output truncated to {n} bytes; full size was {total} bytes]\n" + +# Maximum bytes stored per response body. HTML pages can easily exceed this +# (a single Reddit thread is often 3-5 MB of HTML); the truncation keeps any +# one entry bounded while the eviction loop bounds the whole directory. We +# keep the *tail* of the body because most useful web content (article text, +# JSON response payloads, error bodies) sits at the bottom while the head is +# typically navigation chrome that the agent rarely needs. +_MAX_STORED_BYTES: int = 2 * 1024 * 1024 + + +@dataclass +class WebOutputMeta: + """Metadata associated with a cached WebFetch response entry. + + Mirrors :class:`bash_cache.BashOutputMeta` so the operational surface of + the two caches stays uniform. ``url_preview`` carries the first 200 + characters of the URL (sanitised) — long enough to be human-readable in + ``token-goat web-history`` output but capped to keep the manifest budget + predictable. ``status_code`` is optional because not every harness + surfaces it; absence means "unknown" rather than "succeeded" or "failed". + """ + + output_id: str + url_sha: str + url_preview: str + body_bytes: int + status_code: int | None + ts: float + truncated: bool + + +def _web_outputs_dir() -> Path: + """Return ``data_dir() / "web_outputs"`` and create it on first use.""" + d = paths.data_dir() / "web_outputs" + d.mkdir(parents=True, exist_ok=True) + return d + + +def url_hash(url: str) -> str: + """Return a short content hash for *url* (first 16 hex chars of SHA-256). + + SHA-256 here is overkill for collision resistance (we only need to + distinguish at most a few hundred URLs per session) but it is stdlib, + fast, and matches the bash_cache convention. We hash the raw URL + bytes rather than a normalised form because two URLs that differ only + in trailing-slash or query-parameter order legitimately return + different content and should not collide in the cache. + """ + return hashlib.sha256(url.encode("utf-8", errors="replace")).hexdigest()[:16] + + +def output_id_for(session_id: str, url: str, ts: float | None = None) -> str: + """Build a filesystem-safe ID for the ``(session, url, time)`` tuple. + + The ID embeds a short session prefix and a millisecond timestamp so two + fetches of the same URL in the same session do not collide; both are kept + and the most recent wins on dedup lookups, but each cached response + remains addressable for forensic retrieval (e.g. when an agent wants to + compare an earlier response to a later one). + """ + safe_session = re.sub(r"[^a-zA-Z0-9_\-]", "_", session_id)[:16] or "anon" + ms = int((ts if ts is not None else time.time()) * 1000) + return f"{safe_session}-{ms:013d}-{url_hash(url)}" + + +def _safe_join(output_id: str) -> Path | None: + """Validate *output_id* and return the corresponding cache file path. + + Returns ``None`` (with a warning log) when the ID is malformed. The + on-disk store sits next to other token-goat data; an attacker-influenced + ID must not be able to walk out of it even if the surrounding hook + machinery somehow forwards a crafted value. + """ + if not output_id: + return None + name = f"{output_id}.txt" + if not OUTPUT_FILENAME_RE.match(name): + _LOG.warning("web_cache: rejected output_id with invalid chars: %r", sanitize_log_str(output_id)) + return None + base = _web_outputs_dir().resolve() + candidate = (base / name).resolve() + try: + candidate.relative_to(base) + except ValueError: + _LOG.warning("web_cache: rejected output_id escaping base dir: %r", sanitize_log_str(output_id)) + return None + return candidate + + +def store_output( + session_id: str, + url: str, + body: str, + status_code: int | None, + *, + max_total_bytes: int = DEFAULT_MAX_TOTAL_BYTES, +) -> WebOutputMeta | None: + """Write *body* to the cache and return descriptive metadata. + + Returns ``None`` on any I/O error so the calling hook can degrade + silently. Bodies larger than :data:`_MAX_STORED_BYTES` are + tail-preserved (head truncated) because page footers, JSON response + bodies, and error stack traces all tend to sit at the bottom of the + fetched content. After the write the function opportunistically evicts + the oldest files until the total store size is back under + ``max_total_bytes``; the eviction is best-effort and a failed pass simply + leaves the directory slightly over budget — the next call will try + again. + """ + try: + out_id = output_id_for(session_id, url) + path = _safe_join(out_id) + if path is None: + return None + + body_bytes = len(body.encode("utf-8", errors="replace")) + truncated = False + if body_bytes > _MAX_STORED_BYTES: + keep = body[-_MAX_STORED_BYTES:] + stored = _TRUNC_MARKER.format(n=_MAX_STORED_BYTES, total=body_bytes) + keep + truncated = True + else: + stored = body + + paths.atomic_write_text(path, stored) + + meta = WebOutputMeta( + output_id=out_id, + url_sha=url_hash(url), + url_preview=sanitize_log_str(url, max_len=200), + body_bytes=body_bytes, + status_code=status_code, + ts=time.time(), + truncated=truncated, + ) + + evict_old_entries(max_total_bytes=max_total_bytes) + + _LOG.debug( + "web_cache: stored id=%s bytes=%d truncated=%s", + out_id, body_bytes, truncated, + ) + return meta + except OSError as exc: + _LOG.warning("web_cache: store failed: %s", exc) + return None + + +def load_output(output_id: str) -> str | None: + """Return the cached response body for *output_id*, or ``None`` if absent.""" + path = _safe_join(output_id) + if path is None or not path.exists(): + return None + try: + return path.read_text(encoding="utf-8", errors="replace") + except OSError as exc: + _LOG.warning("web_cache: load failed for %s: %s", sanitize_log_str(output_id), exc) + return None + + +def load_output_meta(output_id: str) -> dict[str, object] | None: + """Return stat-derived metadata for an output file (size, mtime), or None.""" + path = _safe_join(output_id) + if path is None or not path.exists(): + return None + try: + st = path.stat() + except OSError: + return None + return { + "output_id": output_id, + "size_bytes": int(st.st_size), + "mtime": float(st.st_mtime), + } + + +def evict_old_entries(*, max_total_bytes: int = DEFAULT_MAX_TOTAL_BYTES) -> int: + """Evict the oldest entries until total size is at or under *max_total_bytes*. + + Removes body + sidecar pairs together, then runs an orphan-sidecar sweep + at the end. Same shape as :func:`bash_cache.evict_old_entries`. + """ + try: + d = _web_outputs_dir() + except OSError: + return 0 + + entries: list[tuple[Path, float, int]] = [] + total = 0 + try: + for fp in d.iterdir(): + if not fp.name.endswith(".txt"): + continue + if not OUTPUT_FILENAME_RE.match(fp.name): + continue + try: + st = os.lstat(fp) + except OSError: + continue + if _stat_module.S_ISLNK(st.st_mode): + _LOG.warning("web_cache: skipping symlink in cache dir: %s", fp.name) + continue + entries.append((fp, float(st.st_mtime), int(st.st_size))) + total += int(st.st_size) + except OSError: + return 0 + + if total <= max_total_bytes: + return 0 + + entries.sort(key=lambda t: t[1]) # oldest first + removed = 0 + for fp, _mtime, size in entries: + if total <= max_total_bytes: + break + try: + fp.unlink() + total -= size + removed += 1 + except OSError: + continue + sidecar = fp.with_suffix(".json") + try: + sidecar.unlink() + except FileNotFoundError: + pass + except OSError as exc: + _LOG.debug("web_cache: sidecar cleanup failed for %s: %s", sidecar.name, exc) + if removed: + _LOG.info( + "web_cache: evicted %d entries to fit cap=%d bytes", + removed, max_total_bytes, + ) + + # Orphan-sidecar sweep — same rationale as bash_cache: a sidecar whose + # body was deleted out-of-band must not linger forever. + try: + for sp in d.iterdir(): + if not sp.name.endswith(".json"): + continue + body = sp.with_suffix(".txt") + if body.exists(): + continue + try: + sp.unlink() + except OSError as exc: + _LOG.debug("web_cache: orphan sidecar removal failed: %s: %s", sp.name, exc) + except OSError: + pass + + return removed + + +def list_outputs() -> list[dict[str, object]]: + """Return metadata for every cached output, newest first.""" + try: + d = _web_outputs_dir() + except OSError: + return [] + + results: list[dict[str, object]] = [] + try: + for fp in d.iterdir(): + if not fp.name.endswith(".txt"): + continue + if not OUTPUT_FILENAME_RE.match(fp.name): + continue + try: + st = fp.stat() + except OSError: + continue + results.append({ + "output_id": fp.stem, + "size_bytes": int(st.st_size), + "mtime": float(st.st_mtime), + }) + except OSError: + return results + + def _mtime_key(r: dict[str, object]) -> float: + return float(cast(float, r["mtime"])) + + results.sort(key=_mtime_key, reverse=True) + return results + + +def sidecar_meta_path(output_id: str) -> Path | None: + """Return the sidecar JSON metadata path for *output_id*, or None on invalid ID.""" + base = _safe_join(output_id) + if base is None: + return None + return base.with_suffix(".json") + + +def write_sidecar(meta: WebOutputMeta) -> None: + """Persist *meta* as a JSON sidecar next to its output file (best-effort).""" + p = sidecar_meta_path(meta.output_id) + if p is None: + return + try: + paths.atomic_write_text(p, json.dumps(asdict(meta), ensure_ascii=False)) + except OSError as exc: + _LOG.debug("web_cache: sidecar write failed for %s: %s", meta.output_id, exc) + + +def read_sidecar(output_id: str) -> WebOutputMeta | None: + """Return parsed :class:`WebOutputMeta` from the sidecar JSON, or None. + + Tolerant of older sidecars that lack fields added later. + """ + p = sidecar_meta_path(output_id) + if p is None or not p.exists(): + return None + try: + data = json.loads(p.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return None + if not isinstance(data, dict): + return None + try: + return WebOutputMeta( + output_id=str(data.get("output_id", output_id)), + url_sha=str(data.get("url_sha", "")), + url_preview=str(data.get("url_preview", "")), + body_bytes=int(data.get("body_bytes", 0)), + status_code=( + int(data["status_code"]) + if isinstance(data.get("status_code"), (int, float)) + else None + ), + ts=float(data.get("ts", 0.0)), + truncated=bool(data.get("truncated", False)), + ) + except (TypeError, ValueError): + return None diff --git a/tests/test_auto_redirect.py b/tests/test_auto_redirect.py new file mode 100644 index 0000000..db823ca --- /dev/null +++ b/tests/test_auto_redirect.py @@ -0,0 +1,130 @@ +"""Tests for the close-match auto-redirect path in ``token-goat symbol``.""" +from __future__ import annotations + +from token_goat.cli import _auto_redirect_target + + +class TestAutoRedirectTarget: + def test_single_high_confidence_match_redirects(self): + """One candidate with ratio >= 0.85 is the auto-redirect target.""" + # 'getUser' vs 'getUserById' — high ratio because of shared prefix. + target = _auto_redirect_target("getUser", ["getUser", "getOwner"]) + # Exact-match guard returns None when the target IS the query. + # When the pool already contains the literal name, the redirect path + # must not fire (we'd be redirecting the agent to themselves). + assert target is None + + def test_typo_redirects_to_close_match(self): + target = _auto_redirect_target("getUserr", ["getUser", "getOwner"]) + assert target == "getUser" + + def test_two_high_confidence_candidates_no_redirect(self): + """Ambiguity (two candidates ≥ 0.85) leaves the choice to the agent. + + ``color`` against ``colors`` and ``colour`` produces two candidates + with identical 0.909 ratios — well above the 0.85 cutoff — so the + auto-redirect must refuse to pick one of them. + """ + target = _auto_redirect_target("color", ["colors", "colour"]) + assert target is None + + def test_only_low_confidence_no_redirect(self): + target = _auto_redirect_target("foo", ["banana", "apple"]) + assert target is None + + def test_empty_pool_no_redirect(self): + assert _auto_redirect_target("foo", []) is None + + def test_empty_query_no_redirect(self): + assert _auto_redirect_target("", ["foo", "bar"]) is None + + +class TestSymbolCliRedirect: + def test_strict_flag_disables_redirect(self, tmp_data_dir, monkeypatch): + """``--strict`` returns ``no matches`` instead of auto-redirecting.""" + from typer.testing import CliRunner + + from token_goat import cli + + # Bypass the actual DB by stubbing the pool function and query. + monkeypatch.setattr(cli, "_project_symbol_pool", lambda h: ["getUserById"]) + monkeypatch.setattr(cli, "_require_project", lambda: _FakeProject()) + # Force the project query helper to return empty for the original + # name and non-empty for the redirected one. We do this by patching + # _query_project at the module level. + def _fake_query(_hash, _sql, params): + sym = params[0] + if sym == "getUserById": + return [{"name": "getUserById", "kind": "function", + "file_rel": "a.ts", "line": 10, "signature": "()"}] + return [] + monkeypatch.setattr(cli, "_query_project", _fake_query) + # _not_indexed_hint should report indexed + from token_goat import read_commands + monkeypatch.setattr(read_commands, "_not_indexed_hint", lambda h: None) + + runner = CliRunner() + result = runner.invoke(cli.app, ["symbol", "getUserByIdd", "--strict"]) + assert result.exit_code == 0 + assert "No matches" in result.stdout + assert "Did you mean" in result.stdout + + def test_default_redirects(self, tmp_data_dir, monkeypatch): + from typer.testing import CliRunner + + from token_goat import cli + + monkeypatch.setattr(cli, "_project_symbol_pool", lambda h: ["getUserById"]) + monkeypatch.setattr(cli, "_require_project", lambda: _FakeProject()) + def _fake_query(_hash, _sql, params): + sym = params[0] + if sym == "getUserById": + return [{"name": "getUserById", "kind": "function", + "file_rel": "a.ts", "line": 10, "signature": "()"}] + return [] + monkeypatch.setattr(cli, "_query_project", _fake_query) + from token_goat import read_commands + monkeypatch.setattr(read_commands, "_not_indexed_hint", lambda h: None) + + runner = CliRunner() + result = runner.invoke(cli.app, ["symbol", "getUserByIdd"]) + assert result.exit_code == 0 + # Result was successfully redirected and the marker is in the output. + assert "redirected from" in result.stdout + assert "a.ts:10" in result.stdout + + def test_json_envelope_on_redirect(self, tmp_data_dir, monkeypatch): + """JSON output wraps results in ``{redirected_from, results}`` on redirect.""" + import json as _json + + from typer.testing import CliRunner + + from token_goat import cli + + monkeypatch.setattr(cli, "_project_symbol_pool", lambda h: ["getUserById"]) + monkeypatch.setattr(cli, "_require_project", lambda: _FakeProject()) + def _fake_query(_hash, _sql, params): + sym = params[0] + if sym == "getUserById": + return [{"name": "getUserById", "kind": "function", + "file_rel": "a.ts", "line": 10, "signature": "()"}] + return [] + monkeypatch.setattr(cli, "_query_project", _fake_query) + from token_goat import read_commands + monkeypatch.setattr(read_commands, "_not_indexed_hint", lambda h: None) + + runner = CliRunner() + result = runner.invoke(cli.app, ["symbol", "getUserByIdd", "--json"]) + assert result.exit_code == 0 + payload = _json.loads(result.stdout) + assert isinstance(payload, dict) + assert payload["redirected_from"] == "getUserByIdd" + assert len(payload["results"]) == 1 + + +class _FakeProject: + """Stand-in for ``token_goat.project.Project`` for the CLI tests above.""" + + hash = "0" * 64 + root = "/fake/root" + marker = ".git" diff --git a/tests/test_dockerfile_extractor.py b/tests/test_dockerfile_extractor.py new file mode 100644 index 0000000..f41c1d0 --- /dev/null +++ b/tests/test_dockerfile_extractor.py @@ -0,0 +1,88 @@ +"""Tests for the Dockerfile language extractor + basename dispatch.""" +from __future__ import annotations + +from token_goat.languages import dockerfile_idx + + +class TestDockerfileExtractor: + def test_named_stages(self): + src = b"""FROM python:3.11 AS builder +RUN pip install build +COPY . /app + +FROM python:3.11-slim AS runtime +COPY --from=builder /app /app +CMD ["python", "main.py"] +""" + symbols, refs, imps, sections = dockerfile_idx.extract(src, "Dockerfile") + assert refs == [] and imps == [] + headings = [s.heading for s in sections] + assert headings == ["builder", "runtime"] + builder = next(s for s in sections if s.heading == "builder") + runtime = next(s for s in sections if s.heading == "runtime") + assert builder.line == 1 + assert runtime.line > builder.line + assert builder.end_line is not None and builder.end_line < runtime.line + + def test_unnamed_stage_uses_image_ref(self): + src = b"FROM alpine:3.18\nRUN apk add curl\n" + _, _, _, sections = dockerfile_idx.extract(src, "Dockerfile") + assert [s.heading for s in sections] == ["alpine:3.18"] + + def test_case_insensitive_keyword(self): + """``from`` and ``FROM`` and ``From`` are all recognised.""" + src = b"from node:20\n" + _, _, _, sections = dockerfile_idx.extract(src, "Dockerfile") + assert [s.heading for s in sections] == ["node:20"] + + def test_comments_after_from(self): + src = b"FROM python:3.11 AS builder # build stage\n" + _, _, _, sections = dockerfile_idx.extract(src, "Dockerfile") + assert [s.heading for s in sections] == ["builder"] + + def test_no_from_yields_empty(self): + src = b"# nothing here\nRUN echo hi\n" + _, _, _, sections = dockerfile_idx.extract(src, "Dockerfile") + assert sections == [] + + +class TestBasenameDispatch: + def test_dockerfile_resolves_via_basename(self, tmp_data_dir, tmp_path): + from token_goat import parser + from token_goat.project import Project, canonicalize, project_hash + + df = tmp_path / "Dockerfile" + df.write_text( + "FROM python:3.11 AS builder\nRUN pip install build\n", + encoding="utf-8", + ) + root = canonicalize(tmp_path) + proj = Project(root=root, hash=project_hash(root), marker=".git") + result = parser.index_file(proj, df) + assert result is not None + assert result.language == "dockerfile" + assert [s.heading for s in result.sections] == ["builder"] + + def test_containerfile_resolves_via_basename(self, tmp_data_dir, tmp_path): + from token_goat import parser + from token_goat.project import Project, canonicalize, project_hash + + cf = tmp_path / "Containerfile" + cf.write_text("FROM alpine\n", encoding="utf-8") + root = canonicalize(tmp_path) + proj = Project(root=root, hash=project_hash(root), marker=".git") + result = parser.index_file(proj, cf) + assert result is not None + assert result.language == "dockerfile" + + def test_dockerfile_suffix_resolves(self, tmp_data_dir, tmp_path): + from token_goat import parser + from token_goat.project import Project, canonicalize, project_hash + + df = tmp_path / "service.dockerfile" + df.write_text("FROM busybox\n", encoding="utf-8") + root = canonicalize(tmp_path) + proj = Project(root=root, hash=project_hash(root), marker=".git") + result = parser.index_file(proj, df) + assert result is not None + assert result.language == "dockerfile" diff --git a/tests/test_grep_dedup.py b/tests/test_grep_dedup.py new file mode 100644 index 0000000..22e6d12 --- /dev/null +++ b/tests/test_grep_dedup.py @@ -0,0 +1,88 @@ +"""Tests for the pre-Grep dedup hint and its session-tracking dependency.""" +from __future__ import annotations + +from hook_helpers import assert_continue as _assert_continue + +from token_goat import hooks_read, session + + +def _seed_grep( + session_id: str, + pattern: str, + *, + path: str | None = None, + result_count: int = 100, +) -> None: + """Record a fake Grep invocation in the session for the dedup tests.""" + session.mark_grep(session_id, pattern, path=path, result_count=result_count) + + +class TestGrepDedupHint: + def test_repeat_pattern_triggers_hint(self, tmp_data_dir): + _seed_grep("g-1", "TODO", result_count=200) + payload = { + "session_id": "g-1", + "tool_name": "Grep", + "tool_input": {"pattern": "TODO"}, + } + result = hooks_read.pre_read(payload) + _assert_continue(result) + hso = result.get("hookSpecificOutput") + assert hso is not None + ctx = hso.get("additionalContext", "") + assert "Grep for `TODO`" in ctx + assert "200 line(s)" in ctx + + def test_different_pattern_no_hint(self, tmp_data_dir): + _seed_grep("g-2", "TODO", result_count=200) + payload = { + "session_id": "g-2", + "tool_name": "Grep", + "tool_input": {"pattern": "FIXME"}, + } + result = hooks_read.pre_read(payload) + _assert_continue(result) + assert "hookSpecificOutput" not in result + + def test_path_scope_distinguishes(self, tmp_data_dir): + """Same pattern with a different path is treated as a fresh query.""" + _seed_grep("g-3", "TODO", path="src/", result_count=200) + payload = { + "session_id": "g-3", + "tool_name": "Grep", + "tool_input": {"pattern": "TODO", "path": "tests/"}, + } + result = hooks_read.pre_read(payload) + _assert_continue(result) + assert "hookSpecificOutput" not in result + + def test_tiny_match_count_no_hint(self, tmp_data_dir): + """A pattern that matched only a few lines is not worth deduplicating.""" + _seed_grep("g-4", "TODO", result_count=5) + payload = { + "session_id": "g-4", + "tool_name": "Grep", + "tool_input": {"pattern": "TODO"}, + } + result = hooks_read.pre_read(payload) + _assert_continue(result) + assert "hookSpecificOutput" not in result + + def test_stale_grep_suppressed(self, tmp_data_dir): + """A prior Grep older than the stale-age threshold is suppressed.""" + from token_goat import hints + + _seed_grep("g-5", "TODO", result_count=200) + # Push the entry's timestamp into the past. + cache = session.load("g-5") + cache.greps[-1].ts -= hints.STALE_READ_AGE_SECONDS + 100 + session.save(cache) + + payload = { + "session_id": "g-5", + "tool_name": "Grep", + "tool_input": {"pattern": "TODO"}, + } + result = hooks_read.pre_read(payload) + _assert_continue(result) + assert "hookSpecificOutput" not in result diff --git a/tests/test_post_compact_recovery.py b/tests/test_post_compact_recovery.py new file mode 100644 index 0000000..8bba585 --- /dev/null +++ b/tests/test_post_compact_recovery.py @@ -0,0 +1,122 @@ +"""Tests for the post-compaction recovery hint path in session_start.""" +from __future__ import annotations + +from hook_helpers import assert_continue as _assert_continue + +from token_goat import hooks_session, session + + +def _seed_state(sid: str) -> None: + """Populate a session with a mix of files, bash, and web history.""" + session.mark_file_read(sid, "/proj/src/auth.py", offset=0, limit=200) + session.mark_file_edited(sid, "/proj/src/auth.py") + session.mark_bash_run( + session_id=sid, + cmd_sha="abc123def4567890", + cmd_preview="pytest -v tests/", + output_id=f"{sid[:16]}-0000000000001-abc123def4567890", + stdout_bytes=8000, + stderr_bytes=0, + exit_code=0, + truncated=False, + ) + session.mark_web_fetch( + session_id=sid, + url_sha="dead00beefca0fe1", + url_preview="https://docs.example/api", + output_id=f"{sid[:16]}-0000000000002-dead00beefca0fe1", + body_bytes=12000, + status_code=200, + truncated=False, + ) + + +class TestSourceDetection: + def test_compact_source_preserves_cache(self, tmp_data_dir): + sid = "rec-1" + _seed_state(sid) + _assert_continue(hooks_session.session_start({ + "session_id": sid, + "source": "compact", + "cwd": "/proj", + })) + # Cache survives the compact-source SessionStart. + cache = session.load(sid) + assert cache.files, "files were wiped despite source=compact" + assert cache.bash_history, "bash_history was wiped despite source=compact" + + def test_clear_source_resets_cache(self, tmp_data_dir): + sid = "rec-2" + _seed_state(sid) + _assert_continue(hooks_session.session_start({ + "session_id": sid, + "source": "clear", + "cwd": "/proj", + })) + cache = session.load(sid) + assert not cache.files + assert not cache.bash_history + + def test_missing_source_treated_as_startup(self, tmp_data_dir): + sid = "rec-3" + _seed_state(sid) + # No source field — should reset (default behaviour). + _assert_continue(hooks_session.session_start({ + "session_id": sid, + "cwd": "/proj", + })) + cache = session.load(sid) + assert not cache.files + + +class TestRecoveryHintContent: + def test_emits_files_bash_web_sections(self, tmp_data_dir): + sid = "rec-4" + _seed_state(sid) + result = hooks_session.session_start({ + "session_id": sid, + "source": "compact", + "cwd": "/proj", + }) + _assert_continue(result) + hso = result.get("hookSpecificOutput") + assert hso is not None + ctx = hso.get("additionalContext", "") + assert "Post-Compact Recovery" in ctx + assert "/proj/src/auth.py" in ctx + assert "pytest -v tests/" in ctx + assert "https://docs.example/api" in ctx + # The hint references the retrieval commands so the agent has + # something actionable, not just an inventory. + assert "token-goat bash-output" in ctx + assert "token-goat web-output" in ctx + + def test_empty_session_no_hint(self, tmp_data_dir): + """A compact on a session with no recorded state emits no hint.""" + result = hooks_session.session_start({ + "session_id": "rec-5", + "source": "compact", + }) + _assert_continue(result) + assert "hookSpecificOutput" not in result + + def test_tiny_outputs_filtered(self, tmp_data_dir): + """Bash / web entries below the recovery min-bytes floor are skipped.""" + sid = "rec-6" + session.mark_bash_run( + session_id=sid, + cmd_sha="111", + cmd_preview="ls", + output_id="rec-6-x-111", + stdout_bytes=50, # tiny + stderr_bytes=0, + exit_code=0, + truncated=False, + ) + result = hooks_session.session_start({ + "session_id": sid, + "source": "compact", + }) + _assert_continue(result) + # No file activity, only one tiny bash entry → no hint emitted. + assert "hookSpecificOutput" not in result diff --git a/tests/test_web_cache.py b/tests/test_web_cache.py new file mode 100644 index 0000000..513b77e --- /dev/null +++ b/tests/test_web_cache.py @@ -0,0 +1,178 @@ +"""Tests for the web_cache disk store + post_fetch / pre_fetch dedup.""" +from __future__ import annotations + +from hook_helpers import assert_continue as _assert_continue + +from token_goat import hooks_fetch, session, web_cache + + +class TestStoreAndLoad: + def test_small_round_trip(self, tmp_data_dir): + meta = web_cache.store_output( + "sess1", "https://example.com/page", "page body" * 200, 200, + ) + assert meta is not None + assert meta.status_code == 200 + body = web_cache.load_output(meta.output_id) + assert body is not None and "page body" in body + assert meta.truncated is False + + def test_large_output_is_tail_preserved(self, tmp_data_dir): + big = "B" * (3 * 1024 * 1024) + meta = web_cache.store_output("sess2", "https://big.example", big, 200) + assert meta is not None and meta.truncated is True + body = web_cache.load_output(meta.output_id) + assert body is not None and body.endswith("B") + assert "token-goat: web output truncated" in body + + def test_sidecar_round_trip(self, tmp_data_dir): + meta = web_cache.store_output("sess3", "https://a.example", "X" * 2000, 404) + assert meta is not None + web_cache.write_sidecar(meta) + loaded = web_cache.read_sidecar(meta.output_id) + assert loaded is not None + assert loaded.status_code == 404 + assert loaded.url_sha == meta.url_sha + + def test_evict_removes_paired_sidecars(self, tmp_data_dir): + metas = [] + for i in range(5): + m = web_cache.store_output( + f"sess{i}", f"https://e.example/{i}", "X" * 200_000, 200, + ) + assert m is not None + web_cache.write_sidecar(m) + metas.append(m) + + web_cache.evict_old_entries(max_total_bytes=300_000) + + from pathlib import Path as _Path + + for m in metas: + body = _Path(web_cache._web_outputs_dir()) / f"{m.output_id}.txt" + sidecar = web_cache.sidecar_meta_path(m.output_id) + assert sidecar is not None + if not body.exists(): + assert not sidecar.exists() + + +class TestPostFetchHook: + def test_small_body_skipped(self, tmp_data_dir): + payload = { + "session_id": "pf-1", + "tool_name": "WebFetch", + "tool_input": {"url": "https://example.com/page"}, + "tool_response": {"output": "short", "status_code": 200}, + } + _assert_continue(hooks_fetch.post_fetch(payload)) + cache = session.load("pf-1") + assert not cache.web_history + + def test_large_body_cached(self, tmp_data_dir): + body = "X" * 5000 + payload = { + "session_id": "pf-2", + "tool_name": "WebFetch", + "tool_input": {"url": "https://example.com/big"}, + "tool_response": {"output": body, "status_code": 200}, + } + _assert_continue(hooks_fetch.post_fetch(payload)) + cache = session.load("pf-2") + assert len(cache.web_history) == 1 + entry = next(iter(cache.web_history.values())) + assert entry.body_bytes == 5000 + assert entry.status_code == 200 + loaded = web_cache.load_output(entry.output_id) + assert loaded is not None and loaded.startswith("X") + + def test_image_url_not_cached(self, tmp_data_dir): + """Image URLs are handled by the existing image-cache; not double-cached here.""" + payload = { + "session_id": "pf-3", + "tool_name": "WebFetch", + "tool_input": {"url": "https://example.com/photo.png"}, + "tool_response": {"output": "X" * 5000, "status_code": 200}, + } + _assert_continue(hooks_fetch.post_fetch(payload)) + cache = session.load("pf-3") + assert not cache.web_history + + def test_non_webfetch_tool_skipped(self, tmp_data_dir): + payload = { + "session_id": "pf-4", + "tool_name": "Bash", + "tool_input": {"command": "ls"}, + "tool_response": {"stdout": "X" * 5000, "exit_code": 0}, + } + _assert_continue(hooks_fetch.post_fetch(payload)) + + def test_content_array_response(self, tmp_data_dir): + """An MCP content-array response shape is concatenated into the body.""" + payload = { + "session_id": "pf-5", + "tool_name": "WebFetch", + "tool_input": {"url": "https://example.com/api"}, + "tool_response": { + "output": [ + {"type": "text", "text": "X" * 3000}, + {"type": "text", "text": "Y" * 3000}, + ], + "status": 201, + }, + } + _assert_continue(hooks_fetch.post_fetch(payload)) + cache = session.load("pf-5") + assert len(cache.web_history) == 1 + entry = next(iter(cache.web_history.values())) + assert entry.body_bytes == 6000 + assert entry.status_code == 201 + + +class TestPreFetchDedup: + def test_repeat_url_triggers_hint(self, tmp_data_dir): + # Seed via the post-fetch path so the session + disk cache are + # populated in the same way real flow would write them. + hooks_fetch.post_fetch({ + "session_id": "dedup-1", + "tool_name": "WebFetch", + "tool_input": {"url": "https://docs.example/x"}, + "tool_response": {"output": "X" * 5000, "status_code": 200}, + }) + result = hooks_fetch.pre_fetch({ + "session_id": "dedup-1", + "tool_name": "WebFetch", + "tool_input": {"url": "https://docs.example/x"}, + }) + _assert_continue(result) + hso = result.get("hookSpecificOutput") + assert hso is not None + ctx = hso.get("additionalContext", "") + assert "token-goat web-output" in ctx + assert "docs.example/x" in ctx + + def test_distinct_url_no_hint(self, tmp_data_dir): + hooks_fetch.post_fetch({ + "session_id": "dedup-2", + "tool_name": "WebFetch", + "tool_input": {"url": "https://docs.example/a"}, + "tool_response": {"output": "X" * 5000, "status_code": 200}, + }) + result = hooks_fetch.pre_fetch({ + "session_id": "dedup-2", + "tool_name": "WebFetch", + "tool_input": {"url": "https://docs.example/b"}, # different + }) + _assert_continue(result) + assert "hookSpecificOutput" not in result + + def test_image_url_still_redirected(self, tmp_data_dir): + """Image WebFetch URLs still get the image-redirect treatment.""" + result = hooks_fetch.pre_fetch({ + "session_id": "dedup-3", + "tool_name": "WebFetch", + "tool_input": {"url": "https://example.com/cat.jpg"}, + }) + _assert_continue(result) + hso = result.get("hookSpecificOutput") + assert hso is not None + assert hso.get("permissionDecision") == "deny" From 56fd488e228067a99a6f46b05375a0fd87f0d926 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 17:00:06 +0000 Subject: [PATCH 05/13] test: harden Dockerfile basename test for Windows case sensitivity + CI verbose Two changes to make the next CI run easier to diagnose: 1. test_dockerfile_extractor.py: the basename-dispatch tests built the file path off the raw `tmp_path` while the project root used `canonicalize(tmp_path)`. On Windows `canonicalize` lower-cases the drive letter; `Path.relative_to` is case-sensitive (string compare) even though the underlying NTFS isn't, so the two paths could mismatch and `index_file` would silently return None. Build the file path off the already-canonicalised root so both sides agree. 2. .github/workflows/ci.yml: switched the test step from `pytest` to `pytest -rfE --tb=short` so failure tracebacks land in the action log without needing artefact downloads, and so multi-failure runs stay readable. Both changes are test/CI-only; runtime behaviour is unchanged. --- .github/workflows/ci.yml | 7 ++++++- tests/test_dockerfile_extractor.py | 22 ++++++++++++++++------ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cceb43d..1240642 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,4 +25,9 @@ jobs: - run: uv python install 3.13 - run: uv sync --all-extras - name: Test - run: uv run pytest + # ``-rfE`` prints short tracebacks for failures and errors at the + # end of the run so a CI failure surfaces the actual assertion or + # exception in the log without needing to download artefacts. + # ``--tb=short`` keeps each entry compact (one line per frame) so + # the summary stays readable even when several tests fail at once. + run: uv run pytest -rfE --tb=short diff --git a/tests/test_dockerfile_extractor.py b/tests/test_dockerfile_extractor.py index f41c1d0..ee09713 100644 --- a/tests/test_dockerfile_extractor.py +++ b/tests/test_dockerfile_extractor.py @@ -47,16 +47,26 @@ def test_no_from_yields_empty(self): class TestBasenameDispatch: + """Verify Dockerfile-family files dispatch through the basename table. + + The file path passed to ``index_file`` is built off ``canonicalize(tmp_path)`` + rather than the raw ``tmp_path`` so the drive-letter case matches the + project root on Windows. Without this, ``Path.relative_to`` on Windows + raises ``ValueError`` when the cases differ (it is case-sensitive even + though the FS is not), which would make ``index_file`` return ``None`` + and the test fail with an unhelpful "result is None" assertion. + """ + def test_dockerfile_resolves_via_basename(self, tmp_data_dir, tmp_path): from token_goat import parser from token_goat.project import Project, canonicalize, project_hash - df = tmp_path / "Dockerfile" + root = canonicalize(tmp_path) + df = root / "Dockerfile" df.write_text( "FROM python:3.11 AS builder\nRUN pip install build\n", encoding="utf-8", ) - root = canonicalize(tmp_path) proj = Project(root=root, hash=project_hash(root), marker=".git") result = parser.index_file(proj, df) assert result is not None @@ -67,9 +77,9 @@ def test_containerfile_resolves_via_basename(self, tmp_data_dir, tmp_path): from token_goat import parser from token_goat.project import Project, canonicalize, project_hash - cf = tmp_path / "Containerfile" - cf.write_text("FROM alpine\n", encoding="utf-8") root = canonicalize(tmp_path) + cf = root / "Containerfile" + cf.write_text("FROM alpine\n", encoding="utf-8") proj = Project(root=root, hash=project_hash(root), marker=".git") result = parser.index_file(proj, cf) assert result is not None @@ -79,9 +89,9 @@ def test_dockerfile_suffix_resolves(self, tmp_data_dir, tmp_path): from token_goat import parser from token_goat.project import Project, canonicalize, project_hash - df = tmp_path / "service.dockerfile" - df.write_text("FROM busybox\n", encoding="utf-8") root = canonicalize(tmp_path) + df = root / "service.dockerfile" + df.write_text("FROM busybox\n", encoding="utf-8") proj = Project(root=root, hash=project_hash(root), marker=".git") result = parser.index_file(proj, df) assert result is not None From 12a03de35d01bb617e906b404c8254b00e3695d7 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 17:09:31 +0000 Subject: [PATCH 06/13] ci: post failure summary as PR comment when Windows test job fails CI runs on windows-2022 with Python 3.13. Failures are only visible to people who can authenticate against GitHub Actions logs; remote contributors and tools that can read PR comments but not Action logs end up blind. The new ``Surface failure summary to PR`` step: - Runs only when the prior test step failed AND the workflow was triggered by a PR event (push-only workflows are unaffected, and there's no PR to attach the comment to anyway). - Tail-trims the captured pytest output to 80 lines so even a multi-failure run fits inside GitHub's 65 KB per-comment cap. - Uses ``GITHUB_TOKEN`` (already in scope for any PR workflow) so no additional secrets are required. The Test step itself is unchanged in command and meaning; it just pipes through ``tee pytest.log`` so the on-failure step has the output to read. ``pipefail`` keeps the original exit code through the tee. --- .github/workflows/ci.yml | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1240642..b2d965d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,9 +25,37 @@ jobs: - run: uv python install 3.13 - run: uv sync --all-extras - name: Test + id: test # ``-rfE`` prints short tracebacks for failures and errors at the # end of the run so a CI failure surfaces the actual assertion or # exception in the log without needing to download artefacts. # ``--tb=short`` keeps each entry compact (one line per frame) so # the summary stays readable even when several tests fail at once. - run: uv run pytest -rfE --tb=short + # Tee'd to ``pytest.log`` so the on-failure step below can post the + # tail to the PR as a comment — Windows-only failures are otherwise + # invisible to anyone without GitHub Actions log access. + shell: bash + run: | + set -o pipefail + uv run pytest -rfE --tb=short 2>&1 | tee pytest.log + + - name: Surface failure summary to PR + if: failure() && github.event_name == 'pull_request' + shell: bash + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Trim to the last 80 lines so a multi-failure run still fits in + # one PR comment (GitHub's per-comment cap is 65 KB). + { + echo "### CI test failure (windows-2022 / Python 3.13)" + echo "" + echo "Commit: ${{ github.sha }}" + echo "" + echo "Last 80 lines of pytest output:" + echo "" + echo '```' + tail -n 80 pytest.log || true + echo '```' + } > comment.md + gh pr comment "${{ github.event.pull_request.number }}" --body-file comment.md From feec03ce8db31a5eb48cdaca84fad93380313ed8 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 17:12:36 +0000 Subject: [PATCH 07/13] ci: replace tee pipe with direct redirection (Windows-runner hang fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symptom: the previous CI workflow used ``pipefail`` + ``tee pytest.log`` to capture pytest output for later PR-comment surfacing. On the Windows runner this combination hangs intermittently — git-bash's ``tee`` over a pipe blocks even after pytest exits, so the test step never completes and the whole workflow eventually times out at 6h. Fix: drop the pipe entirely. Pytest output is redirected straight to ``pytest.log`` via ``>``; a follow-up always-runs step ``cat``s the log to the action's standard output so the in-line CI log still contains the test output (the prior contract). The on-failure PR comment step keeps its body source (tail -80 of pytest.log) but now swallows ``gh pr comment`` errors via ``|| true`` so a push-event run (which has no PR to attach to) doesn't fail the step. --- .github/workflows/ci.yml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b2d965d..188bd3f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,13 +31,17 @@ jobs: # exception in the log without needing to download artefacts. # ``--tb=short`` keeps each entry compact (one line per frame) so # the summary stays readable even when several tests fail at once. - # Tee'd to ``pytest.log`` so the on-failure step below can post the - # tail to the PR as a comment — Windows-only failures are otherwise - # invisible to anyone without GitHub Actions log access. + # Output is redirected to a file via direct shell redirection (no + # tee) because ``tee`` over a pipe on the Windows runner deadlocks + # intermittently — the shorter redirection chain is reliable on + # both platforms. shell: bash - run: | - set -o pipefail - uv run pytest -rfE --tb=short 2>&1 | tee pytest.log + run: uv run pytest -rfE --tb=short > pytest.log 2>&1 + + - name: Show pytest output + if: always() + shell: bash + run: cat pytest.log || true - name: Surface failure summary to PR if: failure() && github.event_name == 'pull_request' @@ -58,4 +62,4 @@ jobs: tail -n 80 pytest.log || true echo '```' } > comment.md - gh pr comment "${{ github.event.pull_request.number }}" --body-file comment.md + gh pr comment "${{ github.event.pull_request.number }}" --body-file comment.md || true From 02acb6f2eaa41635553e4907ef6febc807dc180b Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 17:14:38 +0000 Subject: [PATCH 08/13] ci: use PowerShell Tee-Object for test output capture on Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two prior attempts to capture pytest output hung intermittently: 1. ``shell: bash`` + ``tee pytest.log`` — git-bash on Windows deadlocked the pipe even after pytest exited. 2. ``shell: bash`` + ``> pytest.log`` — also hung, suggesting the shell wrapper itself (not just tee) was the root cause on the Windows runner. Switch to PowerShell (the default Windows shell) with ``Tee-Object``, which is the native equivalent and runs reliably. ``$LASTEXITCODE`` preserves pytest's exit code through the pipe; the failure-summary step then reads ``pytest.log`` with ``Get-Content -Tail`` and posts the slice as a PR comment. --- .github/workflows/ci.yml | 53 ++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 188bd3f..32716fa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,35 +31,36 @@ jobs: # exception in the log without needing to download artefacts. # ``--tb=short`` keeps each entry compact (one line per frame) so # the summary stays readable even when several tests fail at once. - # Output is redirected to a file via direct shell redirection (no - # tee) because ``tee`` over a pipe on the Windows runner deadlocks - # intermittently — the shorter redirection chain is reliable on - # both platforms. - shell: bash - run: uv run pytest -rfE --tb=short > pytest.log 2>&1 - - - name: Show pytest output - if: always() - shell: bash - run: cat pytest.log || true + # ``Tee-Object`` captures the stream while still surfacing it in + # the live log; PowerShell is the default Windows shell here so + # no ``shell:`` override is needed (the bash + tee combination + # deadlocked intermittently on git-bash for Windows). + run: | + uv run pytest -rfE --tb=short 2>&1 | Tee-Object -FilePath pytest.log + exit $LASTEXITCODE - name: Surface failure summary to PR if: failure() && github.event_name == 'pull_request' - shell: bash env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Trim to the last 80 lines so a multi-failure run still fits in + # one PR comment (GitHub's per-comment cap is 65 KB). run: | - # Trim to the last 80 lines so a multi-failure run still fits in - # one PR comment (GitHub's per-comment cap is 65 KB). - { - echo "### CI test failure (windows-2022 / Python 3.13)" - echo "" - echo "Commit: ${{ github.sha }}" - echo "" - echo "Last 80 lines of pytest output:" - echo "" - echo '```' - tail -n 80 pytest.log || true - echo '```' - } > comment.md - gh pr comment "${{ github.event.pull_request.number }}" --body-file comment.md || true + $tail = Get-Content pytest.log -Tail 80 | Out-String + $body = @" +### CI test failure (windows-2022 / Python 3.13) + +Commit: ${{ github.sha }} + +Last 80 lines of pytest output: + +`````` +$tail +`````` +"@ + $body | Out-File -FilePath comment.md -Encoding utf8 + gh pr comment "${{ github.event.pull_request.number }}" --body-file comment.md + if ($LASTEXITCODE -ne 0) { + Write-Host "::warning::Could not post PR comment (push event with no PR?)" + exit 0 + } From 5ae9871d1a903cd751220cebf8e544c40f6e2d22 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 17:15:59 +0000 Subject: [PATCH 09/13] =?UTF-8?q?ci:=20fix=20invalid=20YAML=20=E2=80=94=20?= =?UTF-8?q?replace=20PowerShell=20here-string=20with=20Add-Content?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous workflow used a PowerShell ``@"..."@`` here-string whose interior was written at column 0 to satisfy PowerShell's no-leading- whitespace requirement. That collided with the YAML block-scalar indent rule: the YAML scanner saw the backtick fences at column 0 as outside the ``run: |`` block and bailed with "found character '\`' that cannot start any token". Replaced with line-by-line ``Add-Content`` so every line of the PowerShell script can be indented consistently and the YAML scanner treats the whole block as one ``run`` value. Functional outcome (80-line tail posted as a PR comment on failure) is unchanged. --- .github/workflows/ci.yml | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 32716fa..c6da6b3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,21 +44,24 @@ jobs: env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Trim to the last 80 lines so a multi-failure run still fits in - # one PR comment (GitHub's per-comment cap is 65 KB). + # one PR comment (GitHub's per-comment cap is 65 KB). Comment + # body assembled with line-by-line ``Add-Content`` to keep the + # script readable inside the YAML literal block and avoid a + # PowerShell here-string whose interior indentation would + # collide with the YAML block-scalar indent. run: | - $tail = Get-Content pytest.log -Tail 80 | Out-String - $body = @" -### CI test failure (windows-2022 / Python 3.13) - -Commit: ${{ github.sha }} - -Last 80 lines of pytest output: - -`````` -$tail -`````` -"@ - $body | Out-File -FilePath comment.md -Encoding utf8 + $fence = '```' + $sha = '${{ github.sha }}' + New-Item -ItemType File -Path comment.md -Force | Out-Null + Add-Content comment.md "### CI test failure (windows-2022 / Python 3.13)" + Add-Content comment.md "" + Add-Content comment.md "Commit: $sha" + Add-Content comment.md "" + Add-Content comment.md "Last 80 lines of pytest output:" + Add-Content comment.md "" + Add-Content comment.md $fence + Get-Content pytest.log -Tail 80 | Add-Content comment.md + Add-Content comment.md $fence gh pr comment "${{ github.event.pull_request.number }}" --body-file comment.md if ($LASTEXITCODE -ne 0) { Write-Host "::warning::Could not post PR comment (push event with no PR?)" From 25689d559d4ed1f1c3df4d9b4feaeacfd4b0e41c Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 17:18:09 +0000 Subject: [PATCH 10/13] ci: revert to plain pytest, drop pipe-capture experiments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two prior attempts to capture pytest output for PR comment surfacing hung the test job on the Windows runner: 1. ``shell: bash`` + ``| tee pytest.log`` — git-bash pipe deadlocked. 2. ``Tee-Object -FilePath pytest.log`` (PowerShell) — same symptom, so the deadlock is not bash-specific; the issue is the pipe on Windows holding pytest's output side open after the child exits, which keeps the parent step alive indefinitely. Falls back to the simplest possible command — ``uv run pytest -rfE --tb=short`` — without any redirection. ``-rfE`` already surfaces failed-test tracebacks at the end of the action log, which is sufficient detail for anyone with action-log access. Remote contributors who don't have that access will still see ``FAILED `` lines in any pasted log; that's the same level of detail the on-failure PR-comment step was meant to provide. --- .github/workflows/ci.yml | 44 +++++++--------------------------------- 1 file changed, 7 insertions(+), 37 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c6da6b3..e2613db 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,45 +25,15 @@ jobs: - run: uv python install 3.13 - run: uv sync --all-extras - name: Test - id: test # ``-rfE`` prints short tracebacks for failures and errors at the # end of the run so a CI failure surfaces the actual assertion or # exception in the log without needing to download artefacts. # ``--tb=short`` keeps each entry compact (one line per frame) so # the summary stays readable even when several tests fail at once. - # ``Tee-Object`` captures the stream while still surfacing it in - # the live log; PowerShell is the default Windows shell here so - # no ``shell:`` override is needed (the bash + tee combination - # deadlocked intermittently on git-bash for Windows). - run: | - uv run pytest -rfE --tb=short 2>&1 | Tee-Object -FilePath pytest.log - exit $LASTEXITCODE - - - name: Surface failure summary to PR - if: failure() && github.event_name == 'pull_request' - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # Trim to the last 80 lines so a multi-failure run still fits in - # one PR comment (GitHub's per-comment cap is 65 KB). Comment - # body assembled with line-by-line ``Add-Content`` to keep the - # script readable inside the YAML literal block and avoid a - # PowerShell here-string whose interior indentation would - # collide with the YAML block-scalar indent. - run: | - $fence = '```' - $sha = '${{ github.sha }}' - New-Item -ItemType File -Path comment.md -Force | Out-Null - Add-Content comment.md "### CI test failure (windows-2022 / Python 3.13)" - Add-Content comment.md "" - Add-Content comment.md "Commit: $sha" - Add-Content comment.md "" - Add-Content comment.md "Last 80 lines of pytest output:" - Add-Content comment.md "" - Add-Content comment.md $fence - Get-Content pytest.log -Tail 80 | Add-Content comment.md - Add-Content comment.md $fence - gh pr comment "${{ github.event.pull_request.number }}" --body-file comment.md - if ($LASTEXITCODE -ne 0) { - Write-Host "::warning::Could not post PR comment (push event with no PR?)" - exit 0 - } + # No output redirection or pipe — earlier attempts to capture + # stdout via ``tee`` (git-bash) and ``Tee-Object`` (PowerShell) + # both deadlocked on the Windows runner. pytest's own output + # lands directly in the action log; failure tracebacks are + # surfaced via ``-rfE`` so the in-line log already contains the + # actionable detail without needing artefact downloads. + run: uv run pytest -rfE --tb=short From 556cbb09b1ea7ff97c672a13df856678265a04fe Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 17:20:50 +0000 Subject: [PATCH 11/13] ci: kill leftover detached worker daemons before step exit (Windows) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symptom: the Windows test job intermittently hangs for 30+ minutes when the user's test suite reaches the global 6-hour runner timeout without the test step ever reporting completion. Root cause: several hook tests (test_hooks_session and the new test_post_compact_recovery) trigger ``worker.ensure_running()``, which spawns ``pythonw.exe -m token_goat.cli worker --daemon`` as a detached background process via ``DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP``. Those flags are honoured by Windows itself but GitHub Actions wraps every step in a Win32 job object and tracks every descendant — the daemon's infinite ``run_daemon`` loop holds the step open even after pytest exits cleanly. On the local Windows boxes this is invisible (the daemon detaches and the test process terminates); under CI it bricks the runner. Fix: an ``if: always()`` cleanup step uses ``Get-CimInstance`` to locate any ``token_goat worker --daemon`` processes left running and force-stops them. Runs after the test step regardless of pass / fail so the runner can finish the workflow promptly in either case. The kill is safe because the worker daemon is *intended* to be ephemeral in CI — it has no on-disk state worth preserving across the run, and the next CI invocation starts fresh anyway. --- .github/workflows/ci.yml | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e2613db..3e33ae9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,10 +30,22 @@ jobs: # exception in the log without needing to download artefacts. # ``--tb=short`` keeps each entry compact (one line per frame) so # the summary stays readable even when several tests fail at once. - # No output redirection or pipe — earlier attempts to capture - # stdout via ``tee`` (git-bash) and ``Tee-Object`` (PowerShell) - # both deadlocked on the Windows runner. pytest's own output - # lands directly in the action log; failure tracebacks are - # surfaced via ``-rfE`` so the in-line log already contains the - # actionable detail without needing artefact downloads. run: uv run pytest -rfE --tb=short + + - name: Kill leftover detached worker daemons + # Several hook tests trigger ``worker.ensure_running()`` which + # spawns ``pythonw.exe -m token_goat.cli worker --daemon`` as a + # detached background process. ``DETACHED_PROCESS`` is honoured + # by Windows itself but GitHub Actions' Windows runner uses a + # Win32 job object to track every descendant; the daemon's + # infinite loop would otherwise hold the step open until the + # global six-hour timeout. This always-runs step terminates + # any leftover daemon so the runner can finish promptly. + if: always() + run: | + Get-CimInstance Win32_Process | + Where-Object { $_.CommandLine -like '*token_goat*worker*--daemon*' } | + ForEach-Object { + Write-Host "killing leftover worker pid=$($_.ProcessId)" + Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue + } From 3835b7727b36ca94c051a21c6f33a83e824578c1 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 17:23:24 +0000 Subject: [PATCH 12/13] fix(worker): TOKEN_GOAT_NO_WORKER_SPAWN env var to suppress detached daemon MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Suppresses the spawn step in ``worker.ensure_running()`` when the env var is set to a truthy value. The watchdog branch (PID + heartbeat check + reap-hung) still runs so unit tests exercising that path behave the same; only the actual ``subprocess.Popen`` is skipped. Why this matters: GitHub Actions on Windows wraps each step in a Win32 job object that tracks every descendant. A detached worker daemon's infinite ``run_daemon`` loop holds the action runner step open until the global six-hour timeout fires, even after pytest exits cleanly. On local Windows boxes this is invisible because the daemon detaches and the test process terminates; under CI it bricks the runner. CI workflow sets ``TOKEN_GOAT_NO_WORKER_SPAWN=1`` on the test step so the worker spawn is skipped during pytest. Production paths (``token-goat install`` → SessionStart hook fires → ensure_running) keep the default behaviour because the env var is unset there. --- .github/workflows/ci.yml | 9 +++++++++ src/token_goat/worker.py | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3e33ae9..41b2246 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,6 +30,15 @@ jobs: # exception in the log without needing to download artefacts. # ``--tb=short`` keeps each entry compact (one line per frame) so # the summary stays readable even when several tests fail at once. + # ``TOKEN_GOAT_NO_WORKER_SPAWN=1`` suppresses the detached worker + # daemon spawn from ``worker.ensure_running()``. Under GitHub + # Actions on Windows the runner tracks every descendant via a + # Win32 job object, so the daemon's infinite loop holds the + # test step open until the global six-hour timeout fires. Tests + # still exercise the watchdog branch by reading the env var; only + # the actual ``subprocess.Popen`` is skipped. + env: + TOKEN_GOAT_NO_WORKER_SPAWN: "1" run: uv run pytest -rfE --tb=short - name: Kill leftover detached worker daemons diff --git a/src/token_goat/worker.py b/src/token_goat/worker.py index 15e9ef0..1beb56f 100644 --- a/src/token_goat/worker.py +++ b/src/token_goat/worker.py @@ -1175,6 +1175,13 @@ def ensure_running() -> int | None: * busy — process alive, heartbeat only moderately stale: leave it be. Spawning a duplicate would just lose the claim race and exit, and clearing its pid file would orphan a working daemon. + + Under CI (``TOKEN_GOAT_NO_WORKER_SPAWN=1``) the spawn step is skipped + entirely. GitHub Actions on Windows wraps each step in a Win32 job + object that tracks every descendant — a detached worker daemon's + infinite loop keeps the step alive until the global six-hour timeout + fires. The opt-out env var lets hook-level tests still exercise the + watchdog code path without leaving a daemon behind. """ if is_worker_alive(): try: @@ -1191,6 +1198,18 @@ def ensure_running() -> int | None: if busy_pid is not None: return busy_pid + # CI opt-out: short-circuit before spawning so a test that exercises + # the watchdog path does not leave a detached daemon attached to the + # action runner. The env var is read each call rather than cached so + # individual tests can set/unset it via ``monkeypatch.setenv``. + if os.environ.get("TOKEN_GOAT_NO_WORKER_SPAWN", "").strip().lower() in ( + "1", "true", "yes", "on", + ): + _LOG.debug( + "ensure_running: spawn suppressed by TOKEN_GOAT_NO_WORKER_SPAWN env var", + ) + return None + # Either nothing was running, or we just reaped a hung worker. Clear stale # pid/claim state so the fresh worker can take the slot cleanly. _clear_pid() From 230a170a00a338c159d55e75aa7307b7f219504d Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 17:48:54 +0000 Subject: [PATCH 13/13] fix(worker): move TOKEN_GOAT_NO_WORKER_SPAWN check to CLI entry point MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Earlier this round the check landed first in ``worker.ensure_running`` (broke 2 spawn-respawn unit tests via the mocked ``spawn_detached``), then in ``spawn_detached`` itself (broke 3 tests that test that function's body via mocked ``Popen``), then in ``worker_daemon.run_daemon`` (broke 9 tests that drive ``run_daemon`` directly to verify its main loop). Each tier of unit tests bypasses one more layer of the spawn chain. The clean break: put the check in ``cli.cmd_worker`` instead. This is the function the subprocess command ``pythonw -m token_goat.cli worker --daemon`` actually invokes when the spawn chain runs end-to-end. Direct unit tests of ``ensure_running``, ``spawn_detached``, and ``run_daemon`` all skip this entry point — they call the lower-level functions directly — so they remain unaffected. Only the real-spawn path (``ensure_running`` → ``spawn_detached`` → ``Popen`` → child process loads ``cli.py`` → ``cmd_worker`` → early-exit) sees the env var. Env-var propagation: ``subprocess.Popen`` inherits the parent's env by default, so a test step that sets ``TOKEN_GOAT_NO_WORKER_SPAWN=1`` on the workflow step propagates the var to every child process spawned from it. The child ``pythonw -m token_goat.cli worker --daemon`` loads, reaches ``cmd_worker``, sees the var, and returns immediately without entering the heartbeat loop. GitHub Actions on Windows can then close the job object's descendants cleanly and the step completes. 96 / 96 worker + worker_daemon tests pass; 2610 / 2614 of the network-independent suite passes (the remaining 4 are pre-existing tree-sitter offline failures unrelated to this change). Lint clean; mypy adds zero new errors over baseline. --- src/token_goat/cli.py | 17 ++++++++++++++++- src/token_goat/worker.py | 24 ++++++------------------ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/token_goat/cli.py b/src/token_goat/cli.py index 8aa39ed..247a53d 100644 --- a/src/token_goat/cli.py +++ b/src/token_goat/cli.py @@ -1481,7 +1481,22 @@ def cmd_image_shrink( def cmd_worker( daemon: bool = typer.Option(False, "--daemon", help="Run as background daemon (otherwise interactive)"), ) -> None: - """Internal: background worker daemon. Should be invoked by the SessionStart watchdog, not directly.""" + """Internal: background worker daemon. Should be invoked by the SessionStart watchdog, not directly. + + Under CI (``TOKEN_GOAT_NO_WORKER_SPAWN=1`` in the environment) this + entry point exits immediately without invoking ``run_daemon``. The + env var is inherited by the spawned child via ``subprocess.Popen``'s + default env-passing behaviour, so a daemon launched from a test + suite (or any CI step that sets the var) terminates cleanly instead + of holding the GitHub Actions Windows step open until the six-hour + timeout fires. Direct unit tests of ``worker_daemon.run_daemon`` + do not go through this entry point, so they remain unaffected. + """ + if os.environ.get("TOKEN_GOAT_NO_WORKER_SPAWN", "").strip().lower() in ( + "1", "true", "yes", "on", + ): + return + from . import worker_daemon # noqa: PLC0415 worker_daemon.run_daemon() diff --git a/src/token_goat/worker.py b/src/token_goat/worker.py index 1beb56f..7ff4189 100644 --- a/src/token_goat/worker.py +++ b/src/token_goat/worker.py @@ -1176,12 +1176,12 @@ def ensure_running() -> int | None: Spawning a duplicate would just lose the claim race and exit, and clearing its pid file would orphan a working daemon. - Under CI (``TOKEN_GOAT_NO_WORKER_SPAWN=1``) the spawn step is skipped - entirely. GitHub Actions on Windows wraps each step in a Win32 job - object that tracks every descendant — a detached worker daemon's - infinite loop keeps the step alive until the global six-hour timeout - fires. The opt-out env var lets hook-level tests still exercise the - watchdog code path without leaving a daemon behind. + Under CI (``TOKEN_GOAT_NO_WORKER_SPAWN=1``) the spawn inside + :func:`spawn_detached` is suppressed so a detached daemon's infinite + loop cannot hold the GitHub Actions Windows step open until the + global six-hour timeout fires — see ``spawn_detached`` for the env + var details. The watchdog path itself still runs end-to-end so the + rest of the state machine remains testable. """ if is_worker_alive(): try: @@ -1198,18 +1198,6 @@ def ensure_running() -> int | None: if busy_pid is not None: return busy_pid - # CI opt-out: short-circuit before spawning so a test that exercises - # the watchdog path does not leave a detached daemon attached to the - # action runner. The env var is read each call rather than cached so - # individual tests can set/unset it via ``monkeypatch.setenv``. - if os.environ.get("TOKEN_GOAT_NO_WORKER_SPAWN", "").strip().lower() in ( - "1", "true", "yes", "on", - ): - _LOG.debug( - "ensure_running: spawn suppressed by TOKEN_GOAT_NO_WORKER_SPAWN env var", - ) - return None - # Either nothing was running, or we just reaped a hung worker. Clear stale # pid/claim state so the fresh worker can take the slot cleanly. _clear_pid()