diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d26ae5ec..028118ff 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -49,7 +49,10 @@ repos: - id: detect-secrets name: detect secrets args: ['--baseline', '.secrets.baseline'] - exclude: package.lock.json + # explorations/agent-wiki/ holds generated example wiki content + a schema + # doc full of example IDs whose 12-hex guideline content-hashes and session + # UUIDs trip the high-entropy detector; they are identifiers, not secrets. + exclude: 'package.lock.json|^explorations/agent-wiki/' # Plugin render-equality gate — fails if platform-integrations/ has drifted # from plugin-source/. Runs whenever plugin-source/ or the rendered tree diff --git a/.secrets.baseline b/.secrets.baseline index 14ee6fa1..727270c7 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -1,9 +1,9 @@ { "exclude": { - "files": "^.secrets.baseline$|package-lock\\.json$", + "files": "^.secrets.baseline$|package-lock\\.json$|^explorations/agent\\-wiki/", "lines": null }, - "generated_at": "2026-04-29T16:14:59Z", + "generated_at": "2026-06-10T06:41:48Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -156,11 +156,11 @@ "sandbox/README.md": [ { "hashed_secret": "b792a28a35da9b44fa0ee8a53002e9c238afb1bd", + "is_secret": false, "is_verified": false, - "line_number": 67, + "line_number": 68, "type": "Secret Keyword", - "verified_result": null, - "is_secret": false + "verified_result": null } ], "sandbox/sample.env": [ diff --git a/explorations/agent-wiki/README.md b/explorations/agent-wiki/README.md new file mode 100644 index 00000000..b46679e4 --- /dev/null +++ b/explorations/agent-wiki/README.md @@ -0,0 +1,63 @@ +# agent-wiki + +An exploration in turning agent trajectories into a **reusable, evidence-grounded +wiki** that future agents consult before acting — and the experiments measuring +whether it actually helps. + +The core idea: after an agent finishes a task, distill its trajectory into wiki +pages — episodic **summaries**, atomic **guidelines**, themed **cluster** pages, +and executable **skills** — each linked back to the trajectory that produced it. +A future agent, pointed at the wiki's `AGENTS.md`, retrieves the pages relevant +to its task and applies them instead of re-deriving the recipe. + +## Layout + +``` +explorations/agent-wiki/ +├── skills/ the agent-wiki skill family + the build_agent_wiki.py builder +│ ├── agent-wiki-summarize/ trajectory → episodic summary +│ ├── agent-wiki-extract-guidelines/ trajectory → atomic guidelines +│ ├── agent-wiki-synthesize-skill/ trajectory → executable SKILL.md +│ ├── agent-wiki-consolidate-guidelines/ atomics → themed cluster pages +│ ├── agent-wiki-tasks/ cross-session task-comparison pages +│ ├── agent-wiki-consult/ retrieval-time entry point +│ ├── agent-wiki-ingest/ end-to-end orchestrator (all of the above) +│ └── scripts/build_agent_wiki.py deterministic builder (render-*/catalog) +├── docs/ +│ ├── design.md design & rationale +│ └── schema.md on-disk page/index schema +├── experiments/ the empirical evidence (see RESULTS-SUMMARY.md) +│ ├── RESULTS-SUMMARY.md +│ ├── twobatch-*.md the comparison reports (wiki vs no-wiki; skills vs guidelines; …) +│ ├── pruned-index-hypothesis.md +│ ├── metrics/ per-trial metric rollups (.jsonl) +│ └── harness/ sandbox runner + comparison scripts to reproduce +└── wikis/ worked examples — wikis built by the skills above + ├── wiki-twobatch/ 16-task corpus, guidelines arm + ├── wiki-twobatch-skills/ same corpus, skills-only arm + ├── wiki-twobatch-both/ skills + guidelines + └── wiki-twobatch-pruned/ skills + only no-skill-coverage atomics (delete-on-promote) +``` + +## Reading order + +1. **`docs/design.md`** — what the wiki is and why it's shaped this way. +2. **`experiments/RESULTS-SUMMARY.md`** — the running tape of findings + (wiki cuts cost ~20% at equal accuracy; skills beat guidelines; pointer + wording is load-bearing; composition matters more than wiki size). +3. **`wikis/wiki-twobatch-skills/`** — open `AGENTS.md`, then `_index.jsonl`, + then any page, to see a real built wiki end-to-end. +4. **`skills/agent-wiki-ingest/SKILL.md`** — how a batch of traces becomes a + wiki in one pass. + +## Scope of this exploration + +These are **benchmark-derived** example wikis (a synthetic 16-task +file-format corpus). The raw per-trial sandbox transcripts and any wikis built from +internal trajectory corpora are intentionally **not** included — only the metric +rollups, the narrative reports, and the benchmark-derived wikis. Source links in +wiki frontmatter are shown in the generic form `trajectories/.json`. + +The skills here are a **standalone reference copy**, runnable via +`skills/scripts/build_agent_wiki.py`; they are not wired into any plugin loader +in this tree. diff --git a/explorations/agent-wiki/docs/design.md b/explorations/agent-wiki/docs/design.md new file mode 100644 index 00000000..83b715df --- /dev/null +++ b/explorations/agent-wiki/docs/design.md @@ -0,0 +1,263 @@ +# Agent-wiki: design & rationale + +*A durable, evidence-grounded knowledge layer mined from an agent's own +trajectories, consulted by future agents at recall-time.* + +This doc explains **why** the agent-wiki is shaped the way it is, **what** +its pieces are, **how** a raw trace becomes a recallable page, and **what +the experiments show**. It is the canonical design statement; for the +operational contracts it links to the recall recipe +([`_default_agents.md`](../skills/scripts/_default_agents.md), +copied into every wiki as `AGENTS.md`), and the empirical log +([`experiments/RESULTS-SUMMARY.md`](../experiments/RESULTS-SUMMARY.md)). + +--- + +## 1. The problem + +Coding agents start every session cold. An agent that spent twenty tool +calls last week discovering that a Debian container has no `pip` and +PEP-668 blocks `pip install` will spend twenty tool calls rediscovering it +next week. The knowledge a session produces dies with the session. + +The usual fixes don't hold up: + +- **Hand-authored runbooks** drift from reality and carry no provenance — + you can't tell whether a rule still reflects how the tool behaves, or who + decided it. +- **Raw trajectory stores** keep everything but generalize nothing. They're + too bulky to load at recall-time, and a future agent has to re-derive the + lesson from a transcript instead of reading it. +- **Generic long-term memory** (embed-everything vector stores) is lossy and + unauditable: a retrieved snippet has no chain back to the moment it was + true. + +The goal: a **knowledge layer the agent earns from its own work** — small +enough to consult cheaply, general enough to apply to unseen-but-related +tasks, and auditable down to the transcript that produced each claim. + +## 2. The core idea + +Build a **wiki from agent traces**. Each completed trajectory is distilled +into pages; every page links back to the session it came from. Future agents +**consult the wiki once they know the task they're about to do** — after the +user's request is understood and the task family is clear, before writing +code. + +``` + past sessions the wiki future session +┌──────────────┐ ┌──────────────────┐ ┌──────────────────┐ +│ trajectory A │─┐ │ summaries/ │ │ user states task │ +│ trajectory B │─┼──▶ │ guidelines/ │ ◀─────│ agent reads │ +│ trajectory C │─┘ │ skills/ tasks/ │consult│ _index.jsonl, │ +└──────────────┘ dist.│ _index.jsonl │ │ applies the rule │ + ▲ └──────────────────┘ └──────────────────┘ + └── provenance ──┘ + (each wiki page links back to the trajectory it was distilled from) +``` + +The wiki is **not** a transcript archive and **not** a session-start +preload. It's a curated, recall-preferred index of distilled lessons that an +agent pulls from on demand. + +## 3. Design principles + +Each decision below earns its place; the *why* is the point. + +### Provenance is mandatory + +Every page is traceable, in a couple of clicks, to the raw transcript that +produced it: + +``` +guideline.md + ↓ related_summary: +summaries/.md + ↓ sources: +trajectories/.json + ↓ source.transcript_path +~/.../.jsonl (the raw trace) +``` + +Why: a recommendation is only trustworthy if you can audit where it came +from and revise it when the underlying tool behavior changes. Provenance is +what separates this from a generic memory store. Cluster pages aggregate +their members' provenance rather than replacing it. + +### Page kinds, and a retrieval preference order + +The wiki has five page kinds, and `_index.jsonl` sorts them in **recall +preference order**: + +| Kind | What it is | Why it exists | +|---|---|---| +| **cluster** | Themed aggregator over ≥2 atomic guidelines | One consolidated rule instead of N near-duplicate hits | +| **skill** | Callable workflow page + sibling scripts | Directly *executable* — no interpretation needed | +| **guideline** (atomic) | One rule, free-text, trigger-tagged | The base unit; a single distilled lesson | +| **task / subtask** | Cross-session comparison / per-session workstream | Analysis surface, not recall-time advice | +| **summary** | Episodic record of one session | The provenance anchor every other page links to | + +Sort order is `cluster → skill → guideline → task`, so the most +consolidated and most directly-actionable artifacts surface first. The exact +retrieval recipe (parse task → read `_index.jsonl` → filter by tag/trigger → +prefer clusters → read top 2–5) lives in the recall contract; see +[`_default_agents.md`](../skills/scripts/_default_agents.md). + +### Procedural over declarative where possible + +A **guideline** tells a future agent *what to do* ("when pip's module dir is +missing, don't trust `ensurepip`"). A **skill** is a structured workflow page +the agent can *execute* — Overview / When-To-Use / Workflow / optional +sibling scripts it runs via Bash. + +Skills are **recall-preferred over guidelines** because they remove an +interpretation step: the agent reads the SKILL.md and runs the recipe +instead of reconstructing it from advice. §5 shows skills also win on cost. + +### Consolidation + delete-on-promote + +Two cross-trajectory moves keep the recall surface small and non-redundant: + +- **Consolidation** clusters ≥2 atomic guidelines that share a real *rule* + (not merely a topic) into a `__cluster.md` aggregator. Members stay on + disk with a `superseded_by:` backref — provenance is preserved. +- **Delete-on-promote** (`--archive-covered`): when a skill is synthesized + (or a cluster created), the atomics it subsumes are **soft-archived** to + `_archived/`. They leave the recall index but stay auditable on disk; the + `_audit.log` records the move. + +Why: §5's central empirical finding is that **recall quality degrades as the +index grows** — a smaller, non-redundant index helps even on tasks where no +page matches. Consolidation and pruning are how the wiki stays small as it +accumulates traces. + +### Recall-time discipline + +Consult **once you know the task or sub-task** — not at session start (too +vague to match), not as a last resort when stuck (too late). And the +**pointer wording is load-bearing**: a strong-imperative instruction to +consult the wiki gets followed; a soft "you may want to check" gets skipped +(§5, the A/B sweep). The pointer lives in the workspace `CLAUDE.md` / +`AGENTS.md`; placement and wording both matter. + +## 4. How a trace becomes a recallable page + +The build pipeline is a sequence of LLM passes, each piping structured JSON +to a deterministic builder +([`build_agent_wiki.py`](../skills/scripts/build_agent_wiki.py)) +that writes the page and maintains the indexes: + +``` +raw trace ─┬─[convert]──▶ normalized JSON + │ + ├─[summarize]─────────▶ summaries/.md render-summary + ├─[extract-guidelines]▶ guidelines/__.md render-guidelines + ├─[synthesize-skill]──▶ skills//SKILL.md render-skill --archive-covered + │ (per trace, above) + ├─[consolidate]───────▶ guidelines/__cluster.md render-cluster + │ (once, cross-corpus) + └─[catalog]───────────▶ _index.jsonl, indexes, backrefs +``` + +| Stage | Skill | Builder subcommand | Scope | +|---|---|---|---| +| Convert | (bob-trace-converter / `normalize_stream_json_transcripts.py`) | — | per trace | +| Summarize | [`agent-wiki-summarize`](../skills/agent-wiki-summarize/SKILL.md) | `render-summary` | per trace | +| Extract guidelines | [`agent-wiki-extract-guidelines`](../skills/agent-wiki-extract-guidelines/SKILL.md) | `render-guidelines` | per trace | +| Synthesize skill | [`agent-wiki-synthesize-skill`](../skills/agent-wiki-synthesize-skill/SKILL.md) | `render-skill` | per trace | +| Consolidate | [`agent-wiki-consolidate-guidelines`](../skills/agent-wiki-consolidate-guidelines/SKILL.md) | `render-cluster` | **cross-corpus, once** | +| Catalog | (any) | `catalog` | bookkeeping | + +**Order matters.** `synthesize-skill` runs *before* `consolidate` so skills +claim recipe-level territory first (and archive the atomics they cover); +consolidation then clusters only the surviving atomics. This matches the +consolidate skill's own rule — don't propose a cluster overlapping a skill's +territory. + +**`catalog` renders; `consolidate` proposes.** A sharp edge worth +internalizing: `catalog` only *materializes* clusters already declared in +`_config.yaml` and refreshes indexes/backrefs. It never *proposes* new +clusters. Consolidation is the LLM pass that proposes them. Running `catalog` +and expecting clusters to appear is a mistake — they won't unless +consolidation declared them first. + +### The one-pass entry point + +[`agent-wiki-ingest`](../skills/agent-wiki-ingest/SKILL.md) +orchestrates the whole pipeline end-to-end (convert → bootstrap → summarize +→ extract → synthesize → consolidate → catalog) via subagent fan-out: +summarize runs in parallel (independent file writes), extract and synthesize +run sequentially (they mutate shared index/config state), consolidation runs +once. It exists specifically so the **consolidation pass is never silently +skipped** when ingesting a batch — the failure mode that motivated it. + +### Build patterns + +The same corpus can be turned into a wiki three ways, varying *when* the +wiki is built and *what* the agent sees during each trial (see +[`RESULTS-SUMMARY.md` §3–4](../experiments/RESULTS-SUMMARY.md)): + +- **Open-loop** — trials run against a fixed external wiki; the new wiki is a + study log built from observing them. +- **Closed-loop** — trials mount the wiki being built; it grows trial-by-trial, + so trial N+1 sees what trial N spawned. The only pattern with real + intra-wiki recall data. +- **Retroactive** — the wiki stays empty during all trials, then is built in + one batch afterward. Cleanest pure-recipe corpus. + +The three real-task themes emerge in **all three** patterns — consolidation +is robust to build order. + +## 5. Evidence + +All experiments use the same 16-task corpus, `claude_md_strong` pointer, +3 trials/task. `total_cost_usd` is the ground-truth cost metric (cache reads +bill at ~10% of regular input, so raw token sums overcount). Full tables and +methodology: [`experiments/RESULTS-SUMMARY.md`](../experiments/RESULTS-SUMMARY.md). + +| Finding | Result | Source | +|---|---|---| +| **Wiki vs no wiki** | −20% cost, −38% duration, −43% tool calls, accuracy unchanged (96%) | [twobatch-comparison](../experiments/twobatch-comparison.md) | +| **Pointer wording is load-bearing** | strong-imperative CLAUDE.md 3/3 reads; soft phrasing 1/3 | [RESULTS-SUMMARY §1](../experiments/RESULTS-SUMMARY.md#1-agentsmd-ab-sweep-the-original) | +| **Build pattern is robust** | same 3 clusters emerge open-/closed-/retroactive | [RESULTS-SUMMARY §3–4](../experiments/RESULTS-SUMMARY.md#34-build-pattern-comparison-closed-loop-vs-retroactive) | +| **Skills > guidelines** | skills-only $0.146 vs guidelines $0.17 (−14%), accuracy 98% vs 96% | [twobatch-skills-comparison](../experiments/twobatch-skills-comparison.md) | +| **Composition is non-additive** | skills+guidelines costs +22% vs skills, +5% vs guidelines | [twobatch-fourway-comparison](../experiments/twobatch-fourway-comparison.md) | +| **Composition > size; skills-only still cheapest** | delete-on-promote (corrected index): −3% vs both, +18% vs skills | [twobatch-fiveway-comparison](../experiments/twobatch-fiveway-comparison.md) | + +The throughline across these: + +- **The wiki materially reduces cost at equal accuracy.** Savings come + mainly from fewer tool calls and shorter responses, not from reading fewer + input bytes — the agent reads *more* wiki bytes but acts more directly. +- **A smaller recall surface helps even when nothing matches.** The + skills-only arm beat guidelines-only on tasks where *no skill matched* + (e.g. t2-imports −39%) — evidence that index noise itself costs, which is + why consolidation and delete-on-promote exist. +- **Don't stack page kinds.** Skills + guidelines together is the worst + populated wiki, and pruning the redundant atomics doesn't recover the gap. + Pick procedural-first; let consolidation + archive keep the rest lean. + +## 6. Open questions / limitations + +From [`RESULTS-SUMMARY.md`](../experiments/RESULTS-SUMMARY.md)'s open +questions — live, not yet resolved: + +- **Statistical power.** Headline numbers rest on 3 trials/task; per-task + confidence intervals are wide, especially on the two observed regressions + (wav-info, imports). +- **True transfer.** All experiments reuse the same task in build and recall. + A real transfer test (build from tasks Y, recall on task X where X ∈ + family(Y), X ∉ Y) would test whether clusters *generalize* rather than + memorize. +- **Scale.** 16 tasks is small. Does the cost-reduction percentage hold, + grow, or saturate at 50+ tasks and a larger index? +- **Why composition regresses.** The skills+guidelines penalty is + output-token-driven, not read-count-driven — trace-level inspection of why + the agent "says more" when both kinds are present is unresolved. + +## See also + +- [`schema.md`](schema.md) — the on-disk schema reference: directory layout, per-kind frontmatter, links, and the promotion/archival lifecycle. +- [`_default_agents.md`](../skills/scripts/_default_agents.md) — the recall contract copied into every wiki as `AGENTS.md` (page kinds, retrieval recipe, provenance chain). +- [`experiments/RESULTS-SUMMARY.md`](../experiments/RESULTS-SUMMARY.md) — the full empirical log. +- The `agent-wiki-*` skills under [`skills/`](../skills/) and the builder [`build_agent_wiki.py`](../skills/scripts/build_agent_wiki.py). diff --git a/explorations/agent-wiki/docs/schema.md b/explorations/agent-wiki/docs/schema.md new file mode 100644 index 00000000..6fa056db --- /dev/null +++ b/explorations/agent-wiki/docs/schema.md @@ -0,0 +1,483 @@ +# Agent-wiki: on-disk schema reference + +The precise file format of an agent-wiki — directory layout, every page +kind, the load-bearing metadata fields, how pages link, and the lifecycle by +which atomic guidelines get promoted into clusters or archived under skills. + +For the *why* behind this structure, see +[`design.md`](design.md). For the recall-time contract +an agent follows, see +[`_default_agents.md`](../skills/scripts/_default_agents.md) +(copied into each wiki as `AGENTS.md`). The source of truth for everything +below is the builder +[`build_agent_wiki.py`](../skills/scripts/build_agent_wiki.py); +real examples are drawn from the `wiki-twobatch-*` example wikis. + +--- + +## 1. Directory layout + +``` +/ +├── AGENTS.md ← recall contract (bootstrapped from the template) +├── index.md ← human-friendly overview (catalog-generated) +├── _config.yaml ← durable taxonomy: tags, clusters, tasks, overrides +├── _index.jsonl ← agent retrieval index (one row per page) +├── _audit.log ← append-only JSONL log of mutations + recall events +├── _archived/ ← guidelines retired by delete-on-promote +│ └── __.md +├── summaries/ +│ ├── .md ← one episodic summary per session +│ ├── __.md ← arc-split summary (long sessions) +│ └── index.md +├── guidelines/ +│ ├── __.md ← atomic guideline (one rule) +│ ├── __cluster.md ← themed aggregator (recall-preferred) +│ ├── _id_index.json ← guideline id → relpath +│ └── index.md +├── skills/ +│ ├── /SKILL.md ← callable workflow page +│ ├── /scripts/ ← optional sibling scripts (Bash-runnable) +│ ├── _id_index.json ← skill slug → relpath +│ └── index.md +└── tasks/ + ├── __task.md ← cross-session comparison + ├── __subtask.md ← per-session workstream + └── index.md +``` + +**Filename suffixes are the navigation contract.** A page's role is decided +by its suffix, and the tooling relies on it — do not rename: + +| Pattern | Role | +|---|---| +| `__.md` (in `guidelines/`) | atomic guideline; `` = the `id:` | +| `__cluster.md` | cluster aggregator | +| `.md` / `__.md` | summary (single / arc-split) | +| `__task.md` | cross-session task comparison | +| `__subtask.md` | per-session workstream | +| `/SKILL.md` | skill | + +Files prefixed `_` (`_index.jsonl`, `_config.yaml`, `_audit.log`, +`_id_index.json`, `_archived/`) are machinery, not content pages. + +--- + +## 2. Page kinds and their frontmatter + +Each page is markdown with YAML frontmatter. Fields are either **authored at +render-time** (written once by the `render-*` pass, stable thereafter) or +**catalog-managed** (recomputed and force-overwritten on every `catalog` +run). The split matters: never hand-edit a catalog-managed field — it'll be +clobbered next catalog. + +### Summary — `summaries/.md` + +`type: episodic-summary`. The provenance anchor every other page links back +to. One per session (or per arc for long, split sessions). + +| Field | Origin | Meaning | +|---|---|---| +| `session_id`, `agent`, `model`, `goal`, `outcome` | render | session identity + one-line goal + success/partial/failure | +| `duration_seconds`, `tools_used`, `sources` | render | wall-clock, tool names, provenance paths (normalized JSON + raw transcript) | +| `recalled_guidelines` | render | guidelines the session saw, each `{id, title, status, evidence?}` | +| `arc`, `sibling_summaries` | render | only on arc-split sessions | +| `tags`, `tool_calls`, `errors`, `dead_end_paths`, `wiki_consulted` | **catalog** | computed from the normalized trajectory | +| `contributed_guidelines`, `contributed_skills` | **catalog** | reverse links — pages this session produced | +| `input_tokens`, `cache_creation_input_tokens`, `cache_read_input_tokens`, `output_tokens`, `total_cost_usd` | **catalog** | token + cost metrics (omitted when zero) | +| `verified_at` | **catalog** | date of last catalog run | + +```yaml +--- +type: episodic-summary +session_id: +agent: bob +model: premium +goal: One sentence describing what the user asked for. +outcome: success +duration_seconds: 40.3 +tools_used: [execute_command, attempt_completion] +sources: + - trajectories/-openai-chat-completions.analysis.json + - /path/to/raw/session.json +# ── below: catalog-managed ── +tags: [] +tool_calls: 7 +errors: 0 +wiki_consulted: false +contributed_guidelines: [, ...] +contributed_skills: [, ...] +total_cost_usd: 0.18 +verified_at: 2026-06-09 +--- +``` + +### Atomic guideline — `guidelines/__.md` + +`type: guideline` (also `workflow` / `script` / `command-template`). One +reusable rule. `` is a 12-hex content hash and equals the `id:`. + +| Field | Origin | Meaning | +|---|---|---| +| `id`, `type` | render | content-hash id; page kind | +| `trigger` | render | situational context when the rule applies | +| `agent` | render | source agent (`bob`, `claude-code`, …); defaults to `claude-code` | +| `tags` | render, then **catalog** | topical tags; catalog re-syncs from `_config.yaml` | +| `sources`, `related_summary` | render | provenance: normalized JSON path + the summary page | +| `cluster`, `superseded_by` | **catalog** | set when this atomic is a cluster member | +| `verified_at` | **catalog** | date of last catalog run | + +The body carries the rule prose, an optional `## Rationale`, a +`## Sources` footer, and a catalog-injected `## Used by` section listing +sessions that recalled it. + +```yaml +--- +id: 84ed6cf26387 +type: guideline +trigger: Need to put a multi-line script inside a running Docker container before executing it. +agent: claude-code +tags: [docker, heredoc, shell, scripting, example] +sources: + - trajectories/df2b08e4-openai-chat-completions.analysis.json +related_summary: summaries/df2b08e4-7853-47ec-9c46-fee4b0a33eb7.md +verified_at: 2026-06-09 +cluster: container-boundary-one-shot__cluster.md # ← stamped by catalog +superseded_by: container-boundary-one-shot__cluster.md # ← stamped by catalog +--- +``` + +### Cluster — `guidelines/__cluster.md` + +`type: cluster`, `id: cluster:`. A themed aggregator over ≥2 atomic +guidelines that share a rule. **Regenerated whole on every catalog run** from +the membership declared in `_config.yaml`; always `priority: high`. + +```yaml +--- +type: cluster +slug: container-boundary-one-shot +title: Cross the host/container boundary in one docker exec +tags: [docker, container, shell, io] +verified_at: 2026-06-09 +members: + - id: 84ed6cf26387 + link: heredoc-python-scripts-into-the__84ed6cf26387.md + - id: 6c2bd298dd0d + link: read-in-container-files-via-docker-exec__6c2bd298dd0d.md +priority: high +--- +``` + +Body: description, optional `## Takeaway` (the actionable one-line rule), and +a `## Members` table. Members keep their own pages and provenance — the +cluster aggregates, it doesn't absorb. + +### Skill — `skills//SKILL.md` + +`type: skill`, `id: skill:`. A callable workflow page. Authored once by +`render-skill`; **not touched by catalog**. + +| Field | Meaning | +|---|---| +| `name`, `description`, `trigger` | slug, one-paragraph summary, when-to-use | +| `agent`, `sources`, `related_summary` | source agent + provenance | +| `tags`, `verified_at` | topical tags; render date | + +Body: `## Overview`, optional `## When To Use`, `## Workflow`, `## Sources`. +Optional sibling scripts live under `skills//scripts/` (shell scripts +are written `chmod 755`). + +```yaml +--- +id: skill:transform-json-with-jq-and-persist-filter-args-yaml +type: skill +name: transform-json-with-jq-and-persist-filter-args-yaml +description: Use a single jq pipeline to filter, reshape, and sort JSON to a target schema … +trigger: "A task gives an input JSON and asks for a transformed output plus a YAML of the jq filter + args …" +agent: bob +sources: + - trajectories/d0e03862-openai-chat-completions.analysis.json +related_summary: summaries/d0e03862-30c5-49b6-9aef-b97dcea57dc0.md +verified_at: 2026-06-09 +tags: [jq, json, yaml, example] +--- +``` + +### Task / subtask — `tasks/__task.md`, `tasks/__subtask.md` + +`task-comparison` pages (`id: task:`) are cross-session comparison +tables, **regenerated each catalog run** from `_config.yaml`'s `tasks.` +definition + the sessions it classifies. `subtask` pages (`id: +subtask:`) are per-session workstream narratives, **authored standalone** +and not regenerated. Both carry `type`, `slug`, `title`, `tags`, +`verified_at`; tasks add `sessions:` (row count), subtasks add +`parent_session_id` / `parent_summary`. + +### id conventions + +- **Atomic guidelines**: a 12-hex content hash (e.g. `84ed6cf26387`); the + filename suffix matches, so id ↔ file round-trips. +- **Everything else**: a kind-prefixed slug — `cluster:`, + `skill:`, `task:`, `subtask:`. + +--- + +## 3. Index, config, and audit files + +### `_index.jsonl` — the retrieval index + +One JSON object per line, one line per cluster / skill / guideline / task / +subtask page. This is what an agent reads at recall-time. Rows are sorted +**clusters → skills → guidelines → tasks → subtasks**, so the most +consolidated and directly-actionable artifacts come first. Common keys: +`kind`, `id`, `title`, `tags`, `trigger`, `summary` (≤240-char snippet), +`link`. Per-kind extras: clusters add `members` + `priority: high`; skills +add `priority: high`; guideline rows add `cluster` and (when clustered) +`superseded_by`; task rows add `family`; subtask rows add +`parent_session_id` / `parent_summary`. + +```jsonl +{"kind": "cluster", "id": "cluster:container-boundary-one-shot", "title": "Cross the host/container boundary in one docker exec", "tags": ["docker","container","shell","io"], "trigger": "", "summary": "Benchmark tasks frequently live inside a named Docker container…", "link": "guidelines/container-boundary-one-shot__cluster.md", "members": ["84ed6cf26387","6c2bd298dd0d"], "priority": "high"} +{"kind": "skill", "id": "skill:aggregate-jsonl-records-top-n-by-sum-and-count", "title": "aggregate-jsonl-records-top-n-by-sum-and-count", "tags": ["jsonl","python","aggregation","example"], "trigger": "Task gives a directory of large JSONL files…", "summary": "Aggregate many JSONL files in one streaming Python pass…", "link": "skills/aggregate-jsonl-records-top-n-by-sum-and-count/SKILL.md", "priority": "high"} +{"kind": "guideline", "id": "3c019235c9f8", "title": "Format ISO 8601 to YYYY-MM-DD with split T", "tags": ["jq","iso-8601","date-formatting","example"], "trigger": "Inside a jq filter, you need only the calendar date…", "summary": "…use `(.last_login | split(\"T\")[0])`.", "link": "guidelines/format-iso-8601-to-yyyy-mm-dd-with__3c019235c9f8.md", "cluster": null} +``` + +**Archived guidelines are absent from `_index.jsonl`** — that's what makes +archiving remove a page from recall. + +### `_config.yaml` — the durable taxonomy + +The one authored file that survives catalog regeneration. Structure: + +```yaml +schema_version: 1 +tags: + guideline: + : [tag, tag, ...] # guideline id → tags (drives "By tag" + clustering) +clusters: + : + title: + description: + takeaway: + members: [, ...] # the cluster's atomic members + tags: [tag, ...] +tasks: + : + title: + family: + family_match: { goal_substring: [, ...] } + intro: + findings: + tags: [tag, ...] +session_family_overrides: + : { family: , trial: , condition: } +``` + +`tags.guideline` and `clusters` are written by `render-guidelines` / +`render-cluster`; `catalog` reads them back to stamp atomic frontmatter and +regenerate cluster pages. `tasks` + `session_family_overrides` drive +task-comparison classification. + +### `_id_index.json` — id → path + +A flat map in both `guidelines/` and `skills/`, used to resolve backlinks +(e.g. a summary's `contributed_guidelines` ids → file paths). Archiving an +atomic **pops** its entry here (see §5). + +```json +{ "84ed6cf26387": "guidelines/heredoc-python-scripts-into-the__84ed6cf26387.md" } +``` + +### `_audit.log` — append-only mutation + recall log + +One JSON line per event. Three action types: + +```jsonl +{"action": "summary.guideline_use", "session_id": "", "id": "", "status": "followed", "ts": "…Z"} +{"action": "synthesize_skill", "session_id": "", "skill_name": "", "scripts": ["run.sh"], "ts": "…Z"} +{"action": "archive_guideline", "id": "", "reason": "covered_by_skill", "target": "", "src": "guidelines/…md", "dst": "_archived/…md", "ts": "…Z"} +``` + +`reason` is `covered_by_skill` or `covered_by_cluster`. The audit log is the +durable record of promotions/archivals even though archived pages leave the +index. + +--- + +## 4. How files link to each other + +Forward links are **authored at render-time**; reverse links are +**recomputed by catalog** from the forward ones. Forward is the source of +truth. + +``` + ┌──────────────────────────── provenance (forward) ───────────────────────────┐ + ▼ │ + guidelines/__.md ──related_summary:──▶ summaries/.md ──sources:──▶ normalized JSON ──▶ raw transcript + ▲ │ + │ contributed_guidelines: / contributed_skills: (reverse — catalog inverts related_summary) + └──────────────────────────────────────────────┘ + + guidelines/__.md ──cluster: / superseded_by:──▶ guidelines/__cluster.md + ▲ │ + └────────────────────── members: ───────────────────────┘ (bidirectional) + + _id_index.json : ──▶ relpath _index.jsonl : row.link ──▶ page file +``` + +- A **guideline → summary → trajectory** chain makes every rule auditable. +- `catalog` builds **`contributed_guidelines` / `contributed_skills`** on the + summary by inverting all guideline/skill `related_summary:` fields — so the + summary knows what it produced without that being hand-maintained. +- **Cluster ↔ member** is bidirectional: the cluster lists `members:`; each + member is stamped `cluster:` + `superseded_by:`. + +--- + +## 5. Lifecycle: promotion & archival + +``` + render-guidelines + │ + ▼ + ┌──────────────────────────┐ + │ ATOMIC │ + │ guidelines/__ │ + │ in _id_index.json │ + │ in _index.jsonl │ + └──────────────────────────┘ + │ │ + render-cluster │ │ render-skill --archive-covered + (+ catalog) │ │ — or — render-cluster --archive-members + ▼ ▼ + ┌────────────────────────┐ ┌──────────────────────────┐ + │ CLUSTERED │ │ ARCHIVED │ + │ file STAYS in place │ │ file MOVES → _archived/ │ + │ +cluster: +superseded… │ │ popped from _id_index │ + │ still in both indexes │ │ ABSENT from _index.jsonl │ + │ cluster row priority:hi│ │ audit: archive_guideline │ + └────────────────────────┘ │ (unreachable at recall) │ + └──────────────────────────┘ +``` + +### ATOMIC → CLUSTERED + +Authored by declaring the cluster (`render-cluster` writes +`_config.yaml/clusters.` + the `__cluster.md` page). On the next +`catalog`, each member atomic is **stamped** `cluster:` and `superseded_by:` +in its frontmatter. The member **file stays in place**, stays in +`_id_index.json`, and stays in `_index.jsonl` (now carrying `superseded_by`). +The cluster gets its own `_index.jsonl` row with `priority: high`. At recall +the cluster is preferred; members remain reachable for their original wording ++ provenance. + +### ATOMIC → ARCHIVED (delete-on-promote) + +When a skill (or cluster) subsumes an atomic, the atomic is **soft-archived**: + +1. file moved `guidelines/__.md` → `_archived/__.md` +2. its entry is **popped** from `guidelines/_id_index.json` +3. an `archive_guideline` line is appended to `_audit.log` +4. on the next catalog it is **not scanned** (it's outside `guidelines/`), so + it disappears from `_index.jsonl` — **unreachable at recall**, still on + disk for audit. Reversal is manual. + +Two triggers: + +| Trigger | Flag | Audit `reason` | +|---|---|---| +| Cluster created | `render-cluster --archive-members` | `covered_by_cluster` | +| Skill synthesized | `render-skill --archive-covered` | `covered_by_skill` | + +### Coverage inference (`--archive-covered`) + +A skill archives an atomic only if `_skill_covers_atomic` returns true via +**any** of three conservative paths (biased toward false-negatives — when in +doubt, the atomic survives): + +1. **Tag-superset** — the atomic's tags ⊆ the skill's tags **and** their + intersection has ≥2 tags outside a `_GENERIC_TAGS` stop-set + (`stdlib`, `parsing`, `agent-behavior`, `binary`, `headers`, …). +2. **Slug-keyword** — a ≥4-char, non-stopword token from the skill slug + appears in the atomic's title. +3. **Format-identifier** — an uppercase (`PNG`, `ZIP`) or CamelCase (`WebP`) + token in the skill description appears in the atomic's title. Catches + family-broad skills whose slug abstracts the format names away. + +### What catalog recomputes vs. what's authored once + +| Recomputed every `catalog` (force-replaced) | Authored once at render | +|---|---| +| guideline: `verified_at`, `tags`, `cluster`, `superseded_by`; `## Used by` | guideline: `id`, `type`, `agent`, `trigger`, `sources`, `related_summary`, body | +| summary: `tags`, `tool_calls`, `errors`, `dead_end_paths`, `wiki_consulted`, `contributed_guidelines`, `contributed_skills`, token metrics, `verified_at` | summary: `session_id`, `agent`, `model`, `goal`, `outcome`, `sources`, narrative | +| cluster + task pages (regenerated whole); all `index.md`; `_index.jsonl`; priority tiers | cluster/task definitions in `_config.yaml`; skill pages; subtask pages | + +Archiving is one-way; reversing it means moving the file back and +re-cataloging by hand. + +--- + +## 6. Worked example — one real chain + +Tracing the atomic `heredoc-python-scripts-into-the__84ed6cf26387` through one of the example wikis. + +**(a) The atomic** carries forward links to its summary + its cluster (the +`cluster:`/`superseded_by:` pair was stamped by catalog when the cluster was +declared): + +```yaml +id: 84ed6cf26387 +type: guideline +agent: claude-code +tags: [docker, heredoc, shell, scripting, example] +sources: + - trajectories/df2b08e4-openai-chat-completions.analysis.json +related_summary: summaries/df2b08e4-7853-47ec-9c46-fee4b0a33eb7.md +cluster: container-boundary-one-shot__cluster.md +superseded_by: container-boundary-one-shot__cluster.md +``` + +**(b) Follow `related_summary:`** to the summary — which closes the reverse +loop via the catalog-computed `contributed_guidelines` (and names the raw +transcript under `sources:`): + +```yaml +type: episodic-summary +session_id: df2b08e4-7853-47ec-9c46-fee4b0a33eb7 +agent: bob +goal: Aggregate JSONL records in a Docker container to produce /app/aggregates.json … +sources: + - trajectories/df2b08e4-openai-chat-completions.analysis.json + - /Users/…/.bob/tmp/…/chats/session-2026-06-09T07-11-df2b08e4.json # raw trace +contributed_guidelines: [84ed6cf26387] # ← reverse edge +contributed_skills: [aggregate-jsonl-records-top-n-by-sum-and-count] +``` + +**(c) Follow `cluster:`** forward to the aggregator, which lists the atomic +as a member — the bidirectional cluster↔member link: + +```yaml +type: cluster +slug: container-boundary-one-shot +title: Cross the host/container boundary in one docker exec +members: + - id: 84ed6cf26387 + link: heredoc-python-scripts-into-the__84ed6cf26387.md + - id: 6c2bd298dd0d + link: read-in-container-files-via-docker-exec__6c2bd298dd0d.md +priority: high +``` + +One atomic, four hops: **rule → summary → raw trajectory** (provenance), and +**rule ↔ cluster** (consolidation), with the summary's +`contributed_guidelines` closing the loop back to the rule. Every edge is +either authored at render (forward) or recomputed by catalog (reverse). + +--- + +## See also + +- [`design.md`](design.md) — why the wiki is shaped this way (rationale, principles, empirical results). +- [`_default_agents.md`](../skills/scripts/_default_agents.md) — the recall-time contract (`AGENTS.md`). +- [`build_agent_wiki.py`](../skills/scripts/build_agent_wiki.py) — the builder; the implementation of everything above. diff --git a/explorations/agent-wiki/experiments/RESULTS-SUMMARY.md b/explorations/agent-wiki/experiments/RESULTS-SUMMARY.md new file mode 100644 index 00000000..1cb4e8c6 --- /dev/null +++ b/explorations/agent-wiki/experiments/RESULTS-SUMMARY.md @@ -0,0 +1,480 @@ +# Experiment results — running summary + +A consolidated index of empirical findings from the wiki experiments, in +the order they were run. The detailed reports for each experiment live in +their own files under `experiments/`; this is the running tape of what we +learned. + +## 1. AGENTS.md A/B sweep (the original) + +**Question**: does pointing an agent at `wiki-example/AGENTS.md` +alter behavior on a related task it didn't see at training time? + +**Setup**: 4 conditions × 3 trials = 12 sandbox runs of a single utt2 +("what lens model was used for @sample.jpg?"). Conditions varied how the +pointer was delivered (`baseline` / `skill` / `prompt` / `claude_md`). + +**Headline results** (`experiments/results/wiki-consult-20260605T070417Z/`): + +| Condition | read AGENTS.md | cited guideline | outcome match | +| ------------------------------ | -------------- | --------------- | ------------- | +| baseline | 0/3 | 0/3 | 0/3 | +| prompt (per-utterance hint) | 3/3 | 3/3 | 3/3 | +| claude_md (soft phrasing) | 1/3 | 1/3 | 1/3 | +| claude_md_strong (added later) | 3/3 | 3/3 | 3/3 | + +**The two CLAUDE.md variants** (verbatim, from +`tests/e2e/experiment_wiki_consult.py`; `WIKI_NAME` = `wiki-example`): + +- **`claude_md` (weak/soft)**: + > Before non-trivial tasks in this repo, consult `wiki-example/AGENTS.md` for relevant guidelines. + +- **`claude_md_strong` (strong)**: + > Before doing anything else in this workspace, read `wiki-example/AGENTS.md` and follow its retrieval recipe to find guidelines that apply to your task. Cite the guidelines you used. + +What makes the strong variant load-bearing — three differences: + +| Axis | weak | strong | +| --- | --- | --- | +| **Timing** | "before non-trivial tasks" (agent judges what's non-trivial) | "before doing anything else" (unconditional, first action) | +| **Verb** | "consult … for relevant guidelines" (vague) | "read … and follow its retrieval recipe" (imperative + concrete procedure) | +| **Accountability** | none | "Cite the guidelines you used" | + +**Finding**: a strong-imperative pointer in CLAUDE.md performs as well as +a per-utterance prompt hint. A *soft* CLAUDE.md ("Before non-trivial +tasks, consult …") got skipped 2/3 of the time — the hedge ("non-trivial") +lets the agent rationalize skipping. **Wording at the pointer site is +load-bearing.** + +## 2. Persistent-pointer mechanism comparison + +**Question**: does it matter where the strong-imperative pointer lives — +in CLAUDE.md, in `--append-system-prompt`, or in a SessionStart hook? + +**Setup**: 3 mechanisms × 3 trials = 9 trials of the same lens-model task. + +**Headline results**: + +| Mechanism | Reads AGENTS.md as Tool 1 | Median runtime | +| ---------------------- | ------------------------- | ----------------- | +| SessionStart hook | 3/3 | **47s** (fastest) | +| claude_md_strong | 3/3 | 52s | +| --append-system-prompt | 3/3 (but Tool 3+) | 63s (slowest) | + +**Finding**: all 3 mechanisms hit the same accuracy. **System-prompt +placement costs ~10–15s of orientation latency** (`ls`, `which exiftool`, +etc.) before the agent reads AGENTS.md. The SessionStart hook places the +pointer above-the-fold, so the agent reads AGENTS.md as Tool 1 with no +orientation pre-amble. + +## 3–4. Build-pattern comparison (closed-loop vs retroactive) + +> **Omitted from this public exploration.** These two experiments compared +> *how* a wiki is built — closed-loop (the wiki grows between trials, each +> trial sees what prior trials spawned) vs retroactive (the wiki stays empty +> during all trials, then is ingested in batch). They ran against internal +> trajectory corpora, so the detailed report and per-trial data are not +> included here. + +**Portable finding**: the same real-task themes emerged in *all* build +patterns (open-loop, closed-loop, retroactive) — +image-format-headers-via-struct, prefer-stdlib-module-for-format, +shell-pipelines-for-line-tasks. Consolidation is robust to build order; what +varies between patterns is meta-content, recall data, and per-task cost. +Closed-loop is the only pattern that accumulates real intra-wiki recall data +(trial N+1 demonstrably reads what trial N spawned); the others need post-hoc +attribution. + +## 5. Two-batch wiki-helps experiment + +**Question**: does the wiki *measurably* reduce token cost / duration / +tool calls at equal accuracy, on the same task, with vs without? + +**Setup**: 16 tasks × 3 trials × 2 batches = 96 trials, all +`claude_md_strong`. Batch 1 ran against an empty wiki. Wiki built from +batch 1's trajectories, frozen. Batch 2 ran against the populated wiki. +Same prompts, same workspace seeding — only variable: wiki content. + +**Headline results** (from `experiments/twobatch-comparison.md`): + +| Metric | Batch 1 (empty) | Batch 2 (with wiki) | Δ | +| ------------------------- | --------------: | ------------------: | ----------------------: | +| **Median total cost USD** | $0.21 | $0.17 | **−20%** | +| **Median duration** | 43s | 27s | **−38%** | +| **Median tool calls** | 7 | 4 | **−43%** | +| Median wiki reads | 5 | 3 | −40% | +| Median output tokens | 406 | 268 | −34% | +| Cache-read tokens | — | — | −32% | +| Cache-creation tokens | — | — | +66% (new pages cached) | +| **Aggregate accuracy** | 96% | 96% | unchanged | + +**Per-task highlights**: + +- **Wiki rescued failures on lens-model**: 67% → **100%** accuracy. +- **t8-bmp-info batch-1 trial 1 timed out at 300s**; with-wiki, all 3 + BMP trials completed in 27s median. **11× speedup** on that task. +- **t5-base64 with empty wiki**: 300s timeout. With wiki: 18s, 23s, 20s + (3/3 succeed). The `skip-for-trivial` guideline — recalled — let the + agent short-circuit AGENTS.md's recipe. +- **Two regressions**: t12-wav-info (100% → 67%) and t2-imports + (100% → 67%). One trial each failed in batch 2 — likely the agent + over-applying or misreading a recalled guideline. + +**Finding**: **wiki → faster, cheaper, fewer tools, equal accuracy.** +Per-task `total_cost_usd` is the ground-truth cost metric (cache reads +are billed at ~10% of regular input rate, so the raw token-sum proxy +overcounts). The −20% cost figure is robust to that pricing nuance. + +Detailed report: [`experiments/twobatch-comparison.md`](twobatch-comparison.md). + +## 6. Skills-arm of the wiki-helps experiment + +**Question**: would a wiki populated only with synthesized **skills** +(executable workflow pages) — instead of free-text guidelines — beat +the guidelines arm on the same 16-task corpus? + +**Setup**: identical to twobatch except batch 2 mounted +`wiki-twobatch-skills/`, an empty wiki populated by acting as the +`agent-wiki-synthesize-skill` agent on twobatch's batch-1 transcripts. +Per the skill's own rules (skip if trivial / single command, broad- +trigger names), three skills emerged: + +- `extract-jpeg-exif-camera-optics` (covers t1) +- `read-image-format-dimensions` (covers t6/t7/t8/t9 via magic-byte dispatch) +- `count-csv-rows-with-quoted-fields` (covers t14) + +Other 12 tasks have no matching skill — agent should fall through. + +**Headline results**: + +| | Empty | Guidelines | Skills | Δ vs guidelines | +| --------------------- | -----: | ---------: | -----: | --------------: | +| Median total cost USD | $0.21 | $0.17 | **$0.146** | **−14%** | +| Median output tokens | 406 | 268 | **206** | −23% | +| Median wiki reads | 5 | 3 | **2** | −33% | +| Aggregate accuracy | 96% | 96% | **98%** | +2% | +| Trials | 47/48 | 48/48 | 48/48 | (no timeouts) | + +**Per-task standouts**: + +- **t1-lens-model**: −28% cost. Direct skill match. +- **t2-imports**: −39% cost AND **67% → 100%** accuracy — *no skill matched*, + but the simpler wiki (3 skills, no guidelines) led to a faster path. +- **t3-todos**: −30%; same pattern. +- **skip family** (t2/t3/t5): 89% → 100% accuracy. +- **t14-csv-quoted**: **+18% cost** despite a matching skill — the skill's + overhead exceeded the savings on a 5-row CSV. +- **text family**: +6% (only family where skills hurt — 3 of 4 text tasks + had no matching skill). + +**Finding**: **skills > guidelines on aggregate cost, even where skills +don't match.** A smaller wiki (3 skills, no guideline noise) seems to +help recall on no-skill-match tasks too — the wiki-noise effect is real. + +Detailed report: [`experiments/twobatch-skills-comparison.md`](twobatch-skills-comparison.md). + +## 7. Both-arm: skills + guidelines together (4-way comparison) + +**Question**: does combining skills + guidelines compose additively, or +is there an overhead? + +**Setup**: same 16-task corpus, fourth arm. `wiki-twobatch-both` was +built from twobatch's batch-1 trajectories with BOTH the retroactive +guideline pipeline AND the synthesize-skill pipeline. End state: 47 +summaries + 15 atomics + 3 skills. + +**Headline 4-way aggregate**: + +| | Empty | Guidelines | Skills | Both | Both vs G | Both vs S | +| --------------------- | -----: | ---------: | ---------: | ---------: | --------: | --------: | +| Median total cost USD | $0.21 | $0.17 | **$0.146** | $0.179 | +5% | +22% | +| Median output tokens | 406 | 268 | 206 | 272 | +1% | +32% | +| Median wiki reads | 5 | 3 | 2 | 2 | −33% | = | +| Median guideline reads | 1 | 1 | 0 | 0 | −1 | = | +| Aggregate accuracy | 96% | 96% | 98% | 98% | +2 | = | + +**Per-family `Δ S→B`** (both minus skills, in cost): + +| | Δ | +| ------- | --: | +| text | −1% | +| image | +22% | +| lens-model | +17% | +| archive | +32% | +| skip | +44% | + +**Findings**: + +1. **Composition is non-additive — and slightly punitive.** Both arm is + the most expensive populated wiki: +22% vs skills, +5% vs guidelines. +2. **The penalty is largest on tasks WITHOUT a matching skill.** Skip + family +44%, archive +32%. Adding guidelines on top of skills did + not help where guidelines should have been the primary recall path. +3. **Behavioral signal**: median output tokens 206 → 272 — agent says + more in the both arm. Wiki-reads count is identical (2 + 0). Cost + increase isn't from extra reads; it's from longer responses (likely + the agent citing both the skill it used + adjacent guideline context). +4. **t14-csv-quoted: +49% vs guidelines, +26% vs skills** — the most + extreme regression. Having both the CSV skill AND the underlying CSV + guideline available pushed cost higher than either alone. + +**Conclusion**: **less wiki content + targeted (procedural) recall +wins.** Don't pile guidelines on top of skills; pick one or the other. + +Detailed report: [`experiments/twobatch-fourway-comparison.md`](twobatch-fourway-comparison.md). + +## 8. Pruned-arm: delete-on-promote policy (5-way comparison) + +**Question**: §7 closed with the open question "if 'both' loses to +'skills-only', does 'skills + only the no-skill-coverage guidelines' +beat 'skills-only'?" This experiment tests that. + +**Policy added** to the agent-wiki builder: when a cluster is rendered, +archive its member atomics; when a skill is synthesized, archive every +atomic the skill *covers* — inferred via three paths: + +1. **Tag-superset**: skill's tags ⊇ atomic's tags AND ≥2 non-generic + tags shared. +2. **Slug-keyword**: a non-stopword token (≥4 chars) from the skill + slug appears in the atomic's title. +3. **Description-format-token**: an uppercase format identifier (e.g. + `PNG`, `BMP`, `WebP`, `JPEG`) that appears in both the skill's + description and the atomic's title. + +Soft archive: moves to `/_archived/` with an audit +log entry; recall data on archived atomics is discarded. + +**Setup**: same 16-task corpus, same `claude_md_strong` condition. +`wiki-twobatch-pruned/` was built by the same pipeline that built +`wiki-twobatch-both/`, but with `--archive-covered` on each +synthesize-skill call. End state: + +- 47 summaries +- 9 surviving atomics (all from no-skill-match tasks: zip, tar, wav, + gzip, jsonl, ini, log, plus the imports/todos/base64 meta-atomics) +- 3 skills (same as skills/both arms) +- **6 archived atomics** (PNG, GIF, BMP, WebP, walk-EXIF-sub-IFD, + use-stdlib-csv-reader) — exactly the atomics covered by the 3 skills + +> **⚠️ Corrected 2026-06-10.** The numbers below are the **re-run** against +> a fixed index. The original §8 (commit `8bcd713`) ran the pruned arm +> against a wiki whose `_index.jsonl` was stale — `render-skill` archived the +> covered atomics but never refreshed the indexes, so the wiki exposed +> **0 skills, 15 guideline rows, and 6 broken links**. Agents never saw the +> skills and chased dangling guideline rows. Commit `2adc67a` fixed the +> builder (refresh indexes + integrity assertion after `render-skill` / +> `render-cluster`); this section reflects the corrected run. The original +> (broken) figures are kept in strikethrough for comparison. +> See [`pruned-index-hypothesis.md`](pruned-index-hypothesis.md). + +**Headline 5-way aggregate** (Pruned = corrected re-run): + +| | Empty | Guidelines | Skills | Both | Pruned | P vs S | P vs B | +| ---------------------- | -----: | ---------: | ---------: | ---------: | ------------------: | -----: | -----: | +| Median total cost USD | $0.21 | $0.17 | **$0.146** | $0.179 | $0.173 (~~$0.181~~) | +18% | −3% | +| Median output tokens | 406 | 268 | **206** | 272 | 226 (~~290~~) | +9% | −17% | +| Median wiki reads | 5 | 3 | 2 | 2 | 2 (~~3~~) | = | = | +| Median guideline reads | 1 | 1 | 0 | 0 | 0 (~~1~~) | = | = | +| Aggregate accuracy | 96% | 96% | 98% | 98% | 98% | = | = | + +**Per-family `Δ` (cost vs skills-only / vs both)** — corrected: + +| Family | B vs S | P vs S | P vs B | +| ---------- | -------: | --------: | -----: | +| lens-model | +17% | +30% | +11% | +| image | +22% | +33% | +9% | +| archive | +32% | +24% | −6% | +| text | −1% | −3% | −3% | +| skip | +44% | +18% | −18% | + +**Findings** (corrected): + +1. **The stale index was a real confound.** Fixing it cut the pruned arm's + median cost $0.181 → **$0.173**, output tokens 290 → **226**, wiki reads + 3 → 2, and **guideline reads 1 → 0**. The broken arm's extra read and + guideline-read were agents following dangling/archived rows that the + correct index no longer exposes. The original "pruning is *worse* than + both" result (+1%) flips to **−3% vs both** once the index is correct. + +2. **But skills-only still wins.** Even corrected, pruned ($0.173) remains + **+18% vs skills-only** ($0.146). The §7 open question still gets a "no": + adding the no-skill-coverage atomics on top of skills does not beat + skills-alone on aggregate cost. + +3. **Pruning still costs on skill-match families, just far less.** Image + +9% vs both (was +28%), lens-model +11% (was +79%). The dramatic + skill-match penalty in the original was mostly the broken index; a + smaller residual penalty remains — having sibling atomics in the index + at all is slightly distracting even when a skill is the right answer. + +4. **Pruning genuinely helps no-skill-match families.** Archive −6% vs both, + skip −18% vs both, text −3%. Where there's no skill to fall through to, + the leaner atomic list is a real (and now larger) win. + +5. **Size *is* a lever once you control for index correctness — but a small + one, and composition still dominates.** Corrected pruned (12 index rows) + now sits between skills (12 rows) and both (18 rows), in the expected + order — the earlier "smallest wiki yet most expensive" paradox was an + artifact of the bug, not a real inversion. + +6. **Same-session matcher variant is a wash.** Re-pruning through the + *also*-fixed archive matcher (commit `1272097`, which keeps GIF/BMP/WebP + the old loose matcher wrongly archived cross-session) yields a 12-atomic + wiki. Its full-corpus median is **$0.175** (sum $8.23) — statistically + indistinguishable from the 9-atomic arm. The 3 extra cross-session + atomics cost essentially nothing. + +7. **Both single-trial misses were known-flaky tasks, not regressions.** + 9-atomic missed t2-imports trial-1 (the prompt renders the module name as + a blank placeholder — the agent correctly asked which module); 12-atomic + missed t12-wav-info trial-2 (the same task that flaked to 67% in the + guidelines/skills/both arms). 47/48 each. + +**Operational implication** (revised): the original "don't expect pruning to +reduce cost" was too pessimistic — it was measuring a broken index. With a +correct index, **delete-on-promote is a net positive vs `both`** (−3% +aggregate, −6%/−18% on no-skill-match families) and is sound hygiene. But it +still doesn't beat **skills-only**, which remains the cheapest surface. If +cost is the only goal, ship skills-only; if you want to keep authored +guidelines for tasks no skill covers, pruned-on-a-fresh-index is a reasonable +middle and clearly better than stacking everything (`both`). + +Detailed report: [`experiments/twobatch-fiveway-comparison.md`](twobatch-fiveway-comparison.md). + +## Cross-experiment findings + +1. **Wording > placement.** Strong-imperative pointer wording matters + more than which channel delivers it. Soft CLAUDE.md got skipped; any + strong-imperative variant succeeded. + +2. **Same real-task themes emerge regardless of build pattern.** The + 3-cluster set (image-format-headers, stdlib-module, shell-pipelines) + appears in open-loop, closed-loop, and retroactive builds. + **Consolidation is robust.** What varies between builds is meta- + content, recall data, and accuracy/cost on individual tasks. + +3. **Closed-loop is the only build with real intra-wiki recall data.** + Other builds need post-hoc attribution or cross-wiki references. + Empirically demonstrated: trial N+1 reads what trial N spawned. + +4. **The wiki materially reduces cost on identical tasks.** −20% cost, + −38% duration, −43% tool calls in the controlled two-batch A/B at + unchanged accuracy. Effect is largest on tasks where the recipe is + non-obvious without the wiki (lens-model, BMP, base64-with-scope- + warning). + +5. **Cost reduction comes mainly from output tokens and tool-call + reduction**, not from input-token compression. The agent doesn't read + *fewer* bytes when it has the wiki — it reads MORE byes (cache-creation + on guideline pages goes up). But it produces shorter responses and + takes fewer tool turns. + +6. **Two-batch experiment surfaced two regressions** (wav-info, imports) + where the wiki may have *hurt* accuracy on one trial each. Worth + investigating before scaling — the wiki's value isn't unconditional. + +7. **Skills > guidelines on cost.** The skills arm (3 synthesized skills, + no guidelines) beat the guidelines arm by 14% on median cost and + matched it on accuracy (98% vs 96%). Largest savings on tasks with a + direct skill match (t1-lens-model −28%) but ALSO on tasks where no + skill matched (t2-imports −39%, t3-todos −30%) — suggesting the + smaller wiki (less to scan) helps recall even when no recall fires. + +8. **Skills + guidelines together is the worst populated wiki.** + Combining the two arms (`wiki-twobatch-both`: same 3 skills + 15 + atomics) costs +22% vs skills and +5% vs guidelines. Composition is + non-additive. Output tokens jump (206 → 272) without a corresponding + reads increase — the agent talks more when both kinds of recall are + available, even though it doesn't read more pages. **Implication: pick + skills OR guidelines, not both.** + +9. **Delete-on-promote beats `both` but not skills-only — and a stale + index nearly hid that.** *(Corrected 2026-06-10, see §8.)* The pruned + arm (3 skills + only the no-skill-coverage atomics) costs **−3% vs + both** and **+18% vs skills** on a correctly-indexed wiki. The + originally-reported +1%/+24% came from a builder bug: `render-skill` + archived atomics without refreshing `_index.jsonl`, so the wiki + exposed 0 skills and agents chased dangling guideline rows (commit + `2adc67a` fixed it). Corrected, pruning *helps* no-skill-match + families (archive −6%, skip −18% vs both) and costs only a small + residual on skill-match ones (image +9%, lens-model +11% vs both, + down from +28%/+79%). **Composition still dominates size, and + skills-only is still cheapest** — but delete-on-promote is a net + positive over stacking everything, not the wash the broken run + suggested. + +## File map + +``` +explorations/agent-wiki/experiments/ +├── RESULTS-SUMMARY.md this file +├── twobatch-comparison.md with-wiki vs without-wiki A/B +├── twobatch-skills-comparison.md 3-way (empty / guidelines / skills) +├── twobatch-fourway-comparison.md 4-way (+ both arm) +├── twobatch-fiveway-comparison.md 5-way (+ pruned arm) +├── pruned-index-hypothesis.md stale-index confound + correction +│ +├── metrics/ per-trial metric rollups (no raw transcripts) +│ ├── twobatch.metrics.jsonl empty (batch-1) + guidelines (batch-2) +│ ├── twobatch-skills.metrics.jsonl +│ ├── twobatch-both.metrics.jsonl +│ └── pruned-fixed-9atomic.metrics.jsonl corrected pruned arm +│ +└── harness/ reproduce-it scripts + ├── experiment_wiki_consult.py sandbox A/B runner + ├── wiki_consult_tasks.yaml the 16-task corpus + ├── extract_trial_metrics.py per-trial token/duration/tool metrics + ├── normalize_stream_json_transcripts.py stream-json → OpenAI chat format + ├── twobatch_compare.py metrics → comparison markdown + ├── threeway_compare.py + skills column + ├── fourway_compare.py + both column + └── fiveway_compare.py + pruned column +``` + +> Raw per-trial transcripts (`results*/.../trial-N.jsonl`) are intentionally +> excluded from this public exploration; only the metric rollups under +> `metrics/` and the narrative reports are included. The comparison scripts +> read those rollups. + +## Open questions worth pursuing + +- **Statistical power.** Headline metrics are based on 3 trials per task. + More trials would tighten the per-task confidence intervals, + particularly on the regression cases (wav-info, imports). +- **Why wav-info and imports regressed.** Single-trial failures could be + noise; could also be the agent following a recalled guideline that + doesn't quite fit. Spot-check those transcripts. +- **Transfer test.** All experiments use the same task in batch 1 and + batch 2. A real "transfer" experiment would test wiki-on-task-X with + wiki-built-from-tasks-Y where X ≠ Y but X ∈ family(Y). That tests + whether clusters generalize. +- **Larger corpus.** 16 tasks × 3 trials is a small experiment. + Repeating with a 50-task corpus over more trials would test whether + the cost-reduction percentage scales, regresses, or saturates. +- **Cross-pattern ensembling.** Could a wiki built closed-loop + + retroactive (using the seeding from the former + the per-task + templates from the latter) outperform either pattern alone? +- **Skill granularity sensitivity.** Skills arm used 3 broad skills. + Would 16 narrow per-task skills do better or worse? Issue-260's prior + finding (broad triggers 4/5 vs narrow 2/5) suggests broad wins, but + per-task skills weren't tested on this corpus. +- **Why the both arm regresses on no-skill-match tasks.** Median wiki + reads is identical between skills and both (2 + 0). The cost penalty + is purely output-token-driven. A trace-level inspection of agent + responses on archive/skip-family tasks would reveal whether the agent + is citing nearby guidelines without reading them, or whether the + presence of guidelines in the index is changing how it phrases its + answer. +- **~~Pruning experiment~~ — answered in §8.** Pruned arm (3 skills + + 9 no-skill-coverage atomics) does NOT beat skills-only. Skills-only + still wins on aggregate. The both-arm penalty is composition-driven, + not index-size-driven. + +- **Why does the pruned arm regress on skill-match tasks?** Pruning + should be neutral or positive on tasks WITH a matching skill — the + skill is unaffected and the index is smaller. Yet image and + lens-model families regressed sharply vs both. A trace-level + inspection of t1-lens-model trial 1 (which alone cost $0.488 in + pruned vs $0.36 in skills) might reveal whether the agent is + reading the surviving atomics out of curiosity or whether something + about the AGENTS.md / index format changes its decision path. diff --git a/explorations/agent-wiki/experiments/harness/experiment_wiki_consult.py b/explorations/agent-wiki/experiments/harness/experiment_wiki_consult.py new file mode 100644 index 00000000..8682ea1b --- /dev/null +++ b/explorations/agent-wiki/experiments/harness/experiment_wiki_consult.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 +# mypy: ignore-errors +# Exploration/reference code — not type-checked to the project standard. +"""A/B experiment: does pointing an agent at AGENTS.md alter its behavior? + +Paired design (utt1 → wiki → utt2): + +- utt1 produces a small focal-length-extraction trajectory (reused from + trajectory data; see Phase A in the plan file). +- wiki-example/ is a fresh single-trajectory wiki built from utt1's + extracted guidelines. It contains AGENTS.md, _index.jsonl, 4 atomic + guidelines, 1 summary. +- utt2 = "what lens model was used for @sample.jpg" — same image, related + but different EXIF field. The wiki should help the agent bridge to + LensModel (tag 0xA434) via the same Exif sub-IFD it documented for + focal length. + +For each condition (baseline, skill, prompt, claude_md), run N trials +in a fresh sandbox container and score three binary signals: + +- read_agents_md: trajectory contains a Read of AGENTS.md +- cited_guideline: agent's final response mentions a guideline title or + a key wiki concept (0xA434, 0x8769, ExifIFD) +- outcome_match: response contains "Google Pixel 4a Rear Wide Camera" + +Usage: + uv run python tests/e2e/experiment_wiki_consult.py \\ + --conditions baseline,skill,prompt,claude_md \\ + --trials 3 +""" + +from __future__ import annotations + +import argparse +import datetime +import json +import os +import re +import shutil +import subprocess +import sys +import time +from pathlib import Path + +import yaml + +REPO_ROOT = Path(__file__).resolve().parents[2] +SANDBOX_IMAGE = "claude-sandbox" +TIMEOUT_SECONDS = 300 +FORWARDED_ENV_VARS = ( + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "ANTHROPIC_BASE_URL", + "CLAUDE_MODEL", + "CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS", + "CLAUDE_CODE_SKIP_BEDROCK_AUTH", +) +WIKI_NAME = "wiki-example" # default; overridable via --wiki on the CLI + +# Test wiki must already exist; built by Phase A of the plan. +WIKI_SRC = REPO_ROOT / WIKI_NAME + +# Plugins to mount. +PLUGINS_DIR = REPO_ROOT / "platform-integrations" / "claude" / "plugins" + +# Demo workspace (sample.jpg only; no .evolve/entities/ confound). +DEMO_WORKSPACE = REPO_ROOT / "demo" / "workspace" + + +_STRONG_HINT = ( + f"Before doing anything else in this workspace, read `{WIKI_NAME}/AGENTS.md` " + f"and follow its retrieval recipe to find guidelines that apply to your task. " + f"Cite the guidelines you used." +) + + +_CODEBASE_SEED = { + "src/__init__.py": "", + "src/parser.py": ( + '"""Parser module for csv and json inputs."""\n' + "\n" + "def parse_csv(text: str) -> list:\n" + " # TODO: handle nested quotes properly\n" + " return [row.split(',') for row in text.splitlines()]\n" + "\n" + "def parse_json(text: str):\n" + " import json\n" + " return json.loads(text)\n" + ), + "src/writer.py": ( + "from .parser import parse_csv\n\ndef write_csv(rows):\n return '\\n'.join(','.join(map(str, r)) for r in rows)\n" + ), + "src/api.py": ( + "from .parser import parse_csv, parse_json\n" + "\n" + "def fetch_and_parse(text: str, fmt: str):\n" + " return parse_csv(text) if fmt == 'csv' else parse_json(text)\n" + ), + "tests/__init__.py": "", + "tests/test_parser.py": ( + "from src.parser import parse_csv\n\ndef test_parse_basic():\n assert parse_csv('a,b\\nc,d') == [['a','b'], ['c','d']]\n" + ), + "README.md": ("# demo\n\nSmall Python project under `src/` with tests under `tests/`.\n"), +} + + +def _seed_codebase(ws: Path) -> None: + for rel, content in _CODEBASE_SEED.items(): + p = ws / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content, encoding="utf-8") + + +def _seed_format_group(ws: Path, group: str) -> list[str]: + """Seed image/archive/text format samples via the stdlib generators in + `_format_samples.py`. Group is one of `image-formats`, `archive-formats`, + `text-formats`.""" + from _format_samples import seed_into # local import — script lives next door + + return seed_into(ws, group) + + +def make_workspace(tmp_root: Path, condition: str, seed: str | None = None) -> Path: + """Build a per-run workspace with the wiki + condition-specific setup + + optional task-specific seed (e.g. a small mock python project).""" + ws = tmp_root / "workspace" + shutil.copytree(DEMO_WORKSPACE, ws, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup")) + # Mount the wiki inside the workspace at the same name the conditions reference. + shutil.copytree(WIKI_SRC, ws / WIKI_NAME) + # Per-condition setup + if condition == "claude_md": + (ws / "CLAUDE.md").write_text( + f"Before non-trivial tasks in this repo, consult `{WIKI_NAME}/AGENTS.md` for relevant guidelines.\n", + encoding="utf-8", + ) + elif condition == "claude_md_strong": + (ws / "CLAUDE.md").write_text(_STRONG_HINT + "\n", encoding="utf-8") + # Per-task seed + if seed == "codebase": + _seed_codebase(ws) + elif seed in ("image-formats", "archive-formats", "text-formats"): + _seed_format_group(ws, seed) + return ws + + +def build_prompt(condition: str, base_prompt: str) -> str: + if condition == "skill": + return "Use any skills that may help. " + base_prompt + if condition == "prompt": + return _STRONG_HINT + " " + base_prompt + return base_prompt + + +_HINT_PLUGIN = REPO_ROOT / "tests" / "e2e" / "_wiki_hint_plugin" + + +def run_sandbox(workspace: Path, prompt: str, condition: str) -> dict: + """Run a single sandbox session; return {stdout, stderr, returncode, duration_s}. + + Per condition extras: + - `system_prompt`: pass `--append-system-prompt` with the strong hint. + - `session_hook`: mount _wiki_hint_plugin which fires a SessionStart + hook printing the strong hint. + + Other conditions don't pass `--plugin-dir` (avoids the evolve-lite recall + hook + recall skill confound). Trajectory comes from + `--output-format stream-json` on stdout (one event per line). + """ + cmd = ["docker", "run", "--rm"] + for var in FORWARDED_ENV_VARS: + if os.environ.get(var): + cmd += ["-e", var] + docker_args = ["-v", f"{workspace}:/workspace"] + claude_extras = "" + if condition == "session_hook": + docker_args += ["-v", f"{_HINT_PLUGIN}:/plugins/_wiki_hint"] + claude_extras = "--plugin-dir /plugins/_wiki_hint " + if condition == "system_prompt": + claude_extras = f"--append-system-prompt {json.dumps(_STRONG_HINT)} " + cmd += docker_args + cmd += [ + SANDBOX_IMAGE, + "bash", + "-c", + f"claude {claude_extras}--dangerously-skip-permissions --output-format stream-json --verbose -p {json.dumps(prompt)}", + ] + t0 = time.time() + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=TIMEOUT_SECONDS) + dt = time.time() - t0 + return { + "returncode": proc.returncode, + "stdout": proc.stdout, + "stderr": proc.stderr, + "duration_s": round(dt, 2), + } + + +def parse_stream_json(stdout: str) -> tuple[list[str], str, list[dict]]: + """Parse `claude -p --output-format stream-json --verbose` output. + + Returns (wiki_access_paths, assistant_text, all_events). + + `wiki_access_paths` collects any signal of wiki access — Read tool calls + on wiki files, *or* Bash commands that cat/less/grep wiki files. The + agent often reads wiki content via `cat /AGENTS.md` + rather than the Read tool, so we check both surfaces. + """ + access_paths: list[str] = [] + chunks: list[str] = [] + events: list[dict] = [] + bash_pat = re.compile( + r"\b(?:cat|less|head|tail|more|grep|sed)\b[^|;]*?(\S*?(?:AGENTS\.md|wiki-example/[A-Za-z0-9_./-]+))", + ) + for line in stdout.splitlines(): + line = line.strip() + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + events.append(event) + if event.get("type") != "assistant": + continue + msg = event.get("message", {}) or {} + content = msg.get("content") + if isinstance(content, str): + chunks.append(content) + elif isinstance(content, list): + for b in content: + if not isinstance(b, dict): + continue + if b.get("type") == "text": + chunks.append(b.get("text", "")) + elif b.get("type") == "tool_use": + name = b.get("name") + inp = b.get("input") or {} + if name == "Read": + fp = inp.get("file_path", "") + if fp: + access_paths.append(fp) + elif name == "Bash": + cmd = inp.get("command", "") + for m in bash_pat.finditer(cmd): + access_paths.append(m.group(1)) + return access_paths, "\n".join(chunks), events + + +def score(access_paths: list[str], assistant_text: str, task: dict) -> dict: + text_lc = assistant_text.lower() + # 1. read_agents_md — Read tool OR Bash cat/less/grep on AGENTS.md + read_agents_md = any("AGENTS.md" in p for p in access_paths) + # 2. cited_guideline: any expected filename mentioned in assistant text + expected_files = task.get("expected_guideline_filenames") or [] + cited_filename = any(fn.lower() in text_lc for fn in expected_files) + # OR any of the wiki concepts (the "match_any" set) appears + match_any = task.get("outcome_match_any") or [] + cited_concept = any(s.lower() in text_lc for s in match_any) + cited_guideline = cited_filename or cited_concept + # 3. outcome_match: every required substring present + must_all = task.get("outcome_match_all") or [] + outcome_match = all(s.lower() in text_lc for s in must_all) + return { + "read_agents_md": bool(read_agents_md), + "cited_guideline": bool(cited_guideline), + "outcome_match": bool(outcome_match), + } + + +def main(argv: list[str] | None = None) -> int: + # Declare upfront because --wiki may rebind these later in this function. + global WIKI_NAME, WIKI_SRC, _STRONG_HINT + parser = argparse.ArgumentParser() + # `skill` condition is omitted: the agent-wiki/ family is not registered + # as a plugin skill in evolve-lite's plugin.json (which only declares + # ./skills/evolve-lite/). Loading the plugin to register it would also + # pull in the recall hook + recall skill, which confound the test. + parser.add_argument( + "--conditions", + default="baseline,prompt,claude_md", + help="Comma-separated condition slugs. " + "Available: baseline, prompt, claude_md, claude_md_strong, " + "system_prompt, session_hook. (skill condition deferred — " + "agent-wiki/* not registered as plugin skills.)", + ) + parser.add_argument("--trials", type=int, default=3, help="Trials per condition") + parser.add_argument("--task", default="t1-lens-model", help="Task id (or comma-separated task ids) from wiki_consult_tasks.yaml") + parser.add_argument("--wiki", default=None, help=f"Wiki dir to mount at /workspace//. Default: {WIKI_NAME}") + parser.add_argument("--out-root", default="experiments/results", help="Where to write the results dir") + parser.add_argument("--keep-workspaces", action="store_true", help="Don't delete per-run workspaces (debug)") + args = parser.parse_args(argv) + + # Allow --wiki to override the module-level constants. _STRONG_HINT is + # already a module global that captures WIKI_NAME at import time, so + # rebuild it whenever we override. + if args.wiki: + WIKI_NAME = args.wiki + WIKI_SRC = REPO_ROOT / WIKI_NAME + _STRONG_HINT = ( + f"Before doing anything else in this workspace, read `{WIKI_NAME}/AGENTS.md` " + f"and follow its retrieval recipe to find guidelines that apply to your task. " + f"Cite the guidelines you used." + ) + + if not WIKI_SRC.is_dir(): + print(f"error: {WIKI_SRC} does not exist. Run Phase A first.", file=sys.stderr) + return 2 + + # Load tasks (--task may be comma-separated) + tasks_file = REPO_ROOT / "tests" / "e2e" / "wiki_consult_tasks.yaml" + tasks = {t["id"]: t for t in yaml.safe_load(tasks_file.read_text())} + task_ids = [t.strip() for t in args.task.split(",") if t.strip()] + for tid in task_ids: + if tid not in tasks: + print(f"error: task {tid!r} not found in {tasks_file}", file=sys.stderr) + return 2 + + conditions = [c.strip() for c in args.conditions.split(",") if c.strip()] + valid = {"baseline", "skill", "prompt", "claude_md", "claude_md_strong", "system_prompt", "session_hook"} + for c in conditions: + if c not in valid: + print(f"error: unknown condition {c!r}; valid: {sorted(valid)}", file=sys.stderr) + return 2 + + ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%dT%H%M%SZ") + out_dir = REPO_ROOT / args.out_root / f"wiki-consult-{ts}" + out_dir.mkdir(parents=True, exist_ok=True) + transcripts_dir = out_dir / "transcripts" + + runs_path = out_dir / "runs.jsonl" + runs_f = runs_path.open("w", encoding="utf-8") + + print(f"writing results to {out_dir}", file=sys.stderr) + print(f"conditions: {conditions}, trials: {args.trials}, tasks: {task_ids}", file=sys.stderr) + + summary: dict[tuple[str, str], list[dict]] = {(t, c): [] for t in task_ids for c in conditions} + for tid in task_ids: + task = tasks[tid] + seed = task.get("seed") + for condition in conditions: + for trial in range(1, args.trials + 1): + print(f"\n=== {tid} / {condition} / trial {trial}/{args.trials} ===", file=sys.stderr) + tmp_root = out_dir / "_workspaces" / f"{tid}-{condition}-t{trial}" + tmp_root.mkdir(parents=True, exist_ok=True) + ws = make_workspace(tmp_root, condition, seed=seed) + prompt = build_prompt(condition, task["prompt"]) + try: + run = run_sandbox(ws, prompt, condition) + except subprocess.TimeoutExpired: + print(f" ✗ TIMEOUT after {TIMEOUT_SECONDS}s — skipping this trial", file=sys.stderr) + runs_f.write( + json.dumps( + { + "task": tid, + "condition": condition, + "trial": trial, + "duration_s": TIMEOUT_SECONDS, + "returncode": None, + "read_agents_md": False, + "cited_guideline": False, + "outcome_match": False, + "access_paths_n": 0, + "assistant_text_len": 0, + "timed_out": True, + } + ) + + "\n" + ) + runs_f.flush() + if not args.keep_workspaces: + shutil.rmtree(tmp_root, ignore_errors=True) + continue + access_paths, assistant_text, events = parse_stream_json(run["stdout"]) + sig = score(access_paths, assistant_text, task) + row = { + "task": tid, + "condition": condition, + "trial": trial, + "duration_s": run["duration_s"], + "returncode": run["returncode"], + **sig, + "access_paths_n": len(access_paths), + "assistant_text_len": len(assistant_text), + } + runs_f.write(json.dumps(row) + "\n") + runs_f.flush() + summary[(tid, condition)].append(row) + print( + f" read_agents_md={sig['read_agents_md']} " + f"cited_guideline={sig['cited_guideline']} " + f"outcome_match={sig['outcome_match']} " + f"({run['duration_s']:.0f}s)", + file=sys.stderr, + ) + # Stash the stream-json output for spot-checks + dst_dir2 = transcripts_dir / tid / condition + dst_dir2.mkdir(parents=True, exist_ok=True) + (dst_dir2 / f"trial-{trial}.jsonl").write_text(run["stdout"], encoding="utf-8") + if run["returncode"] != 0: + (dst_dir2 / f"trial-{trial}.stderr.txt").write_text(run["stderr"], encoding="utf-8") + if not args.keep_workspaces: + shutil.rmtree(tmp_root, ignore_errors=True) + runs_f.close() + + # Render summary.md (one section per task) + md_lines = [f"# Wiki-consult experiment — {ts}", ""] + for tid in task_ids: + task = tasks[tid] + md_lines += [ + f"## Task `{tid}` — {task['prompt']!r}", + "", + f"Trials per condition: **{args.trials}**", + "", + "| Condition | read AGENTS.md | cited guideline | outcome match | median runtime (s) |", + "|------------|:--------------:|:---------------:|:-------------:|-------------------:|", + ] + for condition in conditions: + rows = summary[(tid, condition)] + n = len(rows) + if n == 0: + continue + rd = sum(r["read_agents_md"] for r in rows) + ct = sum(r["cited_guideline"] for r in rows) + om = sum(r["outcome_match"] for r in rows) + durs = sorted(r["duration_s"] for r in rows) + median = durs[n // 2] + md_lines.append(f"| {condition:<10} | {rd}/{n} | {ct}/{n} | {om}/{n} | {median:.0f} |") + md_lines.append("") + md_lines.extend( + [ + "", + "Signals:", + "", + "- **read AGENTS.md**: agent's trajectory contains a `Read` of `AGENTS.md`.", + "- **cited guideline**: agent's text contains an expected guideline filename or wiki concept (e.g. `0xA434`, `0x8769`, `ExifIFD`).", + "- **outcome match**: agent's text contains all required substrings — for the lens-model task, the answer `Google Pixel 4a Rear Wide Camera`.", + "", + f"Runs JSONL: `{runs_path.relative_to(REPO_ROOT)}`", + f"Transcripts: `{transcripts_dir.relative_to(REPO_ROOT)}/`", + ] + ) + (out_dir / "summary.md").write_text("\n".join(md_lines) + "\n", encoding="utf-8") + + print(f"\nwrote {runs_path}", file=sys.stderr) + print(f"wrote {out_dir / 'summary.md'}", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/explorations/agent-wiki/experiments/harness/extract_trial_metrics.py b/explorations/agent-wiki/experiments/harness/extract_trial_metrics.py new file mode 100644 index 00000000..0fddd3f5 --- /dev/null +++ b/explorations/agent-wiki/experiments/harness/extract_trial_metrics.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# mypy: ignore-errors +# Exploration/reference code — not type-checked to the project standard. +"""Extract per-trial metrics from a stream-json transcript. + +Pulls token counts from `assistant.usage` events + the terminal `result` +event. Counts tool calls and wiki-page reads. Used by the two-batch +experiment to build the with-wiki vs without-wiki comparison. + +Usage: + uv run python scripts/extract_trial_metrics.py \\ + --transcript path/to/trial-1.jsonl --task t6-png-dim --batch 1 \\ + --condition claude_md_strong [--outcome-match-all '...'] + +Emits one JSON object on stdout. Pipe to a .jsonl file for aggregation. +""" + +from __future__ import annotations + +import argparse +import json +import re +from pathlib import Path + + +def parse(transcript: Path) -> dict: + events = [json.loads(ln) for ln in transcript.read_text(encoding="utf-8").splitlines() if ln.strip()] + sid = "?" + duration_ms = 0 + total_cost_usd = 0.0 + final_text = "" + in_tokens = cache_creation = cache_read = out_tokens = 0 + tool_calls = 0 + wiki_reads = 0 # Read of AGENTS.md / _index.jsonl / guidelines/*.md + agents_md_read = False + index_read = False + guideline_reads = 0 + + for e in events: + t = e.get("type") + if t == "system" and e.get("subtype") == "init": + sid = e.get("session_id") or sid + elif t == "assistant": + usage = (e.get("message") or {}).get("usage") or {} + in_tokens += int(usage.get("input_tokens", 0) or 0) + cache_creation += int(usage.get("cache_creation_input_tokens", 0) or 0) + cache_read += int(usage.get("cache_read_input_tokens", 0) or 0) + out_tokens += int(usage.get("output_tokens", 0) or 0) + for b in (e.get("message") or {}).get("content") or []: + if not isinstance(b, dict): + continue + if b.get("type") == "text": + final_text = b.get("text") or final_text + elif b.get("type") == "tool_use": + tool_calls += 1 + name = b.get("name") + inp = b.get("input") or {} + if name == "Read": + fp = inp.get("file_path", "") + if "AGENTS.md" in fp: + agents_md_read = True + wiki_reads += 1 + elif "_index.jsonl" in fp: + index_read = True + wiki_reads += 1 + elif "/guidelines/" in fp and fp.endswith(".md"): + guideline_reads += 1 + wiki_reads += 1 + elif name == "Bash": + cmd = inp.get("command", "") or "" + if "AGENTS.md" in cmd: + agents_md_read = True + wiki_reads += 1 + if "_index.jsonl" in cmd: + index_read = True + wiki_reads += 1 + m = re.search(r"/guidelines/[\w./-]+\.md", cmd) + if m: + guideline_reads += 1 + wiki_reads += 1 + elif t == "result": + duration_ms = int(e.get("duration_ms") or 0) + total_cost_usd = float(e.get("total_cost_usd") or 0.0) + final_text = e.get("result") or final_text + + return { + "session_id": sid, + "duration_s": round(duration_ms / 1000, 2), + "total_cost_usd": total_cost_usd, + "input_tokens": in_tokens, + "cache_creation_input_tokens": cache_creation, + "cache_read_input_tokens": cache_read, + "output_tokens": out_tokens, + "billable_tokens_proxy": in_tokens + cache_creation + out_tokens, # cache reads are cheap + "tool_calls": tool_calls, + "wiki_reads_total": wiki_reads, + "agents_md_read": agents_md_read, + "index_read": index_read, + "guideline_reads": guideline_reads, + "final_text_len": len(final_text or ""), + } + + +def score_outcome(text: str, must_all: list[str]) -> bool: + text_lc = (text or "").lower() + return all(s.lower() in text_lc for s in must_all) + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--transcript", required=True) + ap.add_argument("--task", required=True) + ap.add_argument("--batch", required=True) + ap.add_argument("--condition", default="claude_md_strong") + ap.add_argument("--trial", required=True) + ap.add_argument("--outcome-match-all", default="", help="Comma-separated must-all-substrings for outcome_match") + args = ap.parse_args() + + rec = parse(Path(args.transcript)) + rec["task"] = args.task + rec["batch"] = int(args.batch) + rec["condition"] = args.condition + rec["trial"] = int(args.trial) + + must_all = [s.strip() for s in args.outcome_match_all.split(",") if s.strip()] + # Re-parse the result event for outcome scoring + final_text = "" + for line in Path(args.transcript).read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + e = json.loads(line) + if e.get("type") == "result": + final_text = e.get("result") or "" + break + if e.get("type") == "assistant": + for b in (e.get("message") or {}).get("content") or []: + if isinstance(b, dict) and b.get("type") == "text": + final_text = b.get("text") or final_text + rec["outcome_match"] = score_outcome(final_text, must_all) if must_all else None + + print(json.dumps(rec)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/explorations/agent-wiki/experiments/harness/fiveway_compare.py b/explorations/agent-wiki/experiments/harness/fiveway_compare.py new file mode 100644 index 00000000..1aabd15a --- /dev/null +++ b/explorations/agent-wiki/experiments/harness/fiveway_compare.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python3 +# mypy: ignore-errors +# Exploration/reference code — not type-checked to the project standard. +"""Five-way comparison: empty / guidelines / skills / both / pruned. + +Reads four metrics files: + ../metrics/twobatch.metrics.jsonl (twobatch — batch 1 = empty, batch 2 = guidelines) + ../metrics/twobatch-skills.metrics.jsonl (skills arm) + ../metrics/twobatch-both.metrics.jsonl (both arm) + ../metrics/pruned-fixed-9atomic.metrics.jsonl (pruned arm: skills + only no-skill-coverage atomics) +""" + +from __future__ import annotations + +import argparse +import json +import statistics +from collections import defaultdict +from pathlib import Path + +REPO = Path(__file__).resolve().parents[1] + +TASK_IDS_ORDER = [ + "t1-lens-model", + "t6-png-dim", + "t7-gif-dim", + "t8-bmp-info", + "t9-webp-dim", + "t10-zip-list", + "t11-tar-list", + "t12-wav-info", + "t13-gzip-dec", + "t14-csv-quoted", + "t15-jsonl-kinds", + "t16-ini-key", + "t17-log-errors", + "t2-imports", + "t3-todos", + "t5-base64", +] + +FAMILY = { + "t1-lens-model": "lens-model", + "t6-png-dim": "image", + "t7-gif-dim": "image", + "t8-bmp-info": "image", + "t9-webp-dim": "image", + "t10-zip-list": "archive", + "t11-tar-list": "archive", + "t12-wav-info": "archive", + "t13-gzip-dec": "archive", + "t14-csv-quoted": "text", + "t15-jsonl-kinds": "text", + "t16-ini-key": "text", + "t17-log-errors": "text", + "t2-imports": "skip", + "t3-todos": "skip", + "t5-base64": "skip", +} + +ARMS = ("empty", "guidelines", "skills", "both", "pruned") + + +def median(xs): + xs = [x for x in xs if x is not None] + return statistics.median(xs) if xs else None + + +def acc(rs): + oms = [r.get("outcome_match") for r in rs if r.get("outcome_match") is not None] + return sum(1 for x in oms if x) / len(oms) if oms else None + + +def fmt(x, kind="num"): + if x is None: + return "—" + if kind == "tokens": + return f"{int(x):,}" + if kind == "dollars": + return f"${x:.4f}" + if kind == "duration": + return f"{x:.0f}s" + if kind == "pct": + return f"{x:.0%}" + return f"{x:.1f}" + + +def delta(base, other, kind="num"): + if base is None or other is None or base == 0: + return "—" + diff = other - base + pct = diff / base + sign = "+" if diff >= 0 else "" + if kind == "tokens": + return f"{sign}{int(diff):,} ({sign}{pct:.0%})" + if kind == "duration": + return f"{sign}{diff:.0f}s ({sign}{pct:.0%})" + if kind == "dollars": + return f"{sign}${diff:.4f} ({sign}{pct:.0%})" + if kind == "pct": + return f"{sign}{pct:.0%}" + return f"{sign}{diff:.1f}" + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--twobatch-metrics", default="../metrics/twobatch.metrics.jsonl") + ap.add_argument("--skills-metrics", default="../metrics/twobatch-skills.metrics.jsonl") + ap.add_argument("--both-metrics", default="../metrics/twobatch-both.metrics.jsonl") + # Corrected pruned arm: re-run against a fixed (index-refreshed) wiki. + # The original experiments/results-twobatch-pruned/ ran against a stale + # index (0 skills exposed, 6 broken links) — see the Correction note. + ap.add_argument("--pruned-metrics", default="../metrics/pruned-fixed-9atomic.metrics.jsonl") + ap.add_argument("--out", default="experiments/twobatch-fiveway-comparison.md") + args = ap.parse_args() + + rows: list[dict] = [] + for line in Path(args.twobatch_metrics).read_text().splitlines(): + if not line.strip(): + continue + r = json.loads(line) + r["arm"] = "empty" if r["batch"] == 1 else "guidelines" + rows.append(r) + for arm, path in (("skills", args.skills_metrics), ("both", args.both_metrics), ("pruned", args.pruned_metrics)): + p = Path(path) + if not p.exists(): + continue + for line in p.read_text().splitlines(): + if not line.strip(): + continue + r = json.loads(line) + r["arm"] = arm + rows.append(r) + + by_task: dict[str, dict[str, list[dict]]] = defaultdict(lambda: defaultdict(list)) + for r in rows: + by_task[r["task"]][r["arm"]].append(r) + + by_arm = {a: [r for r in rows if r["arm"] == a] for a in ARMS} + + md: list[str] = [] + md.append("# Five-way wiki-helps comparison: empty / guidelines / skills / both / pruned") + md.append("") + md.append( + "Same 16-task corpus, five arms, all `claude_md_strong` condition. " + "Empty + guidelines arms are twobatch's batch-1 / batch-2. Skills arm " + "is twobatch-skills (3 skills, no guidelines). Both arm is " + "twobatch-both (those same 3 skills + ~15 atomics, no clusters). " + "**Pruned arm** is twobatch-pruned: same 3 skills + only the " + "no-skill-coverage atomics (delete-on-promote policy applied — " + "image-format and CSV atomics archived because their corresponding " + "skills were synthesized)." + ) + md.append("") + + md.append("## Aggregate") + md.append("") + md.append("| Metric | Empty | Guidelines | Skills | Both | Pruned | P vs G | P vs S | P vs B |") + md.append("|---|---:|---:|---:|---:|---:|---:|---:|---:|") + pairs = [ + ("Trials", "len", "num"), + ("Accuracy (mean)", "_acc", "pct"), + ("Median duration", "duration_s", "duration"), + ("Median input tokens", "input_tokens", "tokens"), + ("Median output tokens", "output_tokens", "tokens"), + ("Median total cost USD", "total_cost_usd", "dollars"), + ("Median tool calls", "tool_calls", "num"), + ("Median wiki reads", "wiki_reads_total", "num"), + ("Median guideline reads", "guideline_reads", "num"), + ] + for label, field, kind in pairs: + vals = {} + for a in ARMS: + arm_rows = by_arm[a] + if field == "len": + vals[a] = len(arm_rows) + elif field == "_acc": + vals[a] = acc(arm_rows) + else: + vals[a] = median([r.get(field) for r in arm_rows]) + if field == "len": + md.append( + f"| {label} | {vals['empty']} | {vals['guidelines']} | {vals['skills']} | " + f"{vals['both']} | {vals['pruned']} | " + f"{vals['pruned'] - vals['guidelines']:+d} | " + f"{vals['pruned'] - vals['skills']:+d} | " + f"{vals['pruned'] - vals['both']:+d} |" + ) + else: + md.append( + f"| {label} | {fmt(vals['empty'], kind)} | {fmt(vals['guidelines'], kind)} | " + f"{fmt(vals['skills'], kind)} | {fmt(vals['both'], kind)} | " + f"{fmt(vals['pruned'], kind)} | " + f"{delta(vals['guidelines'], vals['pruned'], kind)} | " + f"{delta(vals['skills'], vals['pruned'], kind)} | " + f"{delta(vals['both'], vals['pruned'], kind)} |" + ) + md.append("") + + md.append("## By task family") + md.append("") + md.append("Median total_cost_usd. `Δ S→P` = `pruned` minus `skills`; `Δ B→P` = `pruned` minus `both`.") + md.append("") + md.append("| Family | Tasks | E acc | G acc | S acc | B acc | P acc | E $ | G $ | S $ | B $ | P $ | Δ S→P | Δ B→P |") + md.append("|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|") + fam_groups: dict[str, list[str]] = defaultdict(list) + for tid, fam in FAMILY.items(): + fam_groups[fam].append(tid) + for fam, tids in fam_groups.items(): + in_fam = {a: [r for r in rows if r["task"] in tids and r["arm"] == a] for a in ARMS} + cs = {a: median([r.get("total_cost_usd") for r in in_fam[a]]) for a in ARMS} + md.append( + f"| {fam} | {len(tids)} | " + f"{fmt(acc(in_fam['empty']), 'pct')} | {fmt(acc(in_fam['guidelines']), 'pct')} | " + f"{fmt(acc(in_fam['skills']), 'pct')} | {fmt(acc(in_fam['both']), 'pct')} | " + f"{fmt(acc(in_fam['pruned']), 'pct')} | " + f"{fmt(cs['empty'], 'dollars')} | {fmt(cs['guidelines'], 'dollars')} | " + f"{fmt(cs['skills'], 'dollars')} | {fmt(cs['both'], 'dollars')} | " + f"{fmt(cs['pruned'], 'dollars')} | " + f"{delta(cs['skills'], cs['pruned'], 'dollars')} | " + f"{delta(cs['both'], cs['pruned'], 'dollars')} |" + ) + md.append("") + + md.append("## Per task — cost USD") + md.append("") + md.append("| Task | E $ | G $ | S $ | B $ | P $ | Δ S→P | Δ B→P |") + md.append("|---|---:|---:|---:|---:|---:|---:|---:|") + for tid in TASK_IDS_ORDER: + if not by_task[tid]: + continue + cs = {a: median([r.get("total_cost_usd") for r in by_task[tid].get(a, [])]) for a in ARMS} + md.append( + f"| `{tid}` | {fmt(cs['empty'], 'dollars')} | {fmt(cs['guidelines'], 'dollars')} | " + f"{fmt(cs['skills'], 'dollars')} | {fmt(cs['both'], 'dollars')} | " + f"{fmt(cs['pruned'], 'dollars')} | " + f"{delta(cs['skills'], cs['pruned'], 'dollars')} | " + f"{delta(cs['both'], cs['pruned'], 'dollars')} |" + ) + md.append("") + + md.append("## Per task — accuracy") + md.append("") + md.append("| Task | E acc | G acc | S acc | B acc | P acc |") + md.append("|---|:-:|:-:|:-:|:-:|:-:|") + for tid in TASK_IDS_ORDER: + if not by_task[tid]: + continue + as_ = {a: acc(by_task[tid].get(a, [])) for a in ARMS} + md.append( + f"| `{tid}` | {fmt(as_['empty'], 'pct')} | {fmt(as_['guidelines'], 'pct')} | " + f"{fmt(as_['skills'], 'pct')} | {fmt(as_['both'], 'pct')} | " + f"{fmt(as_['pruned'], 'pct')} |" + ) + md.append("") + md.append("## Notes") + md.append("") + md.append("- Empty + guidelines + skills + both columns reproduce the 4-way comparison.") + md.append( + "- Pruned column is the new arm, testing the **delete-on-promote** policy: " + "when `synthesize-skill` produces a skill, it inferentially archives the " + "atomic guidelines covered by the skill (via tag-superset, slug-keyword, or " + "format-identifier description match). Result: 3 skills + 9 atomics + 6 archived." + ) + md.append( + '- The pruned arm is the experimental answer to the open question "if ' + "'both' loses to 'skills-only', does 'skills + only the no-skill-coverage " + "guidelines' beat 'skills-only'?\" raised in §7 of RESULTS-SUMMARY.md." + ) + md.append("") + md.append("### Correction — Pruned column is the re-run against a fixed index") + md.append("") + md.append( + "The original pruned arm (commit `8bcd713`) ran against a wiki whose " + "`_index.jsonl` was **stale**: `render-skill` archived the covered atomics " + "but never refreshed the indexes, so the wiki exposed **0 skills, 15 " + "guideline rows, 6 broken links**. Agents couldn't see the skills and fell " + "back to dangling guideline rows (original: median $0.181, 290 output " + "tokens, 3 wiki reads, 1 guideline read)." + ) + md.append("") + md.append( + "Commit `2adc67a` fixed the builder to refresh the section indexes + " + "`_index.jsonl` after `render-skill`/`render-cluster` (with an integrity " + "assertion). This Pruned column is the full 16-task re-run against the " + "corrected wiki: median **$0.173**, ~225 output tokens, 2 wiki reads, **0** " + "guideline reads. Net: pruned moved from +1% to **-3% vs both** and from " + "+24% to **+18% vs skills**. Skills-only is still cheapest, but the apparent " + '"pruning is worse than both" result was largely the stale-index bug, not ' + "the policy. See `pruned-index-hypothesis.md` for the slice-level diagnosis." + ) + Path(args.out).write_text("\n".join(md) + "\n", encoding="utf-8") + print(f"wrote {args.out}", flush=True) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/explorations/agent-wiki/experiments/harness/fourway_compare.py b/explorations/agent-wiki/experiments/harness/fourway_compare.py new file mode 100644 index 00000000..27e526d9 --- /dev/null +++ b/explorations/agent-wiki/experiments/harness/fourway_compare.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +# mypy: ignore-errors +# Exploration/reference code — not type-checked to the project standard. +"""Four-way comparison: empty / guidelines / skills / both. + +Reads three metrics files: + ../metrics/twobatch.metrics.jsonl (twobatch — batch 1 = empty, batch 2 = guidelines) + ../metrics/twobatch-skills.metrics.jsonl (skills arm) + ../metrics/twobatch-both.metrics.jsonl (both arm) +""" + +from __future__ import annotations + +import argparse +import json +import statistics +from collections import defaultdict +from pathlib import Path + +REPO = Path(__file__).resolve().parents[1] + +TASK_IDS_ORDER = [ + "t1-lens-model", + "t6-png-dim", + "t7-gif-dim", + "t8-bmp-info", + "t9-webp-dim", + "t10-zip-list", + "t11-tar-list", + "t12-wav-info", + "t13-gzip-dec", + "t14-csv-quoted", + "t15-jsonl-kinds", + "t16-ini-key", + "t17-log-errors", + "t2-imports", + "t3-todos", + "t5-base64", +] + +FAMILY = { + "t1-lens-model": "lens-model", + "t6-png-dim": "image", + "t7-gif-dim": "image", + "t8-bmp-info": "image", + "t9-webp-dim": "image", + "t10-zip-list": "archive", + "t11-tar-list": "archive", + "t12-wav-info": "archive", + "t13-gzip-dec": "archive", + "t14-csv-quoted": "text", + "t15-jsonl-kinds": "text", + "t16-ini-key": "text", + "t17-log-errors": "text", + "t2-imports": "skip", + "t3-todos": "skip", + "t5-base64": "skip", +} + +ARMS = ("empty", "guidelines", "skills", "both") + + +def median(xs): + xs = [x for x in xs if x is not None] + return statistics.median(xs) if xs else None + + +def acc(rs): + oms = [r.get("outcome_match") for r in rs if r.get("outcome_match") is not None] + return sum(1 for x in oms if x) / len(oms) if oms else None + + +def fmt(x, kind="num"): + if x is None: + return "—" + if kind == "tokens": + return f"{int(x):,}" + if kind == "dollars": + return f"${x:.4f}" + if kind == "duration": + return f"{x:.0f}s" + if kind == "pct": + return f"{x:.0%}" + return f"{x:.1f}" + + +def delta(base, other, kind="num"): + if base is None or other is None or base == 0: + return "—" + diff = other - base + pct = diff / base + sign = "+" if diff >= 0 else "" + if kind == "tokens": + return f"{sign}{int(diff):,} ({sign}{pct:.0%})" + if kind == "duration": + return f"{sign}{diff:.0f}s ({sign}{pct:.0%})" + if kind == "dollars": + return f"{sign}${diff:.4f} ({sign}{pct:.0%})" + if kind == "pct": + return f"{sign}{pct:.0%}" + return f"{sign}{diff:.1f}" + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--twobatch-metrics", default="../metrics/twobatch.metrics.jsonl") + ap.add_argument("--skills-metrics", default="../metrics/twobatch-skills.metrics.jsonl") + ap.add_argument("--both-metrics", default="../metrics/twobatch-both.metrics.jsonl") + ap.add_argument("--out", default="experiments/twobatch-fourway-comparison.md") + args = ap.parse_args() + + rows: list[dict] = [] + for line in Path(args.twobatch_metrics).read_text().splitlines(): + if not line.strip(): + continue + r = json.loads(line) + r["arm"] = "empty" if r["batch"] == 1 else "guidelines" + rows.append(r) + for arm, path in (("skills", args.skills_metrics), ("both", args.both_metrics)): + p = Path(path) + if not p.exists(): + continue + for line in p.read_text().splitlines(): + if not line.strip(): + continue + r = json.loads(line) + r["arm"] = arm + rows.append(r) + + by_task: dict[str, dict[str, list[dict]]] = defaultdict(lambda: defaultdict(list)) + for r in rows: + by_task[r["task"]][r["arm"]].append(r) + + by_arm = {a: [r for r in rows if r["arm"] == a] for a in ARMS} + + md: list[str] = [] + md.append("# Four-way wiki-helps comparison: empty / guidelines / skills / both") + md.append("") + md.append( + "Same 16-task corpus, four arms, all `claude_md_strong` condition. " + "Empty + guidelines arms are twobatch's batch-1 / batch-2. Skills arm " + "is twobatch-skills (3 skills, no guidelines). Both arm is " + "twobatch-both (those same 3 skills + ~15 atomics, no clusters)." + ) + md.append("") + + md.append("## Aggregate") + md.append("") + md.append("| Metric | Empty | Guidelines | Skills | Both | Both vs G | Both vs S |") + md.append("|---|---:|---:|---:|---:|---:|---:|") + pairs = [ + ("Trials", "len", "num"), + ("Accuracy (mean)", "_acc", "pct"), + ("Median duration", "duration_s", "duration"), + ("Median input tokens", "input_tokens", "tokens"), + ("Median output tokens", "output_tokens", "tokens"), + ("Median total cost USD", "total_cost_usd", "dollars"), + ("Median tool calls", "tool_calls", "num"), + ("Median wiki reads", "wiki_reads_total", "num"), + ("Median guideline reads", "guideline_reads", "num"), + ] + for label, field, kind in pairs: + vals = {} + for a in ARMS: + arm_rows = by_arm[a] + if field == "len": + vals[a] = len(arm_rows) + elif field == "_acc": + vals[a] = acc(arm_rows) + else: + vals[a] = median([r.get(field) for r in arm_rows]) + if field == "len": + md.append( + f"| {label} | {vals['empty']} | {vals['guidelines']} | {vals['skills']} | {vals['both']} | " + f"{vals['both'] - vals['guidelines']:+d} | {vals['both'] - vals['skills']:+d} |" + ) + else: + md.append( + f"| {label} | {fmt(vals['empty'], kind)} | {fmt(vals['guidelines'], kind)} | " + f"{fmt(vals['skills'], kind)} | {fmt(vals['both'], kind)} | " + f"{delta(vals['guidelines'], vals['both'], kind)} | " + f"{delta(vals['skills'], vals['both'], kind)} |" + ) + md.append("") + + md.append("## By task family") + md.append("") + md.append("Median total_cost_usd. `Δ G→B` is `both` minus `guidelines`; `Δ S→B` is `both` minus `skills`.") + md.append("") + md.append("| Family | Tasks | E acc | G acc | S acc | B acc | E $ | G $ | S $ | B $ | Δ G→B | Δ S→B |") + md.append("|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|") + fam_groups: dict[str, list[str]] = defaultdict(list) + for tid, fam in FAMILY.items(): + fam_groups[fam].append(tid) + for fam, tids in fam_groups.items(): + in_fam = {a: [r for r in rows if r["task"] in tids and r["arm"] == a] for a in ARMS} + cs = {a: median([r.get("total_cost_usd") for r in in_fam[a]]) for a in ARMS} + md.append( + f"| {fam} | {len(tids)} | " + f"{fmt(acc(in_fam['empty']), 'pct')} | {fmt(acc(in_fam['guidelines']), 'pct')} | " + f"{fmt(acc(in_fam['skills']), 'pct')} | {fmt(acc(in_fam['both']), 'pct')} | " + f"{fmt(cs['empty'], 'dollars')} | {fmt(cs['guidelines'], 'dollars')} | " + f"{fmt(cs['skills'], 'dollars')} | {fmt(cs['both'], 'dollars')} | " + f"{delta(cs['guidelines'], cs['both'], 'dollars')} | " + f"{delta(cs['skills'], cs['both'], 'dollars')} |" + ) + md.append("") + + md.append("## Per task — cost USD") + md.append("") + md.append("| Task | E $ | G $ | S $ | B $ | Δ G→B | Δ S→B |") + md.append("|---|---:|---:|---:|---:|---:|---:|") + for tid in TASK_IDS_ORDER: + if not by_task[tid]: + continue + cs = {a: median([r.get("total_cost_usd") for r in by_task[tid].get(a, [])]) for a in ARMS} + md.append( + f"| `{tid}` | {fmt(cs['empty'], 'dollars')} | {fmt(cs['guidelines'], 'dollars')} | " + f"{fmt(cs['skills'], 'dollars')} | {fmt(cs['both'], 'dollars')} | " + f"{delta(cs['guidelines'], cs['both'], 'dollars')} | " + f"{delta(cs['skills'], cs['both'], 'dollars')} |" + ) + md.append("") + + md.append("## Per task — accuracy") + md.append("") + md.append("| Task | E acc | G acc | S acc | B acc |") + md.append("|---|:-:|:-:|:-:|:-:|") + for tid in TASK_IDS_ORDER: + if not by_task[tid]: + continue + as_ = {a: acc(by_task[tid].get(a, [])) for a in ARMS} + md.append( + f"| `{tid}` | {fmt(as_['empty'], 'pct')} | {fmt(as_['guidelines'], 'pct')} | " + f"{fmt(as_['skills'], 'pct')} | {fmt(as_['both'], 'pct')} |" + ) + md.append("") + md.append("## Notes") + md.append("") + md.append("- Empty + guidelines columns reproduce twobatch.") + md.append("- Skills column reproduces the skills-arm experiment.") + md.append( + "- Both column is the new arm: same 3 skills + ~15 atomics from " + "twobatch's batch-1 trajectories. No clusters (matching the " + "guidelines arm's structure)." + ) + md.append("- Trivial-recipe tasks (t11-tar, t13-gzip, t15-jsonl, t16-ini, t17-log, t2/t3, t5) have no matching skill in any arm.") + Path(args.out).write_text("\n".join(md) + "\n", encoding="utf-8") + print(f"wrote {args.out}", flush=True) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/explorations/agent-wiki/experiments/harness/normalize_stream_json_transcripts.py b/explorations/agent-wiki/experiments/harness/normalize_stream_json_transcripts.py new file mode 100644 index 00000000..dc88301b --- /dev/null +++ b/explorations/agent-wiki/experiments/harness/normalize_stream_json_transcripts.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +# mypy: ignore-errors +# Exploration/reference code — not type-checked to the project standard. +"""Normalize `claude -p --output-format stream-json --verbose` outputs. + +Reads stream-json transcripts emitted by the experiment runners and writes +one OpenAI-chat-completion JSON file per transcript, matching the schema +under trajectories/normalized/. + +Usage: + uv run python scripts/normalize_stream_json_transcripts.py \\ + --in experiments/results/wiki-consult-20260605T153035Z/transcripts \\ + --out trajectories/normalized \\ + --label example-corpus \\ + --user-prompt "what lens model was used for @sample.jpg. use exif metadata" \\ + --trial-prefix wiki-consult +""" + +from __future__ import annotations + +import argparse +import json +from collections import Counter +from pathlib import Path +from typing import Any + + +def parse_stream_json_file(path: Path, user_prompt: str) -> dict[str, Any]: + """Parse one stream-json file into normalized form.""" + events: list[dict] = [] + for line in path.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + events.append(json.loads(line)) + except json.JSONDecodeError: + continue + + init = next((e for e in events if e.get("type") == "system" and e.get("subtype") == "init"), None) + result = next((e for e in events if e.get("type") == "result"), None) + + session_id = (init or {}).get("session_id") or path.stem + model = (init or {}).get("model") or "claude-code" + duration_ms = (result or {}).get("duration_ms") or 0 + + messages: list[dict] = [{"role": "user", "content": user_prompt}] + tool_calls = 0 + tool_results = 0 + thinking = 0 + tool_counter: Counter[str] = Counter() + in_tokens = cache_creation = cache_read = out_tokens = 0 + + for ev in events: + if ev.get("type") == "assistant": + msg = ev.get("message", {}) or {} + usage = msg.get("usage") or {} + in_tokens += int(usage.get("input_tokens", 0) or 0) + cache_creation += int(usage.get("cache_creation_input_tokens", 0) or 0) + cache_read += int(usage.get("cache_read_input_tokens", 0) or 0) + out_tokens += int(usage.get("output_tokens", 0) or 0) + content = msg.get("content") + if not isinstance(content, list): + continue + for b in content: + if not isinstance(b, dict): + continue + t = b.get("type") + if t == "text": + text = b.get("text", "") + if text: + messages.append({"role": "assistant", "content": text}) + elif t == "thinking": + thinking += 1 + elif t == "tool_use": + name = b.get("name", "") + tool_counter[name] += 1 + tool_calls += 1 + messages.append( + { + "role": "assistant", + "tool_calls": [ + { + "id": b.get("id"), + "type": "function", + "function": { + "name": name, + "arguments": json.dumps(b.get("input") or {}), + }, + } + ], + } + ) + elif ev.get("type") == "user": + msg = ev.get("message", {}) or {} + content = msg.get("content") + if not isinstance(content, list): + continue + for b in content: + if not isinstance(b, dict): + continue + if b.get("type") == "tool_result": + tool_results += 1 + raw = b.get("content") + if isinstance(raw, list): + text_parts = [c.get("text", "") for c in raw if isinstance(c, dict) and c.get("type") == "text"] + text = "\n".join(text_parts) + elif isinstance(raw, str): + text = raw + else: + text = json.dumps(raw) + messages.append( + { + "role": "tool", + "tool_call_id": b.get("tool_use_id"), + "content": text, + } + ) + + top_tools = [{"tool": t, "count": c} for t, c in tool_counter.most_common(5)] + + return { + "schema_version": "1", + "dataset": "claude-transcripts", + "agent": "claude-code", + "session_id": session_id, + "model": model, + "models": [model], + "duration_seconds": round(duration_ms / 1000.0, 2), + "stats": { + "raw_event_count": len(events), + "message_count": len(messages), + "tool_call_count": tool_calls, + "tool_result_count": tool_results, + "thinking_block_count": thinking, + "sidechain_count": 0, + "top_tools": top_tools, + "input_tokens": in_tokens, + "cache_creation_input_tokens": cache_creation, + "cache_read_input_tokens": cache_read, + "output_tokens": out_tokens, + "total_cost_usd": float((result or {}).get("total_cost_usd") or 0.0), + }, + "openai_chat_completion": {"messages": messages}, + "recalled_guidelines": [], + } + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--in", dest="in_dir", required=True, help="Dir containing /trial-N.jsonl files (or a single file).") + ap.add_argument("--out", default="trajectories/normalized", help="Output root.") + ap.add_argument("--label", required=True, help="Label subdir under --out (becomes /