diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d26ae5ec..028118ff 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -49,7 +49,10 @@ repos:
       - id: detect-secrets
         name: detect secrets
         args: ['--baseline', '.secrets.baseline']
-        exclude: package.lock.json
+        # explorations/agent-wiki/ holds generated example wiki content + a schema
+        # doc full of example IDs whose 12-hex guideline content-hashes and session
+        # UUIDs trip the high-entropy detector; they are identifiers, not secrets.
+        exclude: 'package.lock.json|^explorations/agent-wiki/'
 
   # Plugin render-equality gate — fails if platform-integrations/ has drifted
   # from plugin-source/. Runs whenever plugin-source/ or the rendered tree
diff --git a/.secrets.baseline b/.secrets.baseline
index 14ee6fa1..727270c7 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -1,9 +1,9 @@
 {
   "exclude": {
-    "files": "^.secrets.baseline$|package-lock\\.json$",
+    "files": "^.secrets.baseline$|package-lock\\.json$|^explorations/agent\\-wiki/",
     "lines": null
   },
-  "generated_at": "2026-04-29T16:14:59Z",
+  "generated_at": "2026-06-10T06:41:48Z",
   "plugins_used": [
     {
       "name": "AWSKeyDetector"
@@ -156,11 +156,11 @@
     "sandbox/README.md": [
       {
         "hashed_secret": "b792a28a35da9b44fa0ee8a53002e9c238afb1bd",
+        "is_secret": false,
         "is_verified": false,
-        "line_number": 67,
+        "line_number": 68,
         "type": "Secret Keyword",
-        "verified_result": null,
-        "is_secret": false
+        "verified_result": null
       }
     ],
     "sandbox/sample.env": [
diff --git a/explorations/agent-wiki/README.md b/explorations/agent-wiki/README.md
new file mode 100644
index 00000000..b46679e4
--- /dev/null
+++ b/explorations/agent-wiki/README.md
@@ -0,0 +1,63 @@
+# agent-wiki
+
+An exploration in turning agent trajectories into a **reusable, evidence-grounded
+wiki** that future agents consult before acting — and the experiments measuring
+whether it actually helps.
+
+The core idea: after an agent finishes a task, distill its trajectory into wiki
+pages — episodic **summaries**, atomic **guidelines**, themed **cluster** pages,
+and executable **skills** — each linked back to the trajectory that produced it.
+A future agent, pointed at the wiki's `AGENTS.md`, retrieves the pages relevant
+to its task and applies them instead of re-deriving the recipe.
+
+## Layout
+
+```
+explorations/agent-wiki/
+├── skills/            the agent-wiki skill family + the build_agent_wiki.py builder
+│   ├── agent-wiki-summarize/             trajectory → episodic summary
+│   ├── agent-wiki-extract-guidelines/    trajectory → atomic guidelines
+│   ├── agent-wiki-synthesize-skill/      trajectory → executable SKILL.md
+│   ├── agent-wiki-consolidate-guidelines/ atomics → themed cluster pages
+│   ├── agent-wiki-tasks/                 cross-session task-comparison pages
+│   ├── agent-wiki-consult/               retrieval-time entry point
+│   ├── agent-wiki-ingest/                end-to-end orchestrator (all of the above)
+│   └── scripts/build_agent_wiki.py       deterministic builder (render-*/catalog)
+├── docs/
+│   ├── design.md      design & rationale
+│   └── schema.md      on-disk page/index schema
+├── experiments/       the empirical evidence (see RESULTS-SUMMARY.md)
+│   ├── RESULTS-SUMMARY.md
+│   ├── twobatch-*.md  the comparison reports (wiki vs no-wiki; skills vs guidelines; …)
+│   ├── pruned-index-hypothesis.md
+│   ├── metrics/       per-trial metric rollups (.jsonl)
+│   └── harness/       sandbox runner + comparison scripts to reproduce
+└── wikis/             worked examples — wikis built by the skills above
+    ├── wiki-twobatch/            16-task corpus, guidelines arm
+    ├── wiki-twobatch-skills/     same corpus, skills-only arm
+    ├── wiki-twobatch-both/       skills + guidelines
+    └── wiki-twobatch-pruned/     skills + only no-skill-coverage atomics (delete-on-promote)
+```
+
+## Reading order
+
+1. **`docs/design.md`** — what the wiki is and why it's shaped this way.
+2. **`experiments/RESULTS-SUMMARY.md`** — the running tape of findings
+   (wiki cuts cost ~20% at equal accuracy; skills beat guidelines; pointer
+   wording is load-bearing; composition matters more than wiki size).
+3. **`wikis/wiki-twobatch-skills/`** — open `AGENTS.md`, then `_index.jsonl`,
+   then any page, to see a real built wiki end-to-end.
+4. **`skills/agent-wiki-ingest/SKILL.md`** — how a batch of traces becomes a
+   wiki in one pass.
+
+## Scope of this exploration
+
+These are **benchmark-derived** example wikis (a synthetic 16-task
+file-format corpus). The raw per-trial sandbox transcripts and any wikis built from
+internal trajectory corpora are intentionally **not** included — only the metric
+rollups, the narrative reports, and the benchmark-derived wikis. Source links in
+wiki frontmatter are shown in the generic form `trajectories/<session-id>.json`.
+
+The skills here are a **standalone reference copy**, runnable via
+`skills/scripts/build_agent_wiki.py`; they are not wired into any plugin loader
+in this tree.
diff --git a/explorations/agent-wiki/docs/design.md b/explorations/agent-wiki/docs/design.md
new file mode 100644
index 00000000..83b715df
--- /dev/null
+++ b/explorations/agent-wiki/docs/design.md
@@ -0,0 +1,263 @@
+# Agent-wiki: design & rationale
+
+*A durable, evidence-grounded knowledge layer mined from an agent's own
+trajectories, consulted by future agents at recall-time.*
+
+This doc explains **why** the agent-wiki is shaped the way it is, **what**
+its pieces are, **how** a raw trace becomes a recallable page, and **what
+the experiments show**. It is the canonical design statement; for the
+operational contracts it links to the recall recipe
+([`_default_agents.md`](../skills/scripts/_default_agents.md),
+copied into every wiki as `AGENTS.md`), and the empirical log
+([`experiments/RESULTS-SUMMARY.md`](../experiments/RESULTS-SUMMARY.md)).
+
+---
+
+## 1. The problem
+
+Coding agents start every session cold. An agent that spent twenty tool
+calls last week discovering that a Debian container has no `pip` and
+PEP-668 blocks `pip install` will spend twenty tool calls rediscovering it
+next week. The knowledge a session produces dies with the session.
+
+The usual fixes don't hold up:
+
+- **Hand-authored runbooks** drift from reality and carry no provenance —
+  you can't tell whether a rule still reflects how the tool behaves, or who
+  decided it.
+- **Raw trajectory stores** keep everything but generalize nothing. They're
+  too bulky to load at recall-time, and a future agent has to re-derive the
+  lesson from a transcript instead of reading it.
+- **Generic long-term memory** (embed-everything vector stores) is lossy and
+  unauditable: a retrieved snippet has no chain back to the moment it was
+  true.
+
+The goal: a **knowledge layer the agent earns from its own work** — small
+enough to consult cheaply, general enough to apply to unseen-but-related
+tasks, and auditable down to the transcript that produced each claim.
+
+## 2. The core idea
+
+Build a **wiki from agent traces**. Each completed trajectory is distilled
+into pages; every page links back to the session it came from. Future agents
+**consult the wiki once they know the task they're about to do** — after the
+user's request is understood and the task family is clear, before writing
+code.
+
+```
+ past sessions            the wiki                  future session
+┌──────────────┐      ┌──────────────────┐       ┌──────────────────┐
+│ trajectory A │─┐    │ summaries/       │       │ user states task │
+│ trajectory B │─┼──▶ │ guidelines/      │ ◀─────│ agent reads      │
+│ trajectory C │─┘    │ skills/  tasks/  │consult│ _index.jsonl,    │
+└──────────────┘ dist.│ _index.jsonl     │       │ applies the rule │
+        ▲             └──────────────────┘       └──────────────────┘
+        └── provenance ──┘
+   (each wiki page links back to the trajectory it was distilled from)
+```
+
+The wiki is **not** a transcript archive and **not** a session-start
+preload. It's a curated, recall-preferred index of distilled lessons that an
+agent pulls from on demand.
+
+## 3. Design principles
+
+Each decision below earns its place; the *why* is the point.
+
+### Provenance is mandatory
+
+Every page is traceable, in a couple of clicks, to the raw transcript that
+produced it:
+
+```
+guideline.md
+  ↓ related_summary:
+summaries/<session_id>.md
+  ↓ sources:
+trajectories/<session_id>.json
+  ↓ source.transcript_path
+~/.../<session_id>.jsonl   (the raw trace)
+```
+
+Why: a recommendation is only trustworthy if you can audit where it came
+from and revise it when the underlying tool behavior changes. Provenance is
+what separates this from a generic memory store. Cluster pages aggregate
+their members' provenance rather than replacing it.
+
+### Page kinds, and a retrieval preference order
+
+The wiki has five page kinds, and `_index.jsonl` sorts them in **recall
+preference order**:
+
+| Kind | What it is | Why it exists |
+|---|---|---|
+| **cluster** | Themed aggregator over ≥2 atomic guidelines | One consolidated rule instead of N near-duplicate hits |
+| **skill** | Callable workflow page + sibling scripts | Directly *executable* — no interpretation needed |
+| **guideline** (atomic) | One rule, free-text, trigger-tagged | The base unit; a single distilled lesson |
+| **task / subtask** | Cross-session comparison / per-session workstream | Analysis surface, not recall-time advice |
+| **summary** | Episodic record of one session | The provenance anchor every other page links to |
+
+Sort order is `cluster → skill → guideline → task`, so the most
+consolidated and most directly-actionable artifacts surface first. The exact
+retrieval recipe (parse task → read `_index.jsonl` → filter by tag/trigger →
+prefer clusters → read top 2–5) lives in the recall contract; see
+[`_default_agents.md`](../skills/scripts/_default_agents.md).
+
+### Procedural over declarative where possible
+
+A **guideline** tells a future agent *what to do* ("when pip's module dir is
+missing, don't trust `ensurepip`"). A **skill** is a structured workflow page
+the agent can *execute* — Overview / When-To-Use / Workflow / optional
+sibling scripts it runs via Bash.
+
+Skills are **recall-preferred over guidelines** because they remove an
+interpretation step: the agent reads the SKILL.md and runs the recipe
+instead of reconstructing it from advice. §5 shows skills also win on cost.
+
+### Consolidation + delete-on-promote
+
+Two cross-trajectory moves keep the recall surface small and non-redundant:
+
+- **Consolidation** clusters ≥2 atomic guidelines that share a real *rule*
+  (not merely a topic) into a `__cluster.md` aggregator. Members stay on
+  disk with a `superseded_by:` backref — provenance is preserved.
+- **Delete-on-promote** (`--archive-covered`): when a skill is synthesized
+  (or a cluster created), the atomics it subsumes are **soft-archived** to
+  `_archived/`. They leave the recall index but stay auditable on disk; the
+  `_audit.log` records the move.
+
+Why: §5's central empirical finding is that **recall quality degrades as the
+index grows** — a smaller, non-redundant index helps even on tasks where no
+page matches. Consolidation and pruning are how the wiki stays small as it
+accumulates traces.
+
+### Recall-time discipline
+
+Consult **once you know the task or sub-task** — not at session start (too
+vague to match), not as a last resort when stuck (too late). And the
+**pointer wording is load-bearing**: a strong-imperative instruction to
+consult the wiki gets followed; a soft "you may want to check" gets skipped
+(§5, the A/B sweep). The pointer lives in the workspace `CLAUDE.md` /
+`AGENTS.md`; placement and wording both matter.
+
+## 4. How a trace becomes a recallable page
+
+The build pipeline is a sequence of LLM passes, each piping structured JSON
+to a deterministic builder
+([`build_agent_wiki.py`](../skills/scripts/build_agent_wiki.py))
+that writes the page and maintains the indexes:
+
+```
+raw trace ─┬─[convert]──▶ normalized JSON
+           │
+           ├─[summarize]─────────▶ summaries/<sid>.md        render-summary
+           ├─[extract-guidelines]▶ guidelines/<slug>__<gid>.md  render-guidelines
+           ├─[synthesize-skill]──▶ skills/<slug>/SKILL.md     render-skill --archive-covered
+           │                                                  (per trace, above)
+           ├─[consolidate]───────▶ guidelines/<slug>__cluster.md  render-cluster
+           │                                                  (once, cross-corpus)
+           └─[catalog]───────────▶ _index.jsonl, indexes, backrefs
+```
+
+| Stage | Skill | Builder subcommand | Scope |
+|---|---|---|---|
+| Convert | (bob-trace-converter / `normalize_stream_json_transcripts.py`) | — | per trace |
+| Summarize | [`agent-wiki-summarize`](../skills/agent-wiki-summarize/SKILL.md) | `render-summary` | per trace |
+| Extract guidelines | [`agent-wiki-extract-guidelines`](../skills/agent-wiki-extract-guidelines/SKILL.md) | `render-guidelines` | per trace |
+| Synthesize skill | [`agent-wiki-synthesize-skill`](../skills/agent-wiki-synthesize-skill/SKILL.md) | `render-skill` | per trace |
+| Consolidate | [`agent-wiki-consolidate-guidelines`](../skills/agent-wiki-consolidate-guidelines/SKILL.md) | `render-cluster` | **cross-corpus, once** |
+| Catalog | (any) | `catalog` | bookkeeping |
+
+**Order matters.** `synthesize-skill` runs *before* `consolidate` so skills
+claim recipe-level territory first (and archive the atomics they cover);
+consolidation then clusters only the surviving atomics. This matches the
+consolidate skill's own rule — don't propose a cluster overlapping a skill's
+territory.
+
+**`catalog` renders; `consolidate` proposes.** A sharp edge worth
+internalizing: `catalog` only *materializes* clusters already declared in
+`_config.yaml` and refreshes indexes/backrefs. It never *proposes* new
+clusters. Consolidation is the LLM pass that proposes them. Running `catalog`
+and expecting clusters to appear is a mistake — they won't unless
+consolidation declared them first.
+
+### The one-pass entry point
+
+[`agent-wiki-ingest`](../skills/agent-wiki-ingest/SKILL.md)
+orchestrates the whole pipeline end-to-end (convert → bootstrap → summarize
+→ extract → synthesize → consolidate → catalog) via subagent fan-out:
+summarize runs in parallel (independent file writes), extract and synthesize
+run sequentially (they mutate shared index/config state), consolidation runs
+once. It exists specifically so the **consolidation pass is never silently
+skipped** when ingesting a batch — the failure mode that motivated it.
+
+### Build patterns
+
+The same corpus can be turned into a wiki three ways, varying *when* the
+wiki is built and *what* the agent sees during each trial (see
+[`RESULTS-SUMMARY.md` §3–4](../experiments/RESULTS-SUMMARY.md)):
+
+- **Open-loop** — trials run against a fixed external wiki; the new wiki is a
+  study log built from observing them.
+- **Closed-loop** — trials mount the wiki being built; it grows trial-by-trial,
+  so trial N+1 sees what trial N spawned. The only pattern with real
+  intra-wiki recall data.
+- **Retroactive** — the wiki stays empty during all trials, then is built in
+  one batch afterward. Cleanest pure-recipe corpus.
+
+The three real-task themes emerge in **all three** patterns — consolidation
+is robust to build order.
+
+## 5. Evidence
+
+All experiments use the same 16-task corpus, `claude_md_strong` pointer,
+3 trials/task. `total_cost_usd` is the ground-truth cost metric (cache reads
+bill at ~10% of regular input, so raw token sums overcount). Full tables and
+methodology: [`experiments/RESULTS-SUMMARY.md`](../experiments/RESULTS-SUMMARY.md).
+
+| Finding | Result | Source |
+|---|---|---|
+| **Wiki vs no wiki** | −20% cost, −38% duration, −43% tool calls, accuracy unchanged (96%) | [twobatch-comparison](../experiments/twobatch-comparison.md) |
+| **Pointer wording is load-bearing** | strong-imperative CLAUDE.md 3/3 reads; soft phrasing 1/3 | [RESULTS-SUMMARY §1](../experiments/RESULTS-SUMMARY.md#1-agentsmd-ab-sweep-the-original) |
+| **Build pattern is robust** | same 3 clusters emerge open-/closed-/retroactive | [RESULTS-SUMMARY §3–4](../experiments/RESULTS-SUMMARY.md#34-build-pattern-comparison-closed-loop-vs-retroactive) |
+| **Skills > guidelines** | skills-only $0.146 vs guidelines $0.17 (−14%), accuracy 98% vs 96% | [twobatch-skills-comparison](../experiments/twobatch-skills-comparison.md) |
+| **Composition is non-additive** | skills+guidelines costs +22% vs skills, +5% vs guidelines | [twobatch-fourway-comparison](../experiments/twobatch-fourway-comparison.md) |
+| **Composition > size; skills-only still cheapest** | delete-on-promote (corrected index): −3% vs both, +18% vs skills | [twobatch-fiveway-comparison](../experiments/twobatch-fiveway-comparison.md) |
+
+The throughline across these:
+
+- **The wiki materially reduces cost at equal accuracy.** Savings come
+  mainly from fewer tool calls and shorter responses, not from reading fewer
+  input bytes — the agent reads *more* wiki bytes but acts more directly.
+- **A smaller recall surface helps even when nothing matches.** The
+  skills-only arm beat guidelines-only on tasks where *no skill matched*
+  (e.g. t2-imports −39%) — evidence that index noise itself costs, which is
+  why consolidation and delete-on-promote exist.
+- **Don't stack page kinds.** Skills + guidelines together is the worst
+  populated wiki, and pruning the redundant atomics doesn't recover the gap.
+  Pick procedural-first; let consolidation + archive keep the rest lean.
+
+## 6. Open questions / limitations
+
+From [`RESULTS-SUMMARY.md`](../experiments/RESULTS-SUMMARY.md)'s open
+questions — live, not yet resolved:
+
+- **Statistical power.** Headline numbers rest on 3 trials/task; per-task
+  confidence intervals are wide, especially on the two observed regressions
+  (wav-info, imports).
+- **True transfer.** All experiments reuse the same task in build and recall.
+  A real transfer test (build from tasks Y, recall on task X where X ∈
+  family(Y), X ∉ Y) would test whether clusters *generalize* rather than
+  memorize.
+- **Scale.** 16 tasks is small. Does the cost-reduction percentage hold,
+  grow, or saturate at 50+ tasks and a larger index?
+- **Why composition regresses.** The skills+guidelines penalty is
+  output-token-driven, not read-count-driven — trace-level inspection of why
+  the agent "says more" when both kinds are present is unresolved.
+
+## See also
+
+- [`schema.md`](schema.md) — the on-disk schema reference: directory layout, per-kind frontmatter, links, and the promotion/archival lifecycle.
+- [`_default_agents.md`](../skills/scripts/_default_agents.md) — the recall contract copied into every wiki as `AGENTS.md` (page kinds, retrieval recipe, provenance chain).
+- [`experiments/RESULTS-SUMMARY.md`](../experiments/RESULTS-SUMMARY.md) — the full empirical log.
+- The `agent-wiki-*` skills under [`skills/`](../skills/) and the builder [`build_agent_wiki.py`](../skills/scripts/build_agent_wiki.py).
diff --git a/explorations/agent-wiki/docs/schema.md b/explorations/agent-wiki/docs/schema.md
new file mode 100644
index 00000000..6fa056db
--- /dev/null
+++ b/explorations/agent-wiki/docs/schema.md
@@ -0,0 +1,483 @@
+# Agent-wiki: on-disk schema reference
+
+The precise file format of an agent-wiki — directory layout, every page
+kind, the load-bearing metadata fields, how pages link, and the lifecycle by
+which atomic guidelines get promoted into clusters or archived under skills.
+
+For the *why* behind this structure, see
+[`design.md`](design.md). For the recall-time contract
+an agent follows, see
+[`_default_agents.md`](../skills/scripts/_default_agents.md)
+(copied into each wiki as `AGENTS.md`). The source of truth for everything
+below is the builder
+[`build_agent_wiki.py`](../skills/scripts/build_agent_wiki.py);
+real examples are drawn from the `wiki-twobatch-*` example wikis.
+
+---
+
+## 1. Directory layout
+
+```
+<wiki-root>/
+├── AGENTS.md            ← recall contract (bootstrapped from the template)
+├── index.md             ← human-friendly overview (catalog-generated)
+├── _config.yaml         ← durable taxonomy: tags, clusters, tasks, overrides
+├── _index.jsonl         ← agent retrieval index (one row per page)
+├── _audit.log           ← append-only JSONL log of mutations + recall events
+├── _archived/           ← guidelines retired by delete-on-promote
+│   └── <slug>__<gid>.md
+├── summaries/
+│   ├── <session_id>.md              ← one episodic summary per session
+│   ├── <session_id>__<arc>.md       ← arc-split summary (long sessions)
+│   └── index.md
+├── guidelines/
+│   ├── <slug>__<gid>.md             ← atomic guideline (one rule)
+│   ├── <slug>__cluster.md           ← themed aggregator (recall-preferred)
+│   ├── _id_index.json               ← guideline id → relpath
+│   └── index.md
+├── skills/
+│   ├── <slug>/SKILL.md              ← callable workflow page
+│   ├── <slug>/scripts/<file>        ← optional sibling scripts (Bash-runnable)
+│   ├── _id_index.json               ← skill slug → relpath
+│   └── index.md
+└── tasks/
+    ├── <slug>__task.md              ← cross-session comparison
+    ├── <slug>__subtask.md           ← per-session workstream
+    └── index.md
+```
+
+**Filename suffixes are the navigation contract.** A page's role is decided
+by its suffix, and the tooling relies on it — do not rename:
+
+| Pattern | Role |
+|---|---|
+| `<slug>__<gid>.md` (in `guidelines/`) | atomic guideline; `<gid>` = the `id:` |
+| `<slug>__cluster.md` | cluster aggregator |
+| `<session_id>.md` / `<session_id>__<arc>.md` | summary (single / arc-split) |
+| `<slug>__task.md` | cross-session task comparison |
+| `<slug>__subtask.md` | per-session workstream |
+| `<slug>/SKILL.md` | skill |
+
+Files prefixed `_` (`_index.jsonl`, `_config.yaml`, `_audit.log`,
+`_id_index.json`, `_archived/`) are machinery, not content pages.
+
+---
+
+## 2. Page kinds and their frontmatter
+
+Each page is markdown with YAML frontmatter. Fields are either **authored at
+render-time** (written once by the `render-*` pass, stable thereafter) or
+**catalog-managed** (recomputed and force-overwritten on every `catalog`
+run). The split matters: never hand-edit a catalog-managed field — it'll be
+clobbered next catalog.
+
+### Summary — `summaries/<session_id>.md`
+
+`type: episodic-summary`. The provenance anchor every other page links back
+to. One per session (or per arc for long, split sessions).
+
+| Field | Origin | Meaning |
+|---|---|---|
+| `session_id`, `agent`, `model`, `goal`, `outcome` | render | session identity + one-line goal + success/partial/failure |
+| `duration_seconds`, `tools_used`, `sources` | render | wall-clock, tool names, provenance paths (normalized JSON + raw transcript) |
+| `recalled_guidelines` | render | guidelines the session saw, each `{id, title, status, evidence?}` |
+| `arc`, `sibling_summaries` | render | only on arc-split sessions |
+| `tags`, `tool_calls`, `errors`, `dead_end_paths`, `wiki_consulted` | **catalog** | computed from the normalized trajectory |
+| `contributed_guidelines`, `contributed_skills` | **catalog** | reverse links — pages this session produced |
+| `input_tokens`, `cache_creation_input_tokens`, `cache_read_input_tokens`, `output_tokens`, `total_cost_usd` | **catalog** | token + cost metrics (omitted when zero) |
+| `verified_at` | **catalog** | date of last catalog run |
+
+```yaml
+---
+type: episodic-summary
+session_id: <uuid>
+agent: bob
+model: premium
+goal: One sentence describing what the user asked for.
+outcome: success
+duration_seconds: 40.3
+tools_used: [execute_command, attempt_completion]
+sources:
+  - trajectories/<sid>-openai-chat-completions.analysis.json
+  - /path/to/raw/session.json
+# ── below: catalog-managed ──
+tags: []
+tool_calls: 7
+errors: 0
+wiki_consulted: false
+contributed_guidelines: [<gid>, ...]
+contributed_skills: [<slug>, ...]
+total_cost_usd: 0.18
+verified_at: 2026-06-09
+---
+```
+
+### Atomic guideline — `guidelines/<slug>__<gid>.md`
+
+`type: guideline` (also `workflow` / `script` / `command-template`). One
+reusable rule. `<gid>` is a 12-hex content hash and equals the `id:`.
+
+| Field | Origin | Meaning |
+|---|---|---|
+| `id`, `type` | render | content-hash id; page kind |
+| `trigger` | render | situational context when the rule applies |
+| `agent` | render | source agent (`bob`, `claude-code`, …); defaults to `claude-code` |
+| `tags` | render, then **catalog** | topical tags; catalog re-syncs from `_config.yaml` |
+| `sources`, `related_summary` | render | provenance: normalized JSON path + the summary page |
+| `cluster`, `superseded_by` | **catalog** | set when this atomic is a cluster member |
+| `verified_at` | **catalog** | date of last catalog run |
+
+The body carries the rule prose, an optional `## Rationale`, a
+`## Sources` footer, and a catalog-injected `## Used by` section listing
+sessions that recalled it.
+
+```yaml
+---
+id: 84ed6cf26387
+type: guideline
+trigger: Need to put a multi-line script inside a running Docker container before executing it.
+agent: claude-code
+tags: [docker, heredoc, shell, scripting, example]
+sources:
+  - trajectories/df2b08e4-openai-chat-completions.analysis.json
+related_summary: summaries/df2b08e4-7853-47ec-9c46-fee4b0a33eb7.md
+verified_at: 2026-06-09
+cluster: container-boundary-one-shot__cluster.md       # ← stamped by catalog
+superseded_by: container-boundary-one-shot__cluster.md  # ← stamped by catalog
+---
+```
+
+### Cluster — `guidelines/<slug>__cluster.md`
+
+`type: cluster`, `id: cluster:<slug>`. A themed aggregator over ≥2 atomic
+guidelines that share a rule. **Regenerated whole on every catalog run** from
+the membership declared in `_config.yaml`; always `priority: high`.
+
+```yaml
+---
+type: cluster
+slug: container-boundary-one-shot
+title: Cross the host/container boundary in one docker exec
+tags: [docker, container, shell, io]
+verified_at: 2026-06-09
+members:
+  - id: 84ed6cf26387
+    link: heredoc-python-scripts-into-the__84ed6cf26387.md
+  - id: 6c2bd298dd0d
+    link: read-in-container-files-via-docker-exec__6c2bd298dd0d.md
+priority: high
+---
+```
+
+Body: description, optional `## Takeaway` (the actionable one-line rule), and
+a `## Members` table. Members keep their own pages and provenance — the
+cluster aggregates, it doesn't absorb.
+
+### Skill — `skills/<slug>/SKILL.md`
+
+`type: skill`, `id: skill:<slug>`. A callable workflow page. Authored once by
+`render-skill`; **not touched by catalog**.
+
+| Field | Meaning |
+|---|---|
+| `name`, `description`, `trigger` | slug, one-paragraph summary, when-to-use |
+| `agent`, `sources`, `related_summary` | source agent + provenance |
+| `tags`, `verified_at` | topical tags; render date |
+
+Body: `## Overview`, optional `## When To Use`, `## Workflow`, `## Sources`.
+Optional sibling scripts live under `skills/<slug>/scripts/` (shell scripts
+are written `chmod 755`).
+
+```yaml
+---
+id: skill:transform-json-with-jq-and-persist-filter-args-yaml
+type: skill
+name: transform-json-with-jq-and-persist-filter-args-yaml
+description: Use a single jq pipeline to filter, reshape, and sort JSON to a target schema …
+trigger: "A task gives an input JSON and asks for a transformed output plus a YAML of the jq filter + args …"
+agent: bob
+sources:
+  - trajectories/d0e03862-openai-chat-completions.analysis.json
+related_summary: summaries/d0e03862-30c5-49b6-9aef-b97dcea57dc0.md
+verified_at: 2026-06-09
+tags: [jq, json, yaml, example]
+---
+```
+
+### Task / subtask — `tasks/<slug>__task.md`, `tasks/<slug>__subtask.md`
+
+`task-comparison` pages (`id: task:<slug>`) are cross-session comparison
+tables, **regenerated each catalog run** from `_config.yaml`'s `tasks.<slug>`
+definition + the sessions it classifies. `subtask` pages (`id:
+subtask:<slug>`) are per-session workstream narratives, **authored standalone**
+and not regenerated. Both carry `type`, `slug`, `title`, `tags`,
+`verified_at`; tasks add `sessions:` (row count), subtasks add
+`parent_session_id` / `parent_summary`.
+
+### id conventions
+
+- **Atomic guidelines**: a 12-hex content hash (e.g. `84ed6cf26387`); the
+  filename suffix matches, so id ↔ file round-trips.
+- **Everything else**: a kind-prefixed slug — `cluster:<slug>`,
+  `skill:<slug>`, `task:<slug>`, `subtask:<slug>`.
+
+---
+
+## 3. Index, config, and audit files
+
+### `_index.jsonl` — the retrieval index
+
+One JSON object per line, one line per cluster / skill / guideline / task /
+subtask page. This is what an agent reads at recall-time. Rows are sorted
+**clusters → skills → guidelines → tasks → subtasks**, so the most
+consolidated and directly-actionable artifacts come first. Common keys:
+`kind`, `id`, `title`, `tags`, `trigger`, `summary` (≤240-char snippet),
+`link`. Per-kind extras: clusters add `members` + `priority: high`; skills
+add `priority: high`; guideline rows add `cluster` and (when clustered)
+`superseded_by`; task rows add `family`; subtask rows add
+`parent_session_id` / `parent_summary`.
+
+```jsonl
+{"kind": "cluster", "id": "cluster:container-boundary-one-shot", "title": "Cross the host/container boundary in one docker exec", "tags": ["docker","container","shell","io"], "trigger": "", "summary": "Benchmark tasks frequently live inside a named Docker container…", "link": "guidelines/container-boundary-one-shot__cluster.md", "members": ["84ed6cf26387","6c2bd298dd0d"], "priority": "high"}
+{"kind": "skill", "id": "skill:aggregate-jsonl-records-top-n-by-sum-and-count", "title": "aggregate-jsonl-records-top-n-by-sum-and-count", "tags": ["jsonl","python","aggregation","example"], "trigger": "Task gives a directory of large JSONL files…", "summary": "Aggregate many JSONL files in one streaming Python pass…", "link": "skills/aggregate-jsonl-records-top-n-by-sum-and-count/SKILL.md", "priority": "high"}
+{"kind": "guideline", "id": "3c019235c9f8", "title": "Format ISO 8601 to YYYY-MM-DD with split T", "tags": ["jq","iso-8601","date-formatting","example"], "trigger": "Inside a jq filter, you need only the calendar date…", "summary": "…use `(.last_login | split(\"T\")[0])`.", "link": "guidelines/format-iso-8601-to-yyyy-mm-dd-with__3c019235c9f8.md", "cluster": null}
+```
+
+**Archived guidelines are absent from `_index.jsonl`** — that's what makes
+archiving remove a page from recall.
+
+### `_config.yaml` — the durable taxonomy
+
+The one authored file that survives catalog regeneration. Structure:
+
+```yaml
+schema_version: 1
+tags:
+  guideline:
+    <gid>: [tag, tag, ...]      # guideline id → tags (drives "By tag" + clustering)
+clusters:
+  <slug>:
+    title: <string>
+    description: <string>
+    takeaway: <string>
+    members: [<gid>, ...]       # the cluster's atomic members
+    tags: [tag, ...]
+tasks:
+  <slug>:
+    title: <string>
+    family: <string>
+    family_match: { goal_substring: [<substr>, ...] }
+    intro: <string>
+    findings: <string>
+    tags: [tag, ...]
+session_family_overrides:
+  <session_id>: { family: <str|null>, trial: <int|null>, condition: <str|null> }
+```
+
+`tags.guideline` and `clusters` are written by `render-guidelines` /
+`render-cluster`; `catalog` reads them back to stamp atomic frontmatter and
+regenerate cluster pages. `tasks` + `session_family_overrides` drive
+task-comparison classification.
+
+### `_id_index.json` — id → path
+
+A flat map in both `guidelines/` and `skills/`, used to resolve backlinks
+(e.g. a summary's `contributed_guidelines` ids → file paths). Archiving an
+atomic **pops** its entry here (see §5).
+
+```json
+{ "84ed6cf26387": "guidelines/heredoc-python-scripts-into-the__84ed6cf26387.md" }
+```
+
+### `_audit.log` — append-only mutation + recall log
+
+One JSON line per event. Three action types:
+
+```jsonl
+{"action": "summary.guideline_use", "session_id": "<uuid>", "id": "<gid>", "status": "followed", "ts": "…Z"}
+{"action": "synthesize_skill", "session_id": "<uuid>", "skill_name": "<slug>", "scripts": ["run.sh"], "ts": "…Z"}
+{"action": "archive_guideline", "id": "<gid>", "reason": "covered_by_skill", "target": "<slug>", "src": "guidelines/…md", "dst": "_archived/…md", "ts": "…Z"}
+```
+
+`reason` is `covered_by_skill` or `covered_by_cluster`. The audit log is the
+durable record of promotions/archivals even though archived pages leave the
+index.
+
+---
+
+## 4. How files link to each other
+
+Forward links are **authored at render-time**; reverse links are
+**recomputed by catalog** from the forward ones. Forward is the source of
+truth.
+
+```
+            ┌──────────────────────────── provenance (forward) ───────────────────────────┐
+            ▼                                                                               │
+ guidelines/<slug>__<gid>.md ──related_summary:──▶ summaries/<sid>.md ──sources:──▶ normalized JSON ──▶ raw transcript
+            ▲                                              │
+            │   contributed_guidelines: / contributed_skills:  (reverse — catalog inverts related_summary)
+            └──────────────────────────────────────────────┘
+
+ guidelines/<slug>__<gid>.md ──cluster: / superseded_by:──▶ guidelines/<slug>__cluster.md
+            ▲                                                        │
+            └────────────────────── members: ───────────────────────┘   (bidirectional)
+
+ _id_index.json :  <gid> ──▶ relpath          _index.jsonl :  row.link ──▶ page file
+```
+
+- A **guideline → summary → trajectory** chain makes every rule auditable.
+- `catalog` builds **`contributed_guidelines` / `contributed_skills`** on the
+  summary by inverting all guideline/skill `related_summary:` fields — so the
+  summary knows what it produced without that being hand-maintained.
+- **Cluster ↔ member** is bidirectional: the cluster lists `members:`; each
+  member is stamped `cluster:` + `superseded_by:`.
+
+---
+
+## 5. Lifecycle: promotion & archival
+
+```
+                         render-guidelines
+                                │
+                                ▼
+                   ┌──────────────────────────┐
+                   │         ATOMIC           │
+                   │ guidelines/<slug>__<gid> │
+                   │ in _id_index.json        │
+                   │ in _index.jsonl          │
+                   └──────────────────────────┘
+                         │                   │
+        render-cluster   │                   │  render-skill --archive-covered
+        (+ catalog)      │                   │  — or — render-cluster --archive-members
+                         ▼                   ▼
+        ┌────────────────────────┐   ┌──────────────────────────┐
+        │       CLUSTERED        │   │        ARCHIVED          │
+        │ file STAYS in place    │   │ file MOVES → _archived/  │
+        │ +cluster: +superseded… │   │ popped from _id_index    │
+        │ still in both indexes  │   │ ABSENT from _index.jsonl │
+        │ cluster row priority:hi│   │ audit: archive_guideline │
+        └────────────────────────┘   │ (unreachable at recall)  │
+                                      └──────────────────────────┘
+```
+
+### ATOMIC → CLUSTERED
+
+Authored by declaring the cluster (`render-cluster` writes
+`_config.yaml/clusters.<slug>` + the `__cluster.md` page). On the next
+`catalog`, each member atomic is **stamped** `cluster:` and `superseded_by:`
+in its frontmatter. The member **file stays in place**, stays in
+`_id_index.json`, and stays in `_index.jsonl` (now carrying `superseded_by`).
+The cluster gets its own `_index.jsonl` row with `priority: high`. At recall
+the cluster is preferred; members remain reachable for their original wording
++ provenance.
+
+### ATOMIC → ARCHIVED (delete-on-promote)
+
+When a skill (or cluster) subsumes an atomic, the atomic is **soft-archived**:
+
+1. file moved `guidelines/<slug>__<gid>.md` → `_archived/<slug>__<gid>.md`
+2. its entry is **popped** from `guidelines/_id_index.json`
+3. an `archive_guideline` line is appended to `_audit.log`
+4. on the next catalog it is **not scanned** (it's outside `guidelines/`), so
+   it disappears from `_index.jsonl` — **unreachable at recall**, still on
+   disk for audit. Reversal is manual.
+
+Two triggers:
+
+| Trigger | Flag | Audit `reason` |
+|---|---|---|
+| Cluster created | `render-cluster --archive-members` | `covered_by_cluster` |
+| Skill synthesized | `render-skill --archive-covered` | `covered_by_skill` |
+
+### Coverage inference (`--archive-covered`)
+
+A skill archives an atomic only if `_skill_covers_atomic` returns true via
+**any** of three conservative paths (biased toward false-negatives — when in
+doubt, the atomic survives):
+
+1. **Tag-superset** — the atomic's tags ⊆ the skill's tags **and** their
+   intersection has ≥2 tags outside a `_GENERIC_TAGS` stop-set
+   (`stdlib`, `parsing`, `agent-behavior`, `binary`, `headers`, …).
+2. **Slug-keyword** — a ≥4-char, non-stopword token from the skill slug
+   appears in the atomic's title.
+3. **Format-identifier** — an uppercase (`PNG`, `ZIP`) or CamelCase (`WebP`)
+   token in the skill description appears in the atomic's title. Catches
+   family-broad skills whose slug abstracts the format names away.
+
+### What catalog recomputes vs. what's authored once
+
+| Recomputed every `catalog` (force-replaced) | Authored once at render |
+|---|---|
+| guideline: `verified_at`, `tags`, `cluster`, `superseded_by`; `## Used by` | guideline: `id`, `type`, `agent`, `trigger`, `sources`, `related_summary`, body |
+| summary: `tags`, `tool_calls`, `errors`, `dead_end_paths`, `wiki_consulted`, `contributed_guidelines`, `contributed_skills`, token metrics, `verified_at` | summary: `session_id`, `agent`, `model`, `goal`, `outcome`, `sources`, narrative |
+| cluster + task pages (regenerated whole); all `index.md`; `_index.jsonl`; priority tiers | cluster/task definitions in `_config.yaml`; skill pages; subtask pages |
+
+Archiving is one-way; reversing it means moving the file back and
+re-cataloging by hand.
+
+---
+
+## 6. Worked example — one real chain
+
+Tracing the atomic `heredoc-python-scripts-into-the__84ed6cf26387` through one of the example wikis.
+
+**(a) The atomic** carries forward links to its summary + its cluster (the
+`cluster:`/`superseded_by:` pair was stamped by catalog when the cluster was
+declared):
+
+```yaml
+id: 84ed6cf26387
+type: guideline
+agent: claude-code
+tags: [docker, heredoc, shell, scripting, example]
+sources:
+  - trajectories/df2b08e4-openai-chat-completions.analysis.json
+related_summary: summaries/df2b08e4-7853-47ec-9c46-fee4b0a33eb7.md
+cluster: container-boundary-one-shot__cluster.md
+superseded_by: container-boundary-one-shot__cluster.md
+```
+
+**(b) Follow `related_summary:`** to the summary — which closes the reverse
+loop via the catalog-computed `contributed_guidelines` (and names the raw
+transcript under `sources:`):
+
+```yaml
+type: episodic-summary
+session_id: df2b08e4-7853-47ec-9c46-fee4b0a33eb7
+agent: bob
+goal: Aggregate JSONL records in a Docker container to produce /app/aggregates.json …
+sources:
+  - trajectories/df2b08e4-openai-chat-completions.analysis.json
+  - /Users/…/.bob/tmp/…/chats/session-2026-06-09T07-11-df2b08e4.json   # raw trace
+contributed_guidelines: [84ed6cf26387]                                  # ← reverse edge
+contributed_skills: [aggregate-jsonl-records-top-n-by-sum-and-count]
+```
+
+**(c) Follow `cluster:`** forward to the aggregator, which lists the atomic
+as a member — the bidirectional cluster↔member link:
+
+```yaml
+type: cluster
+slug: container-boundary-one-shot
+title: Cross the host/container boundary in one docker exec
+members:
+  - id: 84ed6cf26387
+    link: heredoc-python-scripts-into-the__84ed6cf26387.md
+  - id: 6c2bd298dd0d
+    link: read-in-container-files-via-docker-exec__6c2bd298dd0d.md
+priority: high
+```
+
+One atomic, four hops: **rule → summary → raw trajectory** (provenance), and
+**rule ↔ cluster** (consolidation), with the summary's
+`contributed_guidelines` closing the loop back to the rule. Every edge is
+either authored at render (forward) or recomputed by catalog (reverse).
+
+---
+
+## See also
+
+- [`design.md`](design.md) — why the wiki is shaped this way (rationale, principles, empirical results).
+- [`_default_agents.md`](../skills/scripts/_default_agents.md) — the recall-time contract (`AGENTS.md`).
+- [`build_agent_wiki.py`](../skills/scripts/build_agent_wiki.py) — the builder; the implementation of everything above.
diff --git a/explorations/agent-wiki/experiments/RESULTS-SUMMARY.md b/explorations/agent-wiki/experiments/RESULTS-SUMMARY.md
new file mode 100644
index 00000000..1cb4e8c6
--- /dev/null
+++ b/explorations/agent-wiki/experiments/RESULTS-SUMMARY.md
@@ -0,0 +1,480 @@
+# Experiment results — running summary
+
+A consolidated index of empirical findings from the wiki experiments, in
+the order they were run. The detailed reports for each experiment live in
+their own files under `experiments/`; this is the running tape of what we
+learned.
+
+## 1. AGENTS.md A/B sweep (the original)
+
+**Question**: does pointing an agent at `wiki-example/AGENTS.md`
+alter behavior on a related task it didn't see at training time?
+
+**Setup**: 4 conditions × 3 trials = 12 sandbox runs of a single utt2
+("what lens model was used for @sample.jpg?"). Conditions varied how the
+pointer was delivered (`baseline` / `skill` / `prompt` / `claude_md`).
+
+**Headline results** (`experiments/results/wiki-consult-20260605T070417Z/`):
+
+| Condition                      | read AGENTS.md | cited guideline | outcome match |
+| ------------------------------ | -------------- | --------------- | ------------- |
+| baseline                       | 0/3            | 0/3             | 0/3           |
+| prompt (per-utterance hint)    | 3/3            | 3/3             | 3/3           |
+| claude_md (soft phrasing)      | 1/3            | 1/3             | 1/3           |
+| claude_md_strong (added later) | 3/3            | 3/3             | 3/3           |
+
+**The two CLAUDE.md variants** (verbatim, from
+`tests/e2e/experiment_wiki_consult.py`; `WIKI_NAME` = `wiki-example`):
+
+- **`claude_md` (weak/soft)**:
+  > Before non-trivial tasks in this repo, consult `wiki-example/AGENTS.md` for relevant guidelines.
+
+- **`claude_md_strong` (strong)**:
+  > Before doing anything else in this workspace, read `wiki-example/AGENTS.md` and follow its retrieval recipe to find guidelines that apply to your task. Cite the guidelines you used.
+
+What makes the strong variant load-bearing — three differences:
+
+| Axis | weak | strong |
+| --- | --- | --- |
+| **Timing** | "before non-trivial tasks" (agent judges what's non-trivial) | "before doing anything else" (unconditional, first action) |
+| **Verb** | "consult … for relevant guidelines" (vague) | "read … and follow its retrieval recipe" (imperative + concrete procedure) |
+| **Accountability** | none | "Cite the guidelines you used" |
+
+**Finding**: a strong-imperative pointer in CLAUDE.md performs as well as
+a per-utterance prompt hint. A *soft* CLAUDE.md ("Before non-trivial
+tasks, consult …") got skipped 2/3 of the time — the hedge ("non-trivial")
+lets the agent rationalize skipping. **Wording at the pointer site is
+load-bearing.**
+
+## 2. Persistent-pointer mechanism comparison
+
+**Question**: does it matter where the strong-imperative pointer lives —
+in CLAUDE.md, in `--append-system-prompt`, or in a SessionStart hook?
+
+**Setup**: 3 mechanisms × 3 trials = 9 trials of the same lens-model task.
+
+**Headline results**:
+
+| Mechanism              | Reads AGENTS.md as Tool 1 | Median runtime    |
+| ---------------------- | ------------------------- | ----------------- |
+| SessionStart hook      | 3/3                       | **47s** (fastest) |
+| claude_md_strong       | 3/3                       | 52s               |
+| --append-system-prompt | 3/3 (but Tool 3+)         | 63s (slowest)     |
+
+**Finding**: all 3 mechanisms hit the same accuracy. **System-prompt
+placement costs ~10–15s of orientation latency** (`ls`, `which exiftool`,
+etc.) before the agent reads AGENTS.md. The SessionStart hook places the
+pointer above-the-fold, so the agent reads AGENTS.md as Tool 1 with no
+orientation pre-amble.
+
+## 3–4. Build-pattern comparison (closed-loop vs retroactive)
+
+> **Omitted from this public exploration.** These two experiments compared
+> *how* a wiki is built — closed-loop (the wiki grows between trials, each
+> trial sees what prior trials spawned) vs retroactive (the wiki stays empty
+> during all trials, then is ingested in batch). They ran against internal
+> trajectory corpora, so the detailed report and per-trial data are not
+> included here.
+
+**Portable finding**: the same real-task themes emerged in *all* build
+patterns (open-loop, closed-loop, retroactive) —
+image-format-headers-via-struct, prefer-stdlib-module-for-format,
+shell-pipelines-for-line-tasks. Consolidation is robust to build order; what
+varies between patterns is meta-content, recall data, and per-task cost.
+Closed-loop is the only pattern that accumulates real intra-wiki recall data
+(trial N+1 demonstrably reads what trial N spawned); the others need post-hoc
+attribution.
+
+## 5. Two-batch wiki-helps experiment
+
+**Question**: does the wiki *measurably* reduce token cost / duration /
+tool calls at equal accuracy, on the same task, with vs without?
+
+**Setup**: 16 tasks × 3 trials × 2 batches = 96 trials, all
+`claude_md_strong`. Batch 1 ran against an empty wiki. Wiki built from
+batch 1's trajectories, frozen. Batch 2 ran against the populated wiki.
+Same prompts, same workspace seeding — only variable: wiki content.
+
+**Headline results** (from `experiments/twobatch-comparison.md`):
+
+| Metric                    | Batch 1 (empty) | Batch 2 (with wiki) |                       Δ |
+| ------------------------- | --------------: | ------------------: | ----------------------: |
+| **Median total cost USD** |           $0.21 |               $0.17 |                **−20%** |
+| **Median duration**       |             43s |                 27s |                **−38%** |
+| **Median tool calls**     |               7 |                   4 |                **−43%** |
+| Median wiki reads         |               5 |                   3 |                    −40% |
+| Median output tokens      |             406 |                 268 |                    −34% |
+| Cache-read tokens         |               — |                   — |                    −32% |
+| Cache-creation tokens     |               — |                   — | +66% (new pages cached) |
+| **Aggregate accuracy**    |             96% |                 96% |               unchanged |
+
+**Per-task highlights**:
+
+- **Wiki rescued failures on lens-model**: 67% → **100%** accuracy.
+- **t8-bmp-info batch-1 trial 1 timed out at 300s**; with-wiki, all 3
+  BMP trials completed in 27s median. **11× speedup** on that task.
+- **t5-base64 with empty wiki**: 300s timeout. With wiki: 18s, 23s, 20s
+  (3/3 succeed). The `skip-for-trivial` guideline — recalled — let the
+  agent short-circuit AGENTS.md's recipe.
+- **Two regressions**: t12-wav-info (100% → 67%) and t2-imports
+  (100% → 67%). One trial each failed in batch 2 — likely the agent
+  over-applying or misreading a recalled guideline.
+
+**Finding**: **wiki → faster, cheaper, fewer tools, equal accuracy.**
+Per-task `total_cost_usd` is the ground-truth cost metric (cache reads
+are billed at ~10% of regular input rate, so the raw token-sum proxy
+overcounts). The −20% cost figure is robust to that pricing nuance.
+
+Detailed report: [`experiments/twobatch-comparison.md`](twobatch-comparison.md).
+
+## 6. Skills-arm of the wiki-helps experiment
+
+**Question**: would a wiki populated only with synthesized **skills**
+(executable workflow pages) — instead of free-text guidelines — beat
+the guidelines arm on the same 16-task corpus?
+
+**Setup**: identical to twobatch except batch 2 mounted
+`wiki-twobatch-skills/`, an empty wiki populated by acting as the
+`agent-wiki-synthesize-skill` agent on twobatch's batch-1 transcripts.
+Per the skill's own rules (skip if trivial / single command, broad-
+trigger names), three skills emerged:
+
+- `extract-jpeg-exif-camera-optics` (covers t1)
+- `read-image-format-dimensions` (covers t6/t7/t8/t9 via magic-byte dispatch)
+- `count-csv-rows-with-quoted-fields` (covers t14)
+
+Other 12 tasks have no matching skill — agent should fall through.
+
+**Headline results**:
+
+|                       |  Empty | Guidelines | Skills | Δ vs guidelines |
+| --------------------- | -----: | ---------: | -----: | --------------: |
+| Median total cost USD |  $0.21 |      $0.17 | **$0.146** |        **−14%** |
+| Median output tokens  |    406 |        268 |    **206** |             −23% |
+| Median wiki reads     |      5 |          3 |      **2** |             −33% |
+| Aggregate accuracy    |   96%  |        96% |    **98%** |             +2% |
+| Trials                | 47/48  |      48/48 |   48/48 |       (no timeouts) |
+
+**Per-task standouts**:
+
+- **t1-lens-model**: −28% cost. Direct skill match.
+- **t2-imports**: −39% cost AND **67% → 100%** accuracy — *no skill matched*,
+  but the simpler wiki (3 skills, no guidelines) led to a faster path.
+- **t3-todos**: −30%; same pattern.
+- **skip family** (t2/t3/t5): 89% → 100% accuracy.
+- **t14-csv-quoted**: **+18% cost** despite a matching skill — the skill's
+  overhead exceeded the savings on a 5-row CSV.
+- **text family**: +6% (only family where skills hurt — 3 of 4 text tasks
+  had no matching skill).
+
+**Finding**: **skills > guidelines on aggregate cost, even where skills
+don't match.** A smaller wiki (3 skills, no guideline noise) seems to
+help recall on no-skill-match tasks too — the wiki-noise effect is real.
+
+Detailed report: [`experiments/twobatch-skills-comparison.md`](twobatch-skills-comparison.md).
+
+## 7. Both-arm: skills + guidelines together (4-way comparison)
+
+**Question**: does combining skills + guidelines compose additively, or
+is there an overhead?
+
+**Setup**: same 16-task corpus, fourth arm. `wiki-twobatch-both` was
+built from twobatch's batch-1 trajectories with BOTH the retroactive
+guideline pipeline AND the synthesize-skill pipeline. End state: 47
+summaries + 15 atomics + 3 skills.
+
+**Headline 4-way aggregate**:
+
+|                       |  Empty | Guidelines |     Skills |       Both | Both vs G | Both vs S |
+| --------------------- | -----: | ---------: | ---------: | ---------: | --------: | --------: |
+| Median total cost USD |  $0.21 |      $0.17 | **$0.146** |     $0.179 |       +5% |     +22% |
+| Median output tokens  |    406 |        268 |        206 |        272 |        +1% |     +32% |
+| Median wiki reads     |      5 |          3 |          2 |          2 |       −33% |       =  |
+| Median guideline reads |     1 |          1 |          0 |          0 |       −1   |       =  |
+| Aggregate accuracy    |    96% |        96% |       98%  |        98% |       +2  |       =  |
+
+**Per-family `Δ S→B`** (both minus skills, in cost):
+
+|         |  Δ |
+| ------- | --: |
+| text    | −1% |
+| image   | +22% |
+| lens-model | +17% |
+| archive | +32% |
+| skip    | +44% |
+
+**Findings**:
+
+1. **Composition is non-additive — and slightly punitive.** Both arm is
+   the most expensive populated wiki: +22% vs skills, +5% vs guidelines.
+2. **The penalty is largest on tasks WITHOUT a matching skill.** Skip
+   family +44%, archive +32%. Adding guidelines on top of skills did
+   not help where guidelines should have been the primary recall path.
+3. **Behavioral signal**: median output tokens 206 → 272 — agent says
+   more in the both arm. Wiki-reads count is identical (2 + 0). Cost
+   increase isn't from extra reads; it's from longer responses (likely
+   the agent citing both the skill it used + adjacent guideline context).
+4. **t14-csv-quoted: +49% vs guidelines, +26% vs skills** — the most
+   extreme regression. Having both the CSV skill AND the underlying CSV
+   guideline available pushed cost higher than either alone.
+
+**Conclusion**: **less wiki content + targeted (procedural) recall
+wins.** Don't pile guidelines on top of skills; pick one or the other.
+
+Detailed report: [`experiments/twobatch-fourway-comparison.md`](twobatch-fourway-comparison.md).
+
+## 8. Pruned-arm: delete-on-promote policy (5-way comparison)
+
+**Question**: §7 closed with the open question "if 'both' loses to
+'skills-only', does 'skills + only the no-skill-coverage guidelines'
+beat 'skills-only'?" This experiment tests that.
+
+**Policy added** to the agent-wiki builder: when a cluster is rendered,
+archive its member atomics; when a skill is synthesized, archive every
+atomic the skill *covers* — inferred via three paths:
+
+1. **Tag-superset**: skill's tags ⊇ atomic's tags AND ≥2 non-generic
+   tags shared.
+2. **Slug-keyword**: a non-stopword token (≥4 chars) from the skill
+   slug appears in the atomic's title.
+3. **Description-format-token**: an uppercase format identifier (e.g.
+   `PNG`, `BMP`, `WebP`, `JPEG`) that appears in both the skill's
+   description and the atomic's title.
+
+Soft archive: moves to `<wiki>/_archived/<filename>` with an audit
+log entry; recall data on archived atomics is discarded.
+
+**Setup**: same 16-task corpus, same `claude_md_strong` condition.
+`wiki-twobatch-pruned/` was built by the same pipeline that built
+`wiki-twobatch-both/`, but with `--archive-covered` on each
+synthesize-skill call. End state:
+
+- 47 summaries
+- 9 surviving atomics (all from no-skill-match tasks: zip, tar, wav,
+  gzip, jsonl, ini, log, plus the imports/todos/base64 meta-atomics)
+- 3 skills (same as skills/both arms)
+- **6 archived atomics** (PNG, GIF, BMP, WebP, walk-EXIF-sub-IFD,
+  use-stdlib-csv-reader) — exactly the atomics covered by the 3 skills
+
+> **⚠️ Corrected 2026-06-10.** The numbers below are the **re-run** against
+> a fixed index. The original §8 (commit `8bcd713`) ran the pruned arm
+> against a wiki whose `_index.jsonl` was stale — `render-skill` archived the
+> covered atomics but never refreshed the indexes, so the wiki exposed
+> **0 skills, 15 guideline rows, and 6 broken links**. Agents never saw the
+> skills and chased dangling guideline rows. Commit `2adc67a` fixed the
+> builder (refresh indexes + integrity assertion after `render-skill` /
+> `render-cluster`); this section reflects the corrected run. The original
+> (broken) figures are kept in strikethrough for comparison.
+> See [`pruned-index-hypothesis.md`](pruned-index-hypothesis.md).
+
+**Headline 5-way aggregate** (Pruned = corrected re-run):
+
+|                        |  Empty | Guidelines |     Skills |       Both |              Pruned | P vs S | P vs B |
+| ---------------------- | -----: | ---------: | ---------: | ---------: | ------------------: | -----: | -----: |
+| Median total cost USD  |  $0.21 |      $0.17 | **$0.146** |     $0.179 | $0.173 (~~$0.181~~)  |   +18% |    −3% |
+| Median output tokens   |    406 |        268 |    **206** |        272 |     226 (~~290~~)    |    +9% |   −17% |
+| Median wiki reads      |      5 |          3 |        2   |        2   |       2 (~~3~~)      |      = |      = |
+| Median guideline reads |      1 |          1 |        0   |        0   |       0 (~~1~~)      |      = |      = |
+| Aggregate accuracy     |    96% |        96% |       98%  |       98%  |       98%            |     =  |     =  |
+
+**Per-family `Δ` (cost vs skills-only / vs both)** — corrected:
+
+| Family     |   B vs S |    P vs S | P vs B |
+| ---------- | -------: | --------: | -----: |
+| lens-model |     +17% |      +30% |   +11% |
+| image      |     +22% |      +33% |    +9% |
+| archive    |     +32% |      +24% |    −6% |
+| text       |      −1% |      −3%  |    −3% |
+| skip       |     +44% |      +18% |   −18% |
+
+**Findings** (corrected):
+
+1. **The stale index was a real confound.** Fixing it cut the pruned arm's
+   median cost $0.181 → **$0.173**, output tokens 290 → **226**, wiki reads
+   3 → 2, and **guideline reads 1 → 0**. The broken arm's extra read and
+   guideline-read were agents following dangling/archived rows that the
+   correct index no longer exposes. The original "pruning is *worse* than
+   both" result (+1%) flips to **−3% vs both** once the index is correct.
+
+2. **But skills-only still wins.** Even corrected, pruned ($0.173) remains
+   **+18% vs skills-only** ($0.146). The §7 open question still gets a "no":
+   adding the no-skill-coverage atomics on top of skills does not beat
+   skills-alone on aggregate cost.
+
+3. **Pruning still costs on skill-match families, just far less.** Image
+   +9% vs both (was +28%), lens-model +11% (was +79%). The dramatic
+   skill-match penalty in the original was mostly the broken index; a
+   smaller residual penalty remains — having sibling atomics in the index
+   at all is slightly distracting even when a skill is the right answer.
+
+4. **Pruning genuinely helps no-skill-match families.** Archive −6% vs both,
+   skip −18% vs both, text −3%. Where there's no skill to fall through to,
+   the leaner atomic list is a real (and now larger) win.
+
+5. **Size *is* a lever once you control for index correctness — but a small
+   one, and composition still dominates.** Corrected pruned (12 index rows)
+   now sits between skills (12 rows) and both (18 rows), in the expected
+   order — the earlier "smallest wiki yet most expensive" paradox was an
+   artifact of the bug, not a real inversion.
+
+6. **Same-session matcher variant is a wash.** Re-pruning through the
+   *also*-fixed archive matcher (commit `1272097`, which keeps GIF/BMP/WebP
+   the old loose matcher wrongly archived cross-session) yields a 12-atomic
+   wiki. Its full-corpus median is **$0.175** (sum $8.23) — statistically
+   indistinguishable from the 9-atomic arm. The 3 extra cross-session
+   atomics cost essentially nothing.
+
+7. **Both single-trial misses were known-flaky tasks, not regressions.**
+   9-atomic missed t2-imports trial-1 (the prompt renders the module name as
+   a blank placeholder — the agent correctly asked which module); 12-atomic
+   missed t12-wav-info trial-2 (the same task that flaked to 67% in the
+   guidelines/skills/both arms). 47/48 each.
+
+**Operational implication** (revised): the original "don't expect pruning to
+reduce cost" was too pessimistic — it was measuring a broken index. With a
+correct index, **delete-on-promote is a net positive vs `both`** (−3%
+aggregate, −6%/−18% on no-skill-match families) and is sound hygiene. But it
+still doesn't beat **skills-only**, which remains the cheapest surface. If
+cost is the only goal, ship skills-only; if you want to keep authored
+guidelines for tasks no skill covers, pruned-on-a-fresh-index is a reasonable
+middle and clearly better than stacking everything (`both`).
+
+Detailed report: [`experiments/twobatch-fiveway-comparison.md`](twobatch-fiveway-comparison.md).
+
+## Cross-experiment findings
+
+1. **Wording > placement.** Strong-imperative pointer wording matters
+   more than which channel delivers it. Soft CLAUDE.md got skipped; any
+   strong-imperative variant succeeded.
+
+2. **Same real-task themes emerge regardless of build pattern.** The
+   3-cluster set (image-format-headers, stdlib-module, shell-pipelines)
+   appears in open-loop, closed-loop, and retroactive builds.
+   **Consolidation is robust.** What varies between builds is meta-
+   content, recall data, and accuracy/cost on individual tasks.
+
+3. **Closed-loop is the only build with real intra-wiki recall data.**
+   Other builds need post-hoc attribution or cross-wiki references.
+   Empirically demonstrated: trial N+1 reads what trial N spawned.
+
+4. **The wiki materially reduces cost on identical tasks.** −20% cost,
+   −38% duration, −43% tool calls in the controlled two-batch A/B at
+   unchanged accuracy. Effect is largest on tasks where the recipe is
+   non-obvious without the wiki (lens-model, BMP, base64-with-scope-
+   warning).
+
+5. **Cost reduction comes mainly from output tokens and tool-call
+   reduction**, not from input-token compression. The agent doesn't read
+   *fewer* bytes when it has the wiki — it reads MORE byes (cache-creation
+   on guideline pages goes up). But it produces shorter responses and
+   takes fewer tool turns.
+
+6. **Two-batch experiment surfaced two regressions** (wav-info, imports)
+   where the wiki may have *hurt* accuracy on one trial each. Worth
+   investigating before scaling — the wiki's value isn't unconditional.
+
+7. **Skills > guidelines on cost.** The skills arm (3 synthesized skills,
+   no guidelines) beat the guidelines arm by 14% on median cost and
+   matched it on accuracy (98% vs 96%). Largest savings on tasks with a
+   direct skill match (t1-lens-model −28%) but ALSO on tasks where no
+   skill matched (t2-imports −39%, t3-todos −30%) — suggesting the
+   smaller wiki (less to scan) helps recall even when no recall fires.
+
+8. **Skills + guidelines together is the worst populated wiki.**
+   Combining the two arms (`wiki-twobatch-both`: same 3 skills + 15
+   atomics) costs +22% vs skills and +5% vs guidelines. Composition is
+   non-additive. Output tokens jump (206 → 272) without a corresponding
+   reads increase — the agent talks more when both kinds of recall are
+   available, even though it doesn't read more pages. **Implication: pick
+   skills OR guidelines, not both.**
+
+9. **Delete-on-promote beats `both` but not skills-only — and a stale
+   index nearly hid that.** *(Corrected 2026-06-10, see §8.)* The pruned
+   arm (3 skills + only the no-skill-coverage atomics) costs **−3% vs
+   both** and **+18% vs skills** on a correctly-indexed wiki. The
+   originally-reported +1%/+24% came from a builder bug: `render-skill`
+   archived atomics without refreshing `_index.jsonl`, so the wiki
+   exposed 0 skills and agents chased dangling guideline rows (commit
+   `2adc67a` fixed it). Corrected, pruning *helps* no-skill-match
+   families (archive −6%, skip −18% vs both) and costs only a small
+   residual on skill-match ones (image +9%, lens-model +11% vs both,
+   down from +28%/+79%). **Composition still dominates size, and
+   skills-only is still cheapest** — but delete-on-promote is a net
+   positive over stacking everything, not the wash the broken run
+   suggested.
+
+## File map
+
+```
+explorations/agent-wiki/experiments/
+├── RESULTS-SUMMARY.md                     this file
+├── twobatch-comparison.md                 with-wiki vs without-wiki A/B
+├── twobatch-skills-comparison.md          3-way (empty / guidelines / skills)
+├── twobatch-fourway-comparison.md         4-way (+ both arm)
+├── twobatch-fiveway-comparison.md         5-way (+ pruned arm)
+├── pruned-index-hypothesis.md             stale-index confound + correction
+│
+├── metrics/                               per-trial metric rollups (no raw transcripts)
+│   ├── twobatch.metrics.jsonl             empty (batch-1) + guidelines (batch-2)
+│   ├── twobatch-skills.metrics.jsonl
+│   ├── twobatch-both.metrics.jsonl
+│   └── pruned-fixed-9atomic.metrics.jsonl corrected pruned arm
+│
+└── harness/                               reproduce-it scripts
+    ├── experiment_wiki_consult.py         sandbox A/B runner
+    ├── wiki_consult_tasks.yaml            the 16-task corpus
+    ├── extract_trial_metrics.py           per-trial token/duration/tool metrics
+    ├── normalize_stream_json_transcripts.py  stream-json → OpenAI chat format
+    ├── twobatch_compare.py                metrics → comparison markdown
+    ├── threeway_compare.py                + skills column
+    ├── fourway_compare.py                 + both column
+    └── fiveway_compare.py                 + pruned column
+```
+
+> Raw per-trial transcripts (`results*/.../trial-N.jsonl`) are intentionally
+> excluded from this public exploration; only the metric rollups under
+> `metrics/` and the narrative reports are included. The comparison scripts
+> read those rollups.
+
+## Open questions worth pursuing
+
+- **Statistical power.** Headline metrics are based on 3 trials per task.
+  More trials would tighten the per-task confidence intervals,
+  particularly on the regression cases (wav-info, imports).
+- **Why wav-info and imports regressed.** Single-trial failures could be
+  noise; could also be the agent following a recalled guideline that
+  doesn't quite fit. Spot-check those transcripts.
+- **Transfer test.** All experiments use the same task in batch 1 and
+  batch 2. A real "transfer" experiment would test wiki-on-task-X with
+  wiki-built-from-tasks-Y where X ≠ Y but X ∈ family(Y). That tests
+  whether clusters generalize.
+- **Larger corpus.** 16 tasks × 3 trials is a small experiment.
+  Repeating with a 50-task corpus over more trials would test whether
+  the cost-reduction percentage scales, regresses, or saturates.
+- **Cross-pattern ensembling.** Could a wiki built closed-loop +
+  retroactive (using the seeding from the former + the per-task
+  templates from the latter) outperform either pattern alone?
+- **Skill granularity sensitivity.** Skills arm used 3 broad skills.
+  Would 16 narrow per-task skills do better or worse? Issue-260's prior
+  finding (broad triggers 4/5 vs narrow 2/5) suggests broad wins, but
+  per-task skills weren't tested on this corpus.
+- **Why the both arm regresses on no-skill-match tasks.** Median wiki
+  reads is identical between skills and both (2 + 0). The cost penalty
+  is purely output-token-driven. A trace-level inspection of agent
+  responses on archive/skip-family tasks would reveal whether the agent
+  is citing nearby guidelines without reading them, or whether the
+  presence of guidelines in the index is changing how it phrases its
+  answer.
+- **~~Pruning experiment~~ — answered in §8.** Pruned arm (3 skills +
+  9 no-skill-coverage atomics) does NOT beat skills-only. Skills-only
+  still wins on aggregate. The both-arm penalty is composition-driven,
+  not index-size-driven.
+
+- **Why does the pruned arm regress on skill-match tasks?** Pruning
+  should be neutral or positive on tasks WITH a matching skill — the
+  skill is unaffected and the index is smaller. Yet image and
+  lens-model families regressed sharply vs both. A trace-level
+  inspection of t1-lens-model trial 1 (which alone cost $0.488 in
+  pruned vs $0.36 in skills) might reveal whether the agent is
+  reading the surviving atomics out of curiosity or whether something
+  about the AGENTS.md / index format changes its decision path.
diff --git a/explorations/agent-wiki/experiments/harness/experiment_wiki_consult.py b/explorations/agent-wiki/experiments/harness/experiment_wiki_consult.py
new file mode 100644
index 00000000..8682ea1b
--- /dev/null
+++ b/explorations/agent-wiki/experiments/harness/experiment_wiki_consult.py
@@ -0,0 +1,450 @@
+#!/usr/bin/env python3
+# mypy: ignore-errors
+# Exploration/reference code — not type-checked to the project standard.
+"""A/B experiment: does pointing an agent at AGENTS.md alter its behavior?
+
+Paired design (utt1 → wiki → utt2):
+
+- utt1 produces a small focal-length-extraction trajectory (reused from
+  trajectory data; see Phase A in the plan file).
+- wiki-example/ is a fresh single-trajectory wiki built from utt1's
+  extracted guidelines. It contains AGENTS.md, _index.jsonl, 4 atomic
+  guidelines, 1 summary.
+- utt2 = "what lens model was used for @sample.jpg" — same image, related
+  but different EXIF field. The wiki should help the agent bridge to
+  LensModel (tag 0xA434) via the same Exif sub-IFD it documented for
+  focal length.
+
+For each condition (baseline, skill, prompt, claude_md), run N trials
+in a fresh sandbox container and score three binary signals:
+
+- read_agents_md: trajectory contains a Read of AGENTS.md
+- cited_guideline: agent's final response mentions a guideline title or
+  a key wiki concept (0xA434, 0x8769, ExifIFD)
+- outcome_match: response contains "Google Pixel 4a Rear Wide Camera"
+
+Usage:
+    uv run python tests/e2e/experiment_wiki_consult.py \\
+        --conditions baseline,skill,prompt,claude_md \\
+        --trials 3
+"""
+
+from __future__ import annotations
+
+import argparse
+import datetime
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+import yaml
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+SANDBOX_IMAGE = "claude-sandbox"
+TIMEOUT_SECONDS = 300
+FORWARDED_ENV_VARS = (
+    "ANTHROPIC_API_KEY",
+    "ANTHROPIC_AUTH_TOKEN",
+    "ANTHROPIC_BASE_URL",
+    "CLAUDE_MODEL",
+    "CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS",
+    "CLAUDE_CODE_SKIP_BEDROCK_AUTH",
+)
+WIKI_NAME = "wiki-example"  # default; overridable via --wiki on the CLI
+
+# Test wiki must already exist; built by Phase A of the plan.
+WIKI_SRC = REPO_ROOT / WIKI_NAME
+
+# Plugins to mount.
+PLUGINS_DIR = REPO_ROOT / "platform-integrations" / "claude" / "plugins"
+
+# Demo workspace (sample.jpg only; no .evolve/entities/ confound).
+DEMO_WORKSPACE = REPO_ROOT / "demo" / "workspace"
+
+
+_STRONG_HINT = (
+    f"Before doing anything else in this workspace, read `{WIKI_NAME}/AGENTS.md` "
+    f"and follow its retrieval recipe to find guidelines that apply to your task. "
+    f"Cite the guidelines you used."
+)
+
+
+_CODEBASE_SEED = {
+    "src/__init__.py": "",
+    "src/parser.py": (
+        '"""Parser module for csv and json inputs."""\n'
+        "\n"
+        "def parse_csv(text: str) -> list:\n"
+        "    # TODO: handle nested quotes properly\n"
+        "    return [row.split(',') for row in text.splitlines()]\n"
+        "\n"
+        "def parse_json(text: str):\n"
+        "    import json\n"
+        "    return json.loads(text)\n"
+    ),
+    "src/writer.py": (
+        "from .parser import parse_csv\n\ndef write_csv(rows):\n    return '\\n'.join(','.join(map(str, r)) for r in rows)\n"
+    ),
+    "src/api.py": (
+        "from .parser import parse_csv, parse_json\n"
+        "\n"
+        "def fetch_and_parse(text: str, fmt: str):\n"
+        "    return parse_csv(text) if fmt == 'csv' else parse_json(text)\n"
+    ),
+    "tests/__init__.py": "",
+    "tests/test_parser.py": (
+        "from src.parser import parse_csv\n\ndef test_parse_basic():\n    assert parse_csv('a,b\\nc,d') == [['a','b'], ['c','d']]\n"
+    ),
+    "README.md": ("# demo\n\nSmall Python project under `src/` with tests under `tests/`.\n"),
+}
+
+
+def _seed_codebase(ws: Path) -> None:
+    for rel, content in _CODEBASE_SEED.items():
+        p = ws / rel
+        p.parent.mkdir(parents=True, exist_ok=True)
+        p.write_text(content, encoding="utf-8")
+
+
+def _seed_format_group(ws: Path, group: str) -> list[str]:
+    """Seed image/archive/text format samples via the stdlib generators in
+    `_format_samples.py`. Group is one of `image-formats`, `archive-formats`,
+    `text-formats`."""
+    from _format_samples import seed_into  # local import — script lives next door
+
+    return seed_into(ws, group)
+
+
+def make_workspace(tmp_root: Path, condition: str, seed: str | None = None) -> Path:
+    """Build a per-run workspace with the wiki + condition-specific setup +
+    optional task-specific seed (e.g. a small mock python project)."""
+    ws = tmp_root / "workspace"
+    shutil.copytree(DEMO_WORKSPACE, ws, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup"))
+    # Mount the wiki inside the workspace at the same name the conditions reference.
+    shutil.copytree(WIKI_SRC, ws / WIKI_NAME)
+    # Per-condition setup
+    if condition == "claude_md":
+        (ws / "CLAUDE.md").write_text(
+            f"Before non-trivial tasks in this repo, consult `{WIKI_NAME}/AGENTS.md` for relevant guidelines.\n",
+            encoding="utf-8",
+        )
+    elif condition == "claude_md_strong":
+        (ws / "CLAUDE.md").write_text(_STRONG_HINT + "\n", encoding="utf-8")
+    # Per-task seed
+    if seed == "codebase":
+        _seed_codebase(ws)
+    elif seed in ("image-formats", "archive-formats", "text-formats"):
+        _seed_format_group(ws, seed)
+    return ws
+
+
+def build_prompt(condition: str, base_prompt: str) -> str:
+    if condition == "skill":
+        return "Use any skills that may help. " + base_prompt
+    if condition == "prompt":
+        return _STRONG_HINT + " " + base_prompt
+    return base_prompt
+
+
+_HINT_PLUGIN = REPO_ROOT / "tests" / "e2e" / "_wiki_hint_plugin"
+
+
+def run_sandbox(workspace: Path, prompt: str, condition: str) -> dict:
+    """Run a single sandbox session; return {stdout, stderr, returncode, duration_s}.
+
+    Per condition extras:
+    - `system_prompt`: pass `--append-system-prompt` with the strong hint.
+    - `session_hook`:  mount _wiki_hint_plugin which fires a SessionStart
+      hook printing the strong hint.
+
+    Other conditions don't pass `--plugin-dir` (avoids the evolve-lite recall
+    hook + recall skill confound). Trajectory comes from
+    `--output-format stream-json` on stdout (one event per line).
+    """
+    cmd = ["docker", "run", "--rm"]
+    for var in FORWARDED_ENV_VARS:
+        if os.environ.get(var):
+            cmd += ["-e", var]
+    docker_args = ["-v", f"{workspace}:/workspace"]
+    claude_extras = ""
+    if condition == "session_hook":
+        docker_args += ["-v", f"{_HINT_PLUGIN}:/plugins/_wiki_hint"]
+        claude_extras = "--plugin-dir /plugins/_wiki_hint "
+    if condition == "system_prompt":
+        claude_extras = f"--append-system-prompt {json.dumps(_STRONG_HINT)} "
+    cmd += docker_args
+    cmd += [
+        SANDBOX_IMAGE,
+        "bash",
+        "-c",
+        f"claude {claude_extras}--dangerously-skip-permissions --output-format stream-json --verbose -p {json.dumps(prompt)}",
+    ]
+    t0 = time.time()
+    proc = subprocess.run(cmd, capture_output=True, text=True, timeout=TIMEOUT_SECONDS)
+    dt = time.time() - t0
+    return {
+        "returncode": proc.returncode,
+        "stdout": proc.stdout,
+        "stderr": proc.stderr,
+        "duration_s": round(dt, 2),
+    }
+
+
+def parse_stream_json(stdout: str) -> tuple[list[str], str, list[dict]]:
+    """Parse `claude -p --output-format stream-json --verbose` output.
+
+    Returns (wiki_access_paths, assistant_text, all_events).
+
+    `wiki_access_paths` collects any signal of wiki access — Read tool calls
+    on wiki files, *or* Bash commands that cat/less/grep wiki files. The
+    agent often reads wiki content via `cat <wiki-example>/AGENTS.md`
+    rather than the Read tool, so we check both surfaces.
+    """
+    access_paths: list[str] = []
+    chunks: list[str] = []
+    events: list[dict] = []
+    bash_pat = re.compile(
+        r"\b(?:cat|less|head|tail|more|grep|sed)\b[^|;]*?(\S*?(?:AGENTS\.md|wiki-example/[A-Za-z0-9_./-]+))",
+    )
+    for line in stdout.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            event = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        events.append(event)
+        if event.get("type") != "assistant":
+            continue
+        msg = event.get("message", {}) or {}
+        content = msg.get("content")
+        if isinstance(content, str):
+            chunks.append(content)
+        elif isinstance(content, list):
+            for b in content:
+                if not isinstance(b, dict):
+                    continue
+                if b.get("type") == "text":
+                    chunks.append(b.get("text", ""))
+                elif b.get("type") == "tool_use":
+                    name = b.get("name")
+                    inp = b.get("input") or {}
+                    if name == "Read":
+                        fp = inp.get("file_path", "")
+                        if fp:
+                            access_paths.append(fp)
+                    elif name == "Bash":
+                        cmd = inp.get("command", "")
+                        for m in bash_pat.finditer(cmd):
+                            access_paths.append(m.group(1))
+    return access_paths, "\n".join(chunks), events
+
+
+def score(access_paths: list[str], assistant_text: str, task: dict) -> dict:
+    text_lc = assistant_text.lower()
+    # 1. read_agents_md — Read tool OR Bash cat/less/grep on AGENTS.md
+    read_agents_md = any("AGENTS.md" in p for p in access_paths)
+    # 2. cited_guideline: any expected filename mentioned in assistant text
+    expected_files = task.get("expected_guideline_filenames") or []
+    cited_filename = any(fn.lower() in text_lc for fn in expected_files)
+    # OR any of the wiki concepts (the "match_any" set) appears
+    match_any = task.get("outcome_match_any") or []
+    cited_concept = any(s.lower() in text_lc for s in match_any)
+    cited_guideline = cited_filename or cited_concept
+    # 3. outcome_match: every required substring present
+    must_all = task.get("outcome_match_all") or []
+    outcome_match = all(s.lower() in text_lc for s in must_all)
+    return {
+        "read_agents_md": bool(read_agents_md),
+        "cited_guideline": bool(cited_guideline),
+        "outcome_match": bool(outcome_match),
+    }
+
+
+def main(argv: list[str] | None = None) -> int:
+    # Declare upfront because --wiki may rebind these later in this function.
+    global WIKI_NAME, WIKI_SRC, _STRONG_HINT
+    parser = argparse.ArgumentParser()
+    # `skill` condition is omitted: the agent-wiki/ family is not registered
+    # as a plugin skill in evolve-lite's plugin.json (which only declares
+    # ./skills/evolve-lite/). Loading the plugin to register it would also
+    # pull in the recall hook + recall skill, which confound the test.
+    parser.add_argument(
+        "--conditions",
+        default="baseline,prompt,claude_md",
+        help="Comma-separated condition slugs. "
+        "Available: baseline, prompt, claude_md, claude_md_strong, "
+        "system_prompt, session_hook. (skill condition deferred — "
+        "agent-wiki/* not registered as plugin skills.)",
+    )
+    parser.add_argument("--trials", type=int, default=3, help="Trials per condition")
+    parser.add_argument("--task", default="t1-lens-model", help="Task id (or comma-separated task ids) from wiki_consult_tasks.yaml")
+    parser.add_argument("--wiki", default=None, help=f"Wiki dir to mount at /workspace/<name>/. Default: {WIKI_NAME}")
+    parser.add_argument("--out-root", default="experiments/results", help="Where to write the results dir")
+    parser.add_argument("--keep-workspaces", action="store_true", help="Don't delete per-run workspaces (debug)")
+    args = parser.parse_args(argv)
+
+    # Allow --wiki to override the module-level constants. _STRONG_HINT is
+    # already a module global that captures WIKI_NAME at import time, so
+    # rebuild it whenever we override.
+    if args.wiki:
+        WIKI_NAME = args.wiki
+        WIKI_SRC = REPO_ROOT / WIKI_NAME
+        _STRONG_HINT = (
+            f"Before doing anything else in this workspace, read `{WIKI_NAME}/AGENTS.md` "
+            f"and follow its retrieval recipe to find guidelines that apply to your task. "
+            f"Cite the guidelines you used."
+        )
+
+    if not WIKI_SRC.is_dir():
+        print(f"error: {WIKI_SRC} does not exist. Run Phase A first.", file=sys.stderr)
+        return 2
+
+    # Load tasks (--task may be comma-separated)
+    tasks_file = REPO_ROOT / "tests" / "e2e" / "wiki_consult_tasks.yaml"
+    tasks = {t["id"]: t for t in yaml.safe_load(tasks_file.read_text())}
+    task_ids = [t.strip() for t in args.task.split(",") if t.strip()]
+    for tid in task_ids:
+        if tid not in tasks:
+            print(f"error: task {tid!r} not found in {tasks_file}", file=sys.stderr)
+            return 2
+
+    conditions = [c.strip() for c in args.conditions.split(",") if c.strip()]
+    valid = {"baseline", "skill", "prompt", "claude_md", "claude_md_strong", "system_prompt", "session_hook"}
+    for c in conditions:
+        if c not in valid:
+            print(f"error: unknown condition {c!r}; valid: {sorted(valid)}", file=sys.stderr)
+            return 2
+
+    ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    out_dir = REPO_ROOT / args.out_root / f"wiki-consult-{ts}"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    transcripts_dir = out_dir / "transcripts"
+
+    runs_path = out_dir / "runs.jsonl"
+    runs_f = runs_path.open("w", encoding="utf-8")
+
+    print(f"writing results to {out_dir}", file=sys.stderr)
+    print(f"conditions: {conditions}, trials: {args.trials}, tasks: {task_ids}", file=sys.stderr)
+
+    summary: dict[tuple[str, str], list[dict]] = {(t, c): [] for t in task_ids for c in conditions}
+    for tid in task_ids:
+        task = tasks[tid]
+        seed = task.get("seed")
+        for condition in conditions:
+            for trial in range(1, args.trials + 1):
+                print(f"\n=== {tid} / {condition} / trial {trial}/{args.trials} ===", file=sys.stderr)
+                tmp_root = out_dir / "_workspaces" / f"{tid}-{condition}-t{trial}"
+                tmp_root.mkdir(parents=True, exist_ok=True)
+                ws = make_workspace(tmp_root, condition, seed=seed)
+                prompt = build_prompt(condition, task["prompt"])
+                try:
+                    run = run_sandbox(ws, prompt, condition)
+                except subprocess.TimeoutExpired:
+                    print(f"  ✗ TIMEOUT after {TIMEOUT_SECONDS}s — skipping this trial", file=sys.stderr)
+                    runs_f.write(
+                        json.dumps(
+                            {
+                                "task": tid,
+                                "condition": condition,
+                                "trial": trial,
+                                "duration_s": TIMEOUT_SECONDS,
+                                "returncode": None,
+                                "read_agents_md": False,
+                                "cited_guideline": False,
+                                "outcome_match": False,
+                                "access_paths_n": 0,
+                                "assistant_text_len": 0,
+                                "timed_out": True,
+                            }
+                        )
+                        + "\n"
+                    )
+                    runs_f.flush()
+                    if not args.keep_workspaces:
+                        shutil.rmtree(tmp_root, ignore_errors=True)
+                    continue
+                access_paths, assistant_text, events = parse_stream_json(run["stdout"])
+                sig = score(access_paths, assistant_text, task)
+                row = {
+                    "task": tid,
+                    "condition": condition,
+                    "trial": trial,
+                    "duration_s": run["duration_s"],
+                    "returncode": run["returncode"],
+                    **sig,
+                    "access_paths_n": len(access_paths),
+                    "assistant_text_len": len(assistant_text),
+                }
+                runs_f.write(json.dumps(row) + "\n")
+                runs_f.flush()
+                summary[(tid, condition)].append(row)
+                print(
+                    f"  read_agents_md={sig['read_agents_md']}  "
+                    f"cited_guideline={sig['cited_guideline']}  "
+                    f"outcome_match={sig['outcome_match']}  "
+                    f"({run['duration_s']:.0f}s)",
+                    file=sys.stderr,
+                )
+                # Stash the stream-json output for spot-checks
+                dst_dir2 = transcripts_dir / tid / condition
+                dst_dir2.mkdir(parents=True, exist_ok=True)
+                (dst_dir2 / f"trial-{trial}.jsonl").write_text(run["stdout"], encoding="utf-8")
+                if run["returncode"] != 0:
+                    (dst_dir2 / f"trial-{trial}.stderr.txt").write_text(run["stderr"], encoding="utf-8")
+                if not args.keep_workspaces:
+                    shutil.rmtree(tmp_root, ignore_errors=True)
+    runs_f.close()
+
+    # Render summary.md (one section per task)
+    md_lines = [f"# Wiki-consult experiment — {ts}", ""]
+    for tid in task_ids:
+        task = tasks[tid]
+        md_lines += [
+            f"## Task `{tid}` — {task['prompt']!r}",
+            "",
+            f"Trials per condition: **{args.trials}**",
+            "",
+            "| Condition  | read AGENTS.md | cited guideline | outcome match | median runtime (s) |",
+            "|------------|:--------------:|:---------------:|:-------------:|-------------------:|",
+        ]
+        for condition in conditions:
+            rows = summary[(tid, condition)]
+            n = len(rows)
+            if n == 0:
+                continue
+            rd = sum(r["read_agents_md"] for r in rows)
+            ct = sum(r["cited_guideline"] for r in rows)
+            om = sum(r["outcome_match"] for r in rows)
+            durs = sorted(r["duration_s"] for r in rows)
+            median = durs[n // 2]
+            md_lines.append(f"| {condition:<10} | {rd}/{n} | {ct}/{n} | {om}/{n} | {median:.0f} |")
+        md_lines.append("")
+    md_lines.extend(
+        [
+            "",
+            "Signals:",
+            "",
+            "- **read AGENTS.md**: agent's trajectory contains a `Read` of `AGENTS.md`.",
+            "- **cited guideline**: agent's text contains an expected guideline filename or wiki concept (e.g. `0xA434`, `0x8769`, `ExifIFD`).",
+            "- **outcome match**: agent's text contains all required substrings — for the lens-model task, the answer `Google Pixel 4a Rear Wide Camera`.",
+            "",
+            f"Runs JSONL: `{runs_path.relative_to(REPO_ROOT)}`",
+            f"Transcripts: `{transcripts_dir.relative_to(REPO_ROOT)}/`",
+        ]
+    )
+    (out_dir / "summary.md").write_text("\n".join(md_lines) + "\n", encoding="utf-8")
+
+    print(f"\nwrote {runs_path}", file=sys.stderr)
+    print(f"wrote {out_dir / 'summary.md'}", file=sys.stderr)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/explorations/agent-wiki/experiments/harness/extract_trial_metrics.py b/explorations/agent-wiki/experiments/harness/extract_trial_metrics.py
new file mode 100644
index 00000000..0fddd3f5
--- /dev/null
+++ b/explorations/agent-wiki/experiments/harness/extract_trial_metrics.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+# mypy: ignore-errors
+# Exploration/reference code — not type-checked to the project standard.
+"""Extract per-trial metrics from a stream-json transcript.
+
+Pulls token counts from `assistant.usage` events + the terminal `result`
+event. Counts tool calls and wiki-page reads. Used by the two-batch
+experiment to build the with-wiki vs without-wiki comparison.
+
+Usage:
+    uv run python scripts/extract_trial_metrics.py \\
+        --transcript path/to/trial-1.jsonl --task t6-png-dim --batch 1 \\
+        --condition claude_md_strong [--outcome-match-all '...']
+
+Emits one JSON object on stdout. Pipe to a .jsonl file for aggregation.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from pathlib import Path
+
+
+def parse(transcript: Path) -> dict:
+    events = [json.loads(ln) for ln in transcript.read_text(encoding="utf-8").splitlines() if ln.strip()]
+    sid = "?"
+    duration_ms = 0
+    total_cost_usd = 0.0
+    final_text = ""
+    in_tokens = cache_creation = cache_read = out_tokens = 0
+    tool_calls = 0
+    wiki_reads = 0  # Read of AGENTS.md / _index.jsonl / guidelines/*.md
+    agents_md_read = False
+    index_read = False
+    guideline_reads = 0
+
+    for e in events:
+        t = e.get("type")
+        if t == "system" and e.get("subtype") == "init":
+            sid = e.get("session_id") or sid
+        elif t == "assistant":
+            usage = (e.get("message") or {}).get("usage") or {}
+            in_tokens += int(usage.get("input_tokens", 0) or 0)
+            cache_creation += int(usage.get("cache_creation_input_tokens", 0) or 0)
+            cache_read += int(usage.get("cache_read_input_tokens", 0) or 0)
+            out_tokens += int(usage.get("output_tokens", 0) or 0)
+            for b in (e.get("message") or {}).get("content") or []:
+                if not isinstance(b, dict):
+                    continue
+                if b.get("type") == "text":
+                    final_text = b.get("text") or final_text
+                elif b.get("type") == "tool_use":
+                    tool_calls += 1
+                    name = b.get("name")
+                    inp = b.get("input") or {}
+                    if name == "Read":
+                        fp = inp.get("file_path", "")
+                        if "AGENTS.md" in fp:
+                            agents_md_read = True
+                            wiki_reads += 1
+                        elif "_index.jsonl" in fp:
+                            index_read = True
+                            wiki_reads += 1
+                        elif "/guidelines/" in fp and fp.endswith(".md"):
+                            guideline_reads += 1
+                            wiki_reads += 1
+                    elif name == "Bash":
+                        cmd = inp.get("command", "") or ""
+                        if "AGENTS.md" in cmd:
+                            agents_md_read = True
+                            wiki_reads += 1
+                        if "_index.jsonl" in cmd:
+                            index_read = True
+                            wiki_reads += 1
+                        m = re.search(r"/guidelines/[\w./-]+\.md", cmd)
+                        if m:
+                            guideline_reads += 1
+                            wiki_reads += 1
+        elif t == "result":
+            duration_ms = int(e.get("duration_ms") or 0)
+            total_cost_usd = float(e.get("total_cost_usd") or 0.0)
+            final_text = e.get("result") or final_text
+
+    return {
+        "session_id": sid,
+        "duration_s": round(duration_ms / 1000, 2),
+        "total_cost_usd": total_cost_usd,
+        "input_tokens": in_tokens,
+        "cache_creation_input_tokens": cache_creation,
+        "cache_read_input_tokens": cache_read,
+        "output_tokens": out_tokens,
+        "billable_tokens_proxy": in_tokens + cache_creation + out_tokens,  # cache reads are cheap
+        "tool_calls": tool_calls,
+        "wiki_reads_total": wiki_reads,
+        "agents_md_read": agents_md_read,
+        "index_read": index_read,
+        "guideline_reads": guideline_reads,
+        "final_text_len": len(final_text or ""),
+    }
+
+
+def score_outcome(text: str, must_all: list[str]) -> bool:
+    text_lc = (text or "").lower()
+    return all(s.lower() in text_lc for s in must_all)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--transcript", required=True)
+    ap.add_argument("--task", required=True)
+    ap.add_argument("--batch", required=True)
+    ap.add_argument("--condition", default="claude_md_strong")
+    ap.add_argument("--trial", required=True)
+    ap.add_argument("--outcome-match-all", default="", help="Comma-separated must-all-substrings for outcome_match")
+    args = ap.parse_args()
+
+    rec = parse(Path(args.transcript))
+    rec["task"] = args.task
+    rec["batch"] = int(args.batch)
+    rec["condition"] = args.condition
+    rec["trial"] = int(args.trial)
+
+    must_all = [s.strip() for s in args.outcome_match_all.split(",") if s.strip()]
+    # Re-parse the result event for outcome scoring
+    final_text = ""
+    for line in Path(args.transcript).read_text(encoding="utf-8").splitlines():
+        if not line.strip():
+            continue
+        e = json.loads(line)
+        if e.get("type") == "result":
+            final_text = e.get("result") or ""
+            break
+        if e.get("type") == "assistant":
+            for b in (e.get("message") or {}).get("content") or []:
+                if isinstance(b, dict) and b.get("type") == "text":
+                    final_text = b.get("text") or final_text
+    rec["outcome_match"] = score_outcome(final_text, must_all) if must_all else None
+
+    print(json.dumps(rec))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/explorations/agent-wiki/experiments/harness/fiveway_compare.py b/explorations/agent-wiki/experiments/harness/fiveway_compare.py
new file mode 100644
index 00000000..1aabd15a
--- /dev/null
+++ b/explorations/agent-wiki/experiments/harness/fiveway_compare.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+# mypy: ignore-errors
+# Exploration/reference code — not type-checked to the project standard.
+"""Five-way comparison: empty / guidelines / skills / both / pruned.
+
+Reads four metrics files:
+  ../metrics/twobatch.metrics.jsonl          (twobatch — batch 1 = empty, batch 2 = guidelines)
+  ../metrics/twobatch-skills.metrics.jsonl   (skills arm)
+  ../metrics/twobatch-both.metrics.jsonl     (both arm)
+  ../metrics/pruned-fixed-9atomic.metrics.jsonl   (pruned arm: skills + only no-skill-coverage atomics)
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+
+TASK_IDS_ORDER = [
+    "t1-lens-model",
+    "t6-png-dim",
+    "t7-gif-dim",
+    "t8-bmp-info",
+    "t9-webp-dim",
+    "t10-zip-list",
+    "t11-tar-list",
+    "t12-wav-info",
+    "t13-gzip-dec",
+    "t14-csv-quoted",
+    "t15-jsonl-kinds",
+    "t16-ini-key",
+    "t17-log-errors",
+    "t2-imports",
+    "t3-todos",
+    "t5-base64",
+]
+
+FAMILY = {
+    "t1-lens-model": "lens-model",
+    "t6-png-dim": "image",
+    "t7-gif-dim": "image",
+    "t8-bmp-info": "image",
+    "t9-webp-dim": "image",
+    "t10-zip-list": "archive",
+    "t11-tar-list": "archive",
+    "t12-wav-info": "archive",
+    "t13-gzip-dec": "archive",
+    "t14-csv-quoted": "text",
+    "t15-jsonl-kinds": "text",
+    "t16-ini-key": "text",
+    "t17-log-errors": "text",
+    "t2-imports": "skip",
+    "t3-todos": "skip",
+    "t5-base64": "skip",
+}
+
+ARMS = ("empty", "guidelines", "skills", "both", "pruned")
+
+
+def median(xs):
+    xs = [x for x in xs if x is not None]
+    return statistics.median(xs) if xs else None
+
+
+def acc(rs):
+    oms = [r.get("outcome_match") for r in rs if r.get("outcome_match") is not None]
+    return sum(1 for x in oms if x) / len(oms) if oms else None
+
+
+def fmt(x, kind="num"):
+    if x is None:
+        return "—"
+    if kind == "tokens":
+        return f"{int(x):,}"
+    if kind == "dollars":
+        return f"${x:.4f}"
+    if kind == "duration":
+        return f"{x:.0f}s"
+    if kind == "pct":
+        return f"{x:.0%}"
+    return f"{x:.1f}"
+
+
+def delta(base, other, kind="num"):
+    if base is None or other is None or base == 0:
+        return "—"
+    diff = other - base
+    pct = diff / base
+    sign = "+" if diff >= 0 else ""
+    if kind == "tokens":
+        return f"{sign}{int(diff):,} ({sign}{pct:.0%})"
+    if kind == "duration":
+        return f"{sign}{diff:.0f}s ({sign}{pct:.0%})"
+    if kind == "dollars":
+        return f"{sign}${diff:.4f} ({sign}{pct:.0%})"
+    if kind == "pct":
+        return f"{sign}{pct:.0%}"
+    return f"{sign}{diff:.1f}"
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--twobatch-metrics", default="../metrics/twobatch.metrics.jsonl")
+    ap.add_argument("--skills-metrics", default="../metrics/twobatch-skills.metrics.jsonl")
+    ap.add_argument("--both-metrics", default="../metrics/twobatch-both.metrics.jsonl")
+    # Corrected pruned arm: re-run against a fixed (index-refreshed) wiki.
+    # The original experiments/results-twobatch-pruned/ ran against a stale
+    # index (0 skills exposed, 6 broken links) — see the Correction note.
+    ap.add_argument("--pruned-metrics", default="../metrics/pruned-fixed-9atomic.metrics.jsonl")
+    ap.add_argument("--out", default="experiments/twobatch-fiveway-comparison.md")
+    args = ap.parse_args()
+
+    rows: list[dict] = []
+    for line in Path(args.twobatch_metrics).read_text().splitlines():
+        if not line.strip():
+            continue
+        r = json.loads(line)
+        r["arm"] = "empty" if r["batch"] == 1 else "guidelines"
+        rows.append(r)
+    for arm, path in (("skills", args.skills_metrics), ("both", args.both_metrics), ("pruned", args.pruned_metrics)):
+        p = Path(path)
+        if not p.exists():
+            continue
+        for line in p.read_text().splitlines():
+            if not line.strip():
+                continue
+            r = json.loads(line)
+            r["arm"] = arm
+            rows.append(r)
+
+    by_task: dict[str, dict[str, list[dict]]] = defaultdict(lambda: defaultdict(list))
+    for r in rows:
+        by_task[r["task"]][r["arm"]].append(r)
+
+    by_arm = {a: [r for r in rows if r["arm"] == a] for a in ARMS}
+
+    md: list[str] = []
+    md.append("# Five-way wiki-helps comparison: empty / guidelines / skills / both / pruned")
+    md.append("")
+    md.append(
+        "Same 16-task corpus, five arms, all `claude_md_strong` condition. "
+        "Empty + guidelines arms are twobatch's batch-1 / batch-2. Skills arm "
+        "is twobatch-skills (3 skills, no guidelines). Both arm is "
+        "twobatch-both (those same 3 skills + ~15 atomics, no clusters). "
+        "**Pruned arm** is twobatch-pruned: same 3 skills + only the "
+        "no-skill-coverage atomics (delete-on-promote policy applied — "
+        "image-format and CSV atomics archived because their corresponding "
+        "skills were synthesized)."
+    )
+    md.append("")
+
+    md.append("## Aggregate")
+    md.append("")
+    md.append("| Metric | Empty | Guidelines | Skills | Both | Pruned | P vs G | P vs S | P vs B |")
+    md.append("|---|---:|---:|---:|---:|---:|---:|---:|---:|")
+    pairs = [
+        ("Trials", "len", "num"),
+        ("Accuracy (mean)", "_acc", "pct"),
+        ("Median duration", "duration_s", "duration"),
+        ("Median input tokens", "input_tokens", "tokens"),
+        ("Median output tokens", "output_tokens", "tokens"),
+        ("Median total cost USD", "total_cost_usd", "dollars"),
+        ("Median tool calls", "tool_calls", "num"),
+        ("Median wiki reads", "wiki_reads_total", "num"),
+        ("Median guideline reads", "guideline_reads", "num"),
+    ]
+    for label, field, kind in pairs:
+        vals = {}
+        for a in ARMS:
+            arm_rows = by_arm[a]
+            if field == "len":
+                vals[a] = len(arm_rows)
+            elif field == "_acc":
+                vals[a] = acc(arm_rows)
+            else:
+                vals[a] = median([r.get(field) for r in arm_rows])
+        if field == "len":
+            md.append(
+                f"| {label} | {vals['empty']} | {vals['guidelines']} | {vals['skills']} | "
+                f"{vals['both']} | {vals['pruned']} | "
+                f"{vals['pruned'] - vals['guidelines']:+d} | "
+                f"{vals['pruned'] - vals['skills']:+d} | "
+                f"{vals['pruned'] - vals['both']:+d} |"
+            )
+        else:
+            md.append(
+                f"| {label} | {fmt(vals['empty'], kind)} | {fmt(vals['guidelines'], kind)} | "
+                f"{fmt(vals['skills'], kind)} | {fmt(vals['both'], kind)} | "
+                f"{fmt(vals['pruned'], kind)} | "
+                f"{delta(vals['guidelines'], vals['pruned'], kind)} | "
+                f"{delta(vals['skills'], vals['pruned'], kind)} | "
+                f"{delta(vals['both'], vals['pruned'], kind)} |"
+            )
+    md.append("")
+
+    md.append("## By task family")
+    md.append("")
+    md.append("Median total_cost_usd. `Δ S→P` = `pruned` minus `skills`; `Δ B→P` = `pruned` minus `both`.")
+    md.append("")
+    md.append("| Family | Tasks | E acc | G acc | S acc | B acc | P acc | E $ | G $ | S $ | B $ | P $ | Δ S→P | Δ B→P |")
+    md.append("|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|")
+    fam_groups: dict[str, list[str]] = defaultdict(list)
+    for tid, fam in FAMILY.items():
+        fam_groups[fam].append(tid)
+    for fam, tids in fam_groups.items():
+        in_fam = {a: [r for r in rows if r["task"] in tids and r["arm"] == a] for a in ARMS}
+        cs = {a: median([r.get("total_cost_usd") for r in in_fam[a]]) for a in ARMS}
+        md.append(
+            f"| {fam} | {len(tids)} | "
+            f"{fmt(acc(in_fam['empty']), 'pct')} | {fmt(acc(in_fam['guidelines']), 'pct')} | "
+            f"{fmt(acc(in_fam['skills']), 'pct')} | {fmt(acc(in_fam['both']), 'pct')} | "
+            f"{fmt(acc(in_fam['pruned']), 'pct')} | "
+            f"{fmt(cs['empty'], 'dollars')} | {fmt(cs['guidelines'], 'dollars')} | "
+            f"{fmt(cs['skills'], 'dollars')} | {fmt(cs['both'], 'dollars')} | "
+            f"{fmt(cs['pruned'], 'dollars')} | "
+            f"{delta(cs['skills'], cs['pruned'], 'dollars')} | "
+            f"{delta(cs['both'], cs['pruned'], 'dollars')} |"
+        )
+    md.append("")
+
+    md.append("## Per task — cost USD")
+    md.append("")
+    md.append("| Task | E $ | G $ | S $ | B $ | P $ | Δ S→P | Δ B→P |")
+    md.append("|---|---:|---:|---:|---:|---:|---:|---:|")
+    for tid in TASK_IDS_ORDER:
+        if not by_task[tid]:
+            continue
+        cs = {a: median([r.get("total_cost_usd") for r in by_task[tid].get(a, [])]) for a in ARMS}
+        md.append(
+            f"| `{tid}` | {fmt(cs['empty'], 'dollars')} | {fmt(cs['guidelines'], 'dollars')} | "
+            f"{fmt(cs['skills'], 'dollars')} | {fmt(cs['both'], 'dollars')} | "
+            f"{fmt(cs['pruned'], 'dollars')} | "
+            f"{delta(cs['skills'], cs['pruned'], 'dollars')} | "
+            f"{delta(cs['both'], cs['pruned'], 'dollars')} |"
+        )
+    md.append("")
+
+    md.append("## Per task — accuracy")
+    md.append("")
+    md.append("| Task | E acc | G acc | S acc | B acc | P acc |")
+    md.append("|---|:-:|:-:|:-:|:-:|:-:|")
+    for tid in TASK_IDS_ORDER:
+        if not by_task[tid]:
+            continue
+        as_ = {a: acc(by_task[tid].get(a, [])) for a in ARMS}
+        md.append(
+            f"| `{tid}` | {fmt(as_['empty'], 'pct')} | {fmt(as_['guidelines'], 'pct')} | "
+            f"{fmt(as_['skills'], 'pct')} | {fmt(as_['both'], 'pct')} | "
+            f"{fmt(as_['pruned'], 'pct')} |"
+        )
+    md.append("")
+    md.append("## Notes")
+    md.append("")
+    md.append("- Empty + guidelines + skills + both columns reproduce the 4-way comparison.")
+    md.append(
+        "- Pruned column is the new arm, testing the **delete-on-promote** policy: "
+        "when `synthesize-skill` produces a skill, it inferentially archives the "
+        "atomic guidelines covered by the skill (via tag-superset, slug-keyword, or "
+        "format-identifier description match). Result: 3 skills + 9 atomics + 6 archived."
+    )
+    md.append(
+        '- The pruned arm is the experimental answer to the open question "if '
+        "'both' loses to 'skills-only', does 'skills + only the no-skill-coverage "
+        "guidelines' beat 'skills-only'?\" raised in §7 of RESULTS-SUMMARY.md."
+    )
+    md.append("")
+    md.append("### Correction — Pruned column is the re-run against a fixed index")
+    md.append("")
+    md.append(
+        "The original pruned arm (commit `8bcd713`) ran against a wiki whose "
+        "`_index.jsonl` was **stale**: `render-skill` archived the covered atomics "
+        "but never refreshed the indexes, so the wiki exposed **0 skills, 15 "
+        "guideline rows, 6 broken links**. Agents couldn't see the skills and fell "
+        "back to dangling guideline rows (original: median $0.181, 290 output "
+        "tokens, 3 wiki reads, 1 guideline read)."
+    )
+    md.append("")
+    md.append(
+        "Commit `2adc67a` fixed the builder to refresh the section indexes + "
+        "`_index.jsonl` after `render-skill`/`render-cluster` (with an integrity "
+        "assertion). This Pruned column is the full 16-task re-run against the "
+        "corrected wiki: median **$0.173**, ~225 output tokens, 2 wiki reads, **0** "
+        "guideline reads. Net: pruned moved from +1% to **-3% vs both** and from "
+        "+24% to **+18% vs skills**. Skills-only is still cheapest, but the apparent "
+        '"pruning is worse than both" result was largely the stale-index bug, not '
+        "the policy. See `pruned-index-hypothesis.md` for the slice-level diagnosis."
+    )
+    Path(args.out).write_text("\n".join(md) + "\n", encoding="utf-8")
+    print(f"wrote {args.out}", flush=True)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/explorations/agent-wiki/experiments/harness/fourway_compare.py b/explorations/agent-wiki/experiments/harness/fourway_compare.py
new file mode 100644
index 00000000..27e526d9
--- /dev/null
+++ b/explorations/agent-wiki/experiments/harness/fourway_compare.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python3
+# mypy: ignore-errors
+# Exploration/reference code — not type-checked to the project standard.
+"""Four-way comparison: empty / guidelines / skills / both.
+
+Reads three metrics files:
+  ../metrics/twobatch.metrics.jsonl          (twobatch — batch 1 = empty, batch 2 = guidelines)
+  ../metrics/twobatch-skills.metrics.jsonl   (skills arm)
+  ../metrics/twobatch-both.metrics.jsonl     (both arm)
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+
+TASK_IDS_ORDER = [
+    "t1-lens-model",
+    "t6-png-dim",
+    "t7-gif-dim",
+    "t8-bmp-info",
+    "t9-webp-dim",
+    "t10-zip-list",
+    "t11-tar-list",
+    "t12-wav-info",
+    "t13-gzip-dec",
+    "t14-csv-quoted",
+    "t15-jsonl-kinds",
+    "t16-ini-key",
+    "t17-log-errors",
+    "t2-imports",
+    "t3-todos",
+    "t5-base64",
+]
+
+FAMILY = {
+    "t1-lens-model": "lens-model",
+    "t6-png-dim": "image",
+    "t7-gif-dim": "image",
+    "t8-bmp-info": "image",
+    "t9-webp-dim": "image",
+    "t10-zip-list": "archive",
+    "t11-tar-list": "archive",
+    "t12-wav-info": "archive",
+    "t13-gzip-dec": "archive",
+    "t14-csv-quoted": "text",
+    "t15-jsonl-kinds": "text",
+    "t16-ini-key": "text",
+    "t17-log-errors": "text",
+    "t2-imports": "skip",
+    "t3-todos": "skip",
+    "t5-base64": "skip",
+}
+
+ARMS = ("empty", "guidelines", "skills", "both")
+
+
+def median(xs):
+    xs = [x for x in xs if x is not None]
+    return statistics.median(xs) if xs else None
+
+
+def acc(rs):
+    oms = [r.get("outcome_match") for r in rs if r.get("outcome_match") is not None]
+    return sum(1 for x in oms if x) / len(oms) if oms else None
+
+
+def fmt(x, kind="num"):
+    if x is None:
+        return "—"
+    if kind == "tokens":
+        return f"{int(x):,}"
+    if kind == "dollars":
+        return f"${x:.4f}"
+    if kind == "duration":
+        return f"{x:.0f}s"
+    if kind == "pct":
+        return f"{x:.0%}"
+    return f"{x:.1f}"
+
+
+def delta(base, other, kind="num"):
+    if base is None or other is None or base == 0:
+        return "—"
+    diff = other - base
+    pct = diff / base
+    sign = "+" if diff >= 0 else ""
+    if kind == "tokens":
+        return f"{sign}{int(diff):,} ({sign}{pct:.0%})"
+    if kind == "duration":
+        return f"{sign}{diff:.0f}s ({sign}{pct:.0%})"
+    if kind == "dollars":
+        return f"{sign}${diff:.4f} ({sign}{pct:.0%})"
+    if kind == "pct":
+        return f"{sign}{pct:.0%}"
+    return f"{sign}{diff:.1f}"
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--twobatch-metrics", default="../metrics/twobatch.metrics.jsonl")
+    ap.add_argument("--skills-metrics", default="../metrics/twobatch-skills.metrics.jsonl")
+    ap.add_argument("--both-metrics", default="../metrics/twobatch-both.metrics.jsonl")
+    ap.add_argument("--out", default="experiments/twobatch-fourway-comparison.md")
+    args = ap.parse_args()
+
+    rows: list[dict] = []
+    for line in Path(args.twobatch_metrics).read_text().splitlines():
+        if not line.strip():
+            continue
+        r = json.loads(line)
+        r["arm"] = "empty" if r["batch"] == 1 else "guidelines"
+        rows.append(r)
+    for arm, path in (("skills", args.skills_metrics), ("both", args.both_metrics)):
+        p = Path(path)
+        if not p.exists():
+            continue
+        for line in p.read_text().splitlines():
+            if not line.strip():
+                continue
+            r = json.loads(line)
+            r["arm"] = arm
+            rows.append(r)
+
+    by_task: dict[str, dict[str, list[dict]]] = defaultdict(lambda: defaultdict(list))
+    for r in rows:
+        by_task[r["task"]][r["arm"]].append(r)
+
+    by_arm = {a: [r for r in rows if r["arm"] == a] for a in ARMS}
+
+    md: list[str] = []
+    md.append("# Four-way wiki-helps comparison: empty / guidelines / skills / both")
+    md.append("")
+    md.append(
+        "Same 16-task corpus, four arms, all `claude_md_strong` condition. "
+        "Empty + guidelines arms are twobatch's batch-1 / batch-2. Skills arm "
+        "is twobatch-skills (3 skills, no guidelines). Both arm is "
+        "twobatch-both (those same 3 skills + ~15 atomics, no clusters)."
+    )
+    md.append("")
+
+    md.append("## Aggregate")
+    md.append("")
+    md.append("| Metric | Empty | Guidelines | Skills | Both | Both vs G | Both vs S |")
+    md.append("|---|---:|---:|---:|---:|---:|---:|")
+    pairs = [
+        ("Trials", "len", "num"),
+        ("Accuracy (mean)", "_acc", "pct"),
+        ("Median duration", "duration_s", "duration"),
+        ("Median input tokens", "input_tokens", "tokens"),
+        ("Median output tokens", "output_tokens", "tokens"),
+        ("Median total cost USD", "total_cost_usd", "dollars"),
+        ("Median tool calls", "tool_calls", "num"),
+        ("Median wiki reads", "wiki_reads_total", "num"),
+        ("Median guideline reads", "guideline_reads", "num"),
+    ]
+    for label, field, kind in pairs:
+        vals = {}
+        for a in ARMS:
+            arm_rows = by_arm[a]
+            if field == "len":
+                vals[a] = len(arm_rows)
+            elif field == "_acc":
+                vals[a] = acc(arm_rows)
+            else:
+                vals[a] = median([r.get(field) for r in arm_rows])
+        if field == "len":
+            md.append(
+                f"| {label} | {vals['empty']} | {vals['guidelines']} | {vals['skills']} | {vals['both']} | "
+                f"{vals['both'] - vals['guidelines']:+d} | {vals['both'] - vals['skills']:+d} |"
+            )
+        else:
+            md.append(
+                f"| {label} | {fmt(vals['empty'], kind)} | {fmt(vals['guidelines'], kind)} | "
+                f"{fmt(vals['skills'], kind)} | {fmt(vals['both'], kind)} | "
+                f"{delta(vals['guidelines'], vals['both'], kind)} | "
+                f"{delta(vals['skills'], vals['both'], kind)} |"
+            )
+    md.append("")
+
+    md.append("## By task family")
+    md.append("")
+    md.append("Median total_cost_usd. `Δ G→B` is `both` minus `guidelines`; `Δ S→B` is `both` minus `skills`.")
+    md.append("")
+    md.append("| Family | Tasks | E acc | G acc | S acc | B acc | E $ | G $ | S $ | B $ | Δ G→B | Δ S→B |")
+    md.append("|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|")
+    fam_groups: dict[str, list[str]] = defaultdict(list)
+    for tid, fam in FAMILY.items():
+        fam_groups[fam].append(tid)
+    for fam, tids in fam_groups.items():
+        in_fam = {a: [r for r in rows if r["task"] in tids and r["arm"] == a] for a in ARMS}
+        cs = {a: median([r.get("total_cost_usd") for r in in_fam[a]]) for a in ARMS}
+        md.append(
+            f"| {fam} | {len(tids)} | "
+            f"{fmt(acc(in_fam['empty']), 'pct')} | {fmt(acc(in_fam['guidelines']), 'pct')} | "
+            f"{fmt(acc(in_fam['skills']), 'pct')} | {fmt(acc(in_fam['both']), 'pct')} | "
+            f"{fmt(cs['empty'], 'dollars')} | {fmt(cs['guidelines'], 'dollars')} | "
+            f"{fmt(cs['skills'], 'dollars')} | {fmt(cs['both'], 'dollars')} | "
+            f"{delta(cs['guidelines'], cs['both'], 'dollars')} | "
+            f"{delta(cs['skills'], cs['both'], 'dollars')} |"
+        )
+    md.append("")
+
+    md.append("## Per task — cost USD")
+    md.append("")
+    md.append("| Task | E $ | G $ | S $ | B $ | Δ G→B | Δ S→B |")
+    md.append("|---|---:|---:|---:|---:|---:|---:|")
+    for tid in TASK_IDS_ORDER:
+        if not by_task[tid]:
+            continue
+        cs = {a: median([r.get("total_cost_usd") for r in by_task[tid].get(a, [])]) for a in ARMS}
+        md.append(
+            f"| `{tid}` | {fmt(cs['empty'], 'dollars')} | {fmt(cs['guidelines'], 'dollars')} | "
+            f"{fmt(cs['skills'], 'dollars')} | {fmt(cs['both'], 'dollars')} | "
+            f"{delta(cs['guidelines'], cs['both'], 'dollars')} | "
+            f"{delta(cs['skills'], cs['both'], 'dollars')} |"
+        )
+    md.append("")
+
+    md.append("## Per task — accuracy")
+    md.append("")
+    md.append("| Task | E acc | G acc | S acc | B acc |")
+    md.append("|---|:-:|:-:|:-:|:-:|")
+    for tid in TASK_IDS_ORDER:
+        if not by_task[tid]:
+            continue
+        as_ = {a: acc(by_task[tid].get(a, [])) for a in ARMS}
+        md.append(
+            f"| `{tid}` | {fmt(as_['empty'], 'pct')} | {fmt(as_['guidelines'], 'pct')} | "
+            f"{fmt(as_['skills'], 'pct')} | {fmt(as_['both'], 'pct')} |"
+        )
+    md.append("")
+    md.append("## Notes")
+    md.append("")
+    md.append("- Empty + guidelines columns reproduce twobatch.")
+    md.append("- Skills column reproduces the skills-arm experiment.")
+    md.append(
+        "- Both column is the new arm: same 3 skills + ~15 atomics from "
+        "twobatch's batch-1 trajectories. No clusters (matching the "
+        "guidelines arm's structure)."
+    )
+    md.append("- Trivial-recipe tasks (t11-tar, t13-gzip, t15-jsonl, t16-ini, t17-log, t2/t3, t5) have no matching skill in any arm.")
+    Path(args.out).write_text("\n".join(md) + "\n", encoding="utf-8")
+    print(f"wrote {args.out}", flush=True)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/explorations/agent-wiki/experiments/harness/normalize_stream_json_transcripts.py b/explorations/agent-wiki/experiments/harness/normalize_stream_json_transcripts.py
new file mode 100644
index 00000000..dc88301b
--- /dev/null
+++ b/explorations/agent-wiki/experiments/harness/normalize_stream_json_transcripts.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+# mypy: ignore-errors
+# Exploration/reference code — not type-checked to the project standard.
+"""Normalize `claude -p --output-format stream-json --verbose` outputs.
+
+Reads stream-json transcripts emitted by the experiment runners and writes
+one OpenAI-chat-completion JSON file per transcript, matching the schema
+under trajectories/normalized/.
+
+Usage:
+    uv run python scripts/normalize_stream_json_transcripts.py \\
+        --in  experiments/results/wiki-consult-20260605T153035Z/transcripts \\
+        --out trajectories/normalized \\
+        --label example-corpus \\
+        --user-prompt "what lens model was used for @sample.jpg. use exif metadata" \\
+        --trial-prefix wiki-consult
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import Counter
+from pathlib import Path
+from typing import Any
+
+
+def parse_stream_json_file(path: Path, user_prompt: str) -> dict[str, Any]:
+    """Parse one stream-json file into normalized form."""
+    events: list[dict] = []
+    for line in path.read_text(encoding="utf-8").splitlines():
+        if not line.strip():
+            continue
+        try:
+            events.append(json.loads(line))
+        except json.JSONDecodeError:
+            continue
+
+    init = next((e for e in events if e.get("type") == "system" and e.get("subtype") == "init"), None)
+    result = next((e for e in events if e.get("type") == "result"), None)
+
+    session_id = (init or {}).get("session_id") or path.stem
+    model = (init or {}).get("model") or "claude-code"
+    duration_ms = (result or {}).get("duration_ms") or 0
+
+    messages: list[dict] = [{"role": "user", "content": user_prompt}]
+    tool_calls = 0
+    tool_results = 0
+    thinking = 0
+    tool_counter: Counter[str] = Counter()
+    in_tokens = cache_creation = cache_read = out_tokens = 0
+
+    for ev in events:
+        if ev.get("type") == "assistant":
+            msg = ev.get("message", {}) or {}
+            usage = msg.get("usage") or {}
+            in_tokens += int(usage.get("input_tokens", 0) or 0)
+            cache_creation += int(usage.get("cache_creation_input_tokens", 0) or 0)
+            cache_read += int(usage.get("cache_read_input_tokens", 0) or 0)
+            out_tokens += int(usage.get("output_tokens", 0) or 0)
+            content = msg.get("content")
+            if not isinstance(content, list):
+                continue
+            for b in content:
+                if not isinstance(b, dict):
+                    continue
+                t = b.get("type")
+                if t == "text":
+                    text = b.get("text", "")
+                    if text:
+                        messages.append({"role": "assistant", "content": text})
+                elif t == "thinking":
+                    thinking += 1
+                elif t == "tool_use":
+                    name = b.get("name", "")
+                    tool_counter[name] += 1
+                    tool_calls += 1
+                    messages.append(
+                        {
+                            "role": "assistant",
+                            "tool_calls": [
+                                {
+                                    "id": b.get("id"),
+                                    "type": "function",
+                                    "function": {
+                                        "name": name,
+                                        "arguments": json.dumps(b.get("input") or {}),
+                                    },
+                                }
+                            ],
+                        }
+                    )
+        elif ev.get("type") == "user":
+            msg = ev.get("message", {}) or {}
+            content = msg.get("content")
+            if not isinstance(content, list):
+                continue
+            for b in content:
+                if not isinstance(b, dict):
+                    continue
+                if b.get("type") == "tool_result":
+                    tool_results += 1
+                    raw = b.get("content")
+                    if isinstance(raw, list):
+                        text_parts = [c.get("text", "") for c in raw if isinstance(c, dict) and c.get("type") == "text"]
+                        text = "\n".join(text_parts)
+                    elif isinstance(raw, str):
+                        text = raw
+                    else:
+                        text = json.dumps(raw)
+                    messages.append(
+                        {
+                            "role": "tool",
+                            "tool_call_id": b.get("tool_use_id"),
+                            "content": text,
+                        }
+                    )
+
+    top_tools = [{"tool": t, "count": c} for t, c in tool_counter.most_common(5)]
+
+    return {
+        "schema_version": "1",
+        "dataset": "claude-transcripts",
+        "agent": "claude-code",
+        "session_id": session_id,
+        "model": model,
+        "models": [model],
+        "duration_seconds": round(duration_ms / 1000.0, 2),
+        "stats": {
+            "raw_event_count": len(events),
+            "message_count": len(messages),
+            "tool_call_count": tool_calls,
+            "tool_result_count": tool_results,
+            "thinking_block_count": thinking,
+            "sidechain_count": 0,
+            "top_tools": top_tools,
+            "input_tokens": in_tokens,
+            "cache_creation_input_tokens": cache_creation,
+            "cache_read_input_tokens": cache_read,
+            "output_tokens": out_tokens,
+            "total_cost_usd": float((result or {}).get("total_cost_usd") or 0.0),
+        },
+        "openai_chat_completion": {"messages": messages},
+        "recalled_guidelines": [],
+    }
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--in", dest="in_dir", required=True, help="Dir containing <condition>/trial-N.jsonl files (or a single file).")
+    ap.add_argument("--out", default="trajectories/normalized", help="Output root.")
+    ap.add_argument("--label", required=True, help="Label subdir under --out (becomes <out>/<label>/items/).")
+    ap.add_argument("--user-prompt", required=True, help="The utt2 text the agent received (rebuilt as message[0]).")
+    ap.add_argument("--trial-prefix", default="trial", help="Prefix used in trial_id.")
+    args = ap.parse_args()
+
+    in_root = Path(args.in_dir).resolve()
+    out_root = Path(args.out).resolve() / args.label / "items"
+    out_root.mkdir(parents=True, exist_ok=True)
+
+    if in_root.is_file():
+        files = [in_root]
+    else:
+        files = sorted(in_root.rglob("*.jsonl"))
+
+    written = 0
+    for f in files:
+        rec = parse_stream_json_file(f, args.user_prompt)
+        # condition is the parent directory name; trial id from filename
+        condition = f.parent.name
+        trial_name = f.stem  # 'trial-1', 'trial-2', etc.
+        rec["trial_id"] = f"{args.trial_prefix}-{condition}-{trial_name}_{rec['session_id']}"
+        rec["source"] = {
+            "transcript_path": str(f.relative_to(Path.cwd())) if f.is_relative_to(Path.cwd()) else str(f),
+            "session_id": rec["session_id"],
+            "condition": condition,
+            "trial": trial_name,
+        }
+        out_path = out_root / f"{condition}__{trial_name}__{rec['session_id']}.json"
+        out_path.write_text(json.dumps(rec, indent=2) + "\n", encoding="utf-8")
+        written += 1
+        print(f"  wrote {out_path.relative_to(Path.cwd())}")
+    print(f"normalized {written} transcript(s) → {out_root}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/explorations/agent-wiki/experiments/harness/threeway_compare.py b/explorations/agent-wiki/experiments/harness/threeway_compare.py
new file mode 100644
index 00000000..ffc66467
--- /dev/null
+++ b/explorations/agent-wiki/experiments/harness/threeway_compare.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+# mypy: ignore-errors
+# Exploration/reference code — not type-checked to the project standard.
+"""Three-way comparison of empty / guidelines / skills arms on the same task corpus.
+
+Reads:
+  ../metrics/twobatch.metrics.jsonl          (twobatch — batch 1 = empty, batch 2 = guidelines)
+  ../metrics/twobatch-skills.metrics.jsonl   (this experiment — skills arm)
+
+Emits a markdown report with aggregate, per-family, and per-task tables.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+
+TASK_IDS_ORDER = [
+    "t1-lens-model",
+    "t6-png-dim",
+    "t7-gif-dim",
+    "t8-bmp-info",
+    "t9-webp-dim",
+    "t10-zip-list",
+    "t11-tar-list",
+    "t12-wav-info",
+    "t13-gzip-dec",
+    "t14-csv-quoted",
+    "t15-jsonl-kinds",
+    "t16-ini-key",
+    "t17-log-errors",
+    "t2-imports",
+    "t3-todos",
+    "t5-base64",
+]
+
+FAMILY = {
+    "t1-lens-model": "lens-model",
+    "t6-png-dim": "image",
+    "t7-gif-dim": "image",
+    "t8-bmp-info": "image",
+    "t9-webp-dim": "image",
+    "t10-zip-list": "archive",
+    "t11-tar-list": "archive",
+    "t12-wav-info": "archive",
+    "t13-gzip-dec": "archive",
+    "t14-csv-quoted": "text",
+    "t15-jsonl-kinds": "text",
+    "t16-ini-key": "text",
+    "t17-log-errors": "text",
+    "t2-imports": "skip",
+    "t3-todos": "skip",
+    "t5-base64": "skip",
+}
+
+
+def median(xs):
+    xs = [x for x in xs if x is not None]
+    return statistics.median(xs) if xs else None
+
+
+def acc(rs):
+    oms = [r.get("outcome_match") for r in rs if r.get("outcome_match") is not None]
+    return sum(1 for x in oms if x) / len(oms) if oms else None
+
+
+def fmt(x, kind="num"):
+    if x is None:
+        return "—"
+    if kind == "tokens":
+        return f"{int(x):,}"
+    if kind == "dollars":
+        return f"${x:.4f}"
+    if kind == "duration":
+        return f"{x:.0f}s"
+    if kind == "pct":
+        return f"{x:.0%}"
+    return f"{x:.1f}"
+
+
+def delta(base, other, kind="num"):
+    """other minus base. Sign in front; pct in parens vs base."""
+    if base is None or other is None or base == 0:
+        return "—"
+    diff = other - base
+    pct = diff / base
+    sign = "+" if diff >= 0 else ""
+    if kind == "tokens":
+        return f"{sign}{int(diff):,} ({sign}{pct:.0%})"
+    if kind == "duration":
+        return f"{sign}{diff:.0f}s ({sign}{pct:.0%})"
+    if kind == "dollars":
+        return f"{sign}${diff:.4f} ({sign}{pct:.0%})"
+    if kind == "pct":
+        return f"{sign}{pct:.0%}"
+    return f"{sign}{diff:.1f}"
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--twobatch-metrics", default="../metrics/twobatch.metrics.jsonl")
+    ap.add_argument("--skills-metrics", default="../metrics/twobatch-skills.metrics.jsonl")
+    ap.add_argument("--out", default="experiments/twobatch-skills-comparison.md")
+    args = ap.parse_args()
+
+    # Load: twobatch's batch 1 = empty arm; batch 2 = guidelines arm.
+    rows: list[dict] = []
+    for line in Path(args.twobatch_metrics).read_text().splitlines():
+        if not line.strip():
+            continue
+        r = json.loads(line)
+        r["arm"] = "empty" if r["batch"] == 1 else "guidelines"
+        rows.append(r)
+    # Skills arm: every row gets arm="skills".
+    skills_path = Path(args.skills_metrics)
+    if skills_path.exists():
+        for line in skills_path.read_text().splitlines():
+            if not line.strip():
+                continue
+            r = json.loads(line)
+            r["arm"] = "skills"
+            rows.append(r)
+
+    by_task: dict[str, dict[str, list[dict]]] = defaultdict(lambda: defaultdict(list))
+    for r in rows:
+        by_task[r["task"]][r["arm"]].append(r)
+
+    md: list[str] = []
+    md.append("# Three-way wiki-helps comparison: empty vs guidelines vs skills")
+    md.append("")
+    md.append(
+        "Same 16-task corpus, three arms, all `claude_md_strong` condition. "
+        "Empty + guidelines arms are the existing twobatch experiment's "
+        "batch-1 / batch-2. Skills arm is the new run against "
+        "`wiki-twobatch-skills/`, populated from twobatch's batch-1 "
+        "trajectories via `agent-wiki-synthesize-skill`."
+    )
+    md.append("")
+
+    by_arm = {
+        "empty": [r for r in rows if r["arm"] == "empty"],
+        "guidelines": [r for r in rows if r["arm"] == "guidelines"],
+        "skills": [r for r in rows if r["arm"] == "skills"],
+    }
+
+    md.append("## Aggregate (3 trials × 16 tasks per arm)")
+    md.append("")
+    md.append("| Metric | Empty | Guidelines | Skills | Skills vs guidelines |")
+    md.append("|---|---:|---:|---:|---:|")
+    pairs = [
+        ("Trials", "len", "num"),
+        ("Accuracy (mean)", "_acc", "pct"),
+        ("Median duration", "duration_s", "duration"),
+        ("Median input tokens", "input_tokens", "tokens"),
+        ("Median output tokens", "output_tokens", "tokens"),
+        ("Median total cost USD", "total_cost_usd", "dollars"),
+        ("Median tool calls", "tool_calls", "num"),
+        ("Median wiki reads", "wiki_reads_total", "num"),
+        ("Median guideline reads", "guideline_reads", "num"),
+    ]
+    for label, field, kind in pairs:
+        if field == "len":
+            vals = {a: len(by_arm[a]) for a in ("empty", "guidelines", "skills")}
+            md.append(f"| {label} | {vals['empty']} | {vals['guidelines']} | {vals['skills']} | {vals['skills'] - vals['guidelines']:+d} |")
+            continue
+        if field == "_acc":
+            vals = {a: acc(by_arm[a]) for a in ("empty", "guidelines", "skills")}
+        else:
+            vals = {a: median([r.get(field) for r in by_arm[a]]) for a in ("empty", "guidelines", "skills")}
+        md.append(
+            f"| {label} | {fmt(vals['empty'], kind)} | {fmt(vals['guidelines'], kind)} | "
+            f"{fmt(vals['skills'], kind)} | {delta(vals['guidelines'], vals['skills'], kind)} |"
+        )
+    md.append("")
+
+    md.append("## By task family")
+    md.append("")
+    md.append("Median per-trial within each family. Skills column shows Δ vs guidelines.")
+    md.append("")
+    md.append(
+        "| Family | Tasks | E acc | G acc | S acc | E dur | G dur | S dur | E tokens | G tokens | S tokens | E $ | G $ | S $ | Skills Δ$ |"
+    )
+    md.append("|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|")
+    fam_groups: dict[str, list[str]] = defaultdict(list)
+    for tid, fam in FAMILY.items():
+        fam_groups[fam].append(tid)
+    for fam, tids in fam_groups.items():
+        in_fam = [r for r in rows if r["task"] in tids]
+        e = [r for r in in_fam if r["arm"] == "empty"]
+        g = [r for r in in_fam if r["arm"] == "guidelines"]
+        s = [r for r in in_fam if r["arm"] == "skills"]
+        md.append(
+            f"| {fam} | {len(tids)} tasks | "
+            f"{fmt(acc(e), 'pct')} | {fmt(acc(g), 'pct')} | {fmt(acc(s), 'pct')} | "
+            f"{fmt(median([r.get('duration_s') for r in e]), 'duration')} | "
+            f"{fmt(median([r.get('duration_s') for r in g]), 'duration')} | "
+            f"{fmt(median([r.get('duration_s') for r in s]), 'duration')} | "
+            f"{fmt(median([r.get('billable_tokens_proxy') for r in e]), 'tokens')} | "
+            f"{fmt(median([r.get('billable_tokens_proxy') for r in g]), 'tokens')} | "
+            f"{fmt(median([r.get('billable_tokens_proxy') for r in s]), 'tokens')} | "
+            f"{fmt(median([r.get('total_cost_usd') for r in e]), 'dollars')} | "
+            f"{fmt(median([r.get('total_cost_usd') for r in g]), 'dollars')} | "
+            f"{fmt(median([r.get('total_cost_usd') for r in s]), 'dollars')} | "
+            f"{delta(median([r.get('total_cost_usd') for r in g]), median([r.get('total_cost_usd') for r in s]), 'dollars')} |"
+        )
+    md.append("")
+
+    md.append("## Per task")
+    md.append("")
+    md.append("| Task | E acc | G acc | S acc | E dur | G dur | S dur | E $ | G $ | S $ | Skills Δ$ vs G |")
+    md.append("|---|:-:|:-:|:-:|---:|---:|---:|---:|---:|---:|---:|")
+    for tid in TASK_IDS_ORDER:
+        e = by_task[tid].get("empty", [])
+        g = by_task[tid].get("guidelines", [])
+        s = by_task[tid].get("skills", [])
+        if not (e or g or s):
+            continue
+        md.append(
+            f"| `{tid}` | {fmt(acc(e), 'pct')} | {fmt(acc(g), 'pct')} | {fmt(acc(s), 'pct')} | "
+            f"{fmt(median([r.get('duration_s') for r in e]), 'duration')} | "
+            f"{fmt(median([r.get('duration_s') for r in g]), 'duration')} | "
+            f"{fmt(median([r.get('duration_s') for r in s]), 'duration')} | "
+            f"{fmt(median([r.get('total_cost_usd') for r in e]), 'dollars')} | "
+            f"{fmt(median([r.get('total_cost_usd') for r in g]), 'dollars')} | "
+            f"{fmt(median([r.get('total_cost_usd') for r in s]), 'dollars')} | "
+            f"{delta(median([r.get('total_cost_usd') for r in g]), median([r.get('total_cost_usd') for r in s]), 'dollars')} |"
+        )
+    md.append("")
+    md.append("## Notes")
+    md.append("")
+    md.append("- Empty + guidelines columns reproduce the original twobatch comparison; skills column is new.")
+    md.append(
+        "- 3 skills were synthesized from twobatch's batch-1 trajectories by the "
+        "`agent-wiki-synthesize-skill` skill: `extract-jpeg-exif-camera-optics`, "
+        "`read-image-format-dimensions`, `count-csv-rows-with-quoted-fields`. "
+        "All other tasks in this arm have **no matching skill** — the agent "
+        "should fall through to whatever it'd do on an empty wiki."
+    )
+    md.append("")
+    Path(args.out).write_text("\n".join(md) + "\n", encoding="utf-8")
+    print(f"wrote {args.out}", flush=True)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/explorations/agent-wiki/experiments/harness/twobatch_compare.py b/explorations/agent-wiki/experiments/harness/twobatch_compare.py
new file mode 100644
index 00000000..49cd99e7
--- /dev/null
+++ b/explorations/agent-wiki/experiments/harness/twobatch_compare.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+# mypy: ignore-errors
+# Exploration/reference code — not type-checked to the project standard.
+"""Compare batch-1 (no wiki) and batch-2 (with wiki) metrics from
+the two-batch experiment. Emits a markdown report.
+
+Usage:
+    uv run python scripts/twobatch_compare.py \\
+        --metrics ../metrics/twobatch.metrics.jsonl \\
+        --out experiments/twobatch-comparison.md
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+
+FAMILY = {
+    "t1-lens-model": "lens-model",
+    "t6-png-dim": "image",
+    "t7-gif-dim": "image",
+    "t8-bmp-info": "image",
+    "t9-webp-dim": "image",
+    "t10-zip-list": "archive",
+    "t11-tar-list": "archive",
+    "t12-wav-info": "archive",
+    "t13-gzip-dec": "archive",
+    "t14-csv-quoted": "text",
+    "t15-jsonl-kinds": "text",
+    "t16-ini-key": "text",
+    "t17-log-errors": "text",
+    "t2-imports": "skip",
+    "t3-todos": "skip",
+    "t5-base64": "skip",
+}
+
+
+def median_or_none(xs: list[float]) -> float | None:
+    xs = [x for x in xs if x is not None]
+    return statistics.median(xs) if xs else None
+
+
+def mean_or_none(xs: list[float]) -> float | None:
+    xs = [x for x in xs if x is not None]
+    return (sum(xs) / len(xs)) if xs else None
+
+
+def fmt(x: float | None, kind: str = "num") -> str:
+    if x is None:
+        return "—"
+    if kind == "tokens":
+        return f"{int(x):,}"
+    if kind == "dollars":
+        return f"${x:.4f}"
+    if kind == "duration":
+        return f"{x:.0f}s"
+    if kind == "pct":
+        return f"{x:.0%}"
+    return f"{x:.1f}"
+
+
+def delta_str(b1: float | None, b2: float | None, kind: str = "num") -> str:
+    if b1 is None or b2 is None or b1 == 0:
+        return "—"
+    diff = b2 - b1
+    pct = diff / b1
+    sign = "+" if diff >= 0 else ""
+    if kind == "tokens":
+        return f"{sign}{int(diff):,} ({sign}{pct:.0%})"
+    if kind == "duration":
+        return f"{sign}{diff:.0f}s ({sign}{pct:.0%})"
+    if kind == "dollars":
+        return f"{sign}${diff:.4f} ({sign}{pct:.0%})"
+    return f"{sign}{diff:.1f} ({sign}{pct:.0%})"
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--metrics", required=True)
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    metrics_path = Path(args.metrics)
+    out_path = Path(args.out)
+    rows = [json.loads(ln) for ln in metrics_path.read_text().splitlines() if ln.strip()]
+
+    # by_task: {task_id: {1: [rows], 2: [rows]}}
+    by_task: dict[str, dict[int, list[dict]]] = defaultdict(lambda: defaultdict(list))
+    for r in rows:
+        by_task[r["task"]][r["batch"]].append(r)
+
+    # Per-task aggregates
+    def agg(rs: list[dict], field: str, op=median_or_none) -> float | None:
+        return op([r.get(field) for r in rs])
+
+    def acc(rs: list[dict]) -> float | None:
+        oms = [r.get("outcome_match") for r in rs if r.get("outcome_match") is not None]
+        return (sum(1 for x in oms if x) / len(oms)) if oms else None
+
+    md: list[str] = []
+    md.append("# Two-batch wiki-helps comparison")
+    md.append("")
+    md.append(
+        "**Question**: does a populated wiki reduce token cost / wall-clock at equal-or-better accuracy, vs the same task on an empty wiki?"
+    )
+    md.append("")
+    md.append(
+        "Setup: 16 tasks × 3 trials × 2 batches = 96 sandbox trials, all "
+        "`claude_md_strong`. Batch 1's agent saw an empty wiki. After ingestion "
+        "the wiki was frozen. Batch 2's agent saw the populated wiki."
+    )
+    md.append("")
+
+    # ── Aggregate ──
+    all_b1 = [r for r in rows if r["batch"] == 1]
+    all_b2 = [r for r in rows if r["batch"] == 2]
+    md.append("## Aggregate (96 trials)")
+    md.append("")
+    md.append("| Metric | Batch 1 (empty wiki) | Batch 2 (with wiki) | Δ |")
+    md.append("|---|---:|---:|---:|")
+    pairs = [
+        ("Trials", "len", "len", "num"),
+        ("Accuracy (mean)", "outcome_match", "mean", "pct"),
+        ("Median duration", "duration_s", "median", "duration"),
+        ("Median input tokens", "input_tokens", "median", "tokens"),
+        ("Median cache-creation tokens", "cache_creation_input_tokens", "median", "tokens"),
+        ("Median cache-read tokens", "cache_read_input_tokens", "median", "tokens"),
+        ("Median output tokens", "output_tokens", "median", "tokens"),
+        ("Median billable proxy (in+cc+out)", "billable_tokens_proxy", "median", "tokens"),
+        ("Median total cost USD", "total_cost_usd", "median", "dollars"),
+        ("Median tool calls", "tool_calls", "median", "num"),
+        ("Median wiki reads", "wiki_reads_total", "median", "num"),
+        ("Median guideline reads", "guideline_reads", "median", "num"),
+    ]
+    for label, field, agg_op, kind in pairs:
+        if field == "len":
+            v1, v2 = len(all_b1), len(all_b2)
+            md.append(f"| {label} | {v1} | {v2} | {v2 - v1:+d} |")
+            continue
+        if agg_op == "mean":
+            if field == "outcome_match":
+                v1 = acc(all_b1)
+                v2 = acc(all_b2)
+            else:
+                v1 = mean_or_none([r.get(field) for r in all_b1])
+                v2 = mean_or_none([r.get(field) for r in all_b2])
+        else:
+            v1 = median_or_none([r.get(field) for r in all_b1])
+            v2 = median_or_none([r.get(field) for r in all_b2])
+        md.append(f"| {label} | {fmt(v1, kind)} | {fmt(v2, kind)} | {delta_str(v1, v2, kind)} |")
+    md.append("")
+
+    # ── By family ──
+    md.append("## By task family")
+    md.append("")
+    md.append("Median per-trial cost within each family. Δ = batch-2 minus batch-1.")
+    md.append("")
+    md.append("| Family | Tasks | B1 acc | B2 acc | Δ acc | B1 dur | B2 dur | Δ dur | B1 tokens | B2 tokens | Δ tokens |")
+    md.append("|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|")
+    fam_groups: dict[str, list[str]] = defaultdict(list)
+    for tid, fam in FAMILY.items():
+        fam_groups[fam].append(tid)
+    for fam, tids in fam_groups.items():
+        b1 = [r for r in rows if r["batch"] == 1 and r["task"] in tids]
+        b2 = [r for r in rows if r["batch"] == 2 and r["task"] in tids]
+        a1 = acc(b1)
+        a2 = acc(b2)
+        d1 = median_or_none([r.get("duration_s") for r in b1])
+        d2 = median_or_none([r.get("duration_s") for r in b2])
+        t1 = median_or_none([r.get("billable_tokens_proxy") for r in b1])
+        t2 = median_or_none([r.get("billable_tokens_proxy") for r in b2])
+        md.append(
+            f"| {fam} | {', '.join(tids)} | {fmt(a1, 'pct')} | {fmt(a2, 'pct')} | "
+            f"{delta_str(a1, a2, 'pct')} | {fmt(d1, 'duration')} | {fmt(d2, 'duration')} | "
+            f"{delta_str(d1, d2, 'duration')} | {fmt(t1, 'tokens')} | {fmt(t2, 'tokens')} | "
+            f"{delta_str(t1, t2, 'tokens')} |"
+        )
+    md.append("")
+
+    # ── Per task ──
+    md.append("## Per task")
+    md.append("")
+    md.append("Median across 3 trials per cell. Token = `billable_tokens_proxy` (input + cache-creation + output; cache reads excluded).")
+    md.append("")
+    md.append("| Task | B1 acc | B2 acc | B1 dur | B2 dur | Δ dur | B1 tokens | B2 tokens | Δ tokens | B1 tools | B2 tools |")
+    md.append("|---|:-:|:-:|---:|---:|---:|---:|---:|---:|---:|---:|")
+    for tid in TASK_IDS_ORDER:
+        b1 = by_task[tid].get(1, [])
+        b2 = by_task[tid].get(2, [])
+        if not b1 and not b2:
+            continue
+        a1 = acc(b1)
+        a2 = acc(b2)
+        d1 = median_or_none([r.get("duration_s") for r in b1])
+        d2 = median_or_none([r.get("duration_s") for r in b2])
+        t1 = median_or_none([r.get("billable_tokens_proxy") for r in b1])
+        t2 = median_or_none([r.get("billable_tokens_proxy") for r in b2])
+        tc1 = median_or_none([r.get("tool_calls") for r in b1])
+        tc2 = median_or_none([r.get("tool_calls") for r in b2])
+        md.append(
+            f"| `{tid}` | {fmt(a1, 'pct')} | {fmt(a2, 'pct')} | "
+            f"{fmt(d1, 'duration')} | {fmt(d2, 'duration')} | {delta_str(d1, d2, 'duration')} | "
+            f"{fmt(t1, 'tokens')} | {fmt(t2, 'tokens')} | {delta_str(t1, t2, 'tokens')} | "
+            f"{fmt(tc1)} | {fmt(tc2)} |"
+        )
+    md.append("")
+
+    md.append("## Notes")
+    md.append("")
+    md.append(
+        "- `billable_tokens_proxy` = `input_tokens + cache_creation_input_tokens + output_tokens` "
+        "(cache reads are very cheap and not directly billed at the same rate)."
+    )
+    md.append(
+        "- A trial that timed out is recorded with `outcome_match=False`, "
+        "`duration_s=300`, all token fields = 0. These bring batch-1 means down "
+        "if they happen."
+    )
+    md.append("- Only `claude_md_strong` was run in this experiment for clean comparison (no condition mixing).")
+    md.append("")
+
+    out_path.write_text("\n".join(md) + "\n", encoding="utf-8")
+    print(f"wrote {out_path}", flush=True)
+    return 0
+
+
+TASK_IDS_ORDER = [
+    "t1-lens-model",
+    "t6-png-dim",
+    "t7-gif-dim",
+    "t8-bmp-info",
+    "t9-webp-dim",
+    "t10-zip-list",
+    "t11-tar-list",
+    "t12-wav-info",
+    "t13-gzip-dec",
+    "t14-csv-quoted",
+    "t15-jsonl-kinds",
+    "t16-ini-key",
+    "t17-log-errors",
+    "t2-imports",
+    "t3-todos",
+    "t5-base64",
+]
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/explorations/agent-wiki/experiments/harness/wiki_consult_tasks.yaml b/explorations/agent-wiki/experiments/harness/wiki_consult_tasks.yaml
new file mode 100644
index 00000000..8bd94958
--- /dev/null
+++ b/explorations/agent-wiki/experiments/harness/wiki_consult_tasks.yaml
@@ -0,0 +1,143 @@
+# Tasks for the wiki-consult A/B experiment.
+# Each block defines a prompt + outcome match rules.
+
+- id: t1-lens-model
+  prompt: "what lens model was used for @sample.jpg. use exif metadata"
+  outcome_match_all:
+    - "Google Pixel 4a Rear Wide Camera"
+  outcome_match_any:
+    - "0xA434"
+    - "0x8769"
+    - "ExifIFD"
+    - "Exif sub-IFD"
+  expected_guideline_filenames:
+    - "walk-exififd-via-tag-0x8769-for-camera__29547d9e9042.md"
+    - "use-stdlib-struct-exif-parser-when__967ec7025f31.md"
+    - "find-exif-behind-app1-marker-0xffe1__b20480acbb88.md"
+
+# ─────── Codebase-exploration family (shared "grep a path scope" pattern) ───────
+
+- id: t2-imports
+  prompt: "List every file under /workspace/src or /workspace/tests that imports the `parser` module."
+  outcome_match_all:
+    - "writer.py"
+    - "api.py"
+    - "test_parser.py"
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: codebase
+
+- id: t3-todos
+  prompt: "Find every TODO comment under /workspace/ — show the file and the matching line."
+  outcome_match_all:
+    - "parser.py"
+    - "TODO"
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: codebase
+
+- id: t4-defs
+  prompt: "How many Python `def` function definitions are in /workspace/src/? Reply with just the integer."
+  outcome_match_all:
+    - "4"
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: codebase
+
+# ─────── Disjoint task (no shared pattern; tests skip-when-inapplicable) ───────
+
+- id: t5-base64
+  prompt: "Convert the ASCII string 'Hello, World!' to standard base64. Reply with just the base64 string."
+  outcome_match_all:
+    - "SGVsbG8sIFdvcmxkIQ=="
+  outcome_match_any: []
+  expected_guideline_filenames: []
+
+# ─────── Real-task family: data/file-format parsing ───────
+# Each trial seeds a tiny valid sample of the format under /workspace/.
+# The agent's job is to read specific fields. Outcome scoring is loose —
+# we want a parser-recipe atomic to fall out of each trajectory.
+
+- id: t6-png-dim
+  prompt: "Read the width and height of /workspace/sample.png. Reply as 'WIDTHxHEIGHT'."
+  outcome_match_all: ["100x100"]
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: image-formats
+
+- id: t7-gif-dim
+  prompt: "Read the GIF version string and dimensions from /workspace/sample.gif. Reply on one line."
+  outcome_match_all: ["GIF89a", "50", "30"]
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: image-formats
+
+- id: t8-bmp-info
+  prompt: "Read the width and bit depth of /workspace/sample.bmp."
+  outcome_match_all: ["4", "24"]
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: image-formats
+
+- id: t9-webp-dim
+  prompt: "Read the width and height of /workspace/sample.webp. Reply as 'WIDTHxHEIGHT'."
+  outcome_match_all: ["32x32"]
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: image-formats
+
+- id: t10-zip-list
+  prompt: "List all entry names inside /workspace/sample.zip, one per line."
+  outcome_match_all: ["foo.txt", "bar/baz.txt"]
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: archive-formats
+
+- id: t11-tar-list
+  prompt: "List all entries in /workspace/sample.tar."
+  outcome_match_all: ["alpha.txt", "nested/beta.txt"]
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: archive-formats
+
+- id: t12-wav-info
+  prompt: "What is the sample rate and channel count of /workspace/sample.wav?"
+  outcome_match_all: ["44100", "1"]
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: archive-formats
+
+- id: t13-gzip-dec
+  prompt: "Decompress /workspace/sample.gz and show the first line."
+  outcome_match_all: ["hello"]
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: archive-formats
+
+- id: t14-csv-quoted
+  prompt: "How many rows in /workspace/data.csv contain a quoted comma in any field? Reply with just the integer."
+  outcome_match_all: ["2"]
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: text-formats
+
+- id: t15-jsonl-kinds
+  prompt: "Count the distinct values of the 'kind' field across /workspace/events.jsonl. Reply with just the integer."
+  outcome_match_all: ["3"]
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: text-formats
+
+- id: t16-ini-key
+  prompt: "Read the value of [server].port from /workspace/config.ini."
+  outcome_match_all: ["8080"]
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: text-formats
+
+- id: t17-log-errors
+  prompt: "How many ERROR lines are in /workspace/app.log? Reply with just the integer."
+  outcome_match_all: ["2"]
+  outcome_match_any: []
+  expected_guideline_filenames: []
+  seed: text-formats
diff --git a/explorations/agent-wiki/experiments/metrics/pruned-fixed-9atomic.metrics.jsonl b/explorations/agent-wiki/experiments/metrics/pruned-fixed-9atomic.metrics.jsonl
new file mode 100644
index 00000000..79af8713
--- /dev/null
+++ b/explorations/agent-wiki/experiments/metrics/pruned-fixed-9atomic.metrics.jsonl
@@ -0,0 +1,48 @@
+{"session_id": "7c8a5190-c3c4-4407-b756-01ebef07b311", "duration_s": 34.73, "total_cost_usd": 0.4125532499999999, "input_tokens": 4511, "cache_creation_input_tokens": 117711, "cache_read_input_tokens": 361546, "output_tokens": 238, "billable_tokens_proxy": 122460, "tool_calls": 6, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 593, "task": "t1-lens-model", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "fdae97ce-f68f-44b0-9690-1676880cec46", "duration_s": 26.47, "total_cost_usd": 0.22893075000000002, "input_tokens": 4513, "cache_creation_input_tokens": 22092, "cache_read_input_tokens": 496348, "output_tokens": 217, "billable_tokens_proxy": 26822, "tool_calls": 6, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 469, "task": "t1-lens-model", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "990d60d5-deee-495c-b15c-69c5d18f4479", "duration_s": 27.33, "total_cost_usd": 0.2058895, "input_tokens": 4511, "cache_creation_input_tokens": 21566, "cache_read_input_tokens": 459725, "output_tokens": 204, "billable_tokens_proxy": 26281, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 531, "task": "t1-lens-model", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "4be1c1f6-4b38-4370-8627-2df1ddf33ca3", "duration_s": 21.68, "total_cost_usd": 0.197542, "input_tokens": 4378, "cache_creation_input_tokens": 23247, "cache_read_input_tokens": 369224, "output_tokens": 205, "billable_tokens_proxy": 27830, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 352, "task": "t6-png-dim", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "07c82bdc-e20e-4991-bf37-6cdff007606d", "duration_s": 22.88, "total_cost_usd": 0.21491375000000001, "input_tokens": 2926, "cache_creation_input_tokens": 15683, "cache_read_input_tokens": 391841, "output_tokens": 272, "billable_tokens_proxy": 18881, "tool_calls": 6, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 254, "task": "t6-png-dim", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "042a3740-e31b-4d8f-a746-fbef9f152a0c", "duration_s": 23.89, "total_cost_usd": 0.16158, "input_tokens": 2922, "cache_creation_input_tokens": 6829, "cache_read_input_tokens": 319025, "output_tokens": 157, "billable_tokens_proxy": 9908, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 294, "task": "t6-png-dim", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "9588ae2c-3a9e-4f1b-b02b-125c11243e1f", "duration_s": 29.05, "total_cost_usd": 0.2268165, "input_tokens": 4511, "cache_creation_input_tokens": 25221, "cache_read_input_tokens": 450304, "output_tokens": 218, "billable_tokens_proxy": 29950, "tool_calls": 6, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 237, "task": "t7-gif-dim", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "403e9acf-4119-4d4a-aad9-69402b0f260b", "duration_s": 26.04, "total_cost_usd": 0.21812274999999998, "input_tokens": 4511, "cache_creation_input_tokens": 20783, "cache_read_input_tokens": 454781, "output_tokens": 248, "billable_tokens_proxy": 25542, "tool_calls": 6, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 184, "task": "t7-gif-dim", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "c78ff8ce-11c7-4c21-9504-22001912ea44", "duration_s": 20.63, "total_cost_usd": 0.1834575, "input_tokens": 2922, "cache_creation_input_tokens": 13942, "cache_read_input_tokens": 312071, "output_tokens": 197, "billable_tokens_proxy": 17061, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 290, "task": "t7-gif-dim", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "ceac0e04-b950-479c-a42c-45f221b13353", "duration_s": 20.72, "total_cost_usd": 0.19524425, "input_tokens": 4378, "cache_creation_input_tokens": 22056, "cache_read_input_tokens": 370045, "output_tokens": 203, "billable_tokens_proxy": 26637, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 394, "task": "t8-bmp-info", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "7f8c9f3d-61c5-411f-a68f-6e6363d31af0", "duration_s": 28.51, "total_cost_usd": 0.24813849999999998, "input_tokens": 4515, "cache_creation_input_tokens": 21906, "cache_read_input_tokens": 533354, "output_tokens": 267, "billable_tokens_proxy": 26688, "tool_calls": 7, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 611, "task": "t8-bmp-info", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "a789846e-c293-422e-9b70-acca6a416735", "duration_s": 31.18, "total_cost_usd": 0.251444, "input_tokens": 4648, "cache_creation_input_tokens": 22575, "cache_read_input_tokens": 612324, "output_tokens": 286, "billable_tokens_proxy": 27509, "tool_calls": 7, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 643, "task": "t8-bmp-info", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "4d7dc5d1-b845-4aee-84b4-9b2c40279b49", "duration_s": 20.97, "total_cost_usd": 0.18929375, "input_tokens": 2922, "cache_creation_input_tokens": 16820, "cache_read_input_tokens": 308898, "output_tokens": 131, "billable_tokens_proxy": 19873, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 216, "task": "t9-webp-dim", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "a0073b77-3806-417c-a1f8-650db98a995a", "duration_s": 18.81, "total_cost_usd": 0.1834305, "input_tokens": 4378, "cache_creation_input_tokens": 17580, "cache_read_input_tokens": 374566, "output_tokens": 202, "billable_tokens_proxy": 22160, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 229, "task": "t9-webp-dim", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "b599db26-e160-45d6-80d6-78f5e4c3a59a", "duration_s": 18.18, "total_cost_usd": 0.18285675, "input_tokens": 2922, "cache_creation_input_tokens": 13923, "cache_read_input_tokens": 312074, "output_tokens": 140, "billable_tokens_proxy": 16985, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 302, "task": "t9-webp-dim", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "fb38dbe0-dd4d-4e74-822d-bf339e7d17f7", "duration_s": 12.26, "total_cost_usd": 0.1425305, "input_tokens": 2920, "cache_creation_input_tokens": 18813, "cache_read_input_tokens": 266099, "output_tokens": 230, "billable_tokens_proxy": 21963, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 205, "task": "t10-zip-list", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "dda2f6ac-cfed-4776-9c2f-7064a412ed1b", "duration_s": 15.08, "total_cost_usd": 0.13581, "input_tokens": 4372, "cache_creation_input_tokens": 13079, "cache_read_input_tokens": 264177, "output_tokens": 108, "billable_tokens_proxy": 17559, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 318, "task": "t10-zip-list", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "b661f24a-0dc2-4a5e-8e1b-159fa9c22846", "duration_s": 17.46, "total_cost_usd": 0.1629325, "input_tokens": 4378, "cache_creation_input_tokens": 18146, "cache_read_input_tokens": 373430, "output_tokens": 186, "billable_tokens_proxy": 22710, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 257, "task": "t10-zip-list", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "1b3792b1-e81b-4191-a243-3568ec894d21", "duration_s": 17.58, "total_cost_usd": 0.1751265, "input_tokens": 4380, "cache_creation_input_tokens": 25446, "cache_read_input_tokens": 404240, "output_tokens": 308, "billable_tokens_proxy": 30134, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 646, "task": "t11-tar-list", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "1c0c6d0e-0cbc-4c5f-8b6d-be0c40ea75c5", "duration_s": 19.9, "total_cost_usd": 0.16556325, "input_tokens": 4378, "cache_creation_input_tokens": 18152, "cache_read_input_tokens": 373351, "output_tokens": 202, "billable_tokens_proxy": 22732, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 567, "task": "t11-tar-list", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "49f7e63c-2980-4680-ad16-dc964461271c", "duration_s": 18.54, "total_cost_usd": 0.16579149999999998, "input_tokens": 4378, "cache_creation_input_tokens": 18164, "cache_read_input_tokens": 373379, "output_tokens": 191, "billable_tokens_proxy": 22733, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 593, "task": "t11-tar-list", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "404c51fc-3a39-4444-a707-39ac869632fb", "duration_s": 20.27, "total_cost_usd": 0.17470725, "input_tokens": 4378, "cache_creation_input_tokens": 22675, "cache_read_input_tokens": 369116, "output_tokens": 169, "billable_tokens_proxy": 27222, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 449, "task": "t12-wav-info", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "d6715f58-a742-47db-8f38-7c48bd11d19b", "duration_s": 22.6, "total_cost_usd": 0.16681724999999997, "input_tokens": 4378, "cache_creation_input_tokens": 18293, "cache_read_input_tokens": 373483, "output_tokens": 168, "billable_tokens_proxy": 22839, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 404, "task": "t12-wav-info", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "647cf207-0b3a-4cd2-b017-a62f98c29108", "duration_s": 20.38, "total_cost_usd": 0.16735425, "input_tokens": 4378, "cache_creation_input_tokens": 17624, "cache_read_input_tokens": 377078, "output_tokens": 208, "billable_tokens_proxy": 22210, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 412, "task": "t12-wav-info", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "54566f1b-8504-4c89-8d15-9b606c3fd3a2", "duration_s": 19.94, "total_cost_usd": 0.17040075000000002, "input_tokens": 4378, "cache_creation_input_tokens": 20332, "cache_read_input_tokens": 373741, "output_tokens": 252, "billable_tokens_proxy": 24962, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 309, "task": "t13-gzip-dec", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "623a1efd-f649-47a9-8088-5264b47bd75f", "duration_s": 18.32, "total_cost_usd": 0.1635035, "input_tokens": 4380, "cache_creation_input_tokens": 20894, "cache_read_input_tokens": 408584, "output_tokens": 292, "billable_tokens_proxy": 25566, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 464, "task": "t13-gzip-dec", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "1fa789b2-5f6c-497f-81eb-c1f25983b335", "duration_s": 26.12, "total_cost_usd": 0.17715399999999998, "input_tokens": 4382, "cache_creation_input_tokens": 22248, "cache_read_input_tokens": 447081, "output_tokens": 380, "billable_tokens_proxy": 27010, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 411, "task": "t13-gzip-dec", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "cd32d694-064c-489d-bf8a-47d615b6e85f", "duration_s": 23.04, "total_cost_usd": 0.21673325000000002, "input_tokens": 4380, "cache_creation_input_tokens": 17450, "cache_read_input_tokens": 418607, "output_tokens": 184, "billable_tokens_proxy": 22014, "tool_calls": 6, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 161, "task": "t14-csv-quoted", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "5f483ec6-3805-4d8b-9d19-cd2d25b978e2", "duration_s": 23.78, "total_cost_usd": 0.20795100000000002, "input_tokens": 4378, "cache_creation_input_tokens": 14416, "cache_read_input_tokens": 382292, "output_tokens": 135, "billable_tokens_proxy": 18929, "tool_calls": 6, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 191, "task": "t14-csv-quoted", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "83ab86b2-552d-4685-9594-acc8c0e1d734", "duration_s": 19.0, "total_cost_usd": 0.20568650000000002, "input_tokens": 2924, "cache_creation_input_tokens": 14295, "cache_read_input_tokens": 351494, "output_tokens": 170, "billable_tokens_proxy": 17389, "tool_calls": 6, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 187, "task": "t14-csv-quoted", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "f4495bd6-9475-41d0-9791-5fda16765a3a", "duration_s": 16.71, "total_cost_usd": 0.14635025000000002, "input_tokens": 4378, "cache_creation_input_tokens": 24404, "cache_read_input_tokens": 365519, "output_tokens": 345, "billable_tokens_proxy": 29127, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 116, "task": "t15-jsonl-kinds", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "8d389a49-91b7-4e3b-8121-05cb45374df5", "duration_s": 23.92, "total_cost_usd": 0.16239525, "input_tokens": 2924, "cache_creation_input_tokens": 16640, "cache_read_input_tokens": 346225, "output_tokens": 288, "billable_tokens_proxy": 19852, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 112, "task": "t15-jsonl-kinds", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "20196bad-a3b9-4a5b-a275-28724f8a46cf", "duration_s": 19.55, "total_cost_usd": 0.1627915, "input_tokens": 2924, "cache_creation_input_tokens": 16704, "cache_read_input_tokens": 346419, "output_tokens": 301, "billable_tokens_proxy": 19929, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 112, "task": "t15-jsonl-kinds", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "15362919-a8c0-4583-8943-42028a337973", "duration_s": 14.63, "total_cost_usd": 0.15113525, "input_tokens": 4451, "cache_creation_input_tokens": 27635, "cache_read_input_tokens": 362432, "output_tokens": 330, "billable_tokens_proxy": 32416, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 284, "task": "t16-ini-key", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "ec2bfee1-0e27-49f2-bfa0-536e38d5bf25", "duration_s": 14.43, "total_cost_usd": 0.142381, "input_tokens": 4449, "cache_creation_input_tokens": 19510, "cache_read_input_tokens": 335218, "output_tokens": 223, "billable_tokens_proxy": 24182, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 320, "task": "t16-ini-key", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "c36627a3-ef9d-430b-abc9-711c067b6c6c", "duration_s": 14.3, "total_cost_usd": 0.13484875, "input_tokens": 4376, "cache_creation_input_tokens": 19625, "cache_read_input_tokens": 331474, "output_tokens": 277, "billable_tokens_proxy": 24278, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 198, "task": "t16-ini-key", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "ed65e658-11f4-4512-89b9-23cf108e7d49", "duration_s": 11.36, "total_cost_usd": 0.14075725, "input_tokens": 2920, "cache_creation_input_tokens": 18856, "cache_read_input_tokens": 266193, "output_tokens": 228, "billable_tokens_proxy": 22004, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 83, "task": "t17-log-errors", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "8fe66f70-94d5-4d20-8a38-7436feb3a4de", "duration_s": 16.26, "total_cost_usd": 0.1372875, "input_tokens": 2920, "cache_creation_input_tokens": 16303, "cache_read_input_tokens": 269316, "output_tokens": 246, "billable_tokens_proxy": 19469, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 102, "task": "t17-log-errors", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "763f450a-c185-4957-90e3-4f2b9cd0f5a0", "duration_s": 25.11, "total_cost_usd": 0.13775300000000001, "input_tokens": 2922, "cache_creation_input_tokens": 16558, "cache_read_input_tokens": 307753, "output_tokens": 296, "billable_tokens_proxy": 19776, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 1, "task": "t17-log-errors", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "cba7a332-7455-4b77-bd58-e7d5d2c1bad4", "duration_s": 24.0, "total_cost_usd": 0.1672185, "input_tokens": 4376, "cache_creation_input_tokens": 25206, "cache_read_input_tokens": 327516, "output_tokens": 243, "billable_tokens_proxy": 29825, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 543, "task": "t2-imports", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": false}
+{"session_id": "47122d6c-350e-4bdb-9c6d-1dd2758cc533", "duration_s": 30.43, "total_cost_usd": 0.16308475, "input_tokens": 5828, "cache_creation_input_tokens": 25532, "cache_read_input_tokens": 321936, "output_tokens": 298, "billable_tokens_proxy": 31658, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 1474, "task": "t2-imports", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "0a0196bf-e2b4-489b-84eb-7db8f117cbd4", "duration_s": 43.35, "total_cost_usd": 0.229501, "input_tokens": 4648, "cache_creation_input_tokens": 24874, "cache_read_input_tokens": 607385, "output_tokens": 434, "billable_tokens_proxy": 29956, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 1135, "task": "t2-imports", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "e3e2d412-34bc-40ce-a255-fe95fbf5e11c", "duration_s": 37.0, "total_cost_usd": 0.213483, "input_tokens": 4384, "cache_creation_input_tokens": 30810, "cache_read_input_tokens": 482384, "output_tokens": 362, "billable_tokens_proxy": 35556, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 664, "task": "t3-todos", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "b197bceb-4321-4544-98fb-bbe0904b6e79", "duration_s": 25.58, "total_cost_usd": 0.1806155, "input_tokens": 4380, "cache_creation_input_tokens": 18024, "cache_read_input_tokens": 415155, "output_tokens": 315, "billable_tokens_proxy": 22719, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 546, "task": "t3-todos", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "71596a05-560e-4fcf-8435-b92b247b2615", "duration_s": 31.93, "total_cost_usd": 0.192006, "input_tokens": 4384, "cache_creation_input_tokens": 23670, "cache_read_input_tokens": 487267, "output_tokens": 381, "billable_tokens_proxy": 28435, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 608, "task": "t3-todos", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "bca18d77-e9a5-448f-889f-3af8cd85f126", "duration_s": 14.26, "total_cost_usd": 0.09264325000000001, "input_tokens": 2912, "cache_creation_input_tokens": 10894, "cache_read_input_tokens": 121638, "output_tokens": 130, "billable_tokens_proxy": 13936, "tool_calls": 1, "wiki_reads_total": 1, "agents_md_read": true, "index_read": false, "guideline_reads": 0, "final_text_len": 175, "task": "t5-base64", "batch": 5, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "92eeb94c-3a8f-4809-87b1-2bdf80e96650", "duration_s": 24.35, "total_cost_usd": 0.128421, "input_tokens": 2918, "cache_creation_input_tokens": 8526, "cache_read_input_tokens": 231927, "output_tokens": 207, "billable_tokens_proxy": 11651, "tool_calls": 3, "wiki_reads_total": 1, "agents_md_read": true, "index_read": false, "guideline_reads": 0, "final_text_len": 152, "task": "t5-base64", "batch": 5, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "015b692d-b0f6-4811-a5d7-56a8cd4d01e6", "duration_s": 10.59, "total_cost_usd": 0.07611175, "input_tokens": 2912, "cache_creation_input_tokens": 7206, "cache_read_input_tokens": 124612, "output_tokens": 140, "billable_tokens_proxy": 10258, "tool_calls": 1, "wiki_reads_total": 1, "agents_md_read": true, "index_read": false, "guideline_reads": 0, "final_text_len": 150, "task": "t5-base64", "batch": 5, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
diff --git a/explorations/agent-wiki/experiments/metrics/twobatch-both.metrics.jsonl b/explorations/agent-wiki/experiments/metrics/twobatch-both.metrics.jsonl
new file mode 100644
index 00000000..c1d6bf76
--- /dev/null
+++ b/explorations/agent-wiki/experiments/metrics/twobatch-both.metrics.jsonl
@@ -0,0 +1,48 @@
+{"session_id": "f34f3996-c546-4642-8d7e-a801a17a4c52", "duration_s": 46.84, "total_cost_usd": 0.39758299999999996, "input_tokens": 4382, "cache_creation_input_tokens": 123870, "cache_read_input_tokens": 362823, "output_tokens": 295, "billable_tokens_proxy": 128547, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 688, "task": "t1-lens-model", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "821a2e8b-94c4-4693-8314-784a67615601", "duration_s": 29.21, "total_cost_usd": 0.188162, "input_tokens": 4382, "cache_creation_input_tokens": 27690, "cache_read_input_tokens": 458521, "output_tokens": 338, "billable_tokens_proxy": 32410, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 632, "task": "t1-lens-model", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "3b8035a9-2b5b-40dd-b814-9d4c24a2f618", "duration_s": 26.29, "total_cost_usd": 0.20712825, "input_tokens": 5965, "cache_creation_input_tokens": 33559, "cache_read_input_tokens": 482637, "output_tokens": 318, "billable_tokens_proxy": 39842, "tool_calls": 6, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 714, "task": "t1-lens-model", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "8f0243bf-b468-47a3-9316-9e52bfd6edd6", "duration_s": 20.09, "total_cost_usd": 0.15646474999999999, "input_tokens": 4374, "cache_creation_input_tokens": 24600, "cache_read_input_tokens": 293285, "output_tokens": 202, "billable_tokens_proxy": 29176, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 357, "task": "t10-zip-list", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "b466fb8f-53b1-487a-9bbb-d3fc0398a07e", "duration_s": 19.7, "total_cost_usd": 0.15006325, "input_tokens": 4376, "cache_creation_input_tokens": 24915, "cache_read_input_tokens": 333178, "output_tokens": 317, "billable_tokens_proxy": 29608, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 337, "task": "t10-zip-list", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "ba78ef87-433c-4990-be93-5931b256448d", "duration_s": 24.32, "total_cost_usd": 0.146872, "input_tokens": 2920, "cache_creation_input_tokens": 21133, "cache_read_input_tokens": 270769, "output_tokens": 236, "billable_tokens_proxy": 24289, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 276, "task": "t10-zip-list", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "b941bbba-30ae-441a-bdb5-0d9350c66eba", "duration_s": 121.56, "total_cost_usd": 0.18677925, "input_tokens": 4376, "cache_creation_input_tokens": 20949, "cache_read_input_tokens": 343672, "output_tokens": 111, "billable_tokens_proxy": 25436, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 684, "task": "t11-tar-list", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "4b5d9415-2545-4676-8c2b-f3898e0da2a6", "duration_s": 31.37, "total_cost_usd": 0.178017, "input_tokens": 4378, "cache_creation_input_tokens": 21664, "cache_read_input_tokens": 378575, "output_tokens": 190, "billable_tokens_proxy": 26232, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 559, "task": "t11-tar-list", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "870716ff-1e21-4d0f-9830-86f07c16bacb", "duration_s": 24.48, "total_cost_usd": 0.17988025, "input_tokens": 4380, "cache_creation_input_tokens": 26438, "cache_read_input_tokens": 414094, "output_tokens": 289, "billable_tokens_proxy": 31107, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 531, "task": "t11-tar-list", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "4a659fe5-09e5-439b-88a7-b90e4ef0573f", "duration_s": 34.45, "total_cost_usd": 0.185446, "input_tokens": 4376, "cache_creation_input_tokens": 21074, "cache_read_input_tokens": 343685, "output_tokens": 102, "billable_tokens_proxy": 25552, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 367, "task": "t12-wav-info", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "f63769e4-ee4a-4de4-a3e2-442cbbf806a4", "duration_s": 24.31, "total_cost_usd": 0.17732474999999998, "input_tokens": 4380, "cache_creation_input_tokens": 26387, "cache_read_input_tokens": 414074, "output_tokens": 269, "billable_tokens_proxy": 31036, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 343, "task": "t12-wav-info", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": false}
+{"session_id": "5a5f5215-d544-43ca-b00d-8c1b2f1c6da1", "duration_s": 31.84, "total_cost_usd": 0.18220625, "input_tokens": 4380, "cache_creation_input_tokens": 26568, "cache_read_input_tokens": 414285, "output_tokens": 284, "billable_tokens_proxy": 31232, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 497, "task": "t12-wav-info", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "5a5eb7d1-0be6-488d-a08e-3350c67f52a1", "duration_s": 34.72, "total_cost_usd": 0.16076925, "input_tokens": 4376, "cache_creation_input_tokens": 29305, "cache_read_input_tokens": 328695, "output_tokens": 314, "billable_tokens_proxy": 33995, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 659, "task": "t13-gzip-dec", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "36ab8fdc-e2e1-4edd-86eb-8b20730d2e27", "duration_s": 38.77, "total_cost_usd": 0.1877265, "input_tokens": 4382, "cache_creation_input_tokens": 27416, "cache_read_input_tokens": 453966, "output_tokens": 380, "billable_tokens_proxy": 32178, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 349, "task": "t13-gzip-dec", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "ac10e879-c74e-4bd0-802f-e8702df38e34", "duration_s": 33.21, "total_cost_usd": 0.17253524999999997, "input_tokens": 4376, "cache_creation_input_tokens": 17896, "cache_read_input_tokens": 346509, "output_tokens": 132, "billable_tokens_proxy": 22404, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 315, "task": "t13-gzip-dec", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "5b45fea7-9709-4a4a-9f52-b7f78c987f83", "duration_s": 29.69, "total_cost_usd": 0.18435825, "input_tokens": 2924, "cache_creation_input_tokens": 25252, "cache_read_input_tokens": 350119, "output_tokens": 319, "billable_tokens_proxy": 28495, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 229, "task": "t14-csv-quoted", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "f800b07a-199b-4904-977a-0e999d230005", "duration_s": 34.41, "total_cost_usd": 0.22669375, "input_tokens": 2928, "cache_creation_input_tokens": 22917, "cache_read_input_tokens": 436761, "output_tokens": 340, "billable_tokens_proxy": 26185, "tool_calls": 6, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 207, "task": "t14-csv-quoted", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "dba07065-aad8-4795-b540-9e7477832815", "duration_s": 36.31, "total_cost_usd": 0.22354475000000001, "input_tokens": 2926, "cache_creation_input_tokens": 19136, "cache_read_input_tokens": 399348, "output_tokens": 252, "billable_tokens_proxy": 22314, "tool_calls": 6, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 287, "task": "t14-csv-quoted", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "da6f83bc-2b0c-4fea-91d6-faa6bfe80efa", "duration_s": 23.94, "total_cost_usd": 0.15432875, "input_tokens": 2922, "cache_creation_input_tokens": 24375, "cache_read_input_tokens": 307984, "output_tokens": 266, "billable_tokens_proxy": 27563, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 109, "task": "t15-jsonl-kinds", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "d44154f0-2fb3-4a99-9c83-49e465e112b6", "duration_s": 34.44, "total_cost_usd": 0.14822225, "input_tokens": 2922, "cache_creation_input_tokens": 21557, "cache_read_input_tokens": 310958, "output_tokens": 272, "billable_tokens_proxy": 24751, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 95, "task": "t15-jsonl-kinds", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "80c47b42-5562-41e7-b978-7509f17469f8", "duration_s": 22.24, "total_cost_usd": 0.148372, "input_tokens": 4376, "cache_creation_input_tokens": 24954, "cache_read_input_tokens": 333340, "output_tokens": 271, "billable_tokens_proxy": 29601, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 158, "task": "t15-jsonl-kinds", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "21c82a09-e2d0-4224-a89b-fa5ae91b4bd9", "duration_s": 19.93, "total_cost_usd": 0.15534575, "input_tokens": 4376, "cache_creation_input_tokens": 29273, "cache_read_input_tokens": 328720, "output_tokens": 256, "billable_tokens_proxy": 33905, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 297, "task": "t16-ini-key", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "f81b5b51-be6a-4b37-8bfd-9ac3309e1f5f", "duration_s": 17.62, "total_cost_usd": 0.15208850000000002, "input_tokens": 4378, "cache_creation_input_tokens": 30156, "cache_read_input_tokens": 368352, "output_tokens": 288, "billable_tokens_proxy": 34822, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 240, "task": "t16-ini-key", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "06aeac92-9899-4b25-998e-de7c9d41f130", "duration_s": 32.58, "total_cost_usd": 0.153356, "input_tokens": 4378, "cache_creation_input_tokens": 30271, "cache_read_input_tokens": 368545, "output_tokens": 330, "billable_tokens_proxy": 34979, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 340, "task": "t16-ini-key", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "9ed3f248-5b0d-40fe-af2d-b012726ce2b5", "duration_s": 11.29, "total_cost_usd": 0.083054, "input_tokens": 4366, "cache_creation_input_tokens": 10524, "cache_read_input_tokens": 151981, "output_tokens": 192, "billable_tokens_proxy": 15082, "tool_calls": 2, "wiki_reads_total": 1, "agents_md_read": true, "index_read": false, "guideline_reads": 0, "final_text_len": 1, "task": "t17-log-errors", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "f5905106-63c1-45a6-90a9-33bd727c095c", "duration_s": 23.37, "total_cost_usd": 0.1466545, "input_tokens": 2920, "cache_creation_input_tokens": 21380, "cache_read_input_tokens": 270868, "output_tokens": 230, "billable_tokens_proxy": 24530, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 131, "task": "t17-log-errors", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "306a666f-d302-4e30-b89f-f9ab1dea5438", "duration_s": 17.6, "total_cost_usd": 0.1455965, "input_tokens": 2920, "cache_creation_input_tokens": 21319, "cache_read_input_tokens": 270858, "output_tokens": 230, "billable_tokens_proxy": 24469, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 91, "task": "t17-log-errors", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "30467a66-a45c-42dc-9aea-698a0b42bff0", "duration_s": 81.37, "total_cost_usd": 0.248, "input_tokens": 4646, "cache_creation_input_tokens": 33247, "cache_read_input_tokens": 576646, "output_tokens": 374, "billable_tokens_proxy": 38267, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 1003, "task": "t2-imports", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "d0b5f72a-8688-4fac-819c-51690b919be7", "duration_s": 66.38, "total_cost_usd": 0.25303675000000003, "input_tokens": 4648, "cache_creation_input_tokens": 31134, "cache_read_input_tokens": 622620, "output_tokens": 442, "billable_tokens_proxy": 36224, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 985, "task": "t2-imports", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "2cba2824-288b-400d-ae4e-72f0bf96bcd6", "duration_s": 35.53, "total_cost_usd": 0.185786, "input_tokens": 4380, "cache_creation_input_tokens": 32824, "cache_read_input_tokens": 407354, "output_tokens": 280, "billable_tokens_proxy": 37484, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 810, "task": "t2-imports", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "d8bed576-c419-47d0-af6f-ed6ae8f2cfd7", "duration_s": 42.16, "total_cost_usd": 0.21774075, "input_tokens": 4382, "cache_creation_input_tokens": 32639, "cache_read_input_tokens": 450081, "output_tokens": 349, "billable_tokens_proxy": 37370, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 607, "task": "t3-todos", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "7242f03f-a36b-4530-b939-1682d3d97e23", "duration_s": 44.47, "total_cost_usd": 0.20307600000000003, "input_tokens": 4382, "cache_creation_input_tokens": 27662, "cache_read_input_tokens": 454645, "output_tokens": 344, "billable_tokens_proxy": 32388, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 593, "task": "t3-todos", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "6fc8930c-4f9d-444e-b9c8-90ff4afa626d", "duration_s": 56.69, "total_cost_usd": 0.21921975, "input_tokens": 4384, "cache_creation_input_tokens": 31486, "cache_read_input_tokens": 495590, "output_tokens": 436, "billable_tokens_proxy": 36306, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 816, "task": "t3-todos", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "3c3fcf22-d280-4388-9763-bf016723767f", "duration_s": 29.79, "total_cost_usd": 0.14193649999999997, "input_tokens": 2920, "cache_creation_input_tokens": 16987, "cache_read_input_tokens": 259657, "output_tokens": 259, "billable_tokens_proxy": 20166, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 353, "task": "t5-base64", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "d6eca1b6-33ce-47e0-8411-9e78155e3048", "duration_s": 10.79, "total_cost_usd": 0.0736735, "input_tokens": 2912, "cache_creation_input_tokens": 7204, "cache_read_input_tokens": 124584, "output_tokens": 140, "billable_tokens_proxy": 10256, "tool_calls": 1, "wiki_reads_total": 1, "agents_md_read": true, "index_read": false, "guideline_reads": 0, "final_text_len": 154, "task": "t5-base64", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "39a452a1-fd78-4eba-9c9a-8189fc9d6b98", "duration_s": 30.93, "total_cost_usd": 0.1292145, "input_tokens": 2916, "cache_creation_input_tokens": 16592, "cache_read_input_tokens": 195558, "output_tokens": 218, "billable_tokens_proxy": 19726, "tool_calls": 2, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 173, "task": "t5-base64", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "21d922dd-65bc-4a10-9403-cedcc5055dc3", "duration_s": 19.86, "total_cost_usd": 0.14497825, "input_tokens": 4370, "cache_creation_input_tokens": 21048, "cache_read_input_tokens": 222234, "output_tokens": 275, "billable_tokens_proxy": 25693, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 428, "task": "t6-png-dim", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "713be0e1-a503-437e-af04-0d16fd67f8f3", "duration_s": 37.45, "total_cost_usd": 0.1811355, "input_tokens": 4380, "cache_creation_input_tokens": 27180, "cache_read_input_tokens": 414688, "output_tokens": 273, "billable_tokens_proxy": 31833, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 431, "task": "t6-png-dim", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "efc0dc4c-fd97-43a4-bad8-6a551bac01a2", "duration_s": 29.91, "total_cost_usd": 0.17778549999999999, "input_tokens": 4378, "cache_creation_input_tokens": 22279, "cache_read_input_tokens": 379004, "output_tokens": 154, "billable_tokens_proxy": 26811, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 392, "task": "t6-png-dim", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "b85a12ff-cdfb-4e98-9ae7-fe3a02c55f23", "duration_s": 31.63, "total_cost_usd": 0.20920949999999996, "input_tokens": 4378, "cache_creation_input_tokens": 21928, "cache_read_input_tokens": 385391, "output_tokens": 137, "billable_tokens_proxy": 26443, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 311, "task": "t7-gif-dim", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "b06ea191-a71b-45fb-96d0-f95efb8ed09e", "duration_s": 32.86, "total_cost_usd": 0.17364325, "input_tokens": 2920, "cache_creation_input_tokens": 17482, "cache_read_input_tokens": 276449, "output_tokens": 224, "billable_tokens_proxy": 20626, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 302, "task": "t7-gif-dim", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "90987872-c5b0-4ba8-bf68-b7c7a15f77a1", "duration_s": 31.7, "total_cost_usd": 0.17294525, "input_tokens": 2920, "cache_creation_input_tokens": 17432, "cache_read_input_tokens": 276359, "output_tokens": 215, "billable_tokens_proxy": 20567, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 316, "task": "t7-gif-dim", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "93d83eed-c0a1-45e3-ae6c-87d0cabece1e", "duration_s": 38.0, "total_cost_usd": 0.22888424999999998, "input_tokens": 4517, "cache_creation_input_tokens": 38352, "cache_read_input_tokens": 572296, "output_tokens": 394, "billable_tokens_proxy": 43263, "tool_calls": 6, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 582, "task": "t8-bmp-info", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "a793c743-53c4-46f7-b107-614ed4f8a55f", "duration_s": 28.42, "total_cost_usd": 0.19196175, "input_tokens": 4216, "cache_creation_input_tokens": 29034, "cache_read_input_tokens": 387416, "output_tokens": 279, "billable_tokens_proxy": 33529, "tool_calls": 5, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 368, "task": "t8-bmp-info", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "c125d0ff-a2ba-4641-8cc1-dc2bf65ade9e", "duration_s": 30.43, "total_cost_usd": 0.1931195, "input_tokens": 4384, "cache_creation_input_tokens": 33492, "cache_read_input_tokens": 492600, "output_tokens": 348, "billable_tokens_proxy": 38224, "tool_calls": 5, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 670, "task": "t8-bmp-info", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "c42785a8-622e-40ce-85f4-a3cb1b246e1c", "duration_s": 27.58, "total_cost_usd": 0.19513650000000002, "input_tokens": 4382, "cache_creation_input_tokens": 32792, "cache_read_input_tokens": 457877, "output_tokens": 303, "billable_tokens_proxy": 37477, "tool_calls": 5, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 376, "task": "t9-webp-dim", "batch": 4, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "65fe0d39-4cdd-4c22-a91b-ca2dbc5f7cf2", "duration_s": 23.35, "total_cost_usd": 0.17579925, "input_tokens": 2924, "cache_creation_input_tokens": 23330, "cache_read_input_tokens": 351974, "output_tokens": 206, "billable_tokens_proxy": 26460, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 264, "task": "t9-webp-dim", "batch": 4, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "78c85ba2-5345-4782-bb8c-5eee6f3467de", "duration_s": 23.93, "total_cost_usd": 0.179563, "input_tokens": 4380, "cache_creation_input_tokens": 27043, "cache_read_input_tokens": 414580, "output_tokens": 252, "billable_tokens_proxy": 31675, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 423, "task": "t9-webp-dim", "batch": 4, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
diff --git a/explorations/agent-wiki/experiments/metrics/twobatch-skills.metrics.jsonl b/explorations/agent-wiki/experiments/metrics/twobatch-skills.metrics.jsonl
new file mode 100644
index 00000000..2e73aaaa
--- /dev/null
+++ b/explorations/agent-wiki/experiments/metrics/twobatch-skills.metrics.jsonl
@@ -0,0 +1,48 @@
+{"session_id": "dfe1f472-c695-4b0d-8bc0-a2ce6d7d8ea0", "duration_s": 26.54, "total_cost_usd": 0.36058025, "input_tokens": 4380, "cache_creation_input_tokens": 111438, "cache_read_input_tokens": 311781, "output_tokens": 192, "billable_tokens_proxy": 116010, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 442, "task": "t1-lens-model", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "86290a1d-aa2c-4a78-8727-0ff843696ba6", "duration_s": 26.89, "total_cost_usd": 0.176281, "input_tokens": 4378, "cache_creation_input_tokens": 11596, "cache_read_input_tokens": 375617, "output_tokens": 110, "billable_tokens_proxy": 16084, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 567, "task": "t1-lens-model", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "54130ce4-603b-48c0-91c8-c7c865a7464f", "duration_s": 35.54, "total_cost_usd": 0.17442225, "input_tokens": 4378, "cache_creation_input_tokens": 11581, "cache_read_input_tokens": 375572, "output_tokens": 128, "billable_tokens_proxy": 16087, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 481, "task": "t1-lens-model", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "c4e1fd1b-214f-44a7-9e43-d736ffe53a43", "duration_s": 29.02, "total_cost_usd": 0.14115875, "input_tokens": 4374, "cache_creation_input_tokens": 13192, "cache_read_input_tokens": 294569, "output_tokens": 187, "billable_tokens_proxy": 17753, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 526, "task": "t10-zip-list", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "351bd7a9-ec89-4d72-8d09-8e762a7debd0", "duration_s": 41.56, "total_cost_usd": 0.13210650000000002, "input_tokens": 4374, "cache_creation_input_tokens": 10236, "cache_read_input_tokens": 297558, "output_tokens": 199, "billable_tokens_proxy": 14809, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 394, "task": "t10-zip-list", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "b2926dff-aad4-4ab7-88eb-b8c86dab6bc0", "duration_s": 21.94, "total_cost_usd": 0.13444075, "input_tokens": 2922, "cache_creation_input_tokens": 13883, "cache_read_input_tokens": 298935, "output_tokens": 246, "billable_tokens_proxy": 17051, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 477, "task": "t10-zip-list", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "e10d90f6-f73c-4830-bbc5-847495fc00e1", "duration_s": 37.94, "total_cost_usd": 0.14507875, "input_tokens": 4376, "cache_creation_input_tokens": 18399, "cache_read_input_tokens": 324956, "output_tokens": 280, "billable_tokens_proxy": 23055, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 661, "task": "t11-tar-list", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "139453c8-4d93-430c-95aa-cc38c2f4b243", "duration_s": 22.03, "total_cost_usd": 0.12860149999999998, "input_tokens": 2920, "cache_creation_input_tokens": 10038, "cache_read_input_tokens": 266836, "output_tokens": 199, "billable_tokens_proxy": 13157, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 616, "task": "t11-tar-list", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "dec60af0-21b5-40f6-acc5-574910d0361e", "duration_s": 38.36, "total_cost_usd": 0.13422275, "input_tokens": 4376, "cache_creation_input_tokens": 13931, "cache_read_input_tokens": 329404, "output_tokens": 263, "billable_tokens_proxy": 18570, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 663, "task": "t11-tar-list", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "df2134e6-bd7c-49bb-936f-1f87769976be", "duration_s": 24.44, "total_cost_usd": 0.177611, "input_tokens": 4382, "cache_creation_input_tokens": 22236, "cache_read_input_tokens": 435347, "output_tokens": 346, "billable_tokens_proxy": 26964, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 737, "task": "t12-wav-info", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "6222236b-2d5e-4327-8cf8-8e0fc2df55a2", "duration_s": 30.21, "total_cost_usd": 0.16059725, "input_tokens": 2924, "cache_creation_input_tokens": 13002, "cache_read_input_tokens": 341768, "output_tokens": 230, "billable_tokens_proxy": 16156, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 323, "task": "t12-wav-info", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "d0b42304-e230-4a72-988f-6599e34ef2a0", "duration_s": 21.78, "total_cost_usd": 0.13493425, "input_tokens": 4376, "cache_creation_input_tokens": 13959, "cache_read_input_tokens": 329403, "output_tokens": 271, "billable_tokens_proxy": 18606, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 598, "task": "t12-wav-info", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": false}
+{"session_id": "eed51cba-2ce8-4cca-ac51-5058e9af34ad", "duration_s": 22.11, "total_cost_usd": 0.13270375, "input_tokens": 4374, "cache_creation_input_tokens": 12945, "cache_read_input_tokens": 294412, "output_tokens": 197, "billable_tokens_proxy": 17516, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 394, "task": "t13-gzip-dec", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "ea2f4816-a73f-4652-8f96-e9864228b6e2", "duration_s": 29.35, "total_cost_usd": 0.12696525, "input_tokens": 4376, "cache_creation_input_tokens": 13739, "cache_read_input_tokens": 329352, "output_tokens": 277, "billable_tokens_proxy": 18392, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 357, "task": "t13-gzip-dec", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "3f41f068-84f0-494e-9d5a-c2bc655d1356", "duration_s": 36.16, "total_cost_usd": 0.12179374999999999, "input_tokens": 4374, "cache_creation_input_tokens": 9913, "cache_read_input_tokens": 297359, "output_tokens": 174, "billable_tokens_proxy": 14461, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 270, "task": "t13-gzip-dec", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "10ec91b5-1016-4c2c-a242-ae3654091262", "duration_s": 48.65, "total_cost_usd": 0.17755024999999997, "input_tokens": 4378, "cache_creation_input_tokens": 13332, "cache_read_input_tokens": 371461, "output_tokens": 216, "billable_tokens_proxy": 17926, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 175, "task": "t14-csv-quoted", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "f8a505d9-e2a1-44d8-a41a-e73c4a30059e", "duration_s": 22.64, "total_cost_usd": 0.19152175, "input_tokens": 2924, "cache_creation_input_tokens": 10587, "cache_read_input_tokens": 344330, "output_tokens": 179, "billable_tokens_proxy": 13690, "tool_calls": 6, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 250, "task": "t14-csv-quoted", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "a70126a6-6363-4f86-b98d-800a98b307bf", "duration_s": 28.66, "total_cost_usd": 0.16939875000000001, "input_tokens": 2922, "cache_creation_input_tokens": 10285, "cache_read_input_tokens": 306757, "output_tokens": 152, "billable_tokens_proxy": 13359, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 210, "task": "t14-csv-quoted", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "44c5d8a0-f54f-446b-b283-20d6a2354975", "duration_s": 41.84, "total_cost_usd": 0.16850075, "input_tokens": 2924, "cache_creation_input_tokens": 15940, "cache_read_input_tokens": 338809, "output_tokens": 274, "billable_tokens_proxy": 19138, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 136, "task": "t15-jsonl-kinds", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "2e872e6e-0378-43af-bfb6-25d1df16878a", "duration_s": 131.68, "total_cost_usd": 0.17372075, "input_tokens": 2928, "cache_creation_input_tokens": 16240, "cache_read_input_tokens": 413315, "output_tokens": 333, "billable_tokens_proxy": 19501, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 289, "task": "t15-jsonl-kinds", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "942bab93-fd74-4298-abb7-7c91d17e7c4e", "duration_s": 27.34, "total_cost_usd": 0.134731, "input_tokens": 2920, "cache_creation_input_tokens": 10406, "cache_read_input_tokens": 267227, "output_tokens": 219, "billable_tokens_proxy": 13545, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 183, "task": "t15-jsonl-kinds", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "6d3c7d3a-b3f0-42f3-913b-f0193ebf8426", "duration_s": 26.38, "total_cost_usd": 0.13945225, "input_tokens": 4376, "cache_creation_input_tokens": 18325, "cache_read_input_tokens": 324991, "output_tokens": 274, "billable_tokens_proxy": 22975, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 396, "task": "t16-ini-key", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "58d92a16-4815-468c-b981-f7cfd55cffb7", "duration_s": 38.21, "total_cost_usd": 0.18534624999999996, "input_tokens": 4517, "cache_creation_input_tokens": 16477, "cache_read_input_tokens": 550134, "output_tokens": 463, "billable_tokens_proxy": 21457, "tool_calls": 5, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 525, "task": "t16-ini-key", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "77ebb350-ade6-4935-a0f4-48cd58d49122", "duration_s": 26.06, "total_cost_usd": 0.12386275, "input_tokens": 2920, "cache_creation_input_tokens": 9406, "cache_read_input_tokens": 267736, "output_tokens": 230, "billable_tokens_proxy": 12556, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 281, "task": "t16-ini-key", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "26f1d62a-9c1c-42fc-bcc5-c0922b2159b1", "duration_s": 30.3, "total_cost_usd": 0.1396995, "input_tokens": 4376, "cache_creation_input_tokens": 13805, "cache_read_input_tokens": 330926, "output_tokens": 302, "billable_tokens_proxy": 18483, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 1, "task": "t17-log-errors", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "451fd893-646b-4bfc-8e92-13359d197422", "duration_s": 20.11, "total_cost_usd": 0.13178225, "input_tokens": 2922, "cache_creation_input_tokens": 10837, "cache_read_input_tokens": 303831, "output_tokens": 285, "billable_tokens_proxy": 14044, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 337, "task": "t17-log-errors", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "593b6f4a-aa79-4c04-a5d8-300b2e327b25", "duration_s": 22.39, "total_cost_usd": 0.11096025, "input_tokens": 4374, "cache_creation_input_tokens": 16760, "cache_read_input_tokens": 289952, "output_tokens": 320, "billable_tokens_proxy": 21454, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 222, "task": "t17-log-errors", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "8824bea3-b6a9-4e1f-a240-1f56182606a1", "duration_s": 25.28, "total_cost_usd": 0.14911875000000002, "input_tokens": 4376, "cache_creation_input_tokens": 14389, "cache_read_input_tokens": 331620, "output_tokens": 264, "billable_tokens_proxy": 19029, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 572, "task": "t2-imports", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "980191e6-8ac9-4c48-94e6-fea240a262a0", "duration_s": 31.72, "total_cost_usd": 0.14118175, "input_tokens": 2920, "cache_creation_input_tokens": 11072, "cache_read_input_tokens": 267668, "output_tokens": 173, "billable_tokens_proxy": 14165, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 563, "task": "t2-imports", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "ab48bec1-354c-4d9c-8c24-0ad70abc14ec", "duration_s": 43.72, "total_cost_usd": 0.21294575000000002, "input_tokens": 4648, "cache_creation_input_tokens": 18946, "cache_read_input_tokens": 589881, "output_tokens": 448, "billable_tokens_proxy": 24042, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 998, "task": "t2-imports", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "212b2428-0fe3-4f6d-8b5c-524d034e0710", "duration_s": 38.47, "total_cost_usd": 0.19270700000000002, "input_tokens": 4517, "cache_creation_input_tokens": 20220, "cache_read_input_tokens": 544681, "output_tokens": 469, "billable_tokens_proxy": 25206, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 564, "task": "t3-todos", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "b65150d9-b839-4c3e-add9-2a06fc61558c", "duration_s": 30.95, "total_cost_usd": 0.13480875, "input_tokens": 4378, "cache_creation_input_tokens": 14114, "cache_read_input_tokens": 365922, "output_tokens": 309, "billable_tokens_proxy": 18801, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 815, "task": "t3-todos", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "ccf42820-6c8a-4bc3-b8df-bb9f929ee972", "duration_s": 34.69, "total_cost_usd": 0.16128725, "input_tokens": 4382, "cache_creation_input_tokens": 13497, "cache_read_input_tokens": 436746, "output_tokens": 388, "billable_tokens_proxy": 18267, "tool_calls": 4, "wiki_reads_total": 1, "agents_md_read": true, "index_read": false, "guideline_reads": 0, "final_text_len": 453, "task": "t3-todos", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "2d188f50-aab4-4b01-9aad-b442653ed912", "duration_s": 28.14, "total_cost_usd": 0.12845774999999998, "input_tokens": 2916, "cache_creation_input_tokens": 13158, "cache_read_input_tokens": 193656, "output_tokens": 194, "billable_tokens_proxy": 16268, "tool_calls": 2, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 216, "task": "t5-base64", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "dc77b929-8d05-47f2-ad87-989a9a3a7ec1", "duration_s": 30.26, "total_cost_usd": 0.120713, "input_tokens": 2920, "cache_creation_input_tokens": 8368, "cache_read_input_tokens": 266804, "output_tokens": 223, "billable_tokens_proxy": 11511, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 20, "task": "t5-base64", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "96607445-092e-4882-97b4-8f5531f65fd3", "duration_s": 27.48, "total_cost_usd": 0.107248, "input_tokens": 2916, "cache_creation_input_tokens": 9312, "cache_read_input_tokens": 195692, "output_tokens": 202, "billable_tokens_proxy": 12430, "tool_calls": 2, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 190, "task": "t5-base64", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "b133337c-a1b2-451e-9234-d45c52b52851", "duration_s": 15.37, "total_cost_usd": 0.15595925, "input_tokens": 4376, "cache_creation_input_tokens": 15060, "cache_read_input_tokens": 331532, "output_tokens": 70, "billable_tokens_proxy": 19506, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 299, "task": "t6-png-dim", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "f0ad51b3-7ae7-4acb-9573-c0405525069f", "duration_s": 24.94, "total_cost_usd": 0.14677675, "input_tokens": 4376, "cache_creation_input_tokens": 11118, "cache_read_input_tokens": 335877, "output_tokens": 127, "billable_tokens_proxy": 15621, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 236, "task": "t6-png-dim", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "32b497dd-143b-4a92-a0fb-f22ac509c7a1", "duration_s": 30.21, "total_cost_usd": 0.14866475, "input_tokens": 4378, "cache_creation_input_tokens": 14765, "cache_read_input_tokens": 367863, "output_tokens": 211, "billable_tokens_proxy": 19354, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 296, "task": "t6-png-dim", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "17f3e41f-8389-453c-8bff-c62afe5affd5", "duration_s": 27.75, "total_cost_usd": 0.15377725, "input_tokens": 4376, "cache_creation_input_tokens": 14003, "cache_read_input_tokens": 332510, "output_tokens": 75, "billable_tokens_proxy": 18454, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 272, "task": "t7-gif-dim", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "1b2b7c19-ed51-4de5-9efb-4dabe6493180", "duration_s": 22.77, "total_cost_usd": 0.1462565, "input_tokens": 4376, "cache_creation_input_tokens": 11137, "cache_read_input_tokens": 335813, "output_tokens": 138, "billable_tokens_proxy": 15651, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 168, "task": "t7-gif-dim", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "0f98d129-6a80-4f4b-81b0-025af91596e3", "duration_s": 49.29, "total_cost_usd": 0.14595675, "input_tokens": 4376, "cache_creation_input_tokens": 11071, "cache_read_input_tokens": 335724, "output_tokens": 127, "billable_tokens_proxy": 15574, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 248, "task": "t7-gif-dim", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "f06d866d-9ec8-426e-8bb6-bc557baee0d7", "duration_s": 25.7, "total_cost_usd": 0.18049574999999998, "input_tokens": 4380, "cache_creation_input_tokens": 19312, "cache_read_input_tokens": 400709, "output_tokens": 168, "billable_tokens_proxy": 23860, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 480, "task": "t8-bmp-info", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "267a3d60-a490-4a78-8fd1-61a1aa4dfff6", "duration_s": 40.81, "total_cost_usd": 0.14628325, "input_tokens": 2920, "cache_creation_input_tokens": 9944, "cache_read_input_tokens": 268819, "output_tokens": 159, "billable_tokens_proxy": 13023, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 404, "task": "t8-bmp-info", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "706689cf-05b0-4ca0-a884-37d1ca2e3a5e", "duration_s": 23.27, "total_cost_usd": 0.1507965, "input_tokens": 4376, "cache_creation_input_tokens": 11182, "cache_read_input_tokens": 335856, "output_tokens": 106, "billable_tokens_proxy": 15664, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 462, "task": "t8-bmp-info", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "5f12f661-fb48-4531-9e78-449cc062d6b2", "duration_s": 24.95, "total_cost_usd": 0.15474275, "input_tokens": 2922, "cache_creation_input_tokens": 14046, "cache_read_input_tokens": 302371, "output_tokens": 121, "billable_tokens_proxy": 17089, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 281, "task": "t9-webp-dim", "batch": 3, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "247bd87b-7c5d-4594-82f4-841af3c4aae0", "duration_s": 25.63, "total_cost_usd": 0.14665050000000002, "input_tokens": 4376, "cache_creation_input_tokens": 11133, "cache_read_input_tokens": 335873, "output_tokens": 123, "billable_tokens_proxy": 15632, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 223, "task": "t9-webp-dim", "batch": 3, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "2c6316fe-3f3d-4038-be20-a4a8a321c157", "duration_s": 27.2, "total_cost_usd": 0.143769, "input_tokens": 2920, "cache_creation_input_tokens": 9900, "cache_read_input_tokens": 268874, "output_tokens": 122, "billable_tokens_proxy": 12942, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 259, "task": "t9-webp-dim", "batch": 3, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
diff --git a/explorations/agent-wiki/experiments/metrics/twobatch.metrics.jsonl b/explorations/agent-wiki/experiments/metrics/twobatch.metrics.jsonl
new file mode 100644
index 00000000..2935e72f
--- /dev/null
+++ b/explorations/agent-wiki/experiments/metrics/twobatch.metrics.jsonl
@@ -0,0 +1,95 @@
+{"session_id": "e45c7a47-c30a-438d-9961-7ba3da638f6b", "duration_s": 28.22, "total_cost_usd": 0.2193035, "input_tokens": 4517, "cache_creation_input_tokens": 12975, "cache_read_input_tokens": 545088, "output_tokens": 318, "billable_tokens_proxy": 17810, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 330, "task": "t10-zip-list", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "681cc195-1fcb-4d4f-be7a-fba72c90fe11", "duration_s": 29.09, "total_cost_usd": 0.20837625, "input_tokens": 4779, "cache_creation_input_tokens": 20663, "cache_read_input_tokens": 615892, "output_tokens": 467, "billable_tokens_proxy": 25909, "tool_calls": 8, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 361, "task": "t10-zip-list", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "07699c4b-b5a6-4249-b904-7dfa7f8afb8c", "duration_s": 39.98, "total_cost_usd": 0.20986875, "input_tokens": 4517, "cache_creation_input_tokens": 13025, "cache_read_input_tokens": 544870, "output_tokens": 406, "billable_tokens_proxy": 17948, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 330, "task": "t10-zip-list", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "ae7be18d-661a-4b5c-b87c-0b32f977ecb4", "duration_s": 50.76, "total_cost_usd": 0.20279124999999998, "input_tokens": 2932, "cache_creation_input_tokens": 13210, "cache_read_input_tokens": 478626, "output_tokens": 374, "billable_tokens_proxy": 16516, "tool_calls": 6, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 1, "task": "t14-csv-quoted", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "ebfd2680-30e6-4c92-a59a-9d27b9e171a1", "duration_s": 39.35, "total_cost_usd": 0.21604625, "input_tokens": 4517, "cache_creation_input_tokens": 12869, "cache_read_input_tokens": 545336, "output_tokens": 452, "billable_tokens_proxy": 17838, "tool_calls": 7, "wiki_reads_total": 4, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 1, "task": "t14-csv-quoted", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "91dc82e5-c142-4f34-b79c-097a5a4291a1", "duration_s": 35.45, "total_cost_usd": 0.21022125000000003, "input_tokens": 2932, "cache_creation_input_tokens": 9632, "cache_read_input_tokens": 481565, "output_tokens": 372, "billable_tokens_proxy": 12936, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 155, "task": "t14-csv-quoted", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "d6fff729-25ad-49eb-a1eb-8123eb33fb7d", "duration_s": 36.97, "total_cost_usd": 0.221664, "input_tokens": 4517, "cache_creation_input_tokens": 11922, "cache_read_input_tokens": 543060, "output_tokens": 359, "billable_tokens_proxy": 16798, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 621, "task": "t11-tar-list", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "ee28b25d-66ca-4b92-9d64-671c3fded93b", "duration_s": 46.45, "total_cost_usd": 0.21444549999999998, "input_tokens": 4519, "cache_creation_input_tokens": 12446, "cache_read_input_tokens": 579107, "output_tokens": 450, "billable_tokens_proxy": 17415, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 642, "task": "t11-tar-list", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "4a74dec5-4e18-4bde-bfbf-5b334eb57847", "duration_s": 40.27, "total_cost_usd": 0.21395800000000004, "input_tokens": 4517, "cache_creation_input_tokens": 10160, "cache_read_input_tokens": 548246, "output_tokens": 349, "billable_tokens_proxy": 15026, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 465, "task": "t11-tar-list", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "3d4ee0d1-d2fa-42a6-9457-14659dde7e89", "duration_s": 35.82, "total_cost_usd": 0.20506050000000003, "input_tokens": 2928, "cache_creation_input_tokens": 14095, "cache_read_input_tokens": 406257, "output_tokens": 298, "billable_tokens_proxy": 17321, "tool_calls": 6, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 337, "task": "t5-base64", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "2c386b8b-286d-40ed-b1fe-8c13e00b2ec9", "duration_s": 42.71, "total_cost_usd": 0.18722850000000002, "input_tokens": 2930, "cache_creation_input_tokens": 9491, "cache_read_input_tokens": 445721, "output_tokens": 350, "billable_tokens_proxy": 12771, "tool_calls": 6, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 20, "task": "t5-base64", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "547c9941-cbf5-4466-95f4-f7aff85f97dd", "duration_s": 36.95, "total_cost_usd": 0.20590025, "input_tokens": 2930, "cache_creation_input_tokens": 9320, "cache_read_input_tokens": 447326, "output_tokens": 325, "billable_tokens_proxy": 12575, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 20, "task": "t5-base64", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "1b560d86-8c35-497a-907d-075eba07683b", "duration_s": 71.36, "total_cost_usd": 0.2122525, "input_tokens": 4517, "cache_creation_input_tokens": 8940, "cache_read_input_tokens": 545572, "output_tokens": 340, "billable_tokens_proxy": 13797, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 260, "task": "t8-bmp-info", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "dd0259ee-4935-4d32-9728-55a9599c4945", "duration_s": 87.08, "total_cost_usd": 0.377762, "input_tokens": 2930, "cache_creation_input_tokens": 73282, "cache_read_input_tokens": 380923, "output_tokens": 308, "billable_tokens_proxy": 76520, "tool_calls": 6, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 388, "task": "t8-bmp-info", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "a14b6e18-4ac4-4fa5-83d0-df703ce86d47", "duration_s": 43.52, "total_cost_usd": 0.20914549999999998, "input_tokens": 4517, "cache_creation_input_tokens": 20554, "cache_read_input_tokens": 539359, "output_tokens": 435, "billable_tokens_proxy": 25506, "tool_calls": 6, "wiki_reads_total": 4, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 234, "task": "t15-jsonl-kinds", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "8665c475-2e79-4974-972c-6ee04cfd5050", "duration_s": 36.88, "total_cost_usd": 0.22407125, "input_tokens": 2934, "cache_creation_input_tokens": 13197, "cache_read_input_tokens": 521856, "output_tokens": 363, "billable_tokens_proxy": 16494, "tool_calls": 7, "wiki_reads_total": 4, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 1, "task": "t15-jsonl-kinds", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "6a9f8fb5-d6ec-4e48-be5d-5da07ae23c09", "duration_s": 49.3, "total_cost_usd": 0.23738425, "input_tokens": 4783, "cache_creation_input_tokens": 14199, "cache_read_input_tokens": 691955, "output_tokens": 537, "billable_tokens_proxy": 19519, "tool_calls": 8, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 285, "task": "t15-jsonl-kinds", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "01984e90-b2c4-434a-8f87-bdb654ae10f6", "duration_s": 37.16, "total_cost_usd": 0.19703325, "input_tokens": 4386, "cache_creation_input_tokens": 17288, "cache_read_input_tokens": 500432, "output_tokens": 375, "billable_tokens_proxy": 22049, "tool_calls": 6, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 226, "task": "t6-png-dim", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "6369eb12-c1ac-4057-9dd2-3a14eed675fc", "duration_s": 43.28, "total_cost_usd": 0.18716225, "input_tokens": 4384, "cache_creation_input_tokens": 8936, "cache_read_input_tokens": 476257, "output_tokens": 356, "billable_tokens_proxy": 13676, "tool_calls": 6, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 265, "task": "t6-png-dim", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "b2181f4a-04f8-4d70-a1b5-22bc843b837d", "duration_s": 64.04, "total_cost_usd": 0.21102474999999998, "input_tokens": 2934, "cache_creation_input_tokens": 9880, "cache_read_input_tokens": 518016, "output_tokens": 397, "billable_tokens_proxy": 13211, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 9, "task": "t6-png-dim", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "d204b47c-4afa-4311-a957-5bc74caaa9a5", "duration_s": 40.14, "total_cost_usd": 0.19239974999999998, "input_tokens": 2932, "cache_creation_input_tokens": 12392, "cache_read_input_tokens": 478493, "output_tokens": 403, "billable_tokens_proxy": 15727, "tool_calls": 6, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 140, "task": "t17-log-errors", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "8a3d2dae-f732-4a25-a2d9-9ca17a08898a", "duration_s": 49.18, "total_cost_usd": 0.187619, "input_tokens": 4388, "cache_creation_input_tokens": 12910, "cache_read_input_tokens": 545019, "output_tokens": 471, "billable_tokens_proxy": 17769, "tool_calls": 6, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 1, "task": "t17-log-errors", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "d0b56fce-edcf-4cf0-8d5f-c8ea3ea98560", "duration_s": 45.82, "total_cost_usd": 0.2350455, "input_tokens": 2936, "cache_creation_input_tokens": 10040, "cache_read_input_tokens": 557853, "output_tokens": 398, "billable_tokens_proxy": 13374, "tool_calls": 8, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 141, "task": "t17-log-errors", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "b4c7fc80-7e6e-4e26-8fc8-cd85685661f7", "duration_s": 33.48, "total_cost_usd": 0.22028350000000002, "input_tokens": 4517, "cache_creation_input_tokens": 13053, "cache_read_input_tokens": 545275, "output_tokens": 315, "billable_tokens_proxy": 17885, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 319, "task": "t13-gzip-dec", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "9b8fe3f5-78ac-4952-b77b-5162ebf65c34", "duration_s": 41.42, "total_cost_usd": 0.16637899999999997, "input_tokens": 4515, "cache_creation_input_tokens": 2275, "cache_read_input_tokens": 515206, "output_tokens": 373, "billable_tokens_proxy": 7163, "tool_calls": 6, "wiki_reads_total": 4, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 365, "task": "t13-gzip-dec", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "c4b44887-497b-4f2b-bba3-ece4589a309a", "duration_s": 37.37, "total_cost_usd": 0.21254249999999997, "input_tokens": 4521, "cache_creation_input_tokens": 13451, "cache_read_input_tokens": 615619, "output_tokens": 462, "billable_tokens_proxy": 18434, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 411, "task": "t13-gzip-dec", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "ad328a24-1c53-495e-ade1-4e3ae74e8302", "duration_s": 49.07, "total_cost_usd": 0.19947874999999998, "input_tokens": 4644, "cache_creation_input_tokens": 12079, "cache_read_input_tokens": 506961, "output_tokens": 381, "billable_tokens_proxy": 17104, "tool_calls": 6, "wiki_reads_total": 4, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 362, "task": "t12-wav-info", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "80806dc8-dfcc-4351-8922-6517845f4f69", "duration_s": 57.87, "total_cost_usd": 0.22215924999999997, "input_tokens": 4521, "cache_creation_input_tokens": 14638, "cache_read_input_tokens": 617474, "output_tokens": 522, "billable_tokens_proxy": 19681, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 534, "task": "t12-wav-info", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "7da018f1-2b2e-4d9f-94b1-36bbcc9d8592", "duration_s": 38.82, "total_cost_usd": 0.208831, "input_tokens": 4517, "cache_creation_input_tokens": 16374, "cache_read_input_tokens": 543921, "output_tokens": 421, "billable_tokens_proxy": 21312, "tool_calls": 6, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 383, "task": "t12-wav-info", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "cd008bd4-19ca-4d40-9be7-395a96649c8d", "duration_s": 156.25, "total_cost_usd": 0.66167125, "input_tokens": 4932, "cache_creation_input_tokens": 130405, "cache_read_input_tokens": 1000353, "output_tokens": 877, "billable_tokens_proxy": 136214, "tool_calls": 12, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 1249, "task": "t1-lens-model", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "06826630-0a10-42fe-8b5e-b575898b0c0e", "duration_s": 101.47, "total_cost_usd": 0.36814824999999995, "input_tokens": 4660, "cache_creation_input_tokens": 20814, "cache_read_input_tokens": 824773, "output_tokens": 516, "billable_tokens_proxy": 25990, "tool_calls": 9, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 1567, "task": "t1-lens-model", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": false}
+{"session_id": "821fe2d0-8cc1-42cb-9ead-84964202146d", "duration_s": 76.37, "total_cost_usd": 0.27928224999999995, "input_tokens": 4783, "cache_creation_input_tokens": 14567, "cache_read_input_tokens": 690776, "output_tokens": 528, "billable_tokens_proxy": 19878, "tool_calls": 8, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 660, "task": "t1-lens-model", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "cef11f5d-967d-47c9-a7ab-88a032752197", "duration_s": 53.82, "total_cost_usd": 0.25592800000000004, "input_tokens": 4787, "cache_creation_input_tokens": 19897, "cache_read_input_tokens": 760092, "output_tokens": 534, "billable_tokens_proxy": 25218, "tool_calls": 8, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 468, "task": "t3-todos", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "15cf7094-83dd-48eb-aabe-3fa85929a0af", "duration_s": 42.43, "total_cost_usd": 0.22527500000000003, "input_tokens": 4781, "cache_creation_input_tokens": 15670, "cache_read_input_tokens": 654911, "output_tokens": 541, "billable_tokens_proxy": 20992, "tool_calls": 7, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 426, "task": "t3-todos", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "be1b6928-4de2-4cb0-9ff2-365026efe35c", "duration_s": 56.31, "total_cost_usd": 0.24562999999999996, "input_tokens": 4785, "cache_creation_input_tokens": 14690, "cache_read_input_tokens": 726336, "output_tokens": 501, "billable_tokens_proxy": 19976, "tool_calls": 8, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 434, "task": "t3-todos", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "4c526ddf-ce1a-41d0-9068-40eaeddf8f21", "duration_s": 35.55, "total_cost_usd": 0.21047425, "input_tokens": 4519, "cache_creation_input_tokens": 16409, "cache_read_input_tokens": 573019, "output_tokens": 406, "billable_tokens_proxy": 21334, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 321, "task": "t16-ini-key", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "9ee55246-9c17-4262-ac53-21419467273f", "duration_s": 35.53, "total_cost_usd": 0.189132, "input_tokens": 4519, "cache_creation_input_tokens": 13642, "cache_read_input_tokens": 578357, "output_tokens": 457, "billable_tokens_proxy": 18618, "tool_calls": 6, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 350, "task": "t16-ini-key", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "35e5e49e-63bf-41e6-8939-63d4e22c1021", "duration_s": 33.98, "total_cost_usd": 0.18546825, "input_tokens": 2932, "cache_creation_input_tokens": 9601, "cache_read_input_tokens": 479869, "output_tokens": 352, "billable_tokens_proxy": 12885, "tool_calls": 6, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 329, "task": "t16-ini-key", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "c41d3516-1299-46fc-9771-7ef044980ea8", "duration_s": 41.37, "total_cost_usd": 0.23471749999999997, "input_tokens": 4519, "cache_creation_input_tokens": 18223, "cache_read_input_tokens": 576461, "output_tokens": 406, "billable_tokens_proxy": 23148, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 185, "task": "t9-webp-dim", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "7b4920e1-c28d-428b-9691-b39b43e88d29", "duration_s": 71.08, "total_cost_usd": 0.28735700000000003, "input_tokens": 3327, "cache_creation_input_tokens": 11685, "cache_read_input_tokens": 630502, "output_tokens": 479, "billable_tokens_proxy": 15491, "tool_calls": 9, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 179, "task": "t9-webp-dim", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "9ac6d785-ebae-4873-9f6f-030e70914381", "duration_s": 69.87, "total_cost_usd": 0.26038174999999997, "input_tokens": 2934, "cache_creation_input_tokens": 17122, "cache_read_input_tokens": 524574, "output_tokens": 402, "billable_tokens_proxy": 20458, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 306, "task": "t9-webp-dim", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "5c0737e6-f786-4ef7-9e8b-bdbf3a6d3545", "duration_s": 58.09, "total_cost_usd": 0.21413574999999999, "input_tokens": 2932, "cache_creation_input_tokens": 11541, "cache_read_input_tokens": 476588, "output_tokens": 353, "billable_tokens_proxy": 14826, "tool_calls": 7, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 148, "task": "t7-gif-dim", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "096b0a12-024f-4329-bf50-fa518f19bdc2", "duration_s": 56.52, "total_cost_usd": 0.22736725000000002, "input_tokens": 2934, "cache_creation_input_tokens": 9477, "cache_read_input_tokens": 520334, "output_tokens": 441, "billable_tokens_proxy": 12852, "tool_calls": 8, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 21, "task": "t7-gif-dim", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": false}
+{"session_id": "be755eea-765f-4d13-9cb8-d065fe5e4ea5", "duration_s": 28.7, "total_cost_usd": 0.18607449999999998, "input_tokens": 2930, "cache_creation_input_tokens": 9400, "cache_read_input_tokens": 444687, "output_tokens": 350, "billable_tokens_proxy": 12680, "tool_calls": 6, "wiki_reads_total": 5, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 166, "task": "t7-gif-dim", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "6c2a7f4f-6e68-4515-9807-86ed692ca0a3", "duration_s": 75.6, "total_cost_usd": 0.34509924999999997, "input_tokens": 4797, "cache_creation_input_tokens": 24927, "cache_read_input_tokens": 951119, "output_tokens": 742, "billable_tokens_proxy": 30466, "tool_calls": 10, "wiki_reads_total": 6, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 1374, "task": "t2-imports", "batch": 1, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "32d9db42-19b0-4c28-a515-3a5104dfd514", "duration_s": 63.7, "total_cost_usd": 0.28174525, "input_tokens": 4789, "cache_creation_input_tokens": 18619, "cache_read_input_tokens": 806005, "output_tokens": 581, "billable_tokens_proxy": 23989, "tool_calls": 8, "wiki_reads_total": 4, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 981, "task": "t2-imports", "batch": 1, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "69ed0b71-80d1-4e37-962d-0f744ea781ca", "duration_s": 52.63, "total_cost_usd": 0.25298424999999997, "input_tokens": 4785, "cache_creation_input_tokens": 20315, "cache_read_input_tokens": 732344, "output_tokens": 537, "billable_tokens_proxy": 25637, "tool_calls": 7, "wiki_reads_total": 4, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 1080, "task": "t2-imports", "batch": 1, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "310b9593-fd34-4345-b2c7-f55978080daa", "duration_s": 22.42, "total_cost_usd": 0.177683, "input_tokens": 4380, "cache_creation_input_tokens": 27544, "cache_read_input_tokens": 405322, "output_tokens": 318, "billable_tokens_proxy": 32242, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 233, "task": "t10-zip-list", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "58e26ddd-de2b-4489-bc7e-5c0bfdf7d10f", "duration_s": 29.23, "total_cost_usd": 0.14146525, "input_tokens": 4376, "cache_creation_input_tokens": 21597, "cache_read_input_tokens": 330961, "output_tokens": 308, "billable_tokens_proxy": 26281, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 318, "task": "t10-zip-list", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "cf6c5570-f230-4938-a8bb-9d5589aa9bd7", "duration_s": 25.41, "total_cost_usd": 0.16665149999999998, "input_tokens": 4378, "cache_creation_input_tokens": 19732, "cache_read_input_tokens": 377749, "output_tokens": 227, "billable_tokens_proxy": 24337, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 229, "task": "t10-zip-list", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "887c518d-4c5f-463a-93b2-1e0f3ad019e6", "duration_s": 22.27, "total_cost_usd": 0.15011724999999998, "input_tokens": 2920, "cache_creation_input_tokens": 21267, "cache_read_input_tokens": 265590, "output_tokens": 268, "billable_tokens_proxy": 24455, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 183, "task": "t14-csv-quoted", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "fd1e67a0-f2c4-4ad4-9f61-d2f7593cd2fa", "duration_s": 18.55, "total_cost_usd": 0.15242275, "input_tokens": 2920, "cache_creation_input_tokens": 22001, "cache_read_input_tokens": 264967, "output_tokens": 279, "billable_tokens_proxy": 25200, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 118, "task": "t14-csv-quoted", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "c8cc1c47-a657-4121-a72d-94d13b867831", "duration_s": 25.29, "total_cost_usd": 0.14563075, "input_tokens": 4420, "cache_creation_input_tokens": 21741, "cache_read_input_tokens": 331012, "output_tokens": 328, "billable_tokens_proxy": 26489, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 103, "task": "t14-csv-quoted", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "5d74b0fc-8492-4486-85bf-2bb8f61fad38", "duration_s": 23.94, "total_cost_usd": 0.17728875, "input_tokens": 4378, "cache_creation_input_tokens": 23441, "cache_read_input_tokens": 370036, "output_tokens": 215, "billable_tokens_proxy": 28034, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 536, "task": "t11-tar-list", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "550426a1-88b8-40e2-a3b5-a6a4e88f32f6", "duration_s": 25.02, "total_cost_usd": 0.17305925, "input_tokens": 4380, "cache_creation_input_tokens": 23068, "cache_read_input_tokens": 409593, "output_tokens": 296, "billable_tokens_proxy": 27744, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 723, "task": "t11-tar-list", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "66f07563-596f-4edb-b117-e3f4a08dbcc4", "duration_s": 30.32, "total_cost_usd": 0.1721055, "input_tokens": 4380, "cache_creation_input_tokens": 23119, "cache_read_input_tokens": 409711, "output_tokens": 308, "billable_tokens_proxy": 27807, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 617, "task": "t11-tar-list", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "db6d5880-acd2-497c-b6d4-944211c1460f", "duration_s": 16.68, "total_cost_usd": 0.12893575, "input_tokens": 2916, "cache_creation_input_tokens": 17130, "cache_read_input_tokens": 191718, "output_tokens": 192, "billable_tokens_proxy": 20238, "tool_calls": 2, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 269, "task": "t5-base64", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "05452062-5b98-4ab5-973d-c9034bcb40c3", "duration_s": 19.22, "total_cost_usd": 0.12258775, "input_tokens": 2916, "cache_creation_input_tokens": 14314, "cache_read_input_tokens": 194726, "output_tokens": 206, "billable_tokens_proxy": 17436, "tool_calls": 2, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 216, "task": "t5-base64", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "5dabdfd9-2e46-4a16-807a-991072d25e2d", "duration_s": 21.89, "total_cost_usd": 0.12660425, "input_tokens": 2916, "cache_creation_input_tokens": 14270, "cache_read_input_tokens": 194742, "output_tokens": 210, "billable_tokens_proxy": 17396, "tool_calls": 2, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 270, "task": "t5-base64", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "63025467-c3c4-4749-b8c6-0dd1402a8cdc", "duration_s": 26.71, "total_cost_usd": 0.1806455, "input_tokens": 4378, "cache_creation_input_tokens": 23640, "cache_read_input_tokens": 370220, "output_tokens": 190, "billable_tokens_proxy": 28208, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 524, "task": "t8-bmp-info", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "7dd18422-7db1-45a2-b48a-2b66e66ee7c4", "duration_s": 20.83, "total_cost_usd": 0.17230724999999997, "input_tokens": 4378, "cache_creation_input_tokens": 19293, "cache_read_input_tokens": 374692, "output_tokens": 179, "billable_tokens_proxy": 23850, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 412, "task": "t8-bmp-info", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "3f5aaf04-fd05-473a-b6aa-8aecbef4df9a", "duration_s": 34.79, "total_cost_usd": 0.17037975, "input_tokens": 4378, "cache_creation_input_tokens": 19171, "cache_read_input_tokens": 374572, "output_tokens": 175, "billable_tokens_proxy": 23724, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 392, "task": "t8-bmp-info", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "63a0fb78-edb2-4068-8962-abaf2bd7bb52", "duration_s": 27.09, "total_cost_usd": 0.1469225, "input_tokens": 2922, "cache_creation_input_tokens": 21454, "cache_read_input_tokens": 304672, "output_tokens": 285, "billable_tokens_proxy": 24661, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 73, "task": "t15-jsonl-kinds", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "10d7edb4-236e-47e8-ab99-b2fc8fd05dd9", "duration_s": 28.15, "total_cost_usd": 0.164087, "input_tokens": 2924, "cache_creation_input_tokens": 19749, "cache_read_input_tokens": 347402, "output_tokens": 210, "billable_tokens_proxy": 22883, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 109, "task": "t15-jsonl-kinds", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "30f104e7-3f42-46ec-8149-a9dacb04a7d2", "duration_s": 21.56, "total_cost_usd": 0.14012975, "input_tokens": 2920, "cache_creation_input_tokens": 18325, "cache_read_input_tokens": 268690, "output_tokens": 202, "billable_tokens_proxy": 21447, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 73, "task": "t15-jsonl-kinds", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "95789913-f235-48b2-ba33-a2031409aabf", "duration_s": 26.88, "total_cost_usd": 0.17568925, "input_tokens": 2922, "cache_creation_input_tokens": 18910, "cache_read_input_tokens": 309244, "output_tokens": 197, "billable_tokens_proxy": 22029, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 213, "task": "t6-png-dim", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "b8c37236-bbf4-45ce-bbfa-e5c69a21ea3b", "duration_s": 41.24, "total_cost_usd": 0.17086675, "input_tokens": 4380, "cache_creation_input_tokens": 23291, "cache_read_input_tokens": 409976, "output_tokens": 269, "billable_tokens_proxy": 27940, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 196, "task": "t6-png-dim", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "11109d6d-59a5-4732-a707-3b2351d3be9c", "duration_s": 21.75, "total_cost_usd": 0.17250475, "input_tokens": 4380, "cache_creation_input_tokens": 23395, "cache_read_input_tokens": 409964, "output_tokens": 318, "billable_tokens_proxy": 28093, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 196, "task": "t6-png-dim", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "defdcb93-5496-40ec-966a-f2c06aed52d7", "duration_s": 30.94, "total_cost_usd": 0.14994000000000002, "input_tokens": 4376, "cache_creation_input_tokens": 26204, "cache_read_input_tokens": 326635, "output_tokens": 325, "billable_tokens_proxy": 30905, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 147, "task": "t17-log-errors", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "32ed1407-01f5-415a-aa62-7c054f26e69a", "duration_s": 27.03, "total_cost_usd": 0.144607, "input_tokens": 4378, "cache_creation_input_tokens": 22237, "cache_read_input_tokens": 370205, "output_tokens": 395, "billable_tokens_proxy": 27010, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 139, "task": "t17-log-errors", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "44c8da03-7579-4ec3-9966-8c89a9abb52b", "duration_s": 19.94, "total_cost_usd": 0.145328, "input_tokens": 4378, "cache_creation_input_tokens": 21970, "cache_read_input_tokens": 370051, "output_tokens": 356, "billable_tokens_proxy": 26704, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 100, "task": "t17-log-errors", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "d3495194-a4ec-420c-8c97-b664e7799c55", "duration_s": 36.12, "total_cost_usd": 0.17531950000000002, "input_tokens": 2924, "cache_creation_input_tokens": 22596, "cache_read_input_tokens": 344052, "output_tokens": 301, "billable_tokens_proxy": 25821, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 344, "task": "t13-gzip-dec", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "6074dab4-c1cc-48e2-a945-204e524ee457", "duration_s": 22.4, "total_cost_usd": 0.16632875, "input_tokens": 4378, "cache_creation_input_tokens": 19646, "cache_read_input_tokens": 377656, "output_tokens": 205, "billable_tokens_proxy": 24229, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 352, "task": "t13-gzip-dec", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "173e8140-db33-4de6-ba43-e4b2f15a4b22", "duration_s": 36.95, "total_cost_usd": 0.165017, "input_tokens": 4376, "cache_creation_input_tokens": 15723, "cache_read_input_tokens": 342549, "output_tokens": 93, "billable_tokens_proxy": 20192, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 320, "task": "t13-gzip-dec", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "1db4fe39-8430-4b57-b93d-71b412216117", "duration_s": 22.35, "total_cost_usd": 0.17687524999999998, "input_tokens": 4380, "cache_creation_input_tokens": 27548, "cache_read_input_tokens": 405290, "output_tokens": 281, "billable_tokens_proxy": 32209, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 219, "task": "t12-wav-info", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "776b643a-3348-4fc5-9e7b-90d6f1683a1f", "duration_s": 23.9, "total_cost_usd": 0.16788875000000003, "input_tokens": 4378, "cache_creation_input_tokens": 19160, "cache_read_input_tokens": 374625, "output_tokens": 169, "billable_tokens_proxy": 23707, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 262, "task": "t12-wav-info", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "6b713e06-88c9-45e6-97f2-8385d7397a7c", "duration_s": 41.18, "total_cost_usd": 0.17020125000000003, "input_tokens": 4380, "cache_creation_input_tokens": 23165, "cache_read_input_tokens": 409740, "output_tokens": 281, "billable_tokens_proxy": 27826, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 318, "task": "t12-wav-info", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": false}
+{"session_id": "be995bf9-5596-4c30-b47e-3aaebe882acd", "duration_s": 48.77, "total_cost_usd": 0.24819524999999998, "input_tokens": 4511, "cache_creation_input_tokens": 23473, "cache_read_input_tokens": 459773, "output_tokens": 189, "billable_tokens_proxy": 28173, "tool_calls": 5, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 776, "task": "t1-lens-model", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "f368ec41-a9d0-4710-8c58-25829b3caba2", "duration_s": 37.74, "total_cost_usd": 0.2061225, "input_tokens": 4380, "cache_creation_input_tokens": 22138, "cache_read_input_tokens": 419024, "output_tokens": 297, "billable_tokens_proxy": 26815, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 654, "task": "t1-lens-model", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "17320910-1f55-44cb-90d6-ffd5ab18cad4", "duration_s": 68.35, "total_cost_usd": 0.24599675, "input_tokens": 4513, "cache_creation_input_tokens": 23128, "cache_read_input_tokens": 499625, "output_tokens": 316, "billable_tokens_proxy": 27957, "tool_calls": 5, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 715, "task": "t1-lens-model", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "849a4816-0c9d-46aa-888c-e077fd310433", "duration_s": 41.56, "total_cost_usd": 0.23050450000000003, "input_tokens": 4386, "cache_creation_input_tokens": 29529, "cache_read_input_tokens": 524637, "output_tokens": 398, "billable_tokens_proxy": 34313, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 561, "task": "t3-todos", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "01928d05-5786-4205-830f-f322e28a7d8a", "duration_s": 55.2, "total_cost_usd": 0.20612525, "input_tokens": 4384, "cache_creation_input_tokens": 27702, "cache_read_input_tokens": 488560, "output_tokens": 403, "billable_tokens_proxy": 32489, "tool_calls": 4, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 688, "task": "t3-todos", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "160f927a-ced2-48a7-8a00-658f83d6ef5d", "duration_s": 58.67, "total_cost_usd": 0.24508049999999998, "input_tokens": 4519, "cache_creation_input_tokens": 25121, "cache_read_input_tokens": 609385, "output_tokens": 459, "billable_tokens_proxy": 30099, "tool_calls": 6, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 686, "task": "t3-todos", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "4d2efe38-e350-46cd-97c1-1817ddb2d1b6", "duration_s": 18.94, "total_cost_usd": 0.14610175, "input_tokens": 2920, "cache_creation_input_tokens": 21087, "cache_read_input_tokens": 265479, "output_tokens": 227, "billable_tokens_proxy": 24234, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 387, "task": "t16-ini-key", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "32cecf43-6478-437d-8b16-6c4f41fffcca", "duration_s": 23.79, "total_cost_usd": 0.13794375, "input_tokens": 4376, "cache_creation_input_tokens": 21519, "cache_read_input_tokens": 330934, "output_tokens": 271, "billable_tokens_proxy": 26166, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 249, "task": "t16-ini-key", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "a6f488a6-a520-4139-b86c-7f68f0e4cf4f", "duration_s": 21.83, "total_cost_usd": 0.1455925, "input_tokens": 4451, "cache_creation_input_tokens": 26066, "cache_read_input_tokens": 365923, "output_tokens": 343, "billable_tokens_proxy": 30860, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 332, "task": "t16-ini-key", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "3c39fdcc-2601-40a0-9f30-54e9953fa5d5", "duration_s": 22.53, "total_cost_usd": 0.18425100000000003, "input_tokens": 4376, "cache_creation_input_tokens": 19310, "cache_read_input_tokens": 339820, "output_tokens": 94, "billable_tokens_proxy": 23780, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 227, "task": "t9-webp-dim", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "672195f5-68f1-4722-afdf-42c83869172f", "duration_s": 29.6, "total_cost_usd": 0.17720424999999998, "input_tokens": 4376, "cache_creation_input_tokens": 16337, "cache_read_input_tokens": 342852, "output_tokens": 94, "billable_tokens_proxy": 20807, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 229, "task": "t9-webp-dim", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "79072bae-6e25-4d4a-b181-e839372351ab", "duration_s": 26.74, "total_cost_usd": 0.17407075, "input_tokens": 4376, "cache_creation_input_tokens": 16192, "cache_read_input_tokens": 342779, "output_tokens": 112, "billable_tokens_proxy": 20680, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 289, "task": "t9-webp-dim", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "16f03398-cf75-40e0-b6fc-fbaae39eaaa4", "duration_s": 30.91, "total_cost_usd": 0.17360875, "input_tokens": 2920, "cache_creation_input_tokens": 18029, "cache_read_input_tokens": 270357, "output_tokens": 191, "billable_tokens_proxy": 21140, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 249, "task": "t7-gif-dim", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "77ad054c-e121-408f-9b67-2cfebba060ff", "duration_s": 24.25, "total_cost_usd": 0.1637025, "input_tokens": 2920, "cache_creation_input_tokens": 15070, "cache_read_input_tokens": 273348, "output_tokens": 127, "billable_tokens_proxy": 18117, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 135, "task": "t7-gif-dim", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "2408c5f4-fd67-4317-95a6-d1466e700d80", "duration_s": 19.25, "total_cost_usd": 0.1625785, "input_tokens": 2920, "cache_creation_input_tokens": 15019, "cache_read_input_tokens": 273301, "output_tokens": 128, "billable_tokens_proxy": 18067, "tool_calls": 4, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 152, "task": "t7-gif-dim", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": true}
+{"session_id": "10ac2ecf-bf0e-4a61-b49c-60cb28126c88", "duration_s": 50.15, "total_cost_usd": 0.244591, "input_tokens": 4388, "cache_creation_input_tokens": 31854, "cache_read_input_tokens": 568090, "output_tokens": 419, "billable_tokens_proxy": 36661, "tool_calls": 5, "wiki_reads_total": 3, "agents_md_read": true, "index_read": true, "guideline_reads": 1, "final_text_len": 980, "task": "t2-imports", "batch": 2, "condition": "claude_md_strong", "trial": 1, "outcome_match": true}
+{"session_id": "d56bbc0a-23cf-4915-b0aa-cb94b44e3975", "duration_s": 57.67, "total_cost_usd": 0.243616, "input_tokens": 4648, "cache_creation_input_tokens": 27807, "cache_read_input_tokens": 612133, "output_tokens": 477, "billable_tokens_proxy": 32932, "tool_calls": 5, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 944, "task": "t2-imports", "batch": 2, "condition": "claude_md_strong", "trial": 3, "outcome_match": true}
+{"session_id": "5ca53ac4-6aa0-4298-80f5-8b968f5c5c23", "duration_s": 31.21, "total_cost_usd": 0.16709125, "input_tokens": 4376, "cache_creation_input_tokens": 22807, "cache_read_input_tokens": 331250, "output_tokens": 231, "billable_tokens_proxy": 27414, "tool_calls": 3, "wiki_reads_total": 2, "agents_md_read": true, "index_read": true, "guideline_reads": 0, "final_text_len": 207, "task": "t2-imports", "batch": 2, "condition": "claude_md_strong", "trial": 2, "outcome_match": false}
diff --git a/explorations/agent-wiki/experiments/pruned-index-hypothesis.md b/explorations/agent-wiki/experiments/pruned-index-hypothesis.md
new file mode 100644
index 00000000..f23c4f5a
--- /dev/null
+++ b/explorations/agent-wiki/experiments/pruned-index-hypothesis.md
@@ -0,0 +1,95 @@
+# Pruned Index Hypothesis Test
+
+Date: 2026-06-09
+
+## Question
+
+Did `wiki-twobatch-pruned` regress because the delete-on-promote policy is bad,
+or because `_index.jsonl` was stale after skill synthesis archived covered
+guidelines?
+
+## Setup
+
+High-signal slice rerun across four arms:
+
+- `wiki-twobatch-skills`
+- `wiki-twobatch-both`
+- `wiki-twobatch-pruned` (original broken index)
+- `wiki-twobatch-pruned-recataloged` (copy of original pruned, then `catalog`)
+
+Tasks:
+
+- `t1-lens-model`
+- `t6-png-dim`
+- `t7-gif-dim`
+- `t8-bmp-info`
+- `t9-webp-dim`
+- `t14-csv-quoted`
+
+Each arm used `claude_md_strong`, 3 trials per task, 18 trials per arm.
+
+Before rerun:
+
+| Wiki | Rows | Kinds | Missing links |
+|---|---:|---|---:|
+| `wiki-twobatch-skills` | 3 | 3 skills | 0 |
+| `wiki-twobatch-both` | 18 | 3 skills, 15 guidelines | 0 |
+| `wiki-twobatch-pruned` | 15 | 15 guidelines, 0 skills | 6 |
+| `wiki-twobatch-pruned-recataloged` | 12 | 3 skills, 9 guidelines | 0 |
+
+## Aggregate Results
+
+All four arms completed 18/18 outcome matches.
+
+| Arm | Median cost | Sum cost | Median output tokens | Median duration | Median tools | Median wiki reads | Median guideline reads | Median skill reads |
+|---|---:|---:|---:|---:|---:|---:|---:|---:|
+| skills | $0.1548 | $3.0366 | 154 | 23.21s | 4 | 2 | 0 | 1 |
+| both | $0.1937 | $3.6701 | 252 | 22.40s | 5 | 2 | 0 | 1 |
+| pruned-broken | $0.2323 | $4.1766 | 320 | 32.45s | 6 | 3 | 1 | 0 |
+| pruned-recataloged | $0.1934 | $3.7721 | 198 | 22.96s | 5 | 2 | 0 | 1 |
+
+## Per-Task Median Cost
+
+| Task | Skills | Both | Pruned broken | Pruned recataloged |
+|---|---:|---:|---:|---:|
+| `t1-lens-model` | $0.1742 | $0.2113 | $0.2915 | $0.2527 |
+| `t6-png-dim` | $0.1491 | $0.1821 | $0.2340 | $0.1881 |
+| `t7-gif-dim` | $0.1438 | $0.1718 | $0.2257 | $0.1824 |
+| `t8-bmp-info` | $0.1565 | $0.2161 | $0.2335 | $0.1939 |
+| `t9-webp-dim` | $0.1478 | $0.1844 | $0.2319 | $0.1829 |
+| `t14-csv-quoted` | $0.1919 | $0.2074 | $0.1561 | $0.2107 |
+
+## Interpretation
+
+The stale-index hypothesis is supported. Recataloging the pruned wiki reduced
+median cost from `$0.2323` to `$0.1934` (-17%), median output tokens from `320`
+to `198` (-38%), median duration from `32.45s` to `22.96s` (-29%), and median
+tool calls from `6` to `5`.
+
+The mechanism is also supported by retrieval behavior. The broken pruned arm
+had no skill reads in any trial because `_index.jsonl` did not expose skills.
+It had a median of 1 guideline read and followed stale rows for archived
+guidelines. The corrected pruned arm had median 1 skill read and 0 guideline
+reads, matching the intended retrieval path.
+
+The broader skills-only conclusion still holds on this slice. Corrected pruned
+roughly ties `both` on median cost (`$0.1934` vs `$0.1937`) and lowers output
+tokens (`198` vs `252`), but it remains more expensive than skills-only
+(`$0.1548`). So there are two effects:
+
+1. The original pruned result was materially confounded by a stale/broken index.
+2. Even after fixing the index, skills-only remains the cheapest retrieval
+   surface for these direct skill-match tasks.
+
+## Artifacts
+
+Metrics:
+
+- `experiments/results-pruned-index-hypothesis/skills/metrics.jsonl`
+- `experiments/results-pruned-index-hypothesis/both/metrics.jsonl`
+- `experiments/results-pruned-index-hypothesis/pruned-broken/metrics.jsonl`
+- `experiments/results-pruned-index-hypothesis/pruned-recataloged/metrics.jsonl`
+
+Corrected wiki copy:
+
+- `wiki-twobatch-pruned-recataloged/`
diff --git a/explorations/agent-wiki/experiments/twobatch-comparison.md b/explorations/agent-wiki/experiments/twobatch-comparison.md
new file mode 100644
index 00000000..589c0918
--- /dev/null
+++ b/explorations/agent-wiki/experiments/twobatch-comparison.md
@@ -0,0 +1,91 @@
+# Two-batch wiki-helps comparison
+
+**Question**: does a populated wiki reduce token cost / wall-clock at equal-or-better accuracy, vs the same task on an empty wiki?
+
+Setup: 16 tasks × 3 trials × 2 batches = 96 sandbox trials, all `claude_md_strong`. Batch 1's agent saw an empty wiki. After ingestion the wiki was frozen (47 summaries → 15 atomics → consolidation; the live state is at `wiki-twobatch/`). Batch 2's agent saw the populated wiki.
+
+## TL;DR
+
+| Headline | Δ |
+|---|---|
+| **Median total cost** ($0.21 → $0.17) | **-20%** |
+| **Median duration** (43s → 27s) | **-38%** |
+| **Median tool calls** (7 → 4) | **-43%** |
+| **Median wiki reads** (5 → 3) | **-40%** |
+| **Median output tokens** (406 → 268) | **-34%** |
+| **Cache-read tokens** (cheap) | -32% |
+| **Cache-creation tokens** (one-shot, agent reads new pages) | +66% |
+| **Aggregate accuracy** | unchanged (96%) |
+
+**With wiki → faster, cheaper, fewer tools, equal accuracy.** The agent's
+recipe path is shorter when the wiki has the recipe.
+
+Two task-level regressions worth noting (both 100% → 67% in batch 2):
+`t12-wav-info` and `t2-imports`. One trial of each failed in batch 2,
+likely the agent over-applying or misreading a recalled guideline.
+Lens-model went the other way: 67% → 100% (the wiki rescued failing
+trials).
+
+The `billable_tokens_proxy` column reads "+47%" because it doesn't
+discount cache-read tokens. The actual `total_cost_usd` (which Anthropic
+prices cache-reads at ~10% of regular input) is the ground truth — and
+that's down 20%.
+
+## Aggregate (96 trials)
+
+| Metric | Batch 1 (empty wiki) | Batch 2 (with wiki) | Δ |
+|---|---:|---:|---:|
+| Trials | 47 | 48 | +1 |
+| Accuracy (mean) | 96% | 96% | +0.0 (+0%) |
+| Median duration | 43s | 27s | -17s (-38%) |
+| Median input tokens | 4,517 | 4,378 | -139 (-3%) |
+| Median cache-creation tokens | 13,197 | 21,855 | +8,658 (+66%) |
+| Median cache-read tokens | 545,088 | 367,979 | -177,108 (-32%) |
+| Median output tokens | 406 | 268 | -137 (-34%) |
+| Median billable proxy (in+cc+out) | 17,838 | 26,223 | +8,385 (+47%) |
+| Median total cost USD | $0.2141 | $0.1703 | $-0.0438 (-20%) |
+| Median tool calls | 7.0 | 4.0 | -3.0 (-43%) |
+| Median wiki reads | 5.0 | 3.0 | -2.0 (-40%) |
+| Median guideline reads | 1.0 | 1.0 | +0.0 (+0%) |
+
+## By task family
+
+Median per-trial cost within each family. Δ = batch-2 minus batch-1.
+
+| Family | Tasks | B1 acc | B2 acc | Δ acc | B1 dur | B2 dur | Δ dur | B1 tokens | B2 tokens | Δ tokens |
+|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+| lens-model | t1-lens-model | 67% | 100% | +0.3 (+50%) | 101s | 49s | -53s (-52%) | 25,990 | 27,957 | +1,967 (+8%) |
+| image | t6-png-dim, t7-gif-dim, t8-bmp-info, t9-webp-dim | 91% | 100% | +0.1 (+10%) | 58s | 27s | -31s (-54%) | 14,826 | 22,876 | +8,050 (+54%) |
+| archive | t10-zip-list, t11-tar-list, t12-wav-info, t13-gzip-dec | 100% | 92% | -0.1 (-8%) | 39s | 25s | -14s (-36%) | 17,847 | 27,012 | +9,165 (+51%) |
+| text | t14-csv-quoted, t15-jsonl-kinds, t16-ini-key, t17-log-errors | 100% | 100% | +0.0 (+0%) | 40s | 23s | -17s (-42%) | 17,142 | 25,683 | +8,540 (+50%) |
+| skip | t2-imports, t3-todos, t5-base64 | 100% | 89% | -0.1 (-11%) | 53s | 42s | -11s (-21%) | 20,992 | 30,099 | +9,107 (+43%) |
+
+## Per task
+
+Median across 3 trials per cell. Token = `billable_tokens_proxy` (input + cache-creation + output; cache reads excluded).
+
+| Task | B1 acc | B2 acc | B1 dur | B2 dur | Δ dur | B1 tokens | B2 tokens | Δ tokens | B1 tools | B2 tools |
+|---|:-:|:-:|---:|---:|---:|---:|---:|---:|---:|---:|
+| `t1-lens-model` | 67% | 100% | 101s | 49s | -53s (-52%) | 25,990 | 27,957 | +1,967 (+8%) | 9.0 | 5.0 |
+| `t6-png-dim` | 100% | 100% | 43s | 27s | -16s (-38%) | 13,676 | 27,940 | +14,264 (+104%) | 6.0 | 4.0 |
+| `t7-gif-dim` | 67% | 100% | 57s | 24s | -32s (-57%) | 12,852 | 18,117 | +5,265 (+41%) | 7.0 | 4.0 |
+| `t8-bmp-info` | 100% | 100% | 79s | 27s | -53s (-66%) | 45,158 | 23,850 | -21,308 (-47%) | 6.5 | 4.0 |
+| `t9-webp-dim` | 100% | 100% | 70s | 27s | -43s (-62%) | 20,458 | 20,807 | +349 (+2%) | 7.0 | 4.0 |
+| `t10-zip-list` | 100% | 100% | 29s | 25s | -4s (-13%) | 17,948 | 26,281 | +8,333 (+46%) | 7.0 | 4.0 |
+| `t11-tar-list` | 100% | 100% | 40s | 25s | -15s (-38%) | 16,798 | 27,807 | +11,009 (+66%) | 7.0 | 4.0 |
+| `t12-wav-info` | 100% | 67% | 49s | 24s | -25s (-51%) | 19,681 | 27,826 | +8,145 (+41%) | 6.0 | 4.0 |
+| `t13-gzip-dec` | 100% | 100% | 37s | 36s | -1s (-3%) | 17,885 | 24,229 | +6,344 (+35%) | 7.0 | 4.0 |
+| `t14-csv-quoted` | 100% | 100% | 39s | 22s | -17s (-43%) | 16,516 | 25,200 | +8,684 (+53%) | 7.0 | 3.0 |
+| `t15-jsonl-kinds` | 100% | 100% | 44s | 27s | -16s (-38%) | 19,519 | 22,883 | +3,364 (+17%) | 7.0 | 3.0 |
+| `t16-ini-key` | 100% | 100% | 36s | 22s | -14s (-39%) | 18,618 | 26,166 | +7,548 (+41%) | 6.0 | 3.0 |
+| `t17-log-errors` | 100% | 100% | 46s | 27s | -19s (-41%) | 15,727 | 27,010 | +11,283 (+72%) | 6.0 | 3.0 |
+| `t2-imports` | 100% | 67% | 64s | 50s | -14s (-21%) | 25,637 | 32,932 | +7,295 (+28%) | 8.0 | 5.0 |
+| `t3-todos` | 100% | 100% | 54s | 55s | +1s (+3%) | 20,992 | 32,489 | +11,497 (+55%) | 8.0 | 5.0 |
+| `t5-base64` | 100% | 100% | 37s | 19s | -18s (-48%) | 12,771 | 17,436 | +4,665 (+37%) | 6.0 | 2.0 |
+
+## Notes
+
+- `billable_tokens_proxy` = `input_tokens + cache_creation_input_tokens + output_tokens` (cache reads are very cheap and not directly billed at the same rate).
+- A trial that timed out is recorded with `outcome_match=False`, `duration_s=300`, all token fields = 0. These bring batch-1 means down if they happen.
+- Only `claude_md_strong` was run in this experiment for clean comparison (no condition mixing).
+
diff --git a/explorations/agent-wiki/experiments/twobatch-fiveway-comparison.md b/explorations/agent-wiki/experiments/twobatch-fiveway-comparison.md
new file mode 100644
index 00000000..2e6afba6
--- /dev/null
+++ b/explorations/agent-wiki/experiments/twobatch-fiveway-comparison.md
@@ -0,0 +1,83 @@
+# Five-way wiki-helps comparison: empty / guidelines / skills / both / pruned
+
+Same 16-task corpus, five arms, all `claude_md_strong` condition. Empty + guidelines arms are twobatch's batch-1 / batch-2. Skills arm is twobatch-skills (3 skills, no guidelines). Both arm is twobatch-both (those same 3 skills + ~15 atomics, no clusters). **Pruned arm** is twobatch-pruned: same 3 skills + only the no-skill-coverage atomics (delete-on-promote policy applied — image-format and CSV atomics archived because their corresponding skills were synthesized).
+
+## Aggregate
+
+| Metric | Empty | Guidelines | Skills | Both | Pruned | P vs G | P vs S | P vs B |
+|---|---:|---:|---:|---:|---:|---:|---:|---:|
+| Trials | 47 | 48 | 48 | 48 | 48 | +0 | +0 | +0 |
+| Accuracy (mean) | 96% | 96% | 98% | 98% | 98% | +2% | +0% | +0% |
+| Median duration | 43s | 27s | 28s | 31s | 21s | -6s (-22%) | -7s (-25%) | -10s (-32%) |
+| Median input tokens | 4,517 | 4,378 | 4,376 | 4,376 | 4,378 | +0 (+0%) | +2 (+0%) | +2 (+0%) |
+| Median output tokens | 406 | 268 | 206 | 272 | 225 | -43 (-16%) | +19 (+9%) | -47 (-17%) |
+| Median total cost USD | $0.2141 | $0.1703 | $0.1463 | $0.1788 | $0.1726 | +$0.0023 (+1%) | +$0.0263 (+18%) | $-0.0062 (-3%) |
+| Median tool calls | 7.0 | 4.0 | 4.0 | 4.0 | 4.0 | +0.0 | +0.0 | +0.0 |
+| Median wiki reads | 5.0 | 3.0 | 2.0 | 2.0 | 2.0 | -1.0 | +0.0 | +0.0 |
+| Median guideline reads | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | -1.0 | — | — |
+
+## By task family
+
+Median total_cost_usd. `Δ S→P` = `pruned` minus `skills`; `Δ B→P` = `pruned` minus `both`.
+
+| Family | Tasks | E acc | G acc | S acc | B acc | P acc | E $ | G $ | S $ | B $ | P $ | Δ S→P | Δ B→P |
+|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+| lens-model | 1 | 67% | 100% | 100% | 100% | 100% | $0.3681 | $0.2460 | $0.1763 | $0.2071 | $0.2289 | +$0.0526 (+30%) | +$0.0218 (+11%) |
+| image | 4 | 91% | 100% | 100% | 100% | 100% | $0.2141 | $0.1731 | $0.1477 | $0.1803 | $0.1964 | +$0.0487 (+33%) | +$0.0160 (+9%) |
+| archive | 4 | 100% | 92% | 92% | 92% | 100% | $0.2133 | $0.1712 | $0.1343 | $0.1777 | $0.1663 | +$0.0320 (+24%) | $-0.0114 (-6%) |
+| text | 4 | 100% | 100% | 100% | 100% | 100% | $0.2097 | $0.1459 | $0.1541 | $0.1527 | $0.1487 | $-0.0054 (-3%) | $-0.0040 (-3%) |
+| skip | 3 | 100% | 89% | 100% | 100% | 89% | $0.2456 | $0.2061 | $0.1412 | $0.2031 | $0.1672 | +$0.0260 (+18%) | $-0.0359 (-18%) |
+
+## Per task — cost USD
+
+| Task | E $ | G $ | S $ | B $ | P $ | Δ S→P | Δ B→P |
+|---|---:|---:|---:|---:|---:|---:|---:|
+| `t1-lens-model` | $0.3681 | $0.2460 | $0.1763 | $0.2071 | $0.2289 | +$0.0526 (+30%) | +$0.0218 (+11%) |
+| `t6-png-dim` | $0.1970 | $0.1725 | $0.1487 | $0.1778 | $0.1975 | +$0.0489 (+33%) | +$0.0198 (+11%) |
+| `t7-gif-dim` | $0.2141 | $0.1637 | $0.1463 | $0.1736 | $0.2181 | +$0.0719 (+49%) | +$0.0445 (+26%) |
+| `t8-bmp-info` | $0.2950 | $0.1723 | $0.1508 | $0.1931 | $0.2481 | +$0.0973 (+65%) | +$0.0550 (+28%) |
+| `t9-webp-dim` | $0.2604 | $0.1772 | $0.1467 | $0.1796 | $0.1834 | +$0.0368 (+25%) | +$0.0039 (+2%) |
+| `t10-zip-list` | $0.2099 | $0.1667 | $0.1344 | $0.1501 | $0.1425 | +$0.0081 (+6%) | $-0.0075 (-5%) |
+| `t11-tar-list` | $0.2144 | $0.1731 | $0.1342 | $0.1799 | $0.1658 | +$0.0316 (+24%) | $-0.0141 (-8%) |
+| `t12-wav-info` | $0.2088 | $0.1702 | $0.1606 | $0.1822 | $0.1674 | +$0.0068 (+4%) | $-0.0149 (-8%) |
+| `t13-gzip-dec` | $0.2125 | $0.1663 | $0.1270 | $0.1725 | $0.1704 | +$0.0434 (+34%) | $-0.0021 (-1%) |
+| `t14-csv-quoted` | $0.2102 | $0.1501 | $0.1776 | $0.2235 | $0.2080 | +$0.0304 (+17%) | $-0.0156 (-7%) |
+| `t15-jsonl-kinds` | $0.2241 | $0.1469 | $0.1685 | $0.1484 | $0.1624 | $-0.0061 (-4%) | +$0.0140 (+9%) |
+| `t16-ini-key` | $0.1891 | $0.1456 | $0.1395 | $0.1534 | $0.1424 | +$0.0029 (+2%) | $-0.0110 (-7%) |
+| `t17-log-errors` | $0.1924 | $0.1453 | $0.1318 | $0.1456 | $0.1378 | +$0.0060 (+5%) | $-0.0078 (-5%) |
+| `t2-imports` | $0.2817 | $0.2436 | $0.1491 | $0.2480 | $0.1672 | +$0.0181 (+12%) | $-0.0808 (-33%) |
+| `t3-todos` | $0.2456 | $0.2305 | $0.1613 | $0.2177 | $0.1920 | +$0.0307 (+19%) | $-0.0257 (-12%) |
+| `t5-base64` | $0.2051 | $0.1266 | $0.1207 | $0.1292 | $0.0926 | $-0.0281 (-23%) | $-0.0366 (-28%) |
+
+## Per task — accuracy
+
+| Task | E acc | G acc | S acc | B acc | P acc |
+|---|:-:|:-:|:-:|:-:|:-:|
+| `t1-lens-model` | 67% | 100% | 100% | 100% | 100% |
+| `t6-png-dim` | 100% | 100% | 100% | 100% | 100% |
+| `t7-gif-dim` | 67% | 100% | 100% | 100% | 100% |
+| `t8-bmp-info` | 100% | 100% | 100% | 100% | 100% |
+| `t9-webp-dim` | 100% | 100% | 100% | 100% | 100% |
+| `t10-zip-list` | 100% | 100% | 100% | 100% | 100% |
+| `t11-tar-list` | 100% | 100% | 100% | 100% | 100% |
+| `t12-wav-info` | 100% | 67% | 67% | 67% | 100% |
+| `t13-gzip-dec` | 100% | 100% | 100% | 100% | 100% |
+| `t14-csv-quoted` | 100% | 100% | 100% | 100% | 100% |
+| `t15-jsonl-kinds` | 100% | 100% | 100% | 100% | 100% |
+| `t16-ini-key` | 100% | 100% | 100% | 100% | 100% |
+| `t17-log-errors` | 100% | 100% | 100% | 100% | 100% |
+| `t2-imports` | 100% | 67% | 100% | 100% | 67% |
+| `t3-todos` | 100% | 100% | 100% | 100% | 100% |
+| `t5-base64` | 100% | 100% | 100% | 100% | 100% |
+
+## Notes
+
+- Empty + guidelines + skills + both columns reproduce the 4-way comparison.
+- Pruned column is the new arm, testing the **delete-on-promote** policy: when `synthesize-skill` produces a skill, it inferentially archives the atomic guidelines covered by the skill (via tag-superset, slug-keyword, or format-identifier description match). Result: 3 skills + 9 atomics + 6 archived.
+- The pruned arm is the experimental answer to the open question "if 'both' loses to 'skills-only', does 'skills + only the no-skill-coverage guidelines' beat 'skills-only'?" raised in §7 of RESULTS-SUMMARY.md.
+
+### Correction — Pruned column is the re-run against a fixed index
+
+The original pruned arm (commit `8bcd713`) ran against a wiki whose `_index.jsonl` was **stale**: `render-skill` archived the covered atomics but never refreshed the indexes, so the wiki exposed **0 skills, 15 guideline rows, 6 broken links**. Agents couldn't see the skills and fell back to dangling guideline rows (original: median $0.181, 290 output tokens, 3 wiki reads, 1 guideline read).
+
+Commit `2adc67a` fixed the builder to refresh the section indexes + `_index.jsonl` after `render-skill`/`render-cluster` (with an integrity assertion). This Pruned column is the full 16-task re-run against the corrected wiki: median **$0.173**, ~225 output tokens, 2 wiki reads, **0** guideline reads. Net: pruned moved from +1% to **-3% vs both** and from +24% to **+18% vs skills**. Skills-only is still cheapest, but the apparent "pruning is worse than both" result was largely the stale-index bug, not the policy. See `pruned-index-hypothesis.md` for the slice-level diagnosis.
diff --git a/explorations/agent-wiki/experiments/twobatch-fourway-comparison.md b/explorations/agent-wiki/experiments/twobatch-fourway-comparison.md
new file mode 100644
index 00000000..66293fd2
--- /dev/null
+++ b/explorations/agent-wiki/experiments/twobatch-fourway-comparison.md
@@ -0,0 +1,78 @@
+# Four-way wiki-helps comparison: empty / guidelines / skills / both
+
+Same 16-task corpus, four arms, all `claude_md_strong` condition. Empty + guidelines arms are twobatch's batch-1 / batch-2. Skills arm is twobatch-skills (3 skills, no guidelines). Both arm is twobatch-both (those same 3 skills + ~15 atomics, no clusters).
+
+## Aggregate
+
+| Metric | Empty | Guidelines | Skills | Both | Both vs G | Both vs S |
+|---|---:|---:|---:|---:|---:|---:|
+| Trials | 47 | 48 | 48 | 48 | +0 | +0 |
+| Accuracy (mean) | 96% | 96% | 98% | 98% | +2% | +0% |
+| Median duration | 43s | 27s | 28s | 31s | +4s (+15%) | +3s (+10%) |
+| Median input tokens | 4,517 | 4,378 | 4,376 | 4,376 | -2 (-0%) | +0 (+0%) |
+| Median output tokens | 406 | 268 | 206 | 272 | +4 (+1%) | +66 (+32%) |
+| Median total cost USD | $0.2141 | $0.1703 | $0.1463 | $0.1788 | +$0.0085 (+5%) | +$0.0325 (+22%) |
+| Median tool calls | 7.0 | 4.0 | 4.0 | 4.0 | +0.0 | +0.0 |
+| Median wiki reads | 5.0 | 3.0 | 2.0 | 2.0 | -1.0 | +0.0 |
+| Median guideline reads | 1.0 | 1.0 | 0.0 | 0.0 | -1.0 | — |
+
+## By task family
+
+Median total_cost_usd. `Δ G→B` is `both` minus `guidelines`; `Δ S→B` is `both` minus `skills`.
+
+| Family | Tasks | E acc | G acc | S acc | B acc | E $ | G $ | S $ | B $ | Δ G→B | Δ S→B |
+|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+| lens-model | 1 | 67% | 100% | 100% | 100% | $0.3681 | $0.2460 | $0.1763 | $0.2071 | $-0.0389 (-16%) | +$0.0308 (+17%) |
+| image | 4 | 91% | 100% | 100% | 100% | $0.2141 | $0.1731 | $0.1477 | $0.1803 | +$0.0073 (+4%) | +$0.0326 (+22%) |
+| archive | 4 | 100% | 92% | 92% | 92% | $0.2133 | $0.1712 | $0.1343 | $0.1777 | +$0.0065 (+4%) | +$0.0433 (+32%) |
+| text | 4 | 100% | 100% | 100% | 100% | $0.2097 | $0.1459 | $0.1541 | $0.1527 | +$0.0069 (+5%) | $-0.0014 (-1%) |
+| skip | 3 | 100% | 89% | 100% | 100% | $0.2456 | $0.2061 | $0.1412 | $0.2031 | $-0.0030 (-1%) | +$0.0619 (+44%) |
+
+## Per task — cost USD
+
+| Task | E $ | G $ | S $ | B $ | Δ G→B | Δ S→B |
+|---|---:|---:|---:|---:|---:|---:|
+| `t1-lens-model` | $0.3681 | $0.2460 | $0.1763 | $0.2071 | $-0.0389 (-16%) | +$0.0308 (+17%) |
+| `t6-png-dim` | $0.1970 | $0.1725 | $0.1487 | $0.1778 | +$0.0053 (+3%) | +$0.0291 (+20%) |
+| `t7-gif-dim` | $0.2141 | $0.1637 | $0.1463 | $0.1736 | +$0.0099 (+6%) | +$0.0274 (+19%) |
+| `t8-bmp-info` | $0.2950 | $0.1723 | $0.1508 | $0.1931 | +$0.0208 (+12%) | +$0.0423 (+28%) |
+| `t9-webp-dim` | $0.2604 | $0.1772 | $0.1467 | $0.1796 | +$0.0024 (+1%) | +$0.0329 (+22%) |
+| `t10-zip-list` | $0.2099 | $0.1667 | $0.1344 | $0.1501 | $-0.0166 (-10%) | +$0.0156 (+12%) |
+| `t11-tar-list` | $0.2144 | $0.1731 | $0.1342 | $0.1799 | +$0.0068 (+4%) | +$0.0457 (+34%) |
+| `t12-wav-info` | $0.2088 | $0.1702 | $0.1606 | $0.1822 | +$0.0120 (+7%) | +$0.0216 (+13%) |
+| `t13-gzip-dec` | $0.2125 | $0.1663 | $0.1270 | $0.1725 | +$0.0062 (+4%) | +$0.0456 (+36%) |
+| `t14-csv-quoted` | $0.2102 | $0.1501 | $0.1776 | $0.2235 | +$0.0734 (+49%) | +$0.0460 (+26%) |
+| `t15-jsonl-kinds` | $0.2241 | $0.1469 | $0.1685 | $0.1484 | +$0.0014 (+1%) | $-0.0201 (-12%) |
+| `t16-ini-key` | $0.1891 | $0.1456 | $0.1395 | $0.1534 | +$0.0078 (+5%) | +$0.0139 (+10%) |
+| `t17-log-errors` | $0.1924 | $0.1453 | $0.1318 | $0.1456 | +$0.0003 (+0%) | +$0.0138 (+10%) |
+| `t2-imports` | $0.2817 | $0.2436 | $0.1491 | $0.2480 | +$0.0044 (+2%) | +$0.0989 (+66%) |
+| `t3-todos` | $0.2456 | $0.2305 | $0.1613 | $0.2177 | $-0.0128 (-6%) | +$0.0565 (+35%) |
+| `t5-base64` | $0.2051 | $0.1266 | $0.1207 | $0.1292 | +$0.0026 (+2%) | +$0.0085 (+7%) |
+
+## Per task — accuracy
+
+| Task | E acc | G acc | S acc | B acc |
+|---|:-:|:-:|:-:|:-:|
+| `t1-lens-model` | 67% | 100% | 100% | 100% |
+| `t6-png-dim` | 100% | 100% | 100% | 100% |
+| `t7-gif-dim` | 67% | 100% | 100% | 100% |
+| `t8-bmp-info` | 100% | 100% | 100% | 100% |
+| `t9-webp-dim` | 100% | 100% | 100% | 100% |
+| `t10-zip-list` | 100% | 100% | 100% | 100% |
+| `t11-tar-list` | 100% | 100% | 100% | 100% |
+| `t12-wav-info` | 100% | 67% | 67% | 67% |
+| `t13-gzip-dec` | 100% | 100% | 100% | 100% |
+| `t14-csv-quoted` | 100% | 100% | 100% | 100% |
+| `t15-jsonl-kinds` | 100% | 100% | 100% | 100% |
+| `t16-ini-key` | 100% | 100% | 100% | 100% |
+| `t17-log-errors` | 100% | 100% | 100% | 100% |
+| `t2-imports` | 100% | 67% | 100% | 100% |
+| `t3-todos` | 100% | 100% | 100% | 100% |
+| `t5-base64` | 100% | 100% | 100% | 100% |
+
+## Notes
+
+- Empty + guidelines columns reproduce twobatch.
+- Skills column reproduces the skills-arm experiment.
+- Both column is the new arm: same 3 skills + ~15 atomics from twobatch's batch-1 trajectories. No clusters (matching the guidelines arm's structure).
+- Trivial-recipe tasks (t11-tar, t13-gzip, t15-jsonl, t16-ini, t17-log, t2/t3, t5) have no matching skill in any arm.
diff --git a/explorations/agent-wiki/experiments/twobatch-skills-comparison.md b/explorations/agent-wiki/experiments/twobatch-skills-comparison.md
new file mode 100644
index 00000000..dcb2f5a1
--- /dev/null
+++ b/explorations/agent-wiki/experiments/twobatch-skills-comparison.md
@@ -0,0 +1,56 @@
+# Three-way wiki-helps comparison: empty vs guidelines vs skills
+
+Same 16-task corpus, three arms, all `claude_md_strong` condition. Empty + guidelines arms are the existing twobatch experiment's batch-1 / batch-2. Skills arm is the new run against `wiki-twobatch-skills/`, populated from twobatch's batch-1 trajectories via `agent-wiki-synthesize-skill`.
+
+## Aggregate (3 trials × 16 tasks per arm)
+
+| Metric | Empty | Guidelines | Skills | Skills vs guidelines |
+|---|---:|---:|---:|---:|
+| Trials | 47 | 48 | 48 | +0 |
+| Accuracy (mean) | 96% | 96% | 98% | +2% |
+| Median duration | 43s | 27s | 28s | +1s (+5%) |
+| Median input tokens | 4,517 | 4,378 | 4,376 | -2 (-0%) |
+| Median output tokens | 406 | 268 | 206 | -62 (-23%) |
+| Median total cost USD | $0.2141 | $0.1703 | $0.1463 | $-0.0240 (-14%) |
+| Median tool calls | 7.0 | 4.0 | 4.0 | +0.0 |
+| Median wiki reads | 5.0 | 3.0 | 2.0 | -1.0 |
+| Median guideline reads | 1.0 | 1.0 | 0.0 | -1.0 |
+
+## By task family
+
+Median per-trial within each family. Skills column shows Δ vs guidelines.
+
+| Family | Tasks | E acc | G acc | S acc | E dur | G dur | S dur | E tokens | G tokens | S tokens | E $ | G $ | S $ | Skills Δ$ |
+|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+| lens-model | 1 tasks | 67% | 100% | 100% | 101s | 49s | 27s | 25,990 | 27,957 | 16,087 | $0.3681 | $0.2460 | $0.1763 | $-0.0697 (-28%) |
+| image | 4 tasks | 91% | 100% | 100% | 58s | 27s | 26s | 14,826 | 22,876 | 15,657 | $0.2141 | $0.1731 | $0.1477 | $-0.0253 (-15%) |
+| archive | 4 tasks | 100% | 92% | 92% | 39s | 25s | 29s | 17,847 | 27,012 | 17,634 | $0.2133 | $0.1712 | $0.1343 | $-0.0368 (-22%) |
+| text | 4 tasks | 100% | 100% | 100% | 40s | 23s | 28s | 17,142 | 25,683 | 18,204 | $0.2097 | $0.1459 | $0.1541 | +$0.0082 (+6%) |
+| skip | 3 tasks | 100% | 89% | 100% | 53s | 42s | 31s | 20,992 | 30,099 | 18,267 | $0.2456 | $0.2061 | $0.1412 | $-0.0649 (-32%) |
+
+## Per task
+
+| Task | E acc | G acc | S acc | E dur | G dur | S dur | E $ | G $ | S $ | Skills Δ$ vs G |
+|---|:-:|:-:|:-:|---:|---:|---:|---:|---:|---:|---:|
+| `t1-lens-model` | 67% | 100% | 100% | 101s | 49s | 27s | $0.3681 | $0.2460 | $0.1763 | $-0.0697 (-28%) |
+| `t6-png-dim` | 100% | 100% | 100% | 43s | 27s | 25s | $0.1970 | $0.1725 | $0.1487 | $-0.0238 (-14%) |
+| `t7-gif-dim` | 67% | 100% | 100% | 57s | 24s | 28s | $0.2141 | $0.1637 | $0.1463 | $-0.0174 (-11%) |
+| `t8-bmp-info` | 100% | 100% | 100% | 79s | 27s | 26s | $0.2950 | $0.1723 | $0.1508 | $-0.0215 (-12%) |
+| `t9-webp-dim` | 100% | 100% | 100% | 70s | 27s | 26s | $0.2604 | $0.1772 | $0.1467 | $-0.0306 (-17%) |
+| `t10-zip-list` | 100% | 100% | 100% | 29s | 25s | 29s | $0.2099 | $0.1667 | $0.1344 | $-0.0322 (-19%) |
+| `t11-tar-list` | 100% | 100% | 100% | 40s | 25s | 38s | $0.2144 | $0.1731 | $0.1342 | $-0.0388 (-22%) |
+| `t12-wav-info` | 100% | 67% | 67% | 49s | 24s | 24s | $0.2088 | $0.1702 | $0.1606 | $-0.0096 (-6%) |
+| `t13-gzip-dec` | 100% | 100% | 100% | 37s | 36s | 29s | $0.2125 | $0.1663 | $0.1270 | $-0.0394 (-24%) |
+| `t14-csv-quoted` | 100% | 100% | 100% | 39s | 22s | 29s | $0.2102 | $0.1501 | $0.1776 | +$0.0274 (+18%) |
+| `t15-jsonl-kinds` | 100% | 100% | 100% | 44s | 27s | 42s | $0.2241 | $0.1469 | $0.1685 | +$0.0216 (+15%) |
+| `t16-ini-key` | 100% | 100% | 100% | 36s | 22s | 26s | $0.1891 | $0.1456 | $0.1395 | $-0.0061 (-4%) |
+| `t17-log-errors` | 100% | 100% | 100% | 46s | 27s | 22s | $0.1924 | $0.1453 | $0.1318 | $-0.0135 (-9%) |
+| `t2-imports` | 100% | 67% | 100% | 64s | 50s | 32s | $0.2817 | $0.2436 | $0.1491 | $-0.0945 (-39%) |
+| `t3-todos` | 100% | 100% | 100% | 54s | 55s | 35s | $0.2456 | $0.2305 | $0.1613 | $-0.0692 (-30%) |
+| `t5-base64` | 100% | 100% | 100% | 37s | 19s | 28s | $0.2051 | $0.1266 | $0.1207 | $-0.0059 (-5%) |
+
+## Notes
+
+- Empty + guidelines columns reproduce the original twobatch comparison; skills column is new.
+- 3 skills were synthesized from twobatch's batch-1 trajectories by the `agent-wiki-synthesize-skill` skill: `extract-jpeg-exif-camera-optics`, `read-image-format-dimensions`, `count-csv-rows-with-quoted-fields`. All other tasks in this arm have **no matching skill** — the agent should fall through to whatever it'd do on an empty wiki.
+
diff --git a/explorations/agent-wiki/skills/agent-wiki-consolidate-guidelines/SKILL.md b/explorations/agent-wiki/skills/agent-wiki-consolidate-guidelines/SKILL.md
new file mode 100644
index 00000000..c7231a87
--- /dev/null
+++ b/explorations/agent-wiki/skills/agent-wiki-consolidate-guidelines/SKILL.md
@@ -0,0 +1,106 @@
+---
+name: agent-wiki-consolidate-guidelines
+description: Read all atomic guidelines in wiki-twobatch/guidelines/ and propose themed clusters that group near-duplicates. Writes cluster pages and updates _config.yaml; originals are preserved with a `superseded_by:` backref.
+---
+
+# Agent Wiki — Consolidate Guidelines
+
+## Overview
+
+Spot duplicates and recurring themes across the corpus of atomic
+guidelines. Author cluster pages that aggregate related variants and
+record the membership in `_config.yaml`. **Originals stay** — clusters
+reference them; nothing is moved or merged.
+
+This is the cross-trajectory **pattern-recognition** pass of the
+`agent-wiki` family. Run it after one or more `extract-guidelines`
+sessions when the wiki has accumulated enough atomic guidelines that a
+theme is visible.
+
+## When to run
+
+- After a batch of `extract-guidelines` runs, when you suspect duplicates.
+- When `guidelines/index.md`'s "By tag" section has 3+ entries under the
+  same tag and you want a canonical aggregator page for that theme.
+- When users complain that recall returns N near-identical hits.
+
+## Workflow
+
+### Step 1: Read the corpus
+
+```bash
+uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py dump-guidelines > /tmp/guidelines.json
+```
+
+Output is a JSON array of `{id, filename, title, trigger, cluster,
+is_cluster_page, content}` for every page in `guidelines/`. The
+`is_cluster_page` flag tells you which entries are existing aggregators
+(`__cluster.md` suffix) — you will be **adding** new clusters, not
+re-deriving existing ones.
+
+Read the file:
+
+```
+Read /tmp/guidelines.json
+```
+
+### Step 2: Decide groupings
+
+For each candidate cluster:
+
+- **Theme**: a one-line statement of the shared idea ("when system EXIF tools are missing, parse JPEG bytes directly with stdlib").
+- **Members**: 2–6 atomic guideline ids that share that theme. Atomic only
+  — never include `is_cluster_page: true` entries.
+- **Tags**: 2–4 short tags that describe the theme.
+
+Rules:
+
+1. **Don't cluster unrelated guidelines just to clean up the listing.** A cluster needs a real shared rule, not a shared topic.
+2. **Don't merge content across atomic pages.** Each atomic page stays whole. The cluster's body summarizes the *theme* and links to members.
+3. **Don't propose a cluster for a single guideline.** Wait for ≥2 members.
+4. **Don't re-author an existing cluster** unless members materially changed. Skip clusters that already exist with the same membership (`existing_clusters` field below).
+
+### Step 3: For each new cluster, output JSON
+
+```json
+{
+  "slug": "exif-stdlib-fallback",
+  "title": "EXIF stdlib parser fallback",
+  "description": "1-2 paragraphs framing the shared theme.",
+  "takeaway": "1 paragraph: the actionable rule the cluster captures.",
+  "members": ["04474b0794e6", "de04f5adde2e", "4746bf445108"],
+  "tags": ["exif", "stdlib", "fallback"]
+}
+```
+
+Pipe to:
+
+```bash
+echo '<json>' | uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py render-cluster
+```
+
+The helper:
+
+- Updates `wiki-twobatch/_config.yaml` `clusters.<slug>` entry.
+- Writes `guidelines/<slug>__cluster.md` with `priority: high`, member links, snippets pulled from disk.
+
+### Step 4: Refresh indexes
+
+After writing all new cluster pages:
+
+```bash
+uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py catalog
+```
+
+`catalog` propagates the cluster membership back to atomic pages: each
+member gets `cluster: <slug>__cluster.md` and `superseded_by:
+<slug>__cluster.md` in its frontmatter, and the cluster page is
+re-rendered against current member content.
+
+## Best practices
+
+1. **Write the takeaway first.** If you can't articulate one shared rule in a sentence, the cluster doesn't exist.
+2. **Be conservative.** Two false-positive clusters cost more than two un-clustered duplicates.
+3. **Preserve atomic provenance.** A reader should be able to navigate cluster → member → source trajectory in two clicks.
+4. **Don't re-cluster within an existing cluster.** Sub-themes don't justify nesting.
+5. Always tail-call `catalog` after the cluster loop.
diff --git a/explorations/agent-wiki/skills/agent-wiki-consult/SKILL.md b/explorations/agent-wiki/skills/agent-wiki-consult/SKILL.md
new file mode 100644
index 00000000..ce151879
--- /dev/null
+++ b/explorations/agent-wiki/skills/agent-wiki-consult/SKILL.md
@@ -0,0 +1,124 @@
+---
+name: agent-wiki-consult
+description: Consult an agent-wiki for guidelines relevant to the task at hand. The wiki itself documents how to retrieve from it (AGENTS.md). Use this skill once you know what task or sub-task you're about to do — not at session start.
+---
+
+# Agent Wiki — Consult
+
+## Overview
+
+This skill is a thin wrapper around the wiki's own `AGENTS.md` document. The
+wiki contains evidence-grounded guidelines distilled from agent
+trajectories; `AGENTS.md` is its agent-readable contract for navigation
+and retrieval. This skill tells you to:
+
+1. Find the wiki root.
+2. Read `<wiki-root>/AGENTS.md`.
+3. Follow the recipe described there against the user's current task.
+
+The retrieval logic lives in `AGENTS.md`, not in this skill. That separation
+is intentional: when the wiki's structure or recall heuristics change, edit
+`AGENTS.md` and not this skill.
+
+## When to invoke
+
+Call this skill **once you know the task or sub-task you're about to do**.
+Concretely:
+
+- After the user has stated their request and you have a plan for the next
+  block of work.
+- Before writing non-trivial code in a problem space the wiki may have
+  documented.
+- Mid-task when a new sub-task emerges with its own narrow scope (e.g.
+  "now I need to handle browser auth resumption").
+
+Do **not** invoke at session start (no task to filter against), and do
+not invoke for trivial tasks (typo fix, single-line edit) where the wiki's
+overhead exceeds the work.
+
+## Workflow
+
+### Step 1: Resolve the wiki root
+
+If the user passed a path argument (e.g. `wiki-twobatch`,
+`wiki-twobatch-skills`, or any other path), use it.
+
+Otherwise auto-detect: walk up from the current working directory looking
+for any sibling directory matching `wiki-*` that contains an `_config.yaml`
+file. If multiple are found, prefer the one closest to cwd. If none are
+found, ask the user which wiki to consult.
+
+### Step 2: Read AGENTS.md
+
+```
+Read <wiki-root>/AGENTS.md
+```
+
+This is the contract document. It explains the wiki's structure, the
+filename suffix convention, the `_index.jsonl` schema, and a recommended
+retrieval recipe. Read it in full — it's typically 3–5 KB.
+
+If the file does not exist, run:
+
+```bash
+uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py \
+  --wiki-root <wiki-root> catalog
+```
+
+Catalog's bootstrap phase will copy the bundled template into place.
+
+### Step 3: Read the retrieval index
+
+```
+Read <wiki-root>/_index.jsonl
+```
+
+One row per guideline / cluster / task / subtask. Schema documented in
+`AGENTS.md`. Rows are sorted clusters-first.
+
+### Step 4: Apply the recipe from AGENTS.md
+
+Per the recipe in `AGENTS.md` (which is advisory, not mandatory):
+
+1. Identify topical tags + keywords from the task description.
+2. Filter `_index.jsonl` rows by tag overlap or `trigger:` substring match.
+3. Prefer cluster rows when a cluster and its members both match.
+4. Read the top 2–5 matching pages in full.
+5. State which guidelines apply (briefly) before acting on them.
+
+Use your judgment for scoring. The wiki does not prescribe a fixed
+algorithm; trust the heuristics in AGENTS.md and the row content.
+
+### Step 5: Surface the matches
+
+Report 2–5 candidate matches to the user (or to your own next-step
+reasoning) with:
+
+- Title
+- One-line summary
+- Relative path inside the wiki
+- Tags
+- Why this match scores high (one phrase)
+
+## Args
+
+This skill accepts:
+
+- An optional path to the wiki root. Examples: `wiki-twobatch`,
+  `wiki-twobatch-skills`, or an absolute path.
+- An optional task description. If omitted, infer from the conversation
+  context.
+
+## Best practices
+
+1. **Don't pre-load.** This skill is on-demand by design. Calling it at
+   session start without a specific task wastes tokens and produces noise.
+2. **Read AGENTS.md every time.** Wikis evolve; the contract may have
+   changed. Caching the contract across invocations is fragile.
+3. **Read clusters before atomics.** Cluster pages reference their members
+   — you usually don't need to read the members directly.
+4. **Cite when you act on a guideline.** Mention the guideline's title +
+   link in your response so the user can audit the recommendation.
+5. **Don't follow guidelines blindly.** If a guideline's `trigger:`
+   doesn't quite match your situation, say so — note the close match and
+   choose your own course.
diff --git a/explorations/agent-wiki/skills/agent-wiki-extract-guidelines/SKILL.md b/explorations/agent-wiki/skills/agent-wiki-extract-guidelines/SKILL.md
new file mode 100644
index 00000000..47c61be5
--- /dev/null
+++ b/explorations/agent-wiki/skills/agent-wiki-extract-guidelines/SKILL.md
@@ -0,0 +1,178 @@
+---
+name: agent-wiki-extract-guidelines
+description: Read a normalized Claude Code trajectory JSON and extract reusable guidelines into wiki-twobatch/guidelines/. Use when mining saved trajectories for reusable lessons.
+---
+
+# Agent Wiki — Extract Guidelines
+
+## Overview
+
+Distill lessons from one session at a time. For each normalized trajectory
+JSON, identify reusable guidelines: reframe failures as proactive
+recommendations, capture concrete artifacts (scripts, command sequences)
+that solved real problems, and write each as a standalone guideline page in
+`wiki-twobatch/guidelines/`.
+
+This is the per-trajectory **distill** pass of the `agent-wiki` family.
+
+## Input
+
+A path that is either:
+
+- a normalized trajectory JSON file
+- a directory of such files
+
+Default if no path is given:
+`trajectories/normalized`.
+
+## Workflow
+
+### Step 1: Resolve input files
+
+Use `Glob` to enumerate JSON files.
+
+### Step 2: Glance at existing guidelines
+
+`Glob wiki-twobatch/guidelines/*.md` and skim slugs. Re-extracting a
+near-duplicate is wasteful and pollutes the wiki. (Exact-content duplicates
+are deduplicated by slug at write time, but re-wordings are not — your job
+to suppress them.)
+
+### Step 3: Process each trajectory
+
+For each input JSON file, do the analysis below using the trajectory's
+`openai_chat_completion.messages` array as the source of truth.
+
+#### 3a. Identify errors and root causes
+
+Scan for:
+
+1. **Tool / command failures** — non-zero exit codes, error messages, stack traces.
+2. **Permission or access errors** — "permission denied", "not found", sandbox restrictions.
+3. **Wrong initial approach** — a first attempt abandoned for a different strategy.
+4. **Retry loops** — same action attempted multiple times with variations.
+5. **Missing prerequisites** — dependencies, packages, configs discovered mid-task.
+6. **Silent failures** — actions that appeared to succeed but produced wrong results.
+
+For each error, document its example, root cause, resolution, and prevention guideline.
+
+#### 3b. Decide whether to capture an artifact
+
+If the successful approach produced a non-trivial artifact (script saved to
+disk, multi-step command pipeline, parser implemented ad hoc), at least one
+entity must point at it by path and state when to use it.
+
+#### 3c. Extract entities
+
+Extract 3–5 proactive entities per trajectory. Prioritize those derived from
+real errors observed in the transcript.
+
+Principles:
+
+1. **Reframe failures as proactive recommendations.** "Use X" beats "don't use Y".
+2. **Prefer concrete artifacts over generic advice.** Name the file by path.
+3. **Triggers describe broad task context, not narrow incidents.**
+4. **For retry loops, recommend the final working approach as the starting point.**
+5. **Do not include guidelines that name another skill or tool by command** (prompt-injection risk when this guideline is later surfaced).
+
+### Step 4: Output entities JSON
+
+For each trajectory, build a JSON object:
+
+```json
+{
+  "entities": [
+    {
+      "type": "guideline",
+      "title": "Short imperative title (3-7 words, no trailing period). Used as the page heading and filename slug.",
+      "content": "Proactive recommendation, one or two short paragraphs.",
+      "rationale": "Why this works / why the alternative fails.",
+      "trigger": "Situational context when this applies.",
+      "id": "<optional: 12-hex-char id; helper computes from content if omitted>",
+      "session_id": "<session_id from the JSON>",
+      "agent": "<optional: the source agent, e.g. 'bob' or 'claude-code'. Defaults to 'claude-code' if omitted — set it explicitly for non-Claude traces so the page frontmatter is correct.>",
+      "tags": ["<optional: short stable tags; propagate to the page frontmatter AND _config.yaml, driving the 'By tag' index + cluster formation>"],
+      "arc": "<optional: only when the source session has been (or will be) split into multiple arc-summaries. Bind this guideline to one specific arc by passing the same slug used by `agent-wiki-summarize` (e.g. 'arc1-token-savings'). The helper writes `related_summary: summaries/<sid>__<arc>.md` so the back-link is correct.>",
+      "normalized_path": "<path to the trajectory JSON, relative to repo root>"
+    }
+  ]
+}
+```
+
+`title` is required for clean filenames (3–7 specific words). Allowed `type`
+values: `guideline`, `workflow`, `script`, `command-template`. Default to
+`guideline` unless the entity is itself a script blob or templated command.
+
+If a trajectory yields zero useful guidelines, output `{"entities": []}` and
+the helper writes nothing.
+
+### When to bind a guideline to a specific arc
+
+A long session that's split into multiple arc-summaries (`agent-wiki-summarize`
+with a `slug`) usually has guidelines that belong cleanly to one arc and not
+the other. Examples from a multi-arc session:
+
+- A guideline about "split runner from results across PRs" came from the
+  token-savings arc → `arc: "arc1-token-savings"`.
+- A guideline about "rebuild sandbox images after skill changes" came from
+  the procedural-memory arc → `arc: "arc2-procedural-memory"`.
+
+Set `arc` per entity. If you don't, the helper writes
+`related_summary: summaries/<sid>.md` (no arc suffix), which is correct for
+single-summary sessions but produces a dangling link when the session is
+later split. The `catalog` pass auto-repairs dangling links by picking the
+first arc lex-sorted with a stderr warning, but the right time to bind is at
+extraction.
+
+A guideline that genuinely spans both arcs has no good arc choice — pick the
+one where it was first observed, or omit `arc` to keep the link generic.
+
+### Step 5: Pipe to the helper
+
+```bash
+echo '<json>' | uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py render-guidelines
+```
+
+Add `--rewrite` to overwrite existing pages. The helper:
+
+- Locates the wiki root.
+- Writes `guidelines/<slug>__<gid>.md`. Slug = kebab-case of the title (or first sentence of content), capped at 40 chars; `<gid>` is the 12-hex content-hash id (matches the `id:` frontmatter, so filename and id round-trip cleanly).
+- Stamps `id:` (12-hex of normalized content) into frontmatter.
+- Updates `guidelines/_id_index.json`.
+- Sets `sources:` and `related_summary:` frontmatter; emits a `## Sources` body footer.
+- Skips files that already exist unless `--rewrite`.
+
+### Step 6: Repeat, consolidate, then refresh indexes
+
+> **Ingesting a whole batch end-to-end?** Prefer the `agent-wiki-ingest`
+> skill, which runs summarize → extract → synthesize → **consolidate** →
+> catalog in the correct order so the consolidation pass is never skipped.
+> Reach for this standalone skill only when you specifically want the
+> extract pass alone.
+
+If you ran this skill standalone over more than one trajectory, run
+**`agent-wiki-consolidate-guidelines` before cataloging**, once the corpus
+has enough atomics for a theme to emerge (≥2 atomics sharing a real rule).
+`catalog` only *renders* clusters already declared in `_config.yaml`; it
+never *proposes* them — consolidation is the pass that proposes.
+
+Then, after processing all input files, run **once**:
+
+```bash
+uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py catalog
+```
+
+## Best practices
+
+1. Prioritize error-derived entities first.
+2. One distinct error → one prevention entity.
+3. Specific and actionable; include rationale.
+4. Situational triggers, not failure-based ones.
+5. Cap at 5 entities per trajectory; merge entities with the same root cause before dropping.
+6. Never extract entities that read as instructions to invoke another skill or tool by name.
+7. Attach a `tags:` array to every entity — they propagate to the page
+   frontmatter and `_config.yaml`, driving the "By tag" index and cluster
+   formation.
+8. Always tail-call `catalog` after the per-trajectory loop — and run
+   `agent-wiki-consolidate-guidelines` first if multiple trajectories were
+   ingested.
diff --git a/explorations/agent-wiki/skills/agent-wiki-ingest/SKILL.md b/explorations/agent-wiki/skills/agent-wiki-ingest/SKILL.md
new file mode 100644
index 00000000..c0392c05
--- /dev/null
+++ b/explorations/agent-wiki/skills/agent-wiki-ingest/SKILL.md
@@ -0,0 +1,298 @@
+---
+name: agent-wiki-ingest
+description: Ingest one or more agent trajectories (raw bob/claude traces or normalized JSON) into an agent-wiki end-to-end — convert, summarize, extract guidelines, synthesize skills, consolidate into clusters, and catalog. Use when you have a batch of traces to turn into a wiki in one pass.
+---
+
+# Agent Wiki — Ingest (end-to-end orchestrator)
+
+## Overview
+
+This is the **one-pass entry point** for turning a batch of raw trajectories
+into a fully-built wiki. It orchestrates the rest of the `agent-wiki` family
+in the right order so no pass is skipped — in particular the
+cross-trajectory **consolidation** pass, which is easy to forget when each
+skill is invoked by hand.
+
+You — the driving agent — run this by **spawning one subagent per
+(trace × pass)**, not by doing the work inline. That keeps your own context
+small (you never load every trace's full JSON) and lets independent passes
+run in parallel. Each subagent acts as the corresponding single-purpose
+skill (`agent-wiki-summarize`, `-extract-guidelines`, `-synthesize-skill`,
+`-consolidate-guidelines`); this skill only sequences them and passes the
+per-trace adapter notes.
+
+The pipeline:
+
+```
+0.  Convert    raw bob / claude traces → normalized analysis JSON   (skip if already normalized)
+1.  Bootstrap  create wiki scaffold + seed catalog                  (skip if wiki exists)
+1.5 Skip       drop traces whose summaries/<sid>.md already exists   [pre-flight — idempotency]
+2.  Summarize  1 subagent / new-trace → summaries/<sid>.md          [PARALLEL]
+3.  Extract    1 subagent / new-trace → guidelines/*.md (+tags)     [SEQUENTIAL]
+4.  Synthesize 1 subagent / new-trace → skills/<slug>/ --archive-covered  [SEQUENTIAL]
+5.  Consolidate 1 subagent over the whole corpus → cluster pages    [SINGLE — MANDATORY]
+6.  Catalog    final bookkeeping → indexes, used-by, priority       [you run this directly]
+```
+
+**Idempotent by default.** Re-running on the same source dir reprocesses
+nothing: Step 1.5 filters out every trace that already has a summary page,
+so Steps 2–4 only touch genuinely new traces. The consolidate + catalog tail
+always runs (it's cheap and self-idempotent). To force a redo of an already-
+ingested trace, keep it in the list and pass `--rewrite` to its `render-*`
+calls.
+
+**Why this order.** `synthesize-skill` runs *before* `consolidate-guidelines`
+so skills claim recipe-level territory first (and archive the atomics they
+cover via `--archive-covered`); consolidation then clusters only the
+*surviving* atomics. This matches the consolidate skill's own rule — "don't
+propose clusters that overlap a skill's territory."
+
+**Why parallel vs sequential.** Summarize writes one independent file per
+trace (`summaries/<sid>.md`) → safe to parallelize. Extract and synthesize
+both mutate shared state (`guidelines/_id_index.json`, `skills/_id_index.json`,
+`_config.yaml`, and the `_archived/` moves) → run them **one trace at a
+time** to avoid lost-update races.
+
+## Input
+
+One of:
+
+- a list of trace file paths
+- a directory of traces (the skill globs it)
+- already-normalized analysis JSON files
+
+…plus a target `--wiki-root` (e.g. `wiki-twobatch-skills`).
+
+### Detecting trace shape (Step 0 dispatch)
+
+Read the top-level JSON keys of each input to classify it:
+
+| Shape | Signature | Conversion |
+|---|---|---|
+| **bob session JSON** | top-level `sessionId` + `messages` | `bob-trace-converter` |
+| **claude stream-json** | JSONL lines with `{"type":"system"/"assistant"/"result"}` | `normalize_stream_json_transcripts.py` |
+| **normalized analysis JSON** | top-level `model` + `messages` + `metadata.id` | pass through (no conversion) |
+
+## Step 0 — Convert
+
+Write converted output under a stable corpus dir:
+`trajectories/normalized/<label>/items/`.
+
+**bob session JSON:**
+```bash
+NODE_OPTIONS='' node ~/.claude/skills/bob-trace-converter/scripts/convert_bob_trace.mjs \
+  <trace.json> --out-dir trajectories/normalized/<label>/items --format both
+```
+> The `NODE_OPTIONS=''` prefix is required — some shells inject a `--require`
+> preload that breaks a bare `node` invocation. Strip it for this call.
+
+The converter writes three files per trace; the ingest pipeline consumes the
+`*-openai-chat-completions.analysis.json` one.
+
+**claude stream-json:**
+```bash
+uv run python scripts/normalize_stream_json_transcripts.py \
+  --in <transcripts-dir> --out trajectories/normalized \
+  --label <label> --user-prompt "<the task prompt>"
+```
+
+**Already normalized:** skip — use the path as-is.
+
+Collect the resulting list of analysis-JSON paths; this is the trace set the
+rest of the pipeline iterates.
+
+## Step 1 — Bootstrap the wiki
+
+If `<wiki-root>/_index.jsonl` does **not** exist:
+
+```bash
+mkdir -p <wiki-root>/{summaries,guidelines,tasks,skills}
+uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py \
+  --wiki-root <wiki-root> catalog
+```
+
+The first `catalog` seeds `AGENTS.md` and `_config.yaml` from the bundled
+defaults and writes empty indexes. Skip this whole step if the wiki already
+exists — you're appending to it.
+
+### Piping JSON to the helper — avoid `echo`
+
+Every `render-*` subcommand reads JSON on stdin. The `echo '<json>' | …`
+form in the per-pass skills **breaks when the payload has multi-line
+`content`/`narrative` fields** (literal newlines become invalid control
+characters in the shell-quoted string). Tell every subagent to write its
+payload to a temp file and `cat` it instead:
+
+```bash
+cat /tmp/ingest-payload.json | uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py --wiki-root <wiki-root> render-guidelines
+```
+
+## Step 1.5 — Skip already-processed traces (pre-flight)
+
+This is what makes re-running the skill on the same source dir cheap. The
+helper's `render-*` subcommands skip-if-exists, but only *after* a subagent
+has already read the trace and synthesized its output — so the LLM cost is
+already spent. Filter **before** spawning any subagent.
+
+For each normalized trace, read its `session_id` — it lives at
+`metadata.id` (bob-converted analysis JSON) **or** top-level `session_id`
+(claude-normalized). If `<wiki-root>/summaries/<sid>.md` already exists, the
+trace was ingested on a prior run → drop it from the work-list. The
+surviving **new-trace list** is what Steps 2–4 iterate.
+
+Compute the new-trace list and log what was skipped (never let a silent
+no-op masquerade as success):
+
+```bash
+for f in <trace-glob>; do
+  sid=$(uv run python -c "import json,sys; d=json.load(open(sys.argv[1])); print(d.get('session_id') or d.get('metadata',{}).get('id',''))" "$f")
+  if [ -n "$sid" ] && [ -f "<wiki-root>/summaries/$sid.md" ]; then
+    echo "skip (already ingested): $sid  $f"
+  else
+    echo "NEW: $sid  $f"
+  fi
+done
+```
+
+The `NEW:` lines are the work-list for Steps 2/3/4. If every trace is
+skipped, that's fine — jump straight to Steps 5–6 (the tail always runs).
+
+**Override.** To force reprocessing of an already-ingested trace, keep it in
+the work-list and pass `--rewrite` to its `render-*` calls (the helper
+overwrites instead of skipping).
+
+## Step 2 — Summarize (parallel subagents)
+
+Spawn **one subagent per new-trace** (from Step 1.5's work-list), **all in
+parallel**. Each acts as
+`agent-wiki-summarize` (point it at that skill's SKILL.md). In each subagent
+prompt include:
+
+- the analysis-JSON path and the `--wiki-root`
+- the trace's **agent** (`bob`, `claude-code`, …) — it must set `agent:`
+  accordingly, not hardcode `claude-code`
+- the bob field-mapping adapter notes (only if the trace came from bob):
+  `session_id` ← `metadata.id`; `model` ← top-level `model`; tool calls live
+  in `messages[i].content[j]` blocks with `type: "tool_use"`;
+  `transcript_path` ← `metadata.source_file`; `recalled_guidelines` is empty
+  for a freshly-built wiki
+- **do NOT run `catalog`** — the orchestrator runs it once at the end
+
+Each subagent pipes its summary JSON to:
+```bash
+echo '<json>' | uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py --wiki-root <wiki-root> render-summary
+```
+
+## Step 3 — Extract guidelines (sequential subagents)
+
+Spawn **one subagent per new-trace, one at a time** (wait for each before
+starting the next — they share `guidelines/_id_index.json` and
+`_config.yaml`). Each acts as `agent-wiki-extract-guidelines`. In each
+prompt:
+
+- the analysis-JSON path, `--wiki-root`, `agent`, and bob adapter notes
+- the list of **existing guideline slugs** (from prior traces this run) so it
+  suppresses near-duplicates
+- instruct it to attach a `tags:` array to every entity (these now propagate
+  to both the `.md` frontmatter and `_config.yaml` — see commit that fixed
+  `render-guidelines`)
+- instruct it to set `"agent": "<source>"` on every entity. The
+  extract-guidelines entity schema does **not** list `agent` as a field, so
+  the subagent must add it explicitly; otherwise the page defaults to
+  `agent: claude-code` even for bob traces.
+- skip `arc:` for single-summary sessions
+- **do NOT run `catalog`**
+
+Pipe via:
+```bash
+echo '<json>' | uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py --wiki-root <wiki-root> render-guidelines
+```
+
+## Step 4 — Synthesize skills (sequential subagents)
+
+Spawn **one subagent per new-trace, one at a time** (shared `skills/_id_index.json`
+plus `_archived/` moves). Each acts as `agent-wiki-synthesize-skill`. In each
+prompt:
+
+- the analysis-JSON path, `--wiki-root`, `agent`, bob adapter notes
+- the list of **existing skill slugs** so it doesn't re-author one
+- tell it to **decide promote-vs-skip** per that skill's "When To Use" rubric
+  (trivial single-command recipes → skip and emit nothing)
+- when promoting, pipe with `--archive-covered` so the atomics the skill
+  subsumes are soft-archived:
+  ```bash
+  cat /tmp/skill-payload.json | uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py --wiki-root <wiki-root> render-skill --archive-covered
+  ```
+  `--archive-covered` is safe to run blind: the matcher only archives an
+  atomic from *another* trajectory when the skill's tags are a true superset
+  (≥2 non-generic shared tags). The weak lexical heuristics (a slug word or
+  format token appearing in the atomic's title) fire only for atomics from
+  the **same trajectory** this skill was synthesized from, so a skill can no
+  longer reach across into an unrelated trace's atomic on a coincidental
+  word like "python" or "csv".
+- **do NOT run `catalog`**
+
+## Step 5 — Consolidate (single subagent — MANDATORY)
+
+**This pass is not optional.** It is the step most easily forgotten when the
+family is invoked by hand, and it is the whole reason this orchestrator
+exists. Always run it, even on a small corpus — the subagent's own judgment
+returns zero clusters when nothing qualifies, which is the correct outcome
+for a tiny or heterogeneous corpus.
+
+**Run it even when Step 1.5 skipped every trace.** A re-run that ingests no
+new traces still benefits from a consolidation pass over the existing
+corpus — it can form clusters that an earlier run missed. Steps 5 and 6 are
+the always-on tail; only Steps 2–4 are gated on the new-trace list.
+
+Spawn **one** subagent acting as `agent-wiki-consolidate-guidelines` over the
+whole surviving-atomic corpus. In its prompt:
+
+- the `--wiki-root`
+- instruct it to run `dump-guidelines` first, then propose clusters
+- remind it: a cluster needs ≥2 atomic members sharing a real **rule** (not
+  just a topic); don't propose clusters overlapping a skill's territory (the
+  skill is already the canonical aggregator)
+- **do NOT run `catalog`** — the orchestrator runs it next
+
+Each cluster is piped via:
+```bash
+echo '<json>' | uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py --wiki-root <wiki-root> render-cluster
+```
+
+## Step 6 — Catalog (you run this directly)
+
+One final bookkeeping pass:
+```bash
+uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py --wiki-root <wiki-root> catalog
+```
+
+This regenerates `_index.jsonl`, the section indexes, the priority table,
+the "By tag" and used-by sections, and propagates `cluster:` /
+`superseded_by:` backrefs onto clustered atomics.
+
+## Report
+
+After Step 6, report the end-state counts: summaries, surviving atomics,
+clusters, skills, archived atomics — and call out any trace that produced no
+guidelines or no skill (trivial recipes), plus any cluster proposals that
+were considered and rejected.
+
+## Best practices
+
+1. **Consolidation is mandatory.** Step 5 always runs. The cluster subagent
+   self-skips individual clusters; the *pass* never skips.
+2. **One subagent per (trace × pass).** Don't batch multiple traces into one
+   subagent — it bloats context and muddies provenance.
+3. **Parallel only for summarize.** Extract, synthesize, and consolidate all
+   touch shared index/config state — keep them sequential.
+4. **Subagents never `catalog`.** Only the orchestrator does, once, at the
+   end. A mid-run catalog wastes work and can race with in-flight writes.
+5. **Pass `agent:` through.** Bob traces are `bob`, not `claude-code`. The
+   summarize and extract subagents must stamp the right source.
+6. **Tags on every guideline.** They drive the "By tag" index and future
+   cluster formation; an untagged atomic is invisible to tag-based recall.
+7. **Idempotent by default.** Step 1.5 skips any trace that already has a
+   `summaries/<sid>.md`, so re-running on the same source dir reprocesses
+   nothing. Use `--rewrite` on the `render-*` calls to force a redo of a
+   specific trace.
diff --git a/explorations/agent-wiki/skills/agent-wiki-summarize/SKILL.md b/explorations/agent-wiki/skills/agent-wiki-summarize/SKILL.md
new file mode 100644
index 00000000..08faef9a
--- /dev/null
+++ b/explorations/agent-wiki/skills/agent-wiki-summarize/SKILL.md
@@ -0,0 +1,185 @@
+---
+name: agent-wiki-summarize
+description: Read a normalized Claude Code trajectory JSON and write an episodic summary page to wiki-twobatch/summaries/. Use when summarizing one or more saved trajectories into the agent wiki.
+---
+
+# Agent Wiki — Summarize Trajectory
+
+## Overview
+
+Witness one session at a time. For each normalized trajectory JSON, author a
+1–3 paragraph narrative + key turns + (when present) a classification of
+each recalled guideline as `followed | ignored | contradicted` with an
+evidence quote.
+
+This is the per-trajectory **witness** pass of the `agent-wiki` family.
+It writes one page per session and tail-calls the bookkeeping `catalog`
+subcommand so indexes stay fresh.
+
+## Input
+
+A path that is either:
+
+- a normalized trajectory JSON file
+- a directory of such files (recurse one level into `<label>/items/`)
+
+Default if no path is given:
+`trajectories/normalized`.
+
+## Workflow
+
+### Step 1: Resolve input files
+
+Use `Glob` to enumerate `*.json`. Accept either a single file, a flat dir
+of files, or a `normalized/` root with `<label>/items/` subdirs.
+
+### Step 2: Glance at existing summaries
+
+`Glob wiki-twobatch/summaries/*.md` so you can skip-if-exists per session
+without re-doing LLM work. Skip is the default; pass `--rewrite` (forwarded
+to the helper below) to overwrite.
+
+### Step 3: For each trajectory JSON
+
+Read the file. The fields you need:
+
+- `session_id`, `agent`, `model`, `started_at`/`ended_at`/`duration_seconds`
+- `stats.top_tools` (for `tools_used`)
+- `source.transcript_path`
+- `openai_chat_completion.messages`
+- `recalled_guidelines` (top-level; may be empty/missing)
+
+If `wiki-twobatch/summaries/<session_id>.md` already exists and the
+user did not request `--rewrite`, skip to the next file.
+
+Otherwise synthesize a summary as a JSON object:
+
+```json
+{
+  "session_id":      "<from JSON>",
+  "slug":            "<optional; for splitting a long session into multiple arc-summaries (e.g. 'arc1-token-savings'). When present, filename becomes <sid>__<slug>.md and frontmatter gains `arc:` plus a `sibling_summaries:` list of co-summaries from the same session.>",
+  "agent":           "<from JSON, default 'claude-code'>",
+  "model":           "<from JSON>",
+  "goal":            "<one short sentence describing what the user asked for>",
+  "outcome":         "success | partial | failure",
+  "duration_seconds": <number from JSON>,
+  "tools_used":      ["<from stats.top_tools, name only>", "..."],
+  "narrative":       "<1-3 paragraphs: what happened, what worked, what didn't>",
+  "key_turns":       ["<one short bullet per pivotal step>", "..."],
+  "normalized_path": "<path to the JSON, relative to repo root>",
+  "transcript_path": "<from source.transcript_path>",
+  "recalled_guidelines": [
+    {
+      "id":       "<12-hex-char id of the guideline that was used in this session>",
+      "title":    "<a short label, 3-7 words>",
+      "status":   "followed | ignored | harmful | contradicted",
+      "evidence": "<verbatim quote ≤200 chars; required for followed/harmful/contradicted>"
+    }
+  ]
+}
+```
+
+Rules of thumb:
+
+- `goal` is one sentence; pull from the first user message.
+- `outcome` is your judgement.
+- `narrative` is short (≤ ~250 words). No fluff.
+- `key_turns` is 3–6 bullets at most. Each one sentence.
+- Skip `recalled_guidelines` entirely if no guidelines were available or used.
+- Quotes must be verbatim (thinking / assistant text / tool_use args / tool_result content); ≤200 chars; ellipsize with `…` if cut.
+
+### How `recalled_guidelines` is populated
+
+The `recalled_guidelines` field captures **every wiki guideline the agent
+saw in this session**. Scan the trajectory for the agent reading guideline
+files from a wiki dir — `<wiki-root>/guidelines/<slug>__<gid>.md`
+or `<wiki-root>/guidelines/<slug>__cluster.md` — either via the `Read`
+tool or via Bash `cat`/`less`/`grep`. Extract each file's id from its YAML
+frontmatter (`id: <12-hex>`) so the row links to the wiki's
+`_id_index.json`.
+
+Don't double-count: if the agent reads the same guideline file twice,
+emit one row.
+
+### Status vocabulary (4-way)
+
+You judge the status from **trajectory evidence**, not the agent's
+self-report:
+
+- **`followed`** — the agent acted on the guideline and the action
+  produced the intended result. Required `evidence`: a verbatim quote
+  showing the agent applied the rule (citation, paraphrase that triggered
+  a tool call, or a tool call whose form matches the guideline's
+  prescription).
+- **`ignored`** — the agent read the guideline file but never acted on
+  it. No `evidence` needed; default for guidelines that landed in context
+  without effect.
+- **`harmful`** — the agent acted on the guideline and it led astray:
+  wasted tool calls, wrong path, retracted decision, or surfaced a wrong
+  answer that had to be corrected. Required `evidence`: a verbatim quote
+  showing the bad outcome that followed application.
+- **`contradicted`** — the agent saw the guideline and deliberately did
+  the opposite (disagreed with the rule). Required `evidence`: a verbatim
+  quote where the agent's action contradicts the guideline's prescription.
+
+Default to `ignored` when uncertain. Don't assign `followed` or `harmful`
+without a verbatim evidence quote — those carry signal value only when
+backed by trajectory text.
+
+### Step 4: Pipe the JSON to the helper
+
+```bash
+echo '<json>' | uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py render-summary
+```
+
+Add `--rewrite` to overwrite an existing page. The helper:
+
+- Locates the wiki root (existing `wiki-twobatch/` ancestor, or creates
+  one next to the nearest `.git/` ancestor).
+- Writes `summaries/<session_id>.md` with frontmatter, body, and a `## Sources` footer.
+- Resolves each `recalled_guidelines[].id` against `guidelines/_id_index.json` for backlinks.
+- Appends one `<wiki-root>/_audit.log` line per recalled guideline.
+- Skips if the page already exists unless `--rewrite`.
+
+### Step 5: Refresh indexes
+
+After processing all input files, run **once**:
+
+```bash
+uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py catalog
+```
+
+This regenerates `index.md`, section indexes, `_index.jsonl`, and enriches
+summary frontmatter with `tool_calls`, `errors`, `recall_used`,
+`contributed_guidelines`, `tags`, `verified_at`. No LLM cost.
+
+## Splitting long sessions into arc-summaries
+
+If a single session has multiple distinct arcs (different sub-projects, a
+clear topic shift, separate PRs landing from one transcript), emit one
+summary JSON *per arc* and pass a `slug` on each. The slug becomes the
+arc identifier and the filename suffix:
+
+- `summaries/<session_id>.md` — single-arc default.
+- `summaries/<session_id>__arc1-token-savings.md`,
+  `summaries/<session_id>__arc2-procedural-memory.md` — split.
+
+Each arc-summary's frontmatter still carries the full `session_id`, plus
+`arc: <slug>` and a `sibling_summaries:` list pointing at the other
+arc-files for the same session. Readers can navigate the whole session via
+the sibling list. The summaries `index.md` shows split sessions in their
+own section at the top.
+
+For per-arc-but-finer workstreams (one specific cross-cutting effort
+within one arc, e.g. "split runner from results across PRs"), use the
+sibling skill `agent-wiki-tasks`'s subtask path
+(`tasks/<slug>__subtask.md`) rather than a third level of summaries.
+
+## Best practices
+
+1. One summary file per `(session_id, arc)` pair. Without `slug`, default
+   to one summary per session. Pass `--rewrite` to overwrite an existing
+   page deterministically.
+2. Don't hallucinate fields — leave them out if missing in the source JSON.
+3. Don't rewrite by default. The wiki accumulates; reruns should be additive.
+4. Always tail-call `catalog` after the per-trajectory loop.
diff --git a/explorations/agent-wiki/skills/agent-wiki-synthesize-skill/SKILL.md b/explorations/agent-wiki/skills/agent-wiki-synthesize-skill/SKILL.md
new file mode 100644
index 00000000..827c3280
--- /dev/null
+++ b/explorations/agent-wiki/skills/agent-wiki-synthesize-skill/SKILL.md
@@ -0,0 +1,267 @@
+---
+name: agent-wiki-synthesize-skill
+description: Read a normalized Claude Code trajectory JSON and produce a wiki-resident SKILL.md page that future agents can invoke. Use when a trajectory captured a non-trivial successful workflow worth promoting from a free-text guideline to an executable, callable artifact.
+---
+
+# Agent Wiki — Synthesize Skill
+
+## Overview
+
+Promote a successful workflow from a saved trajectory into an **executable
+agent skill** living inside a wiki at `<wiki>/skills/<slug>/SKILL.md`. The
+output is the procedural counterpart to `agent-wiki-extract-guidelines`'s
+declarative pages: a guideline tells a future agent *what to do*; a synthesized
+skill is a structured workflow page the future agent can read and *execute
+directly*, optionally invoking sibling scripts via Bash.
+
+This is the per-trajectory **promote-to-procedural** pass of the `agent-wiki`
+family. Run it after one or more trajectories captured the same recipe and
+you want future agents to invoke that recipe instead of re-deriving it.
+
+> **Ingesting a whole batch end-to-end?** Prefer the `agent-wiki-ingest`
+> skill, which sequences summarize → extract → synthesize → **consolidate**
+> → catalog. It runs this skill at the right point (after extraction, before
+> consolidation) and guarantees the consolidation pass that clusters the
+> surviving atomics is never skipped. Use this standalone skill only to
+> promote a single trajectory's workflow.
+
+## When To Use
+
+Use this skill when a trajectory captured:
+
+- A **non-trivial successful workflow** — multiple tool calls, with at least
+  one custom script or non-obvious sequence — that produced the answer after
+  trial-and-error. The eventual happy path is worth saving.
+- A **reusable command sequence or script** the agent wrote. Particularly
+  if the agent had to reconstruct it across multiple attempts.
+- A pattern a future agent will hit on a similar-but-not-identical task —
+  parsing a binary format, walking a structured directory, reaching a
+  specific tool fallback.
+
+Skip this skill — let `agent-wiki-extract-guidelines` cover the case with a
+guideline alone — when:
+
+- The workflow is a single trivial command (`grep -c TODO ...`).
+- The path embeds secrets, tokens, or one-off user inputs.
+- A skill with the same trigger already exists in `<wiki>/skills/`.
+- The session ended without reaching a clear successful answer.
+
+## Input
+
+A path that is either:
+
+- a normalized trajectory JSON file
+- a directory of such files
+
+Default if no path is given:
+`trajectories/normalized`.
+
+## Workflow
+
+### Step 1: Resolve input files
+
+Use `Glob` to enumerate JSON files.
+
+### Step 2: Glance at existing skills
+
+`Glob <wiki>/skills/*/SKILL.md` to see what's already there. **Don't
+re-author a skill with the same name** unless the trajectory's recipe
+materially refines or generalizes it.
+
+### Step 3: For each trajectory
+
+Read the file. The fields you need:
+
+- `session_id`, `agent`, `model`
+- `openai_chat_completion.messages` — the source of truth for what happened
+
+Walk the messages and identify:
+
+#### 3a. The successful workflow
+
+The **final, working** tool sequence — the one that produced the answer.
+Distinguish it from the trial-and-error leading up to it. Capture the
+exact tool calls, scripts, or command sequences verbatim.
+
+#### 3b. The trial-and-error context
+
+What didn't work — the dead ends. You'll use this to author a *trigger
+description* so a future agent knows when to reach for this skill **instead
+of** the failing approaches.
+
+#### 3c. Environment assumptions
+
+What was missing or had to be installed (no `exiftool`, `pip install
+Pillow` needed, etc.).
+
+If no clearly successful workflow is in the trajectory, output zero
+skills for it and continue.
+
+### Step 4: Decide a skill name and trigger
+
+The skill **name** must be:
+
+- kebab-case, action-oriented (`extract-jpeg-exif-camera-optics`,
+  `parse-png-dimensions`, `walk-zip-central-directory`)
+- specific enough that a future agent reading just the name can guess
+  what it does
+- not a duplicate of any existing skill in `<wiki>/skills/`
+
+The skill **description** (one line in frontmatter) describes the *task*,
+not the trajectory. Bad: "Solves the lens-model question from session
+07d60d9f." Good: "Read camera-optics fields (lens model, focal length,
+aperture, ISO) from JPEG EXIF using stdlib `struct` when system EXIF
+tools are unavailable."
+
+The **trigger** (frontmatter + `## When To Use`) describes the broad
+task context, not the narrow original request.
+
+### Step 5: Synthesize a JSON object
+
+```json
+{
+  "name": "<kebab-case-name>",
+  "description": "<one-line task description>",
+  "trigger": "<situational context when this applies>",
+  "session_id": "<from JSON>",
+  "normalized_path": "<path to the JSON, relative to repo root>",
+  "related_summary": "summaries/<sid>.md",
+  "agent": "<from JSON, default 'claude-code'>",
+  "tags": ["<2-4 short tags>"],
+  "overview": "<1-2 sentences: what the skill does and when>",
+  "when_to_use": [
+    "<trigger condition 1>",
+    "<trigger condition 2>"
+  ],
+  "workflow_steps": [
+    "<step 1: an instruction to the future agent>",
+    "<step 2: ...>"
+  ],
+  "scripts": [
+    {
+      "name": "<action>.py",
+      "language": "python",
+      "content": "<full script contents>"
+    }
+  ]
+}
+```
+
+Notes on each field:
+
+- **`overview`** — the SKILL.md's `## Overview` section body. Keep it
+  to 1-2 sentences. Don't retell the original session.
+- **`when_to_use`** — a bulleted list of trigger conditions. The
+  future agent matches its current task against these.
+- **`workflow_steps`** — the procedural body. Each step is an
+  instruction the agent will follow. Reference scripts as
+  `Run \`bash <wiki>/skills/<name>/scripts/<file>.sh\`` (the helper
+  resolves `<wiki>` at write time).
+- **`scripts`** — optional. If the workflow needs a non-trivial script,
+  include it here. The helper writes it to
+  `<wiki>/skills/<name>/scripts/<file>` and references it in the
+  workflow body. Keep scripts minimal — strip incidental log lines or
+  one-off args; replace literal file names with positional arguments.
+
+### Step 6: Pipe the JSON to the helper
+
+```bash
+echo '<json>' | uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py render-skill
+```
+
+Add `--rewrite` to overwrite an existing skill page.
+
+The helper:
+
+- Validates the JSON: `name` must be kebab-case, `description` and
+  `workflow_steps` non-empty; sibling scripts must have `name` matching
+  `^[\w.-]+$`.
+- Writes `<wiki>/skills/<slug>/SKILL.md` with frontmatter (`name`,
+  `description`, `trigger`, `agent`, `sources`, `related_summary`,
+  `tags`, `verified_at`) and body (Overview, When To Use, Workflow,
+  Sources).
+- Writes `<wiki>/skills/<slug>/scripts/<file>` for each script, marks
+  shell scripts executable.
+- Updates `<wiki>/skills/_id_index.json` (skill slug → relpath).
+- Appends `synthesize_skill` to `<wiki>/_audit.log` with session_id +
+  slug.
+- Skips silently if the skill already exists and `--rewrite` was not
+  passed.
+
+### Step 7: Refresh indexes
+
+After processing all trajectories, run **once**:
+
+```bash
+uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py catalog
+```
+
+This regenerates `<wiki>/skills/index.md`, the section indexes, and
+`_index.jsonl` (which gains a `kind: "skill"` row per skill, sorted
+between `cluster:` and `guideline:` rows). No LLM cost.
+
+## Output structure
+
+```
+<wiki>/skills/
+├── _id_index.json                     skill slug → relpath
+├── index.md                           alphabetical listing (auto-generated)
+└── <slug>/
+    ├── SKILL.md                       the synthesized skill
+    └── scripts/                       optional supporting scripts
+        └── <action>.{sh,py}
+```
+
+The SKILL.md frontmatter shape:
+
+```yaml
+---
+id: skill:<slug>
+type: skill
+name: <kebab-case-name>
+description: <one-line task description>
+trigger: <situational context>
+agent: claude-code
+sources:
+  - <normalized_path>
+related_summary: summaries/<sid>.md
+verified_at: <date>
+tags: [<tags>]
+---
+```
+
+## Skills vs guidelines vs clusters
+
+- **Guideline** (in `<wiki>/guidelines/`): the agent reads it and *decides*
+  what to do. Free-text advice. Use when the lesson is conceptual.
+- **Cluster** (in `<wiki>/guidelines/<slug>__cluster.md`): an aggregator
+  page grouping related atomics. Recall-preferred over its members.
+- **Skill** (in `<wiki>/skills/<slug>/SKILL.md`): a structured workflow
+  page the agent reads and *executes*. Use when the lesson is a concrete,
+  reusable recipe with a well-defined input/output.
+
+At retrieval time, `_index.jsonl` lists all three kinds. Sort order is
+`cluster` → `skill` → `guideline` → `task` so callable artifacts surface
+first. The agent reads the SKILL.md, follows its Workflow section, and
+invokes any sibling scripts via Bash.
+
+## Best practices
+
+1. **One skill per workflow.** Two unrelated successful workflows in one
+   trajectory → two synthesize calls with different names.
+2. **Cite the trajectory.** The helper records `sources` +
+   `related_summary` automatically; you just need to set `session_id`
+   and `normalized_path` correctly.
+3. **Don't promote one-shots.** A skill is worth synthesizing only if
+   the trigger is plausibly recurring. Single-use trajectories should
+   stay as guidelines (or nothing at all).
+4. **Don't paraphrase failure.** The skill describes what *worked*. If
+   you're tempted to write "this skill avoids the problem where exiftool
+   isn't installed," restate as "uses Pillow / stdlib struct; works in
+   environments without system EXIF tools."
+5. **Keep scripts minimal.** Strip log lines, debug prints, validation
+   that wasn't actually exercised in the trajectory.
+6. **Generality is everything.** A skill named `extract-gps-from-jpeg`
+   will not match a lens-model query. If the trajectory only exercised
+   one EXIF field, name the skill broadly (`extract-jpeg-exif-camera-optics`)
+   so future agents recognize its applicability to siblings.
diff --git a/explorations/agent-wiki/skills/agent-wiki-tasks/SKILL.md b/explorations/agent-wiki/skills/agent-wiki-tasks/SKILL.md
new file mode 100644
index 00000000..2a45234d
--- /dev/null
+++ b/explorations/agent-wiki/skills/agent-wiki-tasks/SKILL.md
@@ -0,0 +1,226 @@
+---
+name: agent-wiki-tasks
+description: Discover task families across summaries and write per-family comparison pages with findings narrative. Updates wiki-twobatch/_config.yaml task definitions and writes tasks/<slug>__task.md.
+---
+
+# Agent Wiki — Task Comparisons
+
+## Overview
+
+Two cognitive moves in one pass:
+
+1. **Discover** — read across all summaries and identify task families
+   (groups of sessions that attempted the same thing across trials and
+   conditions).
+2. **Compare** — for each family, write a `tasks/<slug>__task.md` page with a
+   per-trial table and a findings narrative that calls out the
+   experimental signal.
+
+This is the cross-trajectory **analysis** pass of the `agent-wiki` family.
+
+## When to run
+
+- After enough summaries exist that a comparative pattern is visible
+  (typically ≥3 sessions per family).
+- When the experiment design (e.g. trial × condition matrices) explicitly
+  cries out for a comparison page.
+
+## Workflow
+
+### Step 1: Read the corpus
+
+```bash
+uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py dump-summaries > /tmp/summaries.json
+```
+
+Output is a JSON array of one row per summary: `{session_id, goal, family,
+trial, condition, tool_calls, errors, recall_used, summary_filename}`.
+`family`, `trial`, `condition` come from existing classification rules —
+they may be null if no rule has matched yet.
+
+Read the file:
+
+```
+Read /tmp/summaries.json
+```
+
+### Step 2: Decide task families
+
+For each candidate task family:
+
+- **Slug**: kebab-case identifier (e.g. `extract-focal-length`).
+- **Family**: short label used to group sessions (often equals slug, but
+  can be looser e.g. `focal-length` for a slug `extract-focal-length`).
+- **Family-match rules**: how a future session gets classified. Currently
+  supported: `goal_substring: [list of substrings]`. A session matches
+  the family if its `goal` contains any substring (case-insensitive).
+- **Tags**: a few short tags.
+- **Intro**: 1–2 sentences setting up the question.
+- **Findings**: 2–5 bullets summarizing what the data shows. **This is
+  the actual product** — a comparison page without findings is just a
+  table.
+
+Rules:
+
+1. **A family needs ≥3 sessions.** Smaller groups should not get their own page.
+2. **Findings must be evidence-grounded.** Cite tool-call counts, error counts, recall-used Y/N from the dump.
+3. **Don't repeat what's in the table.** Findings should explain *why* the metrics differ, not restate them.
+4. **Use overrides** for sessions whose `goal` doesn't auto-match. The override key in `_config.yaml/session_family_overrides` is the session id.
+
+### Step 3: For each family, output JSON
+
+```json
+{
+  "slug": "extract-focal-length",
+  "title": "Extract focal length from JPEG EXIF",
+  "family": "focal-length",
+  "family_match": {
+    "goal_substring": ["focal length"]
+  },
+  "intro": "Question template: *what focal length was used to take @sample.jpg?* FocalLength (tag 0x920A) and FocalLengthIn35mmFilm (tag 0xA405) live in the Exif sub-IFD.",
+  "findings": "**Net signal:** the gap between IFD0/GPS-only scripts and the Exif sub-IFD is the dominant cost. Sessions whose recall pointed at a script that already covered the sub-IFD finished in 2-3 tool calls; sessions that had to write an inline parser took 5+.",
+  "tags": ["exif", "focal-length", "comparison"]
+}
+```
+
+Pipe to:
+
+```bash
+echo '<json>' | uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py render-task
+```
+
+The helper:
+
+- Updates `_config.yaml/tasks.<slug>` entry.
+- Reads classified sessions; selects those matching `family`.
+- Writes `tasks/<slug>__task.md` with the per-trial table + findings.
+
+### Step 4: Add overrides if needed
+
+If a session that *should* be in a family didn't classify automatically,
+patch `_config.yaml`:
+
+```bash
+echo '{"session_family_overrides": {"<session-id>": {"family": "image-dims", "trial": 0, "condition": "claude_md_strong"}}}' \
+  | uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py update-config
+```
+
+### Step 5: Subtask pass — mandatory before refresh
+
+Before refreshing indexes, scan the corpus for **subtask candidates**. The
+default reflex of "the dataset is uniform, no subtasks needed" is wrong
+for almost every dataset; even a 30-session benchmark of short workflows
+typically has 4-6 subtask-worthy sessions. See "## Subtasks" below for
+the heuristics + JSON contract + a worked example.
+
+The minimum viable subtask layer for a condition × trial dataset: one
+subtask per condition, anchored in the session that best demonstrates
+that condition's distinctive behavior. Don't write 5 redundant subtasks
+when 1 representative captures the pattern.
+
+### Step 6: Refresh indexes
+
+```bash
+uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py catalog
+```
+
+This re-reads `_config.yaml`, re-classifies every summary, regenerates
+each `tasks/<slug>__task.md`, scans `tasks/<slug>__subtask.md` files,
+and regenerates `tasks/index.md` and the root `index.md`.
+
+## Subtasks: per-session workstream pages
+
+The `tasks/` directory holds *two* kinds of pages distinguished by filename
+suffix:
+
+- **`<slug>__task.md`** — cross-session task-comparisons (the workflow above).
+- **`<slug>__subtask.md`** — narrative slices of a *single* session.
+
+After Step 5 above, run a **second pass** to scan for subtask candidates.
+Don't skip this just because the dataset is uniform — a 30-session benchmark
+of short workflows still has 4-6 subtask-worthy sessions. The default
+"there are no subtasks worth writing" reflex is wrong for almost every
+dataset.
+
+### When to propose a subtask
+
+Treat each session in the corpus as a potential subtask candidate.
+**Promote** to a subtask page when at least one of these is true:
+
+1. **Exemplar of a condition or arc.** When the corpus has experimental
+   conditions (`no_recall` / `guidelines` / `skill`, or arc-1 / arc-2),
+   pick the session that best demonstrates *that condition's* distinctive
+   behavior — its representative-best, representative-worst, or
+   representative-failure trace — and write a subtask. Aim for one subtask
+   per condition × dataset, not one per session.
+2. **Multi-iteration debug arc.** A session where the agent retried 3+
+   times against the same goal, with each iteration teaching something
+   non-obvious (offset bugs, syntax gotchas, missing prerequisites). The
+   subtask captures the debug walkthrough as a how-to.
+3. **Recall miss / hit pattern.** A session where the recall layer
+   surfaced material that turned out to be wrong, stale, or scope-mismatched
+   — and the agent's recovery path is itself instructive.
+4. **Workstream within a long arc-split session.** When a session has been
+   split into multiple arc-summaries (`<sid>__arcN.md`), each arc usually
+   has 1-3 internal workstreams worth their own subtask page (e.g. "split
+   runner from results", "rebuild sandbox images", "walker fix for late
+   bot batches"). Document each.
+
+### When *not* to write a subtask
+
+- The session is short and atomic — its `key_turns` already captures
+  everything worth capturing.
+- The lesson is already an atomic guideline. (A subtask is a *walkthrough*;
+  a guideline is a *rule*. Same insight, different artifacts.)
+- The session is one of N redundant repetitions of the same pattern. Pick
+  the most illustrative; don't document all 5.
+
+### Output JSON
+
+```json
+{
+  "slug":              "<kebab-case-id, ideally including the source session prefix, e.g. multi-tool-dead-end-stack-66f11622>",
+  "title":             "<short title; mention the session prefix and condition for context>",
+  "parent_session_id": "<session_id>",
+  "parent_summary":    "<filename inside summaries/, e.g. abc123.md or abc123__arc1.md>",
+  "tags":              ["...", "<condition-name>", "<arc-slug>"],
+  "narrative":         "<1-2 paragraphs framing the pattern; reference numerical cost (tool calls, errors, retries) when relevant>",
+  "key_steps":         ["concrete step 1", "concrete step 2", "..."]
+}
+```
+
+Pipe to:
+
+```bash
+echo '<json>' | uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py render-subtask
+```
+
+Subtask pages are *authored* (not regenerated from `_config.yaml`). The
+`catalog` pass picks them up, lists them in `tasks/index.md` under their
+parent session, and adds rows to `_index.jsonl` with `kind: "subtask"`.
+
+### Worked example: 4 conditions → 4 subtasks
+
+When the dataset has 5 trials × 4 conditions, the simplest non-trivial
+subtask layer is one subtask per condition, anchored in the session that
+best demonstrates that condition's distinctive behavior. Concrete pattern
+from `wiki-twobatch/`:
+
+| Subtask | Condition | What it captures |
+|---|:---:|---|
+| Stdlib EXIF parser walkthrough | `seed` | Canonical stdlib path that *produces* the artifact later sessions recall |
+| Multi-tool dead-end stack | `no_recall` | Worst-case 4-tool exhaustion before stdlib fallback |
+| Recalled script path is stale | `guidelines` | Recall hit but stored paths missing → multi-retry recovery |
+| Skill scope mismatch fallback | `skill` | Synthesized skill wrong for the question; inline anyway |
+
+Pick one representative session per row; don't document every session.
+
+## Best practices
+
+1. **Findings is the product.** No findings → no task page.
+2. **Three sessions minimum** before committing a task family.
+3. **Tag families consistently.** `comparison` tag belongs on every task page.
+4. **Leverage `condition` in your findings narrative** — it's the experimental variable.
+5. **Subtasks need a parent_summary.** A subtask without a parent is just a
+   short note — keep it inline in its parent summary's narrative instead.
+6. Always tail-call `catalog` after any task or subtask loop.
diff --git a/explorations/agent-wiki/skills/scripts/_default_agent_wiki_config.yaml b/explorations/agent-wiki/skills/scripts/_default_agent_wiki_config.yaml
new file mode 100644
index 00000000..06b7e081
--- /dev/null
+++ b/explorations/agent-wiki/skills/scripts/_default_agent_wiki_config.yaml
@@ -0,0 +1,42 @@
+schema_version: 1
+
+# Tags applied to atomic guideline pages, keyed by stable 12-hex content id
+# (the `id:` frontmatter on each guideline page; mirrors `_id_index.json`).
+tags:
+  guideline: {}
+  # When you author guidelines, add entries like:
+  #   04474b0794e6: [exif, stdlib, fallback, minimal-env]
+
+# Themed groupings of related atomic guidelines. Members listed here get
+# `cluster:` and `superseded_by:` frontmatter pointing at the cluster page.
+clusters: {}
+  # exif-stdlib-fallback:
+  #   title: EXIF stdlib parser fallback
+  #   description: |
+  #     When system EXIF tools and Python EXIF libraries are all unavailable,
+  #     parse the JPEG bytes directly with stdlib `struct`.
+  #   takeaway: |
+  #     If the first one or two metadata tools fail, switch to a direct
+  #     stdlib parse.
+  #   members: [04474b0794e6, de04f5adde2e, 4746bf445108, 88989680a36a]
+  #   tags: [exif, stdlib, fallback, minimal-env]
+
+# Cross-trajectory comparison pages: one per task family. The `family_match`
+# rules classify summaries; sessions named in `session_family_overrides`
+# override the rules.
+tasks: {}
+  # extract-focal-length:
+  #   title: Extract focal length from JPEG EXIF
+  #   family: focal-length
+  #   family_match:
+  #     goal_substring: [focal length]
+  #   intro: |
+  #     Question template: *what focal length was used to take @sample.jpg?*
+  #   findings: |
+  #     ...
+  #   tags: [exif, focal-length, comparison]
+
+# Optional: pin a session to a specific task family / trial / condition when
+# the family_match rules are insufficient.
+session_family_overrides: {}
+  # 00000000-0000-0000-0000-000000000000: {family: image-dims, trial: 0, condition: claude_md_strong}
diff --git a/explorations/agent-wiki/skills/scripts/_default_agents.md b/explorations/agent-wiki/skills/scripts/_default_agents.md
new file mode 100644
index 00000000..6e490915
--- /dev/null
+++ b/explorations/agent-wiki/skills/scripts/_default_agents.md
@@ -0,0 +1,178 @@
+# AGENTS.md — how an agent should read this wiki
+
+This wiki is **evidence-grounded guidelines distilled from agent
+trajectories**. Every page links back to the trajectory it came from, so any
+recommendation is auditable and revisable.
+
+You — the agent — should consult this wiki **once you know the task or
+sub-task you are about to do**. Not at session start (too vague), not as a
+last resort when stuck (too late). The right moment is after the user states
+their request and you've decided what task family it belongs to, before you
+start writing code.
+
+## When to read me
+
+Trigger conditions, any one of which should prompt a wiki check:
+
+- You're about to author non-trivial code in a problem space the wiki has
+  documented (build a CLI, parse a structured file format, automate a
+  browser flow, design a TUI, run an experiment, ship a PR through review).
+- The user mentions a topic that resembles entries in `_index.jsonl`'s
+  `tags` or `trigger` fields.
+- You're about to make an architectural choice (mode-as-subcommand vs
+  options, env-var vs flag, cluster duplicates vs leave-as-is).
+- A sub-task has been identified (you're now in the middle of a
+  multi-step plan and the next step has its own narrow scope).
+
+Don't read for trivial tasks (typo fix, single-line refactor) or topics
+clearly outside the wiki's scope (the corpus is finite — see
+`guidelines/index.md` for the topical surface).
+
+## Structure
+
+The wiki has three top-level sections, all under the wiki root:
+
+```
+<wiki-root>/
+├── AGENTS.md          ← this file
+├── index.md           ← human-friendly overview
+├── _config.yaml       ← taxonomy: tags, clusters, tasks, family overrides
+├── _index.jsonl       ← agent retrieval index (one row per page)
+├── summaries/
+│   ├── <session_id>.md                       ← single summary per session
+│   └── <session_id>__<arc-slug>.md           ← multi-arc session split
+├── guidelines/
+│   ├── <slug>__<gid>.md                      ← atomic guideline (one rule); `<gid>` matches the `id:` frontmatter
+│   ├── <slug>__cluster.md                    ← themed aggregator (recall-preferred)
+│   └── _id_index.json                        ← guideline id → relpath
+├── skills/
+│   ├── <slug>/SKILL.md                       ← callable workflow page (recall-preferred over guidelines)
+│   ├── <slug>/scripts/<file>                 ← optional supporting scripts (run via Bash)
+│   └── _id_index.json                        ← skill slug → relpath
+└── tasks/
+    ├── <slug>__task.md                       ← cross-session comparison
+    └── <slug>__subtask.md                    ← per-session workstream
+```
+
+**Filename suffixes are the navigation contract.** A page's role is decided
+by its suffix; the wiki's tooling and other agents rely on it. Don't edit
+the suffix.
+
+## The retrieval index — read this first
+
+`_index.jsonl` has one JSON object per line, one line per
+guideline/cluster/skill/task/subtask page. The schema:
+
+```json
+{
+  "kind": "guideline" | "cluster" | "skill" | "task" | "subtask",
+  "id": "<12-hex-char content hash, OR cluster:<slug>, OR skill:<slug>, OR task:<slug>, OR subtask:<slug>>",
+  "title": "<short title>",
+  "tags": ["...", "..."],
+  "trigger": "<situational context when this applies — empty for clusters and tasks>",
+  "summary": "<one-paragraph snippet, ≤240 chars>",
+  "link": "<relative path inside the wiki>",
+  "cluster": "<slug if this guideline is a cluster member, else null>",
+  "superseded_by": "<cluster page name when this atomic is part of a cluster>",
+  "priority": "<\"high\" on cluster rows>",
+  "members": ["<id>", "..."]   // on cluster rows
+}
+```
+
+Rows are sorted **clusters first, then skills, then atomic guidelines, then
+tasks**. Cluster pages are *aggregators* — when a cluster matches your
+query, it references its member atomic guidelines; you usually don't need
+to read the members directly unless you want the original wording or its
+source trajectory.
+
+**Skills** (`kind: "skill"`) live at `<wiki>/skills/<slug>/SKILL.md`.
+They're callable workflow pages: a structured Overview / When To Use /
+Workflow / (optional) supporting scripts under `<slug>/scripts/`. When a
+skill row matches your task, prefer it over a same-trigger guideline —
+the SKILL.md tells you exactly what to do (and may point at sibling
+scripts you can run via Bash). Skills are **recall-preferred over
+guidelines** because they're directly executable; an atomic guideline is
+free-text advice you have to interpret.
+
+## How to retrieve (advisory)
+
+There's no mandated scoring algorithm. A reasonable recipe:
+
+1. **Parse the user's request + your current task plan** for keywords +
+   topical tags.
+2. **Read `_index.jsonl`** end-to-end. It's small (typically 50–200 rows).
+3. **Filter** rows whose `tags` overlap your topical tags, OR whose
+   `trigger` substring-matches your task description.
+4. **Prefer cluster pages** when both a cluster and its members match —
+   the cluster gives you the consolidated rule plus links down. Each
+   member's `superseded_by:` field tells you which cluster supersedes it.
+5. **Read the top 2–5** matches (clusters + standalone atomics not
+   superseded by any matched cluster). For each, follow the `link` and
+   read the page body.
+6. **Decide** which guidelines apply to your current task. State them
+   briefly to the user before acting if helpful, especially when a
+   guideline overrides what they asked for.
+
+Your judgment is the scoring function. Don't read every row.
+
+## Provenance
+
+Every page links back to its source. When you cite a guideline in your
+response or stake a non-trivial decision on one, the chain to follow is:
+
+```
+guideline.md
+  ↓ frontmatter `related_summary:`
+summaries/<session_id>[.md or __<arc>.md]
+  ↓ frontmatter `sources:` (normalized JSON path + raw transcript path)
+trajectories/<session_id>.json
+  ↓ source.transcript_path
+~/.claude/projects/.../<session_id>.jsonl
+```
+
+Cluster pages list their member atomic guidelines in their frontmatter
+`members:` list and in the body's "## Members" section. Each member has
+its own provenance — clusters don't replace member-level provenance, they
+aggregate it.
+
+## Worked example
+
+User asks: *"I'm building a CLI tool with two modes (read and write)
+plus a bunch of options. Should each mode be a subcommand or a flag?"*
+
+Procedure:
+
+1. **Task tags**: `cli`, `ux`, `architecture`, `subcommands`.
+2. **Read `_index.jsonl`**. Filter for any row tagged `cli`, `ux`, or
+   `workspace`.
+3. Top hits (hypothetical):
+   - `cluster:multi-subproject-workspace-conventions` (priority high; tags
+     include `workspace`, `cli`, `conventions`).
+   - `474bb2ba1076` "Promote a feature mode to a top-level flag, not an
+     option" (atomic; tags include `cli`, `ux`, `workspace`).
+4. **Prefer the cluster** — it consolidates several conventions including
+   the mode-as-subcommand rule. Read
+   `guidelines/multi-subproject-workspace-conventions__cluster.md`.
+5. **Decide**: this confirms the user's question — promote each mode to a
+   subcommand; demote everything else to options under it.
+6. **Cite**: respond with the recommendation and (optionally) link the
+   cluster page.
+
+Total wiki tokens read: ~3 KB (one cluster page, plus a glance at one
+atomic). Not a session-start preload; consult on-demand once the task is
+clear.
+
+## Bootstrapping notes
+
+If `AGENTS.md` does not exist in a wiki, run
+`uv run python plugin-source/skills/agent-wiki/scripts/build_agent_wiki.py
+--wiki-root <wiki-root> catalog` — the bootstrap pass copies the template
+in. After bootstrap, this file is yours to edit; subsequent catalog runs
+do not overwrite an existing `AGENTS.md`.
+
+## Skill wrapper
+
+`agent-wiki:agent-wiki-consult` is a thin wrapper that asks the agent to
+follow this file's recipe against a given wiki root. Use the skill when
+you want a one-step "consult the wiki" entry point; read this file
+directly when you want to understand the contract.
diff --git a/explorations/agent-wiki/skills/scripts/build_agent_wiki.py b/explorations/agent-wiki/skills/scripts/build_agent_wiki.py
new file mode 100644
index 00000000..be03dafb
--- /dev/null
+++ b/explorations/agent-wiki/skills/scripts/build_agent_wiki.py
@@ -0,0 +1,2702 @@
+#!/usr/bin/env python3
+# mypy: ignore-errors
+# Exploration/reference code — not type-checked to the project standard.
+"""build_agent_wiki.py — single CLI driving the `agent-wiki` family of skills.
+
+Subcommands:
+  render-summary       stdin JSON -> summaries/<sid>.md
+  render-guidelines    stdin JSON -> guidelines/<slug>__<gid>.md (one per entity)
+  render-cluster       stdin JSON -> guidelines/<slug>__cluster.md
+  render-task          stdin JSON -> tasks/<slug>.md
+  update-config        stdin JSON patch -> wiki-twobatch/_config.yaml
+  dump-guidelines      stdout: corpus of atomic guidelines as JSON
+  dump-summaries       stdout: corpus of summaries as JSON
+  catalog              no input; refresh indexes, _index.jsonl, summary metric frontmatter
+
+The wiki root is found by walking up from cwd looking for an existing
+`wiki-twobatch/` directory; if none, it's created next to the nearest
+`.git/` ancestor. Pass --wiki-root to override.
+
+`_config.yaml` lives at <wiki_root>/_config.yaml. If absent, catalog copies
+the bundled `_default_agent_wiki_config.yaml` (sibling of this script).
+
+Subcommands that mutate are idempotent: re-emit pages with the same content
+unless `--rewrite` was passed.
+
+This script is the single deterministic helper for the agent-wiki skill
+family — it knows nothing about other plugins. The wiki it produces is
+self-contained under <wiki-root>/.
+"""
+
+from __future__ import annotations
+
+import argparse
+import datetime
+import hashlib
+import json
+import os
+import re
+import sys
+import tempfile
+from pathlib import Path
+from typing import Any
+
+try:
+    import yaml  # type: ignore[import-not-found]
+except ImportError:
+    print("error: PyYAML is required (uv run python ...).", file=sys.stderr)
+    raise
+
+WIKI_DIRNAME = "wiki-twobatch"
+SUMMARIES_DIR = "summaries"
+GUIDELINES_DIR = "guidelines"
+TASKS_DIR = "tasks"
+SKILLS_DIR = "skills"
+ID_INDEX_FILENAME = "_id_index.json"
+JSONL_INDEX_FILENAME = "_index.jsonl"
+CONFIG_FILENAME = "_config.yaml"
+DEFAULT_CONFIG_NAME = "_default_agent_wiki_config.yaml"
+SLUG_MAX = 40
+ALLOWED_STATUSES = ("followed", "ignored", "contradicted", "harmful")
+SCRIPT_DIR = Path(__file__).resolve().parent
+
+
+# ---------------------------------------------------------------------------
+# Path discovery
+# ---------------------------------------------------------------------------
+
+
+def find_wiki_root(start: Path | None = None, override: Path | None = None) -> Path:
+    if override is not None:
+        return override.resolve()
+    cur = (start or Path.cwd()).resolve()
+    base = cur
+    while True:
+        if (cur / WIKI_DIRNAME).is_dir():
+            return cur / WIKI_DIRNAME
+        if cur.parent == cur:
+            break
+        cur = cur.parent
+    cur = base
+    while True:
+        if (cur / ".git").exists():
+            return cur / WIKI_DIRNAME
+        if cur.parent == cur:
+            break
+        cur = cur.parent
+    return base / WIKI_DIRNAME
+
+
+def load_config(wiki_root: Path) -> dict:
+    p = wiki_root / CONFIG_FILENAME
+    if not p.exists():
+        seed = SCRIPT_DIR / DEFAULT_CONFIG_NAME
+        if seed.exists():
+            wiki_root.mkdir(parents=True, exist_ok=True)
+            p.write_text(seed.read_text(encoding="utf-8"), encoding="utf-8")
+            print(f"bootstrapped {p} from {seed.name}", file=sys.stderr)
+        else:
+            return {"schema_version": 1, "tags": {"guideline": {}}, "clusters": {}, "tasks": {}, "session_family_overrides": {}}
+    data = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
+    data.setdefault("tags", {}).setdefault("guideline", {})
+    data.setdefault("clusters", {})
+    data.setdefault("tasks", {})
+    data.setdefault("session_family_overrides", {})
+    return data
+
+
+def save_config(wiki_root: Path, cfg: dict) -> None:
+    p = wiki_root / CONFIG_FILENAME
+    p.parent.mkdir(parents=True, exist_ok=True)
+    p.write_text(yaml.safe_dump(cfg, sort_keys=False, allow_unicode=True), encoding="utf-8")
+
+
+# ---------------------------------------------------------------------------
+# Slug + id helpers
+# ---------------------------------------------------------------------------
+
+
+_SLUG_NORM = re.compile(r"[^a-z0-9]+")
+
+
+def slugify(text: str, max_len: int = SLUG_MAX) -> str:
+    s = _SLUG_NORM.sub("-", (text or "").lower()).strip("-")
+    if len(s) > max_len:
+        cut = s[:max_len]
+        last_dash = cut.rfind("-")
+        if last_dash >= max_len // 2:
+            cut = cut[:last_dash]
+        s = cut.rstrip("-")
+    return s or "guideline"
+
+
+def session_prefix(session_id: str | None) -> str:
+    """Deprecated: filenames now suffix the guideline content-hash id, not the
+    session-id prefix. Retained for one release in case external callers
+    still reference it; unused internally.
+    """
+    if not session_id:
+        return "unknown"
+    safe = re.sub(r"[^A-Za-z0-9]", "", session_id)
+    return safe[:8] or "unknown"
+
+
+_SENTENCE_END = re.compile(r"[.!?](?=\s|$)|\n")
+
+
+def first_sentence(text: str) -> str:
+    text = (text or "").strip()
+    if not text:
+        return ""
+    m = _SENTENCE_END.search(text)
+    if not m:
+        return text
+    end = m.end() if text[m.start()] in ".!?" else m.start()
+    return text[:end].strip()
+
+
+def compute_entity_id(content: str) -> str:
+    norm = " ".join((content or "").lower().split())
+    return hashlib.sha1(norm.encode("utf-8")).hexdigest()[:12]
+
+
+# ---------------------------------------------------------------------------
+# YAML scalar / frontmatter
+# ---------------------------------------------------------------------------
+
+
+def yaml_scalar(v: Any) -> str:
+    if isinstance(v, list):
+        if not v:
+            return "[]"
+        if all(isinstance(x, str) and "," not in x and " " not in x for x in v):
+            return "[" + ", ".join(v) + "]"
+        return "[" + ", ".join(json.dumps(x, ensure_ascii=False) if isinstance(x, str) else str(x) for x in v) + "]"
+    if isinstance(v, bool):
+        return "true" if v else "false"
+    if isinstance(v, (int, float)):
+        return str(v)
+    s = str(v)
+    if any(ch in s for ch in (":", "#", "\n")) or s.startswith(("-", "?", "!", "&", "*", "{", "[", '"', "'")):
+        return json.dumps(s, ensure_ascii=False)
+    return s
+
+
+def split_frontmatter(text: str):
+    if not text.startswith("---\n") and not text.startswith("---\r\n"):
+        return None, text
+    end = text.find("\n---\n", 4)
+    if end < 0:
+        end = text.find("\n---\r\n", 4)
+        if end < 0:
+            return None, text
+    fm = text[4:end].rstrip()
+    body_start = text.find("\n", end + 1) + 1
+    return fm, text[body_start:]
+
+
+def has_top_level_key(fm: str, key: str) -> bool:
+    return bool(re.search(rf"^{re.escape(key)}\s*:", fm, re.MULTILINE))
+
+
+def replace_or_append_field(fm: str, key: str, line: str) -> str:
+    # Match the header line PLUS any immediately-following indented child
+    # lines (orphans from a previous block-list form). Without the child
+    # match, transitioning a value from block-list to inline would leave
+    # `  - <item>` lines stranded under the new inline header.
+    pat = re.compile(
+        rf"^{re.escape(key)}:.*(?:\n[ \t]+.*)*$",
+        re.MULTILINE,
+    )
+    if pat.search(fm):
+        return pat.sub(line, fm, count=1)
+    return fm.rstrip() + "\n" + line
+
+
+def append_if_missing(fm: str, key: str, line: str) -> str:
+    if has_top_level_key(fm, key):
+        return fm
+    return fm.rstrip() + "\n" + line
+
+
+def upsert_fields(text: str, additions: dict, *, force_replace: tuple[str, ...] = ()) -> str:
+    """Append/replace YAML fields. Existing keys NOT in `force_replace` are left alone."""
+    fm, body = split_frontmatter(text)
+    if fm is None:
+        new = ["---"]
+        for k, v in additions.items():
+            new.extend(_emit_yaml_field(k, v))
+        new.append("---")
+        new.append("")
+        return "\n".join(new) + body
+
+    for k, v in additions.items():
+        line = _emit_yaml_field(k, v)
+        if len(line) == 1 and k in force_replace:
+            fm = replace_or_append_field(fm, k, line[0])
+        elif len(line) == 1:
+            fm = append_if_missing(fm, k, line[0])
+        else:
+            # block list — replace via regex if forced, else append if missing
+            if has_top_level_key(fm, k):
+                if k in force_replace:
+                    fm = _replace_block_field(fm, k, line)
+                # else: leave existing
+            else:
+                fm = fm.rstrip() + "\n" + "\n".join(line)
+    return "---\n" + fm + "\n---\n" + body
+
+
+def _emit_yaml_field(key: str, value: Any) -> list[str]:
+    if isinstance(value, list):
+        if not value:
+            return [f"{key}: []"]
+        if all(isinstance(x, str) for x in value) and all("," not in x and len(x) < 60 for x in value):
+            return [f"{key}: " + yaml_scalar(value)]
+        # block list (strings or dicts)
+        out = [f"{key}:"]
+        for item in value:
+            if isinstance(item, dict):
+                first = True
+                for kk, vv in item.items():
+                    prefix = "  - " if first else "    "
+                    out.append(f"{prefix}{kk}: {yaml_scalar(vv)}")
+                    first = False
+            else:
+                out.append(f"  - {item}")
+        return out
+    return [f"{key}: {yaml_scalar(value)}"]
+
+
+def _replace_block_field(fm: str, key: str, lines: list[str]) -> str:
+    """Replace a top-level block field (key: + indented children)."""
+    pat = re.compile(
+        rf"^{re.escape(key)}:.*?(?=^\S|\Z)",
+        re.MULTILINE | re.DOTALL,
+    )
+    repl = "\n".join(lines) + "\n"
+    if pat.search(fm):
+        return pat.sub(repl, fm, count=1)
+    return fm.rstrip() + "\n" + "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Subcommand: render-summary
+# ---------------------------------------------------------------------------
+
+
+SUMMARY_REQUIRED = ("session_id", "narrative", "normalized_path")
+
+
+def cmd_render_summary(args) -> int:
+    try:
+        data = json.load(sys.stdin)
+    except json.JSONDecodeError as exc:
+        print(f"error: invalid JSON on stdin: {exc}", file=sys.stderr)
+        return 2
+    if not isinstance(data, dict):
+        print("error: top-level JSON must be an object", file=sys.stderr)
+        return 2
+    missing = [k for k in SUMMARY_REQUIRED if not data.get(k)]
+    if missing:
+        print(f"error: missing required field(s): {', '.join(missing)}", file=sys.stderr)
+        return 2
+
+    wiki_root = find_wiki_root(override=args.wiki_root)
+    out_dir = wiki_root / SUMMARIES_DIR
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    session_id = str(data["session_id"])
+    safe = re.sub(r"[^A-Za-z0-9._-]+", "-", session_id).strip("-") or "session"
+    # Optional `slug`/`arc` field splits a single session into multiple summary
+    # files. Filename pattern: `<sid>__<slug>.md`. Without a slug, fall back to
+    # the historical `<sid>.md` (single-summary-per-session) shape.
+    arc_slug = (data.get("slug") or data.get("arc") or "").strip()
+    if arc_slug:
+        arc_slug = slugify(arc_slug, max_len=50)
+        out_name = f"{safe}__{arc_slug}.md"
+    else:
+        out_name = f"{safe}.md"
+    out_path = out_dir / out_name
+
+    if out_path.exists() and not args.rewrite:
+        print(f"skip (exists): {out_path}")
+        return 0
+
+    # load id_index for backlink resolution
+    id_index = _load_id_index(wiki_root)
+    recalled = _normalize_recalled(data.get("recalled_guidelines"), id_index)
+    # discover sibling arc summaries for this session_id (other files matching
+    # `<sid>*__*.md`). Excluded: this file itself.
+    siblings = sorted(p.name for p in out_dir.glob(f"{safe}__*.md") if p.name != out_name)
+    out_path.write_text(_render_summary_md(data, recalled, arc_slug=arc_slug, siblings=siblings), encoding="utf-8")
+    print(f"wrote: {out_path}")
+
+    if recalled:
+        for r in recalled:
+            _audit_append(
+                wiki_root,
+                {
+                    "action": "summary.guideline_use",
+                    "session_id": session_id,
+                    "id": r["id"],
+                    "status": r["status"],
+                },
+            )
+        print(f"audit: {len(recalled)} line(s) appended to {wiki_root}/_audit.log")
+    return 0
+
+
+def _normalize_recalled(items: Any, id_index: dict[str, str]) -> list[dict]:
+    out: list[dict] = []
+    if not isinstance(items, list):
+        return out
+    for item in items:
+        if not isinstance(item, dict):
+            continue
+        eid = (item.get("id") or "").strip()
+        if not eid:
+            continue
+        status = (item.get("status") or "").strip().lower() or "ignored"
+        if status not in ALLOWED_STATUSES:
+            status = "ignored"
+        out.append(
+            {
+                "id": eid,
+                "title": (item.get("title") or "").strip() or eid,
+                "status": status,
+                "evidence": (item.get("evidence") or "").strip(),
+                "link": id_index.get(eid),
+            }
+        )
+    return out
+
+
+def _render_summary_md(summary: dict, recalled: list[dict], arc_slug: str = "", siblings: list[str] | None = None) -> str:
+    fm = ["---", "type: episodic-summary"]
+    for k in ("session_id", "agent", "model", "goal", "outcome"):
+        v = summary.get(k)
+        if v is not None:
+            fm.append(f"{k}: {yaml_scalar(v)}")
+    if arc_slug:
+        fm.append(f"arc: {yaml_scalar(arc_slug)}")
+    if (d := summary.get("duration_seconds")) is not None:
+        fm.append(f"duration_seconds: {d}")
+    tools = summary.get("tools_used") or []
+    if tools:
+        fm.append("tools_used: [" + ", ".join(yaml_scalar(t) for t in tools) + "]")
+    sources: list[str] = []
+    np = summary.get("normalized_path")
+    tp = summary.get("transcript_path")
+    if np:
+        sources.append(np)
+    if tp and tp != np:
+        sources.append(tp)
+    if sources:
+        fm.append("sources:")
+        for s in sources:
+            fm.append(f"  - {s}")
+    if siblings:
+        fm.append("sibling_summaries:")
+        for s in siblings:
+            fm.append(f"  - {s}")
+    if recalled:
+        fm.append("recalled_guidelines:")
+        for r in recalled:
+            fm.append(f"  - id: {r['id']}")
+            fm.append(f"    title: {yaml_scalar(r['title'])}")
+            fm.append(f"    status: {r['status']}")
+            if r.get("evidence"):
+                fm.append(f"    evidence: {yaml_scalar(r['evidence'])}")
+            if r.get("link"):
+                fm.append(f"    link: {r['link']}")
+    fm.append("---")
+    fm.append("")
+
+    body: list[str] = []
+    title = summary.get("goal") or f"Session {summary.get('session_id', '')}"
+    body.append(f"# {title}")
+    body.append("")
+    narrative = (summary.get("narrative") or "").strip()
+    if narrative:
+        body.append(narrative)
+        body.append("")
+    key_turns = summary.get("key_turns") or []
+    if key_turns:
+        body.append("## Key turns")
+        body.append("")
+        for kt in key_turns:
+            body.append(f"- {kt}")
+        body.append("")
+    if recalled:
+        body.append("## Recalled guidelines")
+        body.append("")
+        for r in recalled:
+            label = r["title"]
+            link = r.get("link")
+            if link:
+                label = f"[{label}](../{link})"
+            line = f"- **{r['status']}** — {label}"
+            if r.get("evidence"):
+                line += f' — "{r["evidence"]}"'
+            body.append(line)
+        body.append("")
+    if sources:
+        body.append("## Sources")
+        body.append("")
+        if np:
+            body.append(f"- [normalized JSON]({np})")
+        if tp and tp != np:
+            body.append(f"- raw transcript: `{tp}`")
+        body.append("")
+    return "\n".join(fm + body)
+
+
+def _load_id_index(wiki_root: Path) -> dict[str, str]:
+    p = wiki_root / GUIDELINES_DIR / ID_INDEX_FILENAME
+    if not p.exists():
+        return {}
+    try:
+        d = json.loads(p.read_text(encoding="utf-8"))
+        return d if isinstance(d, dict) else {}
+    except (OSError, json.JSONDecodeError):
+        return {}
+
+
+def _audit_append(wiki_root: Path, entry: dict) -> None:
+    """Append one JSON line to <wiki-root>/_audit.log. Self-contained per wiki."""
+    p = wiki_root / "_audit.log"
+    p.parent.mkdir(parents=True, exist_ok=True)
+    full = {**entry, "ts": datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z")}
+    with p.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(full, ensure_ascii=False) + "\n")
+
+
+_ARCHIVED_DIR = "_archived"
+
+# Tags that appear on so many atomics they're not useful for "covers" inference.
+# A skill whose tags overlap an atomic's *only* via these is NOT considered to cover it.
+_GENERIC_TAGS = {
+    "stdlib",
+    "parsing",
+    "agent-behavior",
+    "contract",
+    "fallback-avoidance",
+    "wiki-pointer",
+    "wiki-scope",
+    "applicability",
+    "operator-side",
+    "agent-side",
+    "binary",
+    "headers",
+}
+
+
+def _archive_atomic(wiki_root: Path, gid: str, reason: str, target_slug: str) -> bool:
+    """Move an atomic guideline to <wiki>/_archived/<filename>.
+    Drop the gid from `_id_index.json`. Append an `archive_guideline` audit entry.
+    No-op (returns False) if the gid isn't in the id index or the file is missing.
+    """
+    idx_path = wiki_root / GUIDELINES_DIR / ID_INDEX_FILENAME
+    if not idx_path.exists():
+        return False
+    try:
+        idx = json.loads(idx_path.read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError):
+        return False
+    if gid not in idx:
+        return False
+    rel = idx.pop(gid)
+    src = wiki_root / rel
+    if not src.is_file():
+        return False
+    dst_dir = wiki_root / _ARCHIVED_DIR
+    dst_dir.mkdir(parents=True, exist_ok=True)
+    dst = dst_dir / src.name
+    src.rename(dst)
+    idx_path.write_text(json.dumps(idx, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    _audit_append(
+        wiki_root,
+        {
+            "action": "archive_guideline",
+            "id": gid,
+            "reason": reason,
+            "target": target_slug,
+            "src": rel,
+            "dst": f"{_ARCHIVED_DIR}/{src.name}",
+        },
+    )
+    return True
+
+
+# Slug/title tokens too generic to be evidence of coverage on their own.
+# A skill slug containing "python" must not archive every atomic whose title
+# happens to say "Python". These words appear across unrelated guidelines, so
+# a lexical match on one of them carries no signal.
+_GENERIC_SLUG_TOKENS = {
+    "read",
+    "extract",
+    "count",
+    "list",
+    "via",
+    "from",
+    "the",
+    "for",
+    "with",
+    "of",
+    "a",
+    "an",
+    "and",
+    "or",
+    "to",
+    "on",
+    "in",
+    "by",
+    "into",
+    "python",
+    "python3",
+    "script",
+    "scripts",
+    "scripting",
+    "file",
+    "files",
+    "data",
+    "command",
+    "commands",
+    "run",
+    "running",
+    "write",
+    "writing",
+    "system",
+    "use",
+    "using",
+    "fix",
+    "fixing",
+    "parse",
+    "parsing",
+    "json",
+    "install",
+    "installing",
+    "verify",
+    "container",
+    "containers",
+    "docker",
+}
+
+
+def _session_of_atomic(info: dict) -> str:
+    """Extract the source session id from an atomic's scan info.
+    `related_summary` is `summaries/<sid>.md` or `summaries/<sid>__<arc>.md`.
+    Returns '' if not derivable.
+    """
+    rel = (info or {}).get("related_summary") or ""
+    m = re.search(r"summaries/([^/]+?)(?:__[^/]+)?\.md\s*$", rel)
+    return m.group(1) if m else ""
+
+
+def _skill_covers_atomic(
+    skill_tags: set[str],
+    skill_slug: str,
+    atomic_tags: set[str],
+    atomic_title: str,
+    skill_description: str = "",
+    *,
+    same_session: bool = False,
+) -> bool:
+    """Inference: does this skill cover this atomic guideline?
+
+    Three paths to True, split by how strong the signal is:
+
+    1. **Tag-superset path** (works cross-trajectory): skill's tags are a
+       superset of the atomic's tags AND their intersection contains ≥ 2
+       non-generic tags. This is the disciplined signal — a true tag
+       superset means the skill's topic genuinely subsumes the atomic's.
+
+    2. **Slug-keyword path** (same-session only): a *distinctive* token
+       (≥ 4 chars, not in `_GENERIC_SLUG_TOKENS`) from the skill slug
+       appears in the atomic's title.
+
+    3. **Format-token path** (same-session only): an all-caps/CamelCase
+       format identifier (e.g. "PNG", "JPEG", "WebP", "CSV") shared between
+       the skill's description and the atomic's title.
+
+    Paths 2 and 3 are weak lexical heuristics — a single shared word. They
+    only fire when the atomic was extracted from the *same trajectory* the
+    skill was synthesized from (`same_session=True`), where any topical
+    overlap is real coverage rather than coincidence. Cross-trajectory
+    archival requires the strong Path 1. This prevents a skill from reaching
+    across into an unrelated trace's atomics on an incidental word match.
+
+    Bias: false negatives are safe (atomic stays); false positives are
+    expensive (atomic incorrectly archived).
+    """
+    if not atomic_tags:
+        atomic_tags = set()
+    # Path 1: superset + ≥2 non-generic shared tags (cross-trajectory OK)
+    if atomic_tags and atomic_tags <= skill_tags:
+        non_generic = (skill_tags & atomic_tags) - _GENERIC_TAGS
+        if len(non_generic) >= 2:
+            return True
+    # Paths 2 & 3 are weak lexical signals — only trust them within the same
+    # trajectory. A skill cannot archive a different trace's atomic on these.
+    if not same_session:
+        return False
+    title_lc = atomic_title.lower()
+    # Path 2: distinctive slug-keyword in title
+    skill_tokens = {t for t in skill_slug.split("-") if t not in _GENERIC_SLUG_TOKENS and len(t) >= 4}
+    for tok in skill_tokens:
+        if tok in title_lc:
+            return True
+    # Path 3: format/identifier token shared between skill description and atomic title
+    format_tokens = re.findall(r"\b([A-Z]{3,}|[A-Z][a-z]+[A-Z][a-zP]+)\b", skill_description or "")
+    format_tokens = {t for t in format_tokens if t not in {"AND", "OR", "THE", "USE"}}
+    for tok in format_tokens:
+        if tok.lower() in title_lc:
+            return True
+    return False
+
+
+# ---------------------------------------------------------------------------
+# Subcommand: render-guidelines
+# ---------------------------------------------------------------------------
+
+
+def cmd_render_guidelines(args) -> int:
+    try:
+        data = json.load(sys.stdin)
+    except json.JSONDecodeError as exc:
+        print(f"error: invalid JSON on stdin: {exc}", file=sys.stderr)
+        return 2
+    if not isinstance(data, dict):
+        print("error: top-level JSON must be an object with `entities`", file=sys.stderr)
+        return 2
+    entities = data.get("entities") or []
+    if not isinstance(entities, list) or not entities:
+        print("no entities provided; nothing to write", file=sys.stderr)
+        return 0
+
+    wiki_root = find_wiki_root(override=args.wiki_root)
+    out_dir = wiki_root / GUIDELINES_DIR
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    written = skipped = 0
+    new_index: dict[str, str] = {}
+    cfg = load_config(wiki_root)
+    cfg_tag_map = cfg.setdefault("tags", {}).setdefault("guideline", {})
+    cfg_dirty = False
+    for entity in entities:
+        if not isinstance(entity, dict):
+            continue
+        content = (entity.get("content") or "").strip()
+        if not content:
+            continue
+        sid = entity.get("session_id") or args.session_id
+        norm = entity.get("normalized_path") or entity.get("trajectory") or args.normalized_path
+        eid = (entity.get("id") or "").strip() or compute_entity_id(content)
+        slug_source = entity.get("slug") or entity.get("title") or content
+        slug = slugify(slug_source)
+        # Filename suffix is the guideline's content-hash id, NOT the
+        # session-id prefix. Two motivations: filename ↔ `id:` frontmatter
+        # round-trip cleanly, and two guidelines from the same session no
+        # longer share a suffix. Session lineage stays recoverable via the
+        # `related_summary:` frontmatter and the `## Sources` footer.
+        out_path = out_dir / f"{slug}__{eid}.md"
+        # Persist tags into _config.yaml so the catalog "By tag" table picks
+        # them up. Authored entries in cfg win over re-extractions.
+        ent_tags = entity.get("tags") or []
+        if isinstance(ent_tags, list):
+            ent_tags = [str(t).strip() for t in ent_tags if str(t).strip()]
+            if ent_tags and (eid not in cfg_tag_map or args.rewrite):
+                cfg_tag_map[eid] = ent_tags
+                cfg_dirty = True
+        if out_path.exists() and not args.rewrite:
+            print(f"skip (exists): {out_path}")
+            skipped += 1
+            new_index[eid] = f"{GUIDELINES_DIR}/{out_path.name}"
+            continue
+        out_path.write_text(_render_guideline_md(entity, norm, sid, eid), encoding="utf-8")
+        new_index[eid] = f"{GUIDELINES_DIR}/{out_path.name}"
+        print(f"wrote: {out_path}")
+        written += 1
+    _update_id_index(out_dir, new_index)
+    if cfg_dirty:
+        save_config(wiki_root, cfg)
+    print(f"\nwrote {written}, skipped {skipped}")
+    return 0
+
+
+def _render_guideline_md(entity: dict, normalized_path: str | None, session_id: str | None, eid: str) -> str:
+    content = (entity.get("content") or "").strip()
+    rationale = (entity.get("rationale") or "").strip()
+    trigger = (entity.get("trigger") or "").strip()
+    title = (entity.get("title") or "").strip() or first_sentence(content) or "Guideline"
+    etype = entity.get("type") or "guideline"
+
+    # Optional `arc` lets the entity bind to one specific arc-summary when the
+    # session has been split into multiple arc-files. Filename pattern is
+    # `<sid>__<arc>.md`; if `arc` is empty, fall back to `<sid>.md`.
+    arc = (entity.get("arc") or "").strip()
+    if arc:
+        arc = slugify(arc, max_len=50)
+    summary_basename = f"{session_id}__{arc}.md" if (session_id and arc) else (f"{session_id}.md" if session_id else "")
+
+    fm = ["---", f"id: {eid}", f"type: {etype}"]
+    if trigger:
+        fm.append(f"trigger: {yaml_scalar(trigger)}")
+    fm.append(f"agent: {entity.get('agent') or 'claude-code'}")
+    tags = entity.get("tags") or []
+    if isinstance(tags, list):
+        tags_clean = [str(t).strip() for t in tags if str(t).strip()]
+        if tags_clean:
+            fm.append(f"tags: [{', '.join(tags_clean)}]")
+    sources: list[str] = []
+    if normalized_path:
+        sources.append(normalized_path)
+    if sources:
+        fm.append("sources:")
+        for s in sources:
+            fm.append(f"  - {s}")
+    if summary_basename:
+        fm.append(f"related_summary: {SUMMARIES_DIR}/{summary_basename}")
+    fm.append("---")
+    fm.append("")
+    body = [f"# {title}", "", content, ""]
+    if rationale:
+        body.extend(["## Rationale", "", rationale, ""])
+    body.append("## Sources")
+    body.append("")
+    if summary_basename:
+        body.append(f"- [trajectory summary](../{SUMMARIES_DIR}/{summary_basename})")
+    if normalized_path:
+        body.append(f"- [normalized JSON]({normalized_path})")
+    body.append("")
+    return "\n".join(fm + body)
+
+
+def _update_id_index(out_dir: Path, entries: dict[str, str]) -> None:
+    if not entries:
+        return
+    p = out_dir / ID_INDEX_FILENAME
+    cur: dict = {}
+    if p.exists():
+        try:
+            cur = json.loads(p.read_text(encoding="utf-8"))
+            if not isinstance(cur, dict):
+                cur = {}
+        except (OSError, json.JSONDecodeError):
+            cur = {}
+    cur.update(entries)
+    fd, tmp = tempfile.mkstemp(dir=out_dir, prefix=ID_INDEX_FILENAME + ".", suffix=".tmp")
+    try:
+        with os.fdopen(fd, "w", encoding="utf-8") as f:
+            json.dump(cur, f, indent=2, sort_keys=True)
+        os.replace(tmp, p)
+    except BaseException:
+        if os.path.exists(tmp):
+            os.unlink(tmp)
+        raise
+
+
+def _missing_jsonl_index_links(wiki_root: Path) -> list[tuple[str, str, str]]:
+    idx = wiki_root / "_index.jsonl"
+    if not idx.exists():
+        return []
+
+    missing: list[tuple[str, str, str]] = []
+    for line_no, line in enumerate(idx.read_text(encoding="utf-8").splitlines(), start=1):
+        if not line.strip():
+            continue
+        try:
+            row = json.loads(line)
+        except json.JSONDecodeError as exc:
+            raise RuntimeError(f"{idx}:{line_no}: invalid JSON: {exc}") from exc
+        link = row.get("link")
+        if link and not (wiki_root / link).exists():
+            missing.append((str(row.get("kind") or ""), str(row.get("id") or ""), str(link)))
+    return missing
+
+
+def _assert_jsonl_index_integrity(wiki_root: Path) -> None:
+    missing = _missing_jsonl_index_links(wiki_root)
+    if not missing:
+        return
+    details = "; ".join(f"{kind}:{ident} -> {link}" for kind, ident, link in missing[:10])
+    extra = f"; +{len(missing) - 10} more" if len(missing) > 10 else ""
+    raise RuntimeError(f"{wiki_root / '_index.jsonl'} has missing links: {details}{extra}")
+
+
+def _refresh_agent_retrieval_indexes(wiki_root: Path) -> None:
+    """Refresh indexes agents use immediately after local page moves/writes."""
+    cfg = load_config(wiki_root)
+    today = datetime.date.today().isoformat()
+    g_meta = _scan_atomic_guidelines(wiki_root)
+    _write_guidelines_index(wiki_root, g_meta, cfg, today)
+    _write_skills_index(wiki_root, _scan_skills(wiki_root), today)
+    _write_jsonl_index(wiki_root, cfg, g_meta)
+    _assert_jsonl_index_integrity(wiki_root)
+
+
+# ---------------------------------------------------------------------------
+# Subcommand: render-cluster
+# ---------------------------------------------------------------------------
+
+
+def cmd_render_cluster(args) -> int:
+    try:
+        data = json.load(sys.stdin)
+    except json.JSONDecodeError as exc:
+        print(f"error: invalid JSON on stdin: {exc}", file=sys.stderr)
+        return 2
+    if not isinstance(data, dict):
+        print("error: top-level JSON must be an object", file=sys.stderr)
+        return 2
+    slug = (data.get("slug") or "").strip()
+    if not slug:
+        print("error: missing slug", file=sys.stderr)
+        return 2
+
+    wiki_root = find_wiki_root(override=args.wiki_root)
+    cfg = load_config(wiki_root)
+
+    # write/update config
+    members = data.get("members") or []
+    cfg["clusters"][slug] = {
+        "title": data.get("title") or slug,
+        "description": data.get("description") or "",
+        "takeaway": data.get("takeaway") or "",
+        "members": members,
+        "tags": data.get("tags") or [],
+    }
+    save_config(wiki_root, cfg)
+
+    # render the cluster page
+    out_path = wiki_root / GUIDELINES_DIR / f"{slug}__cluster.md"
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(_render_cluster_md(slug, cfg["clusters"][slug], wiki_root), encoding="utf-8")
+    print(f"wrote: {out_path}")
+
+    # Archive each member atomic — the cluster page now represents them.
+    if getattr(args, "archive_members", False):
+        archived = 0
+        for gid in members:
+            if _archive_atomic(wiki_root, gid, reason="covered_by_cluster", target_slug=slug):
+                archived += 1
+        if archived:
+            print(f"  archived {archived} member atomic(s) to {_ARCHIVED_DIR}/")
+    _refresh_agent_retrieval_indexes(wiki_root)
+    print("refreshed: skills/index.md, guidelines/index.md, _index.jsonl")
+    return 0
+
+
+def _render_cluster_md(slug: str, info: dict, wiki_root: Path) -> str:
+    today = datetime.date.today().isoformat()
+    members = info.get("members") or []
+    id_index = _load_id_index(wiki_root)
+
+    fm = [
+        "---",
+        "type: cluster",
+        f"slug: {slug}",
+        f"title: {yaml_scalar(info.get('title') or slug)}",
+        "tags: " + yaml_scalar(info.get("tags") or []),
+        f"verified_at: {today}",
+        "members:",
+    ]
+    for gid in members:
+        link = id_index.get(gid)
+        fm.append(f"  - id: {gid}")
+        if link:
+            # cluster page lives in guidelines/, so relative link is just basename
+            fm.append(f"    link: {Path(link).name}")
+    fm.append("priority: high")
+    fm.append("---")
+    fm.append("")
+
+    body = [f"# {info.get('title') or slug}", "", info.get("description") or "", ""]
+    takeaway = (info.get("takeaway") or "").strip()
+    if takeaway:
+        body.extend(["## Takeaway", "", takeaway, ""])
+    body.append("## Members")
+    body.append("")
+    body.append(
+        "These guidelines are kept as separate pages for full provenance back to "
+        "their source trajectories. The cluster references them; nothing is moved "
+        "or merged."
+    )
+    body.append("")
+    for gid in members:
+        link = id_index.get(gid)
+        title, snippet, trigger, related = _read_guideline_meta(wiki_root, link) if link else (gid, "", "", "")
+        body.append(f"### [{title}]({Path(link).name if link else gid})")
+        body.append("")
+        body.append(f"- **id:** `{gid}`")
+        if trigger:
+            body.append(f"- **trigger:** {trigger}")
+        if related:
+            body.append(f"- **source:** [{related.replace('summaries/', '')[:14]}](../{related})")
+        if snippet:
+            body.append("")
+            body.append(f"> {snippet}")
+        body.append("")
+    return "\n".join(fm + body)
+
+
+def _read_guideline_meta(wiki_root: Path, relpath: str) -> tuple[str, str, str, str]:
+    p = wiki_root / relpath
+    if not p.exists():
+        return relpath, "", "", ""
+    text = p.read_text(encoding="utf-8")
+    fm, body = split_frontmatter(text)
+    title_m = re.search(r"^# (.+)$", body or "", re.MULTILINE)
+    title = title_m.group(1).strip() if title_m else relpath
+    trig_m = re.search(r"^trigger:\s*(.+)$", fm or "", re.MULTILINE)
+    trigger = trig_m.group(1).strip() if trig_m else ""
+    if trigger.startswith('"') and trigger.endswith('"'):
+        try:
+            trigger = json.loads(trigger)
+        except Exception:
+            pass
+    rel_m = re.search(r"^related_summary:\s*(.+)$", fm or "", re.MULTILINE)
+    related = rel_m.group(1).strip() if rel_m else ""
+    cm = re.search(r"^# .+?\n\n(.+?)(?=\n\n|\n## |\Z)", body or "", re.S | re.M)
+    snippet = cm.group(1).replace("\n", " ").strip() if cm else ""
+    if len(snippet) > 300:
+        snippet = snippet[:300].rsplit(" ", 1)[0] + "…"
+    return title, snippet, trigger, related
+
+
+# ---------------------------------------------------------------------------
+# Subcommand: render-task
+# ---------------------------------------------------------------------------
+
+
+def cmd_render_task(args) -> int:
+    try:
+        data = json.load(sys.stdin)
+    except json.JSONDecodeError as exc:
+        print(f"error: invalid JSON on stdin: {exc}", file=sys.stderr)
+        return 2
+    slug = (data.get("slug") or "").strip()
+    if not slug:
+        print("error: missing slug", file=sys.stderr)
+        return 2
+
+    wiki_root = find_wiki_root(override=args.wiki_root)
+    cfg = load_config(wiki_root)
+    cfg["tasks"][slug] = {
+        "title": data.get("title") or slug,
+        "family": data.get("family") or slug,
+        "family_match": data.get("family_match") or {},
+        "intro": data.get("intro") or "",
+        "findings": data.get("findings") or "",
+        "tags": data.get("tags") or [],
+    }
+    save_config(wiki_root, cfg)
+
+    out = wiki_root / TASKS_DIR / f"{slug}__task.md"
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(_render_task_md(slug, cfg["tasks"][slug], wiki_root, cfg), encoding="utf-8")
+    print(f"wrote: {out}")
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# Subcommand: render-skill
+# ---------------------------------------------------------------------------
+
+
+SKILL_REQUIRED = ("name", "description", "workflow_steps")
+_SKILL_NAME_RE = re.compile(r"^[a-z][a-z0-9]*(?:-[a-z0-9]+)*$")
+_SKILL_FILENAME_RE = re.compile(r"^[\w][\w.-]*$")
+
+
+def cmd_render_skill(args) -> int:
+    try:
+        data = json.load(sys.stdin)
+    except json.JSONDecodeError as exc:
+        print(f"error: invalid JSON on stdin: {exc}", file=sys.stderr)
+        return 2
+    if not isinstance(data, dict):
+        print("error: top-level JSON must be an object", file=sys.stderr)
+        return 2
+    missing = [k for k in SKILL_REQUIRED if not data.get(k)]
+    if missing:
+        print(f"error: missing required field(s): {', '.join(missing)}", file=sys.stderr)
+        return 2
+    name = str(data["name"]).strip()
+    if not _SKILL_NAME_RE.match(name):
+        print(f"error: name {name!r} is not kebab-case", file=sys.stderr)
+        return 2
+
+    wiki_root = find_wiki_root(override=args.wiki_root)
+    out_dir = wiki_root / SKILLS_DIR / name
+    skill_md = out_dir / "SKILL.md"
+    if skill_md.exists() and not args.rewrite:
+        print(f"skip (exists): {skill_md}")
+        return 0
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    # Author SKILL.md
+    today = datetime.date.today().isoformat()
+    desc = str(data["description"]).strip()
+    trigger = (data.get("trigger") or "").strip()
+    sid = (data.get("session_id") or "").strip()
+    related = (data.get("related_summary") or "").strip()
+    if not related and sid:
+        related = f"{SUMMARIES_DIR}/{sid}.md"
+    norm_path = (data.get("normalized_path") or "").strip()
+    agent_id = (data.get("agent") or "claude-code").strip()
+    tags = data.get("tags") or []
+
+    fm = ["---"]
+    fm.append(f"id: skill:{name}")
+    fm.append("type: skill")
+    fm.append(f"name: {name}")
+    fm.append(f"description: {yaml_scalar(desc)}")
+    if trigger:
+        fm.append(f"trigger: {yaml_scalar(trigger)}")
+    fm.append(f"agent: {agent_id}")
+    if norm_path:
+        fm.append("sources:")
+        fm.append(f"  - {norm_path}")
+    if related:
+        fm.append(f"related_summary: {related}")
+    fm.append(f"verified_at: {today}")
+    if tags:
+        fm.append("tags: [" + ", ".join(yaml_scalar(t) for t in tags) + "]")
+    fm.append("---")
+    fm.append("")
+
+    # Body
+    body: list[str] = []
+    title = data.get("title") or name.replace("-", " ").title()
+    body.append(f"# {title}")
+    body.append("")
+    overview = (data.get("overview") or desc).strip()
+    body.append("## Overview")
+    body.append("")
+    body.append(overview)
+    body.append("")
+
+    when_to_use = data.get("when_to_use") or []
+    if when_to_use:
+        body.append("## When To Use")
+        body.append("")
+        for line in when_to_use:
+            body.append(f"- {line}")
+        body.append("")
+
+    workflow = data.get("workflow_steps") or []
+    body.append("## Workflow")
+    body.append("")
+    for i, step in enumerate(workflow, start=1):
+        body.append(f"{i}. {step}")
+    body.append("")
+
+    # Sources footer
+    body.append("## Sources")
+    body.append("")
+    if related:
+        body.append(f"- [trajectory summary](../../{related})")
+    if norm_path:
+        body.append(f"- [normalized JSON]({norm_path})")
+    body.append("")
+
+    skill_md.write_text("\n".join(fm + body), encoding="utf-8")
+    print(f"wrote: {skill_md}")
+
+    # Sibling scripts
+    scripts = data.get("scripts") or []
+    scripts_dir = out_dir / "scripts"
+    written_scripts: list[str] = []
+    for s in scripts:
+        if not isinstance(s, dict):
+            continue
+        sname = (s.get("name") or "").strip()
+        scontent = s.get("content")
+        if not sname or not scontent:
+            continue
+        if not _SKILL_FILENAME_RE.match(sname):
+            print(f"warning: skipping invalid script name {sname!r}", file=sys.stderr)
+            continue
+        scripts_dir.mkdir(parents=True, exist_ok=True)
+        sp = scripts_dir / sname
+        sp.write_text(scontent, encoding="utf-8")
+        if sname.endswith((".sh", ".bash")):
+            sp.chmod(0o755)
+        written_scripts.append(sname)
+        print(f"  + {sp}")
+
+    # Update _id_index.json
+    idx_path = wiki_root / SKILLS_DIR / ID_INDEX_FILENAME
+    idx = {}
+    if idx_path.exists():
+        try:
+            idx = json.loads(idx_path.read_text(encoding="utf-8"))
+        except (OSError, json.JSONDecodeError):
+            idx = {}
+    idx[name] = f"{SKILLS_DIR}/{name}/SKILL.md"
+    idx_path.parent.mkdir(parents=True, exist_ok=True)
+    idx_path.write_text(json.dumps(idx, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+
+    # Audit
+    if sid:
+        _audit_append(
+            wiki_root,
+            {
+                "action": "synthesize_skill",
+                "session_id": sid,
+                "skill_name": name,
+                "scripts": written_scripts,
+            },
+        )
+        print(f"audit: synthesize_skill recorded for {name}")
+
+    # Archive atomic guidelines this skill covers (delete-on-promote).
+    # Cross-trajectory archival uses only the disciplined tag-superset path;
+    # the weak lexical paths (slug-token, format-token) fire only for atomics
+    # from the SAME trajectory this skill was synthesized from. See
+    # `_skill_covers_atomic` for the rationale.
+    if getattr(args, "archive_covered", False):
+        skill_tags = set(tags or [])
+        archived: list[str] = []
+        # _scan_atomic_guidelines is defined later in the file but we can reach it.
+        for gid, info in _scan_atomic_guidelines(wiki_root).items():
+            atomic_tags = set(info.get("tags") or [])
+            atomic_title = info.get("title") or ""
+            same_session = bool(sid) and _session_of_atomic(info) == sid
+            if _skill_covers_atomic(skill_tags, name, atomic_tags, atomic_title, desc, same_session=same_session):
+                if _archive_atomic(wiki_root, gid, reason="covered_by_skill", target_slug=name):
+                    archived.append(gid)
+        if archived:
+            print(f"  archived {len(archived)} covered atomic(s): {', '.join(archived)}")
+    _refresh_agent_retrieval_indexes(wiki_root)
+    print("refreshed: skills/index.md, guidelines/index.md, _index.jsonl")
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# Subcommand: render-subtask
+# ---------------------------------------------------------------------------
+
+
+def cmd_render_subtask(args) -> int:
+    """Render a subtask page: a narrative slice within a single session.
+
+    Stdin JSON:
+      {
+        "slug":             "<kebab-case identifier>",
+        "title":            "<short title>",
+        "parent_session_id":"<session_id, full UUID>",
+        "parent_summary":   "<filename of parent summary in summaries/, e.g. abc123__arc1.md>",
+        "tags":             ["..."],
+        "narrative":        "<one-or-two paragraphs>",
+        "key_steps":        ["...", "..."]    # optional
+      }
+    """
+    try:
+        data = json.load(sys.stdin)
+    except json.JSONDecodeError as exc:
+        print(f"error: invalid JSON on stdin: {exc}", file=sys.stderr)
+        return 2
+    slug = (data.get("slug") or "").strip()
+    if not slug:
+        print("error: missing slug", file=sys.stderr)
+        return 2
+    parent_summary = (data.get("parent_summary") or "").strip()
+    parent_sid = (data.get("parent_session_id") or "").strip()
+    if not parent_summary and not parent_sid:
+        print("error: subtask requires parent_summary or parent_session_id", file=sys.stderr)
+        return 2
+    wiki_root = find_wiki_root(override=args.wiki_root)
+    out_dir = wiki_root / TASKS_DIR
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out = out_dir / f"{slug}__subtask.md"
+    today = datetime.date.today().isoformat()
+
+    title = data.get("title") or slug
+    tags = data.get("tags") or []
+    narrative = (data.get("narrative") or "").strip()
+    key_steps = data.get("key_steps") or []
+
+    fm = ["---", "type: subtask", f"slug: {slug}", f"title: {yaml_scalar(title)}"]
+    if parent_sid:
+        fm.append(f"parent_session_id: {parent_sid}")
+    if parent_summary:
+        fm.append(f"parent_summary: {SUMMARIES_DIR}/{parent_summary}")
+    fm.append("tags: " + yaml_scalar(tags))
+    fm.append(f"verified_at: {today}")
+    fm.append("---")
+    fm.append("")
+
+    body = [f"# {title}", ""]
+    if narrative:
+        body.append(narrative)
+        body.append("")
+    if key_steps:
+        body.append("## Key steps")
+        body.append("")
+        for s in key_steps:
+            body.append(f"- {s}")
+        body.append("")
+    if parent_summary:
+        body.append("## Parent summary")
+        body.append("")
+        body.append(f"- [{parent_summary}](../{SUMMARIES_DIR}/{parent_summary})")
+        body.append("")
+
+    out.write_text("\n".join(fm + body), encoding="utf-8")
+    print(f"wrote: {out}")
+    return 0
+
+
+def _render_task_md(slug: str, info: dict, wiki_root: Path, cfg: dict) -> str:
+    today = datetime.date.today().isoformat()
+    sessions = _classify_sessions(wiki_root, cfg)
+    rows = sorted([s for s in sessions if s["family"] == info.get("family")], key=lambda x: (x.get("condition") or "", x.get("trial") or 0))
+
+    fm = [
+        "---",
+        "type: task-comparison",
+        f"slug: {slug}",
+        f"title: {yaml_scalar(info.get('title') or slug)}",
+        "tags: " + yaml_scalar(info.get("tags") or []),
+        f"verified_at: {today}",
+        f"sessions: {len(rows)}",
+        "---",
+        "",
+    ]
+    body: list[str] = [f"# {info.get('title') or slug}", ""]
+    intro = (info.get("intro") or "").strip()
+    if intro:
+        body.append(intro)
+        body.append("")
+    body.append("## Comparison")
+    body.append("")
+    body.append("| Trial | Condition | Session | Tool calls | Errors | Wiki used | Contributed guidelines |")
+    body.append("|-------|-----------|---------|-----------:|-------:|:------:|------------------------|")
+    for s in rows:
+        sid = s["session_id"]
+        sid_short = sid[:8]
+        # Prefer the per-session summary basename when this session has only one
+        # summary; if it was split into arcs, point at the first arc and let the
+        # reader navigate from there via sibling_summaries.
+        summary_basename = s.get("summary_basename") or f"{sid}.md"
+        link = f"../{SUMMARIES_DIR}/{summary_basename}"
+        tc = s.get("tool_calls") or 0
+        err = s.get("errors") or 0
+        recall = "Y" if s.get("wiki_consulted") else "—"
+        contrib = ", ".join(f"`{x}`" for x in (s.get("contributed_guidelines") or [])) or "—"
+        trial = str(s.get("trial") or "—")
+        cond = s.get("condition") or "—"
+        body.append(f"| {trial} | {cond} | [{sid_short}…]({link}) | {tc} | {err} | {recall} | {contrib} |")
+    body.append("")
+    findings = (info.get("findings") or "").strip()
+    if findings:
+        body.extend(["## Findings", "", findings, ""])
+    return "\n".join(fm + body)
+
+
+# ---------------------------------------------------------------------------
+# Session classification (shared helper)
+# ---------------------------------------------------------------------------
+
+
+def _classify_sessions(wiki_root: Path, cfg: dict) -> list[dict]:
+    overrides = cfg.get("session_family_overrides") or {}
+    tasks_cfg = cfg.get("tasks") or {}
+    sessions = []
+    summaries_dir = wiki_root / SUMMARIES_DIR
+    if not summaries_dir.is_dir():
+        return sessions
+    for p in sorted(summaries_dir.glob("*.md")):
+        if p.name == "index.md":
+            continue
+        text = p.read_text(encoding="utf-8")
+        fm, body = split_frontmatter(text)
+        if fm is None:
+            continue
+        sid_m = re.search(r"^session_id:\s*(.+)$", fm, re.MULTILINE)
+        goal_m = re.search(r"^goal:\s*(.+)$", fm, re.MULTILINE)
+        sources = re.findall(r"^  - (\S+)", fm, re.MULTILINE)
+        sid = sid_m.group(1).strip() if sid_m else p.stem
+        goal = goal_m.group(1).strip() if goal_m else ""
+        if goal.startswith('"') and goal.endswith('"'):
+            try:
+                goal = json.loads(goal)
+            except Exception:
+                pass
+        np = sources[0] if sources else ""
+        path_haystack = " ".join(sources)
+
+        family, trial, condition = _classify_one(sid, goal, path_haystack, overrides, tasks_cfg)
+
+        # metrics from existing fm (added by catalog)
+        def fm_int(key):
+            m = re.search(rf"^{key}:\s*(\d+)\s*$", fm, re.MULTILINE)
+            return int(m.group(1)) if m else None
+
+        def fm_float(key):
+            m = re.search(rf"^{key}:\s*([\d.]+)\s*$", fm, re.MULTILINE)
+            return float(m.group(1)) if m else None
+
+        def fm_bool(key):
+            m = re.search(rf"^{key}:\s*(true|false)\s*$", fm, re.MULTILINE)
+            return m.group(1) == "true" if m else None
+
+        def fm_list(key):
+            # inline list: key: [a, b]
+            m = re.search(rf"^{key}:\s*\[(.*?)\]\s*$", fm, re.MULTILINE)
+            if m:
+                items = [x.strip() for x in m.group(1).split(",") if x.strip()]
+                return items
+            # block list: key:\n  - a\n  - b
+            m = re.search(rf"^{key}:\s*\n((?:  - .+\n)+)", fm, re.MULTILINE)
+            if m:
+                return [line[4:].strip() for line in m.group(1).splitlines() if line.startswith("  - ")]
+            return []
+
+        arc_m = re.search(r"^arc:\s*(.+)$", fm, re.MULTILINE)
+        arc = arc_m.group(1).strip() if arc_m else ""
+        if arc.startswith('"') and arc.endswith('"'):
+            try:
+                arc = json.loads(arc)
+            except Exception:
+                pass
+
+        sessions.append(
+            {
+                "session_id": sid,
+                "arc": arc,
+                "goal": goal,
+                "normalized_path": np,
+                "family": family,
+                "trial": trial,
+                "condition": condition,
+                "tool_calls": fm_int("tool_calls"),
+                "errors": fm_int("errors"),
+                "wiki_consulted": fm_bool("wiki_consulted"),
+                "contributed_guidelines": fm_list("contributed_guidelines"),
+                "contributed_skills": fm_list("contributed_skills"),
+                "input_tokens": fm_int("input_tokens"),
+                "cache_creation_input_tokens": fm_int("cache_creation_input_tokens"),
+                "cache_read_input_tokens": fm_int("cache_read_input_tokens"),
+                "output_tokens": fm_int("output_tokens"),
+                "total_cost_usd": fm_float("total_cost_usd"),
+                "summary_path": p,
+                "summary_basename": p.name,
+            }
+        )
+    return sessions
+
+
+def _classify_one(sid: str, goal: str, np: str, overrides: dict, tasks_cfg: dict) -> tuple[str | None, int | None, str | None]:
+    if sid in overrides:
+        o = overrides[sid] or {}
+        return o.get("family"), o.get("trial"), o.get("condition")
+    g = goal.lower()
+    family = None
+    for slug, info in tasks_cfg.items():
+        match = (info or {}).get("family_match") or {}
+        for sub in match.get("goal_substring") or []:
+            if sub.lower() in g:
+                family = info.get("family") or slug
+                break
+        if family:
+            break
+    trial = None
+    cond = None
+    m = re.search(r"trial_(\d+)_(seed|no_recall|guidelines|skill)", np)
+    if m:
+        trial = int(m.group(1))
+        cond = m.group(2).replace("_", "-")
+    return family, trial, cond
+
+
+# ---------------------------------------------------------------------------
+# Subcommand: update-config (patch)
+# ---------------------------------------------------------------------------
+
+
+def cmd_update_config(args) -> int:
+    try:
+        patch = json.load(sys.stdin)
+    except json.JSONDecodeError as exc:
+        print(f"error: invalid JSON on stdin: {exc}", file=sys.stderr)
+        return 2
+    if not isinstance(patch, dict):
+        print("error: top-level JSON must be an object", file=sys.stderr)
+        return 2
+    wiki_root = find_wiki_root(override=args.wiki_root)
+    cfg = load_config(wiki_root)
+    _deep_merge(cfg, patch)
+    save_config(wiki_root, cfg)
+    print(f"updated: {wiki_root / CONFIG_FILENAME}")
+    return 0
+
+
+def _deep_merge(dst: dict, src: dict) -> None:
+    for k, v in src.items():
+        if isinstance(v, dict) and isinstance(dst.get(k), dict):
+            _deep_merge(dst[k], v)
+        else:
+            dst[k] = v
+
+
+# ---------------------------------------------------------------------------
+# Subcommand: dump-guidelines
+# ---------------------------------------------------------------------------
+
+
+def cmd_dump_guidelines(args) -> int:
+    wiki_root = find_wiki_root(override=args.wiki_root)
+    out = []
+    g_dir = wiki_root / GUIDELINES_DIR
+    if not g_dir.is_dir():
+        print("[]", end="")
+        return 0
+    for p in sorted(g_dir.glob("*.md")):
+        if p.name == "index.md":
+            continue
+        text = p.read_text(encoding="utf-8")
+        fm, body = split_frontmatter(text)
+        if fm is None:
+            continue
+        gid_m = re.search(r"^id:\s*(\S+)", fm, re.MULTILINE)
+        title_m = re.search(r"^# (.+)$", body or "", re.MULTILINE)
+        trig_m = re.search(r"^trigger:\s*(.+)$", fm, re.MULTILINE)
+        cluster_m = re.search(r"^cluster:\s*(.+)$", fm, re.MULTILINE)
+        cm = re.search(r"^# .+?\n\n(.+?)(?=\n\n|\n## |\Z)", body or "", re.S | re.M)
+        out.append(
+            {
+                "id": gid_m.group(1).strip() if gid_m else compute_entity_id(body or ""),
+                "filename": p.name,
+                "title": (title_m.group(1).strip() if title_m else ""),
+                "trigger": (trig_m.group(1).strip() if trig_m else ""),
+                "cluster": (cluster_m.group(1).strip() if cluster_m else None),
+                "is_cluster_page": p.name.endswith("__cluster.md"),
+                "content": (cm.group(1).strip() if cm else ""),
+            }
+        )
+    json.dump(out, sys.stdout, ensure_ascii=False, indent=2)
+    sys.stdout.write("\n")
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# Subcommand: dump-summaries
+# ---------------------------------------------------------------------------
+
+
+def cmd_dump_summaries(args) -> int:
+    wiki_root = find_wiki_root(override=args.wiki_root)
+    cfg = load_config(wiki_root)
+    sessions = _classify_sessions(wiki_root, cfg)
+    out = []
+    for s in sessions:
+        out.append(
+            {
+                "session_id": s["session_id"],
+                "goal": s["goal"],
+                "family": s["family"],
+                "trial": s["trial"],
+                "condition": s["condition"],
+                "tool_calls": s["tool_calls"],
+                "errors": s["errors"],
+                "wiki_consulted": s["wiki_consulted"],
+                "summary_filename": s["summary_path"].name,
+            }
+        )
+    json.dump(out, sys.stdout, ensure_ascii=False, indent=2)
+    sys.stdout.write("\n")
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# Subcommand: catalog (the big bookkeeping pass)
+# ---------------------------------------------------------------------------
+
+
+def cmd_catalog(args) -> int:
+    wiki_root = find_wiki_root(override=args.wiki_root)
+    wiki_root.mkdir(parents=True, exist_ok=True)
+    cfg = load_config(wiki_root)
+    today = datetime.date.today().isoformat()
+
+    # Phase 0: bootstrap AGENTS.md from the bundled template if absent.
+    # Subsequent runs leave a present file alone — the user owns AGENTS.md
+    # after first bootstrap.
+    agents_path = wiki_root / "AGENTS.md"
+    if not agents_path.exists():
+        seed = SCRIPT_DIR / "_default_agents.md"
+        if seed.exists():
+            agents_path.write_text(seed.read_text(encoding="utf-8"), encoding="utf-8")
+            print(f"bootstrapped {agents_path} from {seed.name}", file=sys.stderr)
+
+    # Phase 1: enrich atomic guideline frontmatter from config
+    g_meta = _scan_atomic_guidelines(wiki_root)
+    tag_map = (cfg.get("tags") or {}).get("guideline") or {}
+    cluster_map = {}
+    for slug, info in (cfg.get("clusters") or {}).items():
+        for gid in (info or {}).get("members") or []:
+            cluster_map[gid] = slug
+
+    summaries_dir = wiki_root / SUMMARIES_DIR
+    enriched = 0
+    repaired_links = 0
+    for gid, info in g_meta.items():
+        text = info["path"].read_text(encoding="utf-8")
+        additions: dict[str, Any] = {"verified_at": today}
+        if tag_map.get(gid):
+            additions["tags"] = list(tag_map[gid])
+        cluster_slug = cluster_map.get(gid)
+        if cluster_slug:
+            additions["cluster"] = f"{cluster_slug}__cluster.md"
+            additions["superseded_by"] = f"{cluster_slug}__cluster.md"
+        # Auto-repair dangling `related_summary:` when the linked file is
+        # missing AND the session has arc-split summaries on disk. Picks the
+        # first arc lex-sorted; emits a stderr warning so the user can override
+        # by editing the frontmatter directly.
+        related = (info.get("related_summary") or "").strip()
+        if related and summaries_dir.is_dir():
+            related_basename = Path(related).name
+            if not (summaries_dir / related_basename).exists():
+                # try to find sibling arc-summaries for this session_id
+                sid_stem = related_basename.removesuffix(".md")
+                candidates = sorted(p.name for p in summaries_dir.glob(f"{sid_stem}__*.md"))
+                if candidates:
+                    new_related = f"{SUMMARIES_DIR}/{candidates[0]}"
+                    additions["related_summary"] = new_related
+                    if len(candidates) > 1:
+                        print(
+                            f"warning: {info['path'].name} pointed at missing {related_basename}; "
+                            f"repaired to {candidates[0]} (one of {len(candidates)} arc-summaries — "
+                            f"override via the frontmatter if a different arc is canonical)",
+                            file=sys.stderr,
+                        )
+                    repaired_links += 1
+        new_text = upsert_fields(text, additions, force_replace=("verified_at", "tags", "cluster", "superseded_by", "related_summary"))
+        # If we repaired related_summary, also fix the body link.
+        if "related_summary" in additions:
+            old_link = f"../{SUMMARIES_DIR}/{Path(related).name}"
+            new_link = f"../{additions['related_summary']}"
+            new_text = new_text.replace(old_link, new_link)
+        if new_text != text:
+            info["path"].write_text(new_text, encoding="utf-8")
+            enriched += 1
+    if repaired_links:
+        print(f"repaired {repaired_links} dangling related_summary link(s)")
+
+    # Phase 2: regenerate cluster pages from config
+    clusters_written = 0
+    for slug, info in (cfg.get("clusters") or {}).items():
+        out = wiki_root / GUIDELINES_DIR / f"{slug}__cluster.md"
+        out.write_text(_render_cluster_md(slug, info or {}, wiki_root), encoding="utf-8")
+        clusters_written += 1
+
+    # Phase 3: enrich summary frontmatter (metrics + tags + contributed_guidelines).
+    # Must happen BEFORE task pages render — _render_task_md reads enriched
+    # frontmatter (tool_calls, wiki_consulted, contributed_guidelines) via
+    # _classify_sessions; running it post-enrichment avoids stale zeros.
+    enriched_summaries = _enrich_summaries(wiki_root, cfg, g_meta, today)
+
+    # Phase 3b: inject/refresh `## Used by` section on each atomic guideline
+    # by inverting the `recalled_guidelines:` blocks across all summaries.
+    used_by_updated = _inject_used_by_sections(wiki_root, g_meta)
+
+    # Phase 4: regenerate task pages from config (suffix: __task.md)
+    tasks_written = 0
+    for slug, info in (cfg.get("tasks") or {}).items():
+        out = wiki_root / TASKS_DIR / f"{slug}__task.md"
+        out.parent.mkdir(parents=True, exist_ok=True)
+        out.write_text(_render_task_md(slug, info or {}, wiki_root, cfg), encoding="utf-8")
+        tasks_written += 1
+        # Migration: clean up legacy <slug>.md from a prior version of this script.
+        legacy = wiki_root / TASKS_DIR / f"{slug}.md"
+        if legacy.exists() and legacy != out:
+            legacy.unlink()
+
+    # Phase 5: regenerate index pages.
+    # Re-classify summaries here (after Phase 4 enrichment) so the index
+    # reflects the just-written tool_calls / wiki_consulted /
+    # contributed_guidelines frontmatter rather than pre-enrichment zeros.
+    sessions = _classify_sessions(wiki_root, cfg)
+    _write_root_index(wiki_root, cfg, g_meta, sessions, today)
+    _write_summaries_index(wiki_root, sessions, today)
+    _write_guidelines_index(wiki_root, g_meta, cfg, today)
+    _write_tasks_index(wiki_root, cfg, today)
+    _write_skills_index(wiki_root, _scan_skills(wiki_root), today)
+
+    # Phase 6: regenerate _index.jsonl
+    _write_jsonl_index(wiki_root, cfg, g_meta)
+    _assert_jsonl_index_integrity(wiki_root)
+
+    print(
+        f"catalog: enriched {enriched} guideline(s), wrote {clusters_written} cluster page(s), "
+        f"{tasks_written} task page(s), enriched {enriched_summaries} summary file(s), "
+        f"{used_by_updated} used-by section(s) updated"
+    )
+    return 0
+
+
+def _scan_atomic_guidelines(wiki_root: Path) -> dict[str, dict]:
+    """Return {id: {path, relpath, title, trigger, first_para, related_summary}} for atomic guidelines.
+
+    Atomic = NOT a `__cluster.md` file. Reads `id:` from frontmatter, falling
+    back to a content-derived id.
+    """
+    out: dict[str, dict] = {}
+    g_dir = wiki_root / GUIDELINES_DIR
+    if not g_dir.is_dir():
+        return out
+    for p in sorted(g_dir.glob("*.md")):
+        if p.name == "index.md" or p.name.endswith("__cluster.md"):
+            continue
+        text = p.read_text(encoding="utf-8")
+        fm, body = split_frontmatter(text)
+        if fm is None:
+            continue
+        gid_m = re.search(r"^id:\s*(\S+)", fm, re.MULTILINE)
+        if gid_m:
+            gid = gid_m.group(1).strip()
+        else:
+            cm = re.search(r"^# .+?\n\n(.+?)(?=\n\n|\n## |\Z)", body or "", re.S | re.M)
+            gid = compute_entity_id(cm.group(1) if cm else (body or ""))
+        title_m = re.search(r"^# (.+)$", body or "", re.MULTILINE)
+        trig_m = re.search(r"^trigger:\s*(.+)$", fm, re.MULTILINE)
+        rel_m = re.search(r"^related_summary:\s*(.+)$", fm, re.MULTILINE)
+        ver_m = re.search(r"^verified_at:\s*(.+)$", fm, re.MULTILINE)
+        tags_m = re.search(r"^tags:\s*\[(.*?)\]\s*$", fm, re.MULTILINE)
+        tags_list = [t.strip() for t in (tags_m.group(1).split(",") if tags_m else []) if t.strip()] if tags_m else []
+        cm = re.search(r"^# .+?\n\n(.+?)(?=\n\n|\n## |\Z)", body or "", re.S | re.M)
+        out[gid] = {
+            "path": p,
+            "relpath": f"{GUIDELINES_DIR}/{p.name}",
+            "title": title_m.group(1).strip() if title_m else p.name,
+            "trigger": trig_m.group(1).strip() if trig_m else "",
+            "first_para": (cm.group(1).replace("\n", " ").strip() if cm else "")[:240],
+            "related_summary": rel_m.group(1).strip() if rel_m else "",
+            "verified_at": ver_m.group(1).strip() if ver_m else "",
+            "tags": tags_list,
+        }
+    # also persist _id_index.json with current state
+    id_index = {gid: info["relpath"] for gid, info in out.items()}
+    if id_index:
+        _update_id_index(g_dir, id_index)
+    return out
+
+
+_CONDITION_IN_GOAL_RE = re.compile(r"(?<![A-Za-z0-9_])([a-z][a-z0-9_]+)/trial-\d+", re.IGNORECASE)
+
+
+def _extract_condition(goal: str, fm_data: dict) -> str:
+    """Return the trial condition slug for a summary, or '' if not detectable.
+    Prefers explicit frontmatter `condition:`; falls back to a `<slug>/trial-N`
+    pattern in the goal text (matches authoring conventions like
+    'claude_md_strong/trial-1', 'session_hook/trial-2').
+    """
+    explicit = fm_data.get("condition")
+    if isinstance(explicit, str) and explicit.strip():
+        return explicit.strip()
+    if not isinstance(goal, str):
+        return ""
+    m = _CONDITION_IN_GOAL_RE.search(goal)
+    return m.group(1) if m else ""
+
+
+def _scan_recalled_guidelines_in_summaries(wiki_root: Path) -> dict[str, list[dict]]:
+    """Build {gid -> [{summary_basename, summary_title, condition, status, evidence}]}
+    by parsing every summary's frontmatter via PyYAML.
+    """
+    out: dict[str, list[dict]] = {}
+    summaries_dir = wiki_root / SUMMARIES_DIR
+    if not summaries_dir.is_dir():
+        return out
+    for p in sorted(summaries_dir.glob("*.md")):
+        if p.name == "index.md":
+            continue
+        text = p.read_text(encoding="utf-8")
+        fm, _ = split_frontmatter(text)
+        if fm is None:
+            continue
+        try:
+            data = yaml.safe_load(fm) or {}
+        except yaml.YAMLError:
+            continue
+        rgs = data.get("recalled_guidelines") or []
+        summary_title = data.get("goal") or p.stem
+        condition = _extract_condition(data.get("goal") or "", data)
+        for entry in rgs:
+            if not isinstance(entry, dict):
+                continue
+            gid = str(entry.get("id") or "").strip()
+            if not gid:
+                continue
+            status = str(entry.get("status") or "ignored").strip().lower()
+            evidence = entry.get("evidence")
+            evidence = evidence.strip() if isinstance(evidence, str) else ""
+            out.setdefault(gid, []).append(
+                {
+                    "summary_basename": p.name,
+                    "summary_title": summary_title,
+                    "condition": condition,
+                    "status": status,
+                    "evidence": evidence,
+                }
+            )
+    return out
+
+
+_USED_BY_RE = re.compile(
+    r"\n## Used by\n.*?(?=\n## |\Z)",
+    re.DOTALL,
+)
+
+
+def _inject_used_by_sections(wiki_root: Path, g_meta: dict) -> int:
+    """For each atomic guideline page, render or refresh a `## Used by`
+    section listing every summary whose `recalled_guidelines:` references
+    this guideline, with per-session condition / status / evidence quote.
+    Idempotent: an existing section is replaced, not duplicated.
+
+    Always rendered, even when no recalls exist — pages without recalls show
+    `_(no recalls yet)_` so readers can distinguish 'never recalled' from
+    'old wiki, missing section'. Contributing the guideline does not count
+    as a use; only frontmatter `recalled_guidelines:` entries count.
+
+    Returns count of guideline pages updated.
+    """
+    usages = _scan_recalled_guidelines_in_summaries(wiki_root)
+    updated = 0
+    for gid, info in g_meta.items():
+        rows = usages.get(gid) or []
+        rows = sorted(rows, key=lambda r: r["summary_basename"])
+
+        lines = ["", "## Used by", ""]
+        if rows:
+            lines.append("| Session | Condition | Status | Evidence |")
+            lines.append("|---------|-----------|--------|----------|")
+            for r in rows:
+                sid_short = r["summary_basename"].split(".md")[0][:18]
+                cond = r.get("condition") or "—"
+                ev = (r.get("evidence") or "").replace("|", "\\|").replace("\n", " ").strip()
+                if len(ev) > 200:
+                    ev = ev[:197] + "…"
+                ev_cell = f'"{ev}"' if ev else "—"
+                lines.append(f"| [{sid_short}…](../{SUMMARIES_DIR}/{r['summary_basename']}) | `{cond}` | **{r['status']}** | {ev_cell} |")
+        else:
+            lines.append("_(no recalls yet)_")
+        new_section = "\n".join(lines) + "\n"
+
+        text = info["path"].read_text(encoding="utf-8")
+        # Remove any existing ## Used by section (idempotent)
+        without = _USED_BY_RE.sub("", text)
+        # Insert the fresh section right before ## Sources, or at end if no Sources.
+        if "\n## Sources" in without:
+            new_text = without.replace("\n## Sources", new_section + "\n## Sources", 1)
+        else:
+            new_text = without.rstrip() + new_section
+        if new_text != text:
+            info["path"].write_text(new_text, encoding="utf-8")
+            updated += 1
+    return updated
+
+
+def _enrich_summaries(wiki_root: Path, cfg: dict, g_meta: dict, today: str) -> int:
+    """Compute metrics from normalized JSON; add `contributed_guidelines:` and
+    `contributed_skills:` from inverted related_summary on guidelines/ + skills/.
+    """
+    # invert: summary_basename (without .md) -> [guideline_ids] from each
+    # guideline's related_summary. The basename keys the lookup so an
+    # arc-summary like `<sid>__arc1.md` only collects guidelines that
+    # explicitly bind to that arc.
+    basename_to_gids: dict[str, list[str]] = {}
+    for gid, info in g_meta.items():
+        rel = info.get("related_summary") or ""
+        m = re.match(rf"^{SUMMARIES_DIR}/(.+)\.md$", rel)
+        if m:
+            basename_to_gids.setdefault(m.group(1), []).append(gid)
+
+    # Same inversion across <wiki>/skills/ — by skill slug, not content hash.
+    basename_to_skills: dict[str, list[str]] = {}
+    for slug, sk in _scan_skills(wiki_root).items():
+        rel = sk.get("related_summary") or ""
+        m = re.match(rf"^{SUMMARIES_DIR}/(.+)\.md$", rel)
+        if m:
+            basename_to_skills.setdefault(m.group(1), []).append(slug)
+
+    repo_root = _repo_root(wiki_root)
+    enriched = 0
+    summaries_dir = wiki_root / SUMMARIES_DIR
+    if not summaries_dir.is_dir():
+        return 0
+    overrides = cfg.get("session_family_overrides") or {}
+    tasks_cfg = cfg.get("tasks") or {}
+    for p in sorted(summaries_dir.glob("*.md")):
+        if p.name == "index.md":
+            continue
+        text = p.read_text(encoding="utf-8")
+        fm, body = split_frontmatter(text)
+        if fm is None:
+            continue
+        sid_m = re.search(r"^session_id:\s*(.+)$", fm, re.MULTILINE)
+        sid = sid_m.group(1).strip() if sid_m else p.stem
+        sources_list = re.findall(r"^  - (\S+)", fm, re.MULTILINE)
+        np_rel = sources_list[0] if sources_list else ""
+        path_haystack = " ".join(sources_list)
+        goal_m = re.search(r"^goal:\s*(.+)$", fm, re.MULTILINE)
+        goal = goal_m.group(1).strip() if goal_m else ""
+        if goal.startswith('"') and goal.endswith('"'):
+            try:
+                goal = json.loads(goal)
+            except Exception:
+                pass
+
+        metrics = _compute_metrics(np_rel, repo_root)
+        family, _, _ = _classify_one(sid, goal, path_haystack, overrides, tasks_cfg)
+        tags = _summary_tags(goal, path_haystack, family, metrics["wiki_consulted"])
+        # Look up by the summary file's basename (sans .md) so arc-summaries
+        # only collect guidelines that bound to that specific arc.
+        contributed = basename_to_gids.get(p.stem, [])
+        contributed_skills = basename_to_skills.get(p.stem, [])
+
+        additions = {
+            "tags": tags,
+            "tool_calls": metrics["tool_calls"],
+            "errors": metrics["errors"],
+            "dead_end_paths": metrics["dead_end_paths"],
+            "wiki_consulted": metrics["wiki_consulted"],
+            "contributed_guidelines": contributed,
+            "contributed_skills": contributed_skills,
+            "verified_at": today,
+        }
+        # Token + cost fields. Skip whichever are zero — saves clutter on
+        # summaries that lack the data (older normalized JSONs without the
+        # new stats fields, or sessions where the result event had no usage).
+        if metrics["input_tokens"] or metrics["cache_creation_input_tokens"] or metrics["output_tokens"]:
+            additions["input_tokens"] = metrics["input_tokens"]
+            additions["cache_creation_input_tokens"] = metrics["cache_creation_input_tokens"]
+            additions["cache_read_input_tokens"] = metrics["cache_read_input_tokens"]
+            additions["output_tokens"] = metrics["output_tokens"]
+        if metrics["total_cost_usd"]:
+            # Round to 4 decimals so frontmatter stays readable.
+            additions["total_cost_usd"] = round(float(metrics["total_cost_usd"]), 4)
+        new_text = upsert_fields(
+            text,
+            additions,
+            force_replace=(
+                "tags",
+                "tool_calls",
+                "errors",
+                "dead_end_paths",
+                "wiki_consulted",
+                "contributed_guidelines",
+                "contributed_skills",
+                "verified_at",
+                "input_tokens",
+                "cache_creation_input_tokens",
+                "cache_read_input_tokens",
+                "output_tokens",
+                "total_cost_usd",
+            ),
+        )
+        # One-shot migration: drop the legacy `recall_used:` field if present.
+        new_text = re.sub(r"^recall_used:.*\n", "", new_text, count=1, flags=re.MULTILINE)
+        if new_text != text:
+            p.write_text(new_text, encoding="utf-8")
+            enriched += 1
+    return enriched
+
+
+def _repo_root(wiki_root: Path) -> Path:
+    cur = wiki_root.parent
+    while True:
+        if (cur / ".git").exists():
+            return cur
+        if cur.parent == cur:
+            return wiki_root.parent
+        cur = cur.parent
+
+
+def _compute_metrics(np_rel: str, repo_root: Path) -> dict:
+    out = {
+        "tool_calls": 0,
+        "errors": 0,
+        "dead_end_paths": 0,
+        "wiki_consulted": False,
+        "input_tokens": 0,
+        "cache_creation_input_tokens": 0,
+        "cache_read_input_tokens": 0,
+        "output_tokens": 0,
+        "total_cost_usd": 0.0,
+    }
+    if not np_rel:
+        return out
+    p = Path(np_rel)
+    candidates = [p if p.is_absolute() else repo_root / np_rel]
+    # if relative path doesn't exist, try wiki ancestor too
+    json_path = next((c for c in candidates if c.exists()), None)
+    if not json_path:
+        return out
+    try:
+        d = json.loads(json_path.read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError):
+        return out
+    msgs = (d.get("openai_chat_completion") or {}).get("messages") or []
+    stats = d.get("stats") or {}
+    out["tool_calls"] = stats.get("tool_call_count", 0)
+    errs = sum(1 for m in msgs if m.get("role") == "tool" and m.get("is_error"))
+    out["errors"] = errs
+    out["dead_end_paths"] = errs
+
+    # Token + cost from stats (preferred; populated by the updated normalizer).
+    # Fallback: walk per-message `usage` blocks. Cost USD has no fallback —
+    # only the original `result` event carries it.
+    if "input_tokens" in stats:
+        out["input_tokens"] = int(stats.get("input_tokens") or 0)
+        out["cache_creation_input_tokens"] = int(stats.get("cache_creation_input_tokens") or 0)
+        out["cache_read_input_tokens"] = int(stats.get("cache_read_input_tokens") or 0)
+        out["output_tokens"] = int(stats.get("output_tokens") or 0)
+    else:
+        for m in msgs:
+            if m.get("role") != "assistant":
+                continue
+            usage = m.get("usage") or {}
+            out["input_tokens"] += int(usage.get("input_tokens") or 0)
+            out["cache_creation_input_tokens"] += int(usage.get("cache_creation_input_tokens") or 0)
+            out["cache_read_input_tokens"] += int(usage.get("cache_read_input_tokens") or 0)
+            out["output_tokens"] += int(usage.get("output_tokens") or 0)
+    out["total_cost_usd"] = float(stats.get("total_cost_usd") or 0.0)
+    # `wiki_consulted`: did the agent read any wiki guideline/AGENTS.md page?
+    # Detect via Read tool calls or Bash commands containing wiki-shaped paths
+    # (`/AGENTS.md` or `/guidelines/<slug>__<gid>.md`). agent-wiki has
+    # no knowledge of any other recall layer.
+    wiki_consulted = False
+    wiki_path_pat = re.compile(r"AGENTS\.md|guidelines/[A-Za-z0-9_./-]+\.md")
+    for m in msgs:
+        if m.get("role") != "assistant":
+            continue
+        for tc in m.get("tool_calls") or []:
+            fn = tc.get("function") or {}
+            args = fn.get("arguments", "")
+            if isinstance(args, str):
+                try:
+                    args = json.loads(args)
+                except Exception:
+                    args = {}
+            if not isinstance(args, dict):
+                continue
+            name = fn.get("name", "")
+            if name == "Read":
+                if wiki_path_pat.search(str(args.get("file_path", ""))):
+                    wiki_consulted = True
+            elif name == "Bash":
+                if wiki_path_pat.search(str(args.get("command", ""))):
+                    wiki_consulted = True
+    out["wiki_consulted"] = wiki_consulted
+    return out
+
+
+def _summary_tags(goal: str, np_rel: str, family: str | None, wiki_consulted: bool) -> list[str]:
+    tags: list[str] = []
+    g = goal.lower()
+    if family:
+        tags.append(family)
+    if "exif" in g or "exif" in np_rel.lower():
+        tags.append("exif")
+    if "focal length" in g:
+        tags.append("focal-length")
+    if "lens model" in g or "lens" in g:
+        tags.append("lens-model")
+    if "where was" in g or "gps" in g:
+        tags.append("gps")
+    if "synthesize" in g:
+        tags.append("synthesize-skill")
+    m = re.search(r"trial_(\d+)_(seed|no_recall|guidelines|skill)", np_rel)
+    if m:
+        tags.append(f"trial-{m.group(1)}")
+        tags.append(f"condition-{m.group(2).replace('_', '-')}")
+    if wiki_consulted:
+        tags.append("wiki-consulted")
+    seen = set()
+    out = []
+    for t in tags:
+        if t not in seen:
+            out.append(t)
+            seen.add(t)
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Index page writers
+# ---------------------------------------------------------------------------
+
+
+def _write_root_index(wiki_root: Path, cfg: dict, g_meta: dict, sessions: list[dict], today: str) -> None:
+    n_clusters = len(cfg.get("clusters") or {})
+    n_tasks = len(cfg.get("tasks") or {})
+    n_subtasks = len(_scan_subtasks(wiki_root / TASKS_DIR))
+    n_atomic = len(g_meta)
+    multi_arc_sessions = len({s["session_id"] for s in sessions if sum(1 for x in sessions if x["session_id"] == s["session_id"]) > 1})
+    summary_blurb = f"{len(sessions)} pages"
+    if multi_arc_sessions:
+        summary_blurb += f" ({multi_arc_sessions} session(s) split across multiple arc-summaries)"
+    lines = [
+        "---",
+        "type: wiki-index",
+        f"verified_at: {today}",
+        "---",
+        "",
+        f"# {wiki_root.name}",
+        "",
+        "An evidence-grounded wiki of agent trajectories: each lesson links back to "
+        "the trajectory that produced it. Built by the `agent-wiki` skill family from "
+        "normalized agent transcripts.",
+        "",
+        "## Sections",
+        "",
+        f"- [Tasks](tasks/index.md) — `__task.md` cross-session comparisons ({n_tasks}) "
+        f"+ `__subtask.md` per-session workstreams ({n_subtasks})",
+        f"- [Guidelines](guidelines/index.md) — atomic lessons + cluster aggregator pages "
+        f"(suffix `__cluster.md`); cluster pages are recall-preferred ({n_atomic} atomic + {n_clusters} clusters)",
+        f"- [Summaries](summaries/index.md) — episodic summaries ({summary_blurb}). "
+        f"Long sessions may be split into multiple arc-summaries that share a `session_id`.",
+        "",
+        "## How content relates",
+        "",
+        "```",
+        "raw .jsonl  ──normalize──▶  normalized JSON  ──summarize──▶  summary",
+        "                                                                │",
+        "                                                                └──▶  guideline (one or more)  ──cluster──▶  guideline (cluster) page",
+        "                                                                                                              │",
+        "                            task comparison page  ◀───────────────────────────────────────────────────────────┘",
+        "```",
+        "",
+        "Provenance closes via:",
+        "",
+        "- `summary.contributed_guidelines: [id, …]` (outbound)",
+        "- `guideline.related_summary: summaries/<sid>.md` (inbound)",
+        "- `guideline.cluster: <slug>__cluster.md` (themed group)",
+        "- `cluster.members[].link: <member>.md` (preserves originals)",
+        "- `_index.jsonl` at the wiki root for cheap filter+score retrieval",
+        "",
+        "## For agents (recall-time)",
+        "",
+        "Read [_index.jsonl](_index.jsonl) — one row per guideline + cluster page with "
+        "`{id, kind, title, tags, trigger, summary, link}`. Filter by tag, score on "
+        "trigger overlap, then follow `link` for the full content.",
+        "",
+        "## Cluster pages",
+        "",
+        "Cluster pages live in `guidelines/` with the `__cluster.md` suffix. They are "
+        "themed aggregators that reference atomic-guideline siblings — the originals "
+        "stay intact. At recall time clusters are preferred over their members; atomic "
+        "members carry a `superseded_by:` field.",
+        "",
+        "## Staleness",
+        "",
+        f"All pages stamp `verified_at`. Today: **{today}**. Pages without an "
+        "`expires_at` are valid until a follow-up trajectory contradicts them.",
+        "",
+    ]
+    (wiki_root / "index.md").write_text("\n".join(lines), encoding="utf-8")
+
+
+def _write_summaries_index(wiki_root: Path, sessions: list[dict], today: str) -> None:
+    grouped: dict[tuple[str, str], list[dict]] = {}
+    for s in sessions:
+        key = (s.get("family") or "other", s.get("condition") or "unknown")
+        grouped.setdefault(key, []).append(s)
+
+    # Sessions split into multiple arc-summaries (same session_id across N files)
+    by_sid: dict[str, list[dict]] = {}
+    for s in sessions:
+        by_sid.setdefault(s["session_id"], []).append(s)
+    multi_arc = {sid: rows for sid, rows in by_sid.items() if len(rows) > 1}
+
+    lines = [
+        "---",
+        "type: section-index",
+        "section: summaries",
+        f"verified_at: {today}",
+        f"count: {len(sessions)}",
+        "---",
+        "",
+        "# Summaries",
+        "",
+        "One episodic summary per trajectory (or per arc, when a long session "
+        "is split into multiple arc-summaries that share a `session_id`). "
+        "See [../tasks/](../tasks/index.md) for cross-session comparisons "
+        "and intra-session subtasks.",
+        "",
+    ]
+    if multi_arc:
+        lines.append("## Sessions split across multiple arc-summaries")
+        lines.append("")
+        lines.append(
+            "These rows share a `session_id` but live in separate files. Each "
+            "carries `arc:` plus a `sibling_summaries:` list pointing at the others."
+        )
+        lines.append("")
+        for sid, rows in sorted(multi_arc.items()):
+            lines.append(f"- **`{sid[:8]}…`** — {len(rows)} arcs:")
+            for r in sorted(rows, key=lambda x: x.get("arc") or x["summary_basename"]):
+                arc = r.get("arc") or "(no arc tag)"
+                lines.append(f"  - [{arc}]({r['summary_basename']})")
+        lines.append("")
+    for key in sorted(grouped.keys()):
+        fam, cond = key
+        rows = sorted(grouped[key], key=lambda x: x.get("trial") or 0)
+        lines.append(f"## `{fam}` / `{cond}` ({len(rows)})")
+        lines.append("")
+        lines.append("| Trial | Session | Arc | Tool calls | Errors | Wiki used | Contributed guidelines | Contributed skills | Cost USD |")
+        lines.append("|------:|---------|-----|-----------:|-------:|:------:|------------------------|--------------------|---------:|")
+        for s in rows:
+            sid = s["session_id"]
+            arc = s.get("arc") or "—"
+            tc = s.get("tool_calls") or 0
+            err = s.get("errors") or 0
+            recall = "Y" if s.get("wiki_consulted") else "—"
+            contrib = ", ".join(f"`{x}`" for x in (s.get("contributed_guidelines") or [])) or "—"
+            skills = ", ".join(f"`{x}`" for x in (s.get("contributed_skills") or [])) or "—"
+            trial = str(s.get("trial") or "—")
+            cost = s.get("total_cost_usd")
+            cost_cell = f"${cost:.4f}" if cost else "—"
+            lines.append(
+                f"| {trial} | [{sid[:8]}…]({s['summary_path'].name}) | {arc} | {tc} | {err} | {recall} | {contrib} | {skills} | {cost_cell} |"
+            )
+        lines.append("")
+    (wiki_root / SUMMARIES_DIR / "index.md").write_text("\n".join(lines), encoding="utf-8")
+
+
+_PRIORITY_TIERS = ("high", "disputed", "weak", "normal", "low", "unvalidated")
+
+
+def _compute_priority(*, kind: str, is_cluster_member: bool, counts: dict) -> str:
+    """Multi-factor priority assignment. See `_PRIORITY_TIERS` for the order
+    `## Pages, by priority` sorts rows in. Six tiers, computed from recall
+    counts + cluster membership; not authored. Recomputed on every catalog.
+    """
+    if kind == "cluster":
+        return "high"
+    f = counts.get("followed", 0)
+    i = counts.get("ignored", 0)
+    c = counts.get("contradicted", 0)
+    h = counts.get("harmful", 0)
+    has_neg = (c > 0) or (h > 0)
+    if f > 0 and has_neg:
+        return "disputed"  # mixed signals — investigate
+    if has_neg:
+        return "weak"  # neg-only — candidate to deprecate
+    if f >= 5:
+        return "high"  # strongly validated
+    if is_cluster_member:
+        return "low"  # cluster supersedes routine atomics
+    if f + i == 0:
+        return "unvalidated"
+    return "normal"
+
+
+def _render_priority_table(*, g_meta: dict, clusters: dict, tag_map: dict, cluster_for_id: dict, usages: dict, today: str) -> list[str]:
+    """Build the `## Pages, by priority` table. Returns a list of markdown
+    lines (caller appends to its own `lines` buffer).
+    """
+    rows: list[dict] = []
+
+    # Cluster rows
+    for slug, info in (clusters or {}).items():
+        info = info or {}
+        rows.append(
+            {
+                "title": info.get("title") or slug,
+                "link": f"{slug}__cluster.md",
+                "kind": "cluster",
+                "priority": _compute_priority(kind="cluster", is_cluster_member=False, counts={}),
+                "trigger": "—",
+                "tags": ", ".join(info.get("tags") or []) or "—",
+                "cluster": "—",
+                "counts": {s: 0 for s in ALLOWED_STATUSES},
+                "verified_at": today,
+            }
+        )
+
+    # Atomic rows
+    for gid, info in g_meta.items():
+        cs = cluster_for_id.get(gid)
+        recalls = usages.get(gid) or []
+        counts = {s: 0 for s in ALLOWED_STATUSES}
+        for r in recalls:
+            st = r["status"]
+            if st in counts:
+                counts[st] += 1
+        rows.append(
+            {
+                "title": info["title"],
+                "link": Path(info["relpath"]).name,
+                "kind": "atomic",
+                "priority": _compute_priority(
+                    kind="atomic",
+                    is_cluster_member=bool(cs),
+                    counts=counts,
+                ),
+                "trigger": (info.get("trigger") or "—").strip() or "—",
+                "tags": ", ".join(tag_map.get(gid) or []) or "—",
+                "cluster": cs or "—",
+                "counts": counts,
+                "verified_at": info.get("verified_at") or today,
+            }
+        )
+
+    tier_order = {t: i for i, t in enumerate(_PRIORITY_TIERS)}
+    rows.sort(key=lambda r: (tier_order.get(r["priority"], 99), r["title"].lower()))
+
+    out = ["## Pages, by priority", ""]
+    out.append(
+        "Unified roll-up across clusters + atomic guidelines. Priority is "
+        "computed each catalog run from recall counts and cluster membership "
+        "(not authored). Rows sort by tier "
+        "(`high` → `disputed` → `weak` → `normal` → `low` → `unvalidated`), "
+        "then alphabetical within tier."
+    )
+    out.append("")
+    out.append("| Title | Kind | Priority | Trigger | Tags | Cluster | Recall (T / f / i / c / h) | Verified at |")
+    out.append("|-------|------|----------|---------|------|---------|---------------------------:|-------------|")
+    for r in rows:
+        c = r["counts"]
+        total = sum(c.values())
+        recall_cell = (
+            "—" if r["kind"] == "cluster" else (f"{total} / {c['followed']} / {c['ignored']} / {c['contradicted']} / {c['harmful']}")
+        )
+        # Pipe-escape and truncate trigger to keep the row scannable.
+        trig = r["trigger"].replace("|", "\\|").replace("\n", " ").strip()
+        if len(trig) > 80:
+            trig = trig[:77] + "…"
+        out.append(
+            f"| [{r['title']}]({r['link']}) | {r['kind']} | **{r['priority']}** "
+            f"| {trig} | {r['tags']} | {r['cluster']} | {recall_cell} | {r['verified_at']} |"
+        )
+    out.append("")
+    return out
+
+
+def _write_guidelines_index(wiki_root: Path, g_meta: dict, cfg: dict, today: str) -> None:
+    clusters = cfg.get("clusters") or {}
+    tag_map = (cfg.get("tags") or {}).get("guideline") or {}
+    by_tag: dict[str, list[tuple[str, dict]]] = {}
+    for gid, info in g_meta.items():
+        for tag in tag_map.get(gid) or ["untagged"]:
+            by_tag.setdefault(tag, []).append((gid, info))
+
+    cluster_for_id = {gid: slug for slug, c in clusters.items() for gid in (c or {}).get("members") or []}
+
+    lines = [
+        "---",
+        "type: section-index",
+        "section: guidelines",
+        f"verified_at: {today}",
+        f"count: {len(g_meta) + len(clusters)}",
+        f"atomic: {len(g_meta)}",
+        f"clusters: {len(clusters)}",
+        "---",
+        "",
+        "# Guidelines",
+        "",
+        "Atomic, trigger-tagged lessons plus aggregator **cluster pages** that "
+        "group related variants. Cluster pages have the suffix `__cluster.md` and "
+        "are recall-preferred — when a cluster and its members both match a query, "
+        "the cluster wins. Members carry a `superseded_by:` field pointing at "
+        "their cluster.",
+        "",
+    ]
+    if clusters:
+        lines.append("## Clusters (prefer these first)")
+        lines.append("")
+        for slug, info in clusters.items():
+            info = info or {}
+            n = len((info.get("members") or []))
+            tags = ", ".join(info.get("tags") or [])
+            lines.append(f"- **[{info.get('title') or slug}]({slug}__cluster.md)** `cluster:{slug}` — `tags: {tags}` ({n} members)")
+        lines.append("")
+    lines.append("## Atomic guidelines, alphabetical")
+    lines.append("")
+    for gid, info in sorted(g_meta.items(), key=lambda x: x[1]["title"]):
+        snippet = info["first_para"]
+        if len(snippet) > 140:
+            snippet = snippet[:140].rsplit(" ", 1)[0] + "…"
+        cs = cluster_for_id.get(gid)
+        clink = f" [→ cluster: {cs}]({cs}__cluster.md)" if cs else ""
+        lines.append(f"- **[{info['title']}]({Path(info['relpath']).name})** `{gid}`{clink}")
+        lines.append(f"  - {snippet}")
+    if any(len(v) >= 2 for v in by_tag.values()):
+        lines.append("")
+        lines.append("## By tag")
+        lines.append("")
+        for tag in sorted(by_tag.keys()):
+            members = by_tag[tag]
+            if len(members) < 2:
+                continue
+            lines.append(f"### `{tag}`")
+            lines.append("")
+            for gid, info in sorted(members, key=lambda x: x[1]["title"]):
+                lines.append(f"- [{info['title']}]({Path(info['relpath']).name}) `{gid}`")
+            lines.append("")
+
+    # Recall roll-up: per-guideline counts of how many summaries recalled
+    # this rule, broken down by status. Always rendered when there is at
+    # least one atomic guideline (zero-recall rows show all zeros so the
+    # reader can see what's been contributed but not yet validated).
+    if g_meta:
+        usages = _scan_recalled_guidelines_in_summaries(wiki_root)
+        lines.append("")
+        lines.append("## Recall roll-up")
+        lines.append("")
+        lines.append(
+            "Cross-summary tally of `recalled_guidelines:` blocks. "
+            "Rows are alphabetical by guideline title. A row of zeros "
+            "means the guideline has been contributed by a session "
+            "but never recalled by another."
+        )
+        lines.append("")
+        lines.append("| Guideline | Total | followed | ignored | contradicted | harmful |")
+        lines.append("|-----------|------:|---------:|--------:|-------------:|--------:|")
+        for gid, info in sorted(g_meta.items(), key=lambda x: x[1]["title"]):
+            rows = usages.get(gid) or []
+            counts = {s: 0 for s in ALLOWED_STATUSES}
+            for r in rows:
+                st = r["status"]
+                if st in counts:
+                    counts[st] += 1
+            total = sum(counts.values())
+            lines.append(
+                f"| [{info['title']}]({Path(info['relpath']).name}) "
+                f"| {total} | {counts['followed']} | {counts['ignored']} "
+                f"| {counts['contradicted']} | {counts['harmful']} |"
+            )
+        lines.append("")
+
+    # ── Pages by priority — unified table across clusters + atomics ──
+    # Lives at the bottom of the page so it doesn't crowd the bullets above.
+    if g_meta or clusters:
+        usages_for_priority = _scan_recalled_guidelines_in_summaries(wiki_root)
+        lines.extend(
+            _render_priority_table(
+                g_meta=g_meta,
+                clusters=clusters,
+                tag_map=tag_map,
+                cluster_for_id=cluster_for_id,
+                usages=usages_for_priority,
+                today=today,
+            )
+        )
+
+    (wiki_root / GUIDELINES_DIR / "index.md").write_text("\n".join(lines), encoding="utf-8")
+
+
+def _write_tasks_index(wiki_root: Path, cfg: dict, today: str) -> None:
+    tasks = cfg.get("tasks") or {}
+    tasks_dir = wiki_root / TASKS_DIR
+    tasks_dir.mkdir(parents=True, exist_ok=True)
+    subtasks = _scan_subtasks(tasks_dir)
+    lines = [
+        "---",
+        "type: section-index",
+        "section: tasks",
+        f"verified_at: {today}",
+        f"task_pages: {len(tasks)}",
+        f"subtask_pages: {len(subtasks)}",
+        "---",
+        "",
+        "# Tasks",
+        "",
+        "Two kinds of pages live here, distinguished by filename suffix:",
+        "",
+        "- **`__task.md`** — cross-session task-comparisons. Joins all sessions "
+        "that attempted the same task across trials and conditions; defined in "
+        "`_config.yaml` under `tasks:`.",
+        "- **`__subtask.md`** — narrative slices within a single session. Authored standalone; not regenerated from config.",
+        "",
+    ]
+    if tasks:
+        lines.append("## Task comparisons")
+        lines.append("")
+        for slug, info in tasks.items():
+            info = info or {}
+            lines.append(f"- **[{info.get('title') or slug}]({slug}__task.md)** — `{info.get('family') or slug}` family")
+        lines.append("")
+    if subtasks:
+        lines.append("## Subtasks (per-session workstreams)")
+        lines.append("")
+        # group by parent_session_id when present
+        grouped: dict[str, list[dict]] = {}
+        for st in subtasks:
+            grouped.setdefault(st.get("parent_session_id") or "(unknown)", []).append(st)
+        for sid, items in sorted(grouped.items()):
+            head = sid[:12] + "…" if len(sid) > 12 else sid
+            lines.append(f"### Session `{head}`")
+            lines.append("")
+            for st in sorted(items, key=lambda x: x.get("title") or x.get("slug") or ""):
+                lines.append(f"- **[{st.get('title') or st.get('slug')}]({st['filename']})**")
+            lines.append("")
+    (tasks_dir / "index.md").write_text("\n".join(lines), encoding="utf-8")
+
+
+def _scan_subtasks(tasks_dir: Path) -> list[dict]:
+    out: list[dict] = []
+    if not tasks_dir.is_dir():
+        return out
+    for p in sorted(tasks_dir.glob("*__subtask.md")):
+        text = p.read_text(encoding="utf-8")
+        fm, body = split_frontmatter(text)
+        if fm is None:
+            continue
+        title_m = re.search(r"^title:\s*(.+)$", fm, re.MULTILINE)
+        slug_m = re.search(r"^slug:\s*(.+)$", fm, re.MULTILINE)
+        psid_m = re.search(r"^parent_session_id:\s*(.+)$", fm, re.MULTILINE)
+        psum_m = re.search(r"^parent_summary:\s*(.+)$", fm, re.MULTILINE)
+        tags_m = re.search(r"^tags:\s*\[(.*?)\]\s*$", fm, re.MULTILINE)
+        title = title_m.group(1).strip() if title_m else None
+        if title and title.startswith('"') and title.endswith('"'):
+            try:
+                title = json.loads(title)
+            except Exception:
+                pass
+        out.append(
+            {
+                "filename": p.name,
+                "slug": slug_m.group(1).strip() if slug_m else p.stem.replace("__subtask", ""),
+                "title": title or p.stem,
+                "parent_session_id": (psid_m.group(1).strip() if psid_m else ""),
+                "parent_summary": (psum_m.group(1).strip() if psum_m else ""),
+                "tags": [x.strip() for x in (tags_m.group(1).split(",") if tags_m else []) if x.strip()],
+            }
+        )
+    return out
+
+
+def _scan_skills(wiki_root: Path) -> dict[str, dict]:
+    """Return {slug: {path, relpath, name, description, trigger, related_summary,
+    verified_at, tags}} for every wiki skill at <wiki>/skills/<slug>/SKILL.md.
+    """
+    out: dict[str, dict] = {}
+    sk_dir = wiki_root / SKILLS_DIR
+    if not sk_dir.is_dir():
+        return out
+    for sub in sorted(p for p in sk_dir.iterdir() if p.is_dir()):
+        skill_md = sub / "SKILL.md"
+        if not skill_md.is_file():
+            continue
+        text = skill_md.read_text(encoding="utf-8")
+        fm, body = split_frontmatter(text)
+        if fm is None:
+            continue
+        try:
+            data = yaml.safe_load(fm) or {}
+        except yaml.YAMLError:
+            continue
+        slug = str(data.get("name") or sub.name).strip()
+        out[slug] = {
+            "path": skill_md,
+            "relpath": f"{SKILLS_DIR}/{sub.name}/SKILL.md",
+            "name": slug,
+            "description": str(data.get("description") or "").strip(),
+            "trigger": str(data.get("trigger") or "").strip(),
+            "related_summary": str(data.get("related_summary") or "").strip(),
+            "verified_at": str(data.get("verified_at") or "").strip(),
+            "tags": data.get("tags") or [],
+        }
+    return out
+
+
+def _write_skills_index(wiki_root: Path, skills: dict[str, dict], today: str) -> None:
+    sk_dir = wiki_root / SKILLS_DIR
+    sk_dir.mkdir(parents=True, exist_ok=True)
+    lines: list[str] = [
+        "---",
+        "type: section-index",
+        "section: skills",
+        f"verified_at: {today}",
+        f"count: {len(skills)}",
+        "---",
+        "",
+        "# Skills",
+        "",
+        "Wiki-resident, callable workflow pages. Each `<slug>/SKILL.md` is a "
+        "structured procedural artifact: frontmatter + Overview + When To Use + "
+        "Workflow + (optional) supporting scripts under `<slug>/scripts/`. At "
+        "retrieval time, skills sort between clusters and atomic guidelines in "
+        "`_index.jsonl` — directly callable, recall-preferred over guidelines "
+        "for the same trigger.",
+        "",
+    ]
+    if not skills:
+        lines.append("_(none yet — synthesize one via `agent-wiki-synthesize-skill`)_")
+        lines.append("")
+    else:
+        lines.append("| Skill | Description | Trigger | Verified at |")
+        lines.append("|---|---|---|---|")
+        for slug, info in sorted(skills.items()):
+            trig = (info.get("trigger") or "—").replace("|", "\\|")
+            if len(trig) > 80:
+                trig = trig[:77] + "…"
+            desc = (info.get("description") or "—").replace("|", "\\|")
+            if len(desc) > 80:
+                desc = desc[:77] + "…"
+            lines.append(f"| **[{slug}]({slug}/SKILL.md)** | {desc} | {trig} | {info.get('verified_at') or today} |")
+        lines.append("")
+    (sk_dir / "index.md").write_text("\n".join(lines), encoding="utf-8")
+
+
+def _write_jsonl_index(wiki_root: Path, cfg: dict, g_meta: dict) -> None:
+    rows = []
+    clusters = cfg.get("clusters") or {}
+    tag_map = (cfg.get("tags") or {}).get("guideline") or {}
+    cluster_for_id = {gid: slug for slug, c in clusters.items() for gid in (c or {}).get("members") or []}
+
+    # clusters first
+    for slug, info in clusters.items():
+        info = info or {}
+        rows.append(
+            {
+                "kind": "cluster",
+                "id": f"cluster:{slug}",
+                "title": info.get("title") or slug,
+                "tags": info.get("tags") or [],
+                "trigger": "",
+                "summary": (info.get("description") or "")[:240],
+                "link": f"{GUIDELINES_DIR}/{slug}__cluster.md",
+                "members": info.get("members") or [],
+                "priority": "high",
+            }
+        )
+
+    # skills (recall-preferred over plain atomics)
+    for slug, info in sorted(_scan_skills(wiki_root).items()):
+        rows.append(
+            {
+                "kind": "skill",
+                "id": f"skill:{slug}",
+                "title": info["name"],
+                "tags": info.get("tags") or [],
+                "trigger": info.get("trigger") or "",
+                "summary": (info.get("description") or "")[:240],
+                "link": info["relpath"],
+                "priority": "high",
+            }
+        )
+
+    # atomic guidelines
+    for gid, info in g_meta.items():
+        snippet = info["first_para"]
+        if len(snippet) > 240:
+            snippet = snippet[:240].rsplit(" ", 1)[0] + "…"
+        cs = cluster_for_id.get(gid)
+        row = {
+            "kind": "guideline",
+            "id": gid,
+            "title": info["title"],
+            "tags": tag_map.get(gid) or [],
+            "trigger": info["trigger"],
+            "summary": snippet,
+            "link": info["relpath"],
+            "cluster": cs,
+        }
+        if cs:
+            row["superseded_by"] = f"{cs}__cluster.md"
+        rows.append(row)
+
+    # tasks (cross-session comparison pages)
+    for slug, info in (cfg.get("tasks") or {}).items():
+        info = info or {}
+        rows.append(
+            {
+                "kind": "task",
+                "id": f"task:{slug}",
+                "title": info.get("title") or slug,
+                "tags": info.get("tags") or [],
+                "trigger": "",
+                "summary": (info.get("intro") or info.get("findings") or "")[:240],
+                "link": f"{TASKS_DIR}/{slug}__task.md",
+                "family": info.get("family") or slug,
+            }
+        )
+
+    # subtasks (per-session workstreams)
+    subtasks = _scan_subtasks(wiki_root / TASKS_DIR)
+    for st in subtasks:
+        rows.append(
+            {
+                "kind": "subtask",
+                "id": f"subtask:{st['slug']}",
+                "title": st["title"],
+                "tags": st.get("tags") or [],
+                "trigger": "",
+                "summary": "",
+                "link": f"{TASKS_DIR}/{st['filename']}",
+                "parent_session_id": st.get("parent_session_id") or None,
+                "parent_summary": st.get("parent_summary") or None,
+            }
+        )
+
+    p = wiki_root / JSONL_INDEX_FILENAME
+    with p.open("w", encoding="utf-8") as f:
+        for r in rows:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Build/refresh the wiki-twobatch wiki.")
+    parser.add_argument("--wiki-root", type=Path, default=None, help="Override the wiki root directory.")
+    sub = parser.add_subparsers(dest="cmd", required=True)
+
+    p_sum = sub.add_parser("render-summary", help="stdin JSON -> summaries/<sid>.md")
+    p_sum.add_argument("--rewrite", action="store_true")
+
+    p_g = sub.add_parser("render-guidelines", help="stdin {entities: [...]} -> guideline pages")
+    p_g.add_argument("--rewrite", action="store_true")
+    p_g.add_argument("--session-id", default=None)
+    p_g.add_argument("--normalized-path", default=None)
+
+    p_cluster = sub.add_parser("render-cluster", help="stdin JSON -> guidelines/<slug>__cluster.md (also writes _config)")
+    p_cluster.add_argument(
+        "--archive-members",
+        action="store_true",
+        help="Move each member atomic to <wiki>/_archived/ after writing the cluster page (delete-on-promote).",
+    )
+    sub.add_parser("render-task", help="stdin JSON -> tasks/<slug>__task.md (also writes _config)")
+    sub.add_parser("render-subtask", help="stdin JSON -> tasks/<slug>__subtask.md (per-session workstream page)")
+    p_skill = sub.add_parser("render-skill", help="stdin JSON -> skills/<slug>/SKILL.md (+ scripts/)")
+    p_skill.add_argument("--rewrite", action="store_true", help="Overwrite an existing skill page.")
+    p_skill.add_argument(
+        "--archive-covered",
+        action="store_true",
+        help="After writing the skill, archive any atomic guideline whose tags/title indicate it's covered by this skill.",
+    )
+    sub.add_parser("update-config", help="stdin patch -> _config.yaml")
+    sub.add_parser("dump-guidelines", help="stdout: corpus of atomic guidelines as JSON")
+    sub.add_parser("dump-summaries", help="stdout: corpus of summaries as JSON")
+    sub.add_parser("catalog", help="refresh indexes, _index.jsonl, summary frontmatter metrics")
+
+    args = parser.parse_args(argv)
+    handlers = {
+        "render-summary": cmd_render_summary,
+        "render-guidelines": cmd_render_guidelines,
+        "render-cluster": cmd_render_cluster,
+        "render-task": cmd_render_task,
+        "render-subtask": cmd_render_subtask,
+        "render-skill": cmd_render_skill,
+        "update-config": cmd_update_config,
+        "dump-guidelines": cmd_dump_guidelines,
+        "dump-summaries": cmd_dump_summaries,
+        "catalog": cmd_catalog,
+    }
+    return handlers[args.cmd](args)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/pyproject.toml b/pyproject.toml
index 57472950..e0df408b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -106,6 +106,9 @@ anyio_mode = "auto"
 [tool.ruff]
 line-length = 140
 indent-width = 4
+# Generated example-wiki content (scripts the wiki builder emitted verbatim from
+# trajectories) is exploration artifact, not project source — don't lint it.
+extend-exclude = ["explorations/agent-wiki/wikis/"]
 
 [tool.semantic_release]
 allow_zero_version = true
@@ -166,6 +169,7 @@ exclude = [
     "platform-integrations/",
     "plugin-source/",
     "examples/",
+    "explorations/agent-wiki/wikis/",
 ]
 
 [[tool.mypy.overrides]]