diff --git a/.forge/ralph/apply-worker-cli-v5/prd.json b/.forge/ralph/apply-worker-cli-v5/prd.json deleted file mode 100644 index 4437cd4..0000000 --- a/.forge/ralph/apply-worker-cli-v5/prd.json +++ /dev/null @@ -1,58 +0,0 @@ -{ - "project": "Webster", - "branchName": "ralph/apply-worker-cli-v5", - "prdFile": "prd.md", - "description": "Expose the existing apply-worker core as a weekly CLI and prove mutation, commit trailers, and validation blocking with integration tests.", - "userStories": [ - { - "id": "US-001", - "title": "Add apply-worker CLI wrapper", - "description": "As a Webster operator, I want a Bun CLI that runs the existing apply-worker core against a weekly history directory so that selected proposal issues become validated experiment commits without manual orchestration.", - "acceptanceCriteria": [ - "Add a script entrypoint following repo conventions, likely scripts/apply-worker/cli.ts, with a #!/usr/bin/env bun shebang and import.meta.main guard.", - "The CLI accepts a week directory argument and reads proposal.md plus decision.json from that directory.", - "The CLI uses existing exports from scripts/apply-worker.ts for parseDecision, parseProposal, applyMutation, runValidation, buildCommitMessage, commitExperiment, emitSkip, and writeApplyLog instead of duplicating US-001 through US-004 core logic.", - "For each selected issue, the CLI applies mutations, runs lint, type-check, and format-check before committing, and creates one Git commit only when all three validation checks pass.", - "Every successful experiment commit message includes the existing trailer format Experiment-Id: exp-NN-slug validated by buildCommitMessage.", - "String mismatches or validation failures are recorded as skipped experiments in apply-log.json and emit structured skip rows through emitSkip without creating a commit for that experiment.", - "Missing week directory, missing proposal.md, or missing decision.json produces a clear non-zero CLI error without mutating files.", - "Type-check passes", - "Lint passes with zero warnings", - "Format check passes", - "Tests pass" - ], - "technicalNotes": "Build on scripts/apply-worker.ts:287-317 for parseDecision/parseProposal, 319-357 for applyMutation, 359-369 for runValidation, 372-393 for buildCommitMessage, 395-429 for commitExperiment, and 442-457 for emitSkip/writeApplyLog. Mirror CLI entrypoint/error shape from scripts/critic-genealogy.ts:676-694 and package script style from package.json scripts. Fixture artifact shapes are visible in history/2026-04-23/decision.json and history/2026-04-23/proposal.md.", - "dependsOn": [], - "priority": 1, - "passes": true, - "notes": "Implemented in iteration 1. Files: scripts/apply-worker-cli.ts, scripts/apply-worker.ts, package.json, .forge/ralph/apply-worker-cli-v5/prd.md." - }, - { - "id": "US-002", - "title": "Add apply-worker integration tests", - "description": "As a Webster maintainer, I want integration tests around the apply-worker CLI so that successful proposals mutate and commit correctly while broken proposals are blocked before commit.", - "acceptanceCriteria": [ - "Add Bun tests under scripts/__tests__/ using the existing bun:test style from scripts/__tests__/memory.test.ts.", - "Create fixture proposal.md and decision.json inputs that match the existing weekly schema and proposal format shown in history/2026-04-23 artifacts.", - "A successful fixture run verifies the target file content changed from the Before block to the After block.", - "A successful fixture run verifies git history contains a commit message trailer exactly matching Experiment-Id: exp-01- for the applied issue.", - "A successful fixture run verifies apply-log.json records the applied experiment with status applied and a commit_sha.", - "A deliberately broken proposal or validation-breaking mutation does not create a Git commit for that experiment.", - "The blocked fixture verifies apply-log.json and/or skips.jsonl records the terminal skip reason, including validation failure when the output fails the lint/type/format floor.", - "Tests isolate Git state and filesystem mutations in temporary directories or fixture repositories and do not mutate Webster's real history or site files.", - "bun run validate passes before committing the story.", - "Type-check passes", - "Lint passes with zero warnings", - "Format check passes", - "Tests pass" - ], - "technicalNotes": "Use scripts/__tests__/memory.test.ts:1-18 as the temp-path and cleanup pattern. Exercise the CLI from US-001 rather than only unit-testing helper functions. Keep fixtures minimal but schema-faithful to history/2026-04-23/decision.json:1-25 and proposal markdown sections from history/2026-04-23/proposal.md:1-28. Inspect commit messages with git log in the isolated fixture repo. The broken-output case should prove runValidation from scripts/apply-worker.ts:359-369 prevents commitExperiment from scripts/apply-worker.ts:395-429.", - "dependsOn": [ - "US-001" - ], - "priority": 2, - "passes": true, - "notes": "Implemented in iteration 2. Files: scripts/__tests__/apply-worker-cli.test.ts." - } - ] -} diff --git a/.forge/ralph/apply-worker-cli-v5/prd.md b/.forge/ralph/apply-worker-cli-v5/prd.md deleted file mode 100644 index 75910b0..0000000 --- a/.forge/ralph/apply-worker-cli-v5/prd.md +++ /dev/null @@ -1,180 +0,0 @@ -# Apply Worker CLI v5 — Product Requirements - -## Overview - -**Problem**: Webster has the apply-worker core for parsing `proposal.md`, applying text mutations, running validation, emitting skip rows, writing `apply-log.json`, and building experiment commit messages, but the weekly operator still lacks a single CLI entrypoint and integration-level proof that the full proposal-to-commit path is safe. -**Solution**: Add a thin CLI wrapper around the existing apply-worker core and integration tests that exercise successful mutation commits, correct `Experiment-Id` trailers, and validation-blocked broken proposals. -**Branch**: `ralph/apply-worker-cli-v5` - ---- - -## Goals & Success - -### Primary Goal - -Expose the merged apply-worker core as a production CLI that can be run against a weekly history directory and can commit only validated experiment mutations. - -### Success Metrics - -| Metric | Target | How Measured | -|--------|--------|--------------| -| CLI entrypoint exists | `scripts/apply-worker/cli.ts` or repo-convention equivalent invokes the core from a weekly directory | Code review and `bun` execution in tests | -| Valid experiment commits | Every applied experiment creates a Git commit with `Experiment-Id: exp-NN-slug` | Integration test inspects `git log --format=%B` | -| Broken output is blocked | A deliberately broken proposal does not create a commit | Integration test compares commit count and apply log/skip output | -| Quality floor | Type, lint, format, validators, markdownlint, and tests pass | `bun run validate` | - -### Non-Goals (Out of Scope) - -- Reimplementing US-001 through US-004 — the core parser, text mutation engine, validation gate, skip-row emission, and apply-log writer already exist in `scripts/apply-worker.ts`. -- Multi-kind proposal routing — tracked separately in Layer 10 #47-#49. -- Visual review or critic rerun gates — downstream of the apply step and not part of this remaining scope. -- Changing proposal or decision schemas — this story consumes the existing `proposal.md` and `decision.json` shapes. - ---- - -## User & Context - -### Target User - -- **Who**: Webster implementation operator running the weekly landing-page improvement loop. -- **Role**: Takes a redesigner proposal and operator decision from `history//`, applies selected edits, and promotes only safe experiments. -- **Current Pain**: The core code exists but the operator cannot reliably run one command that reads weekly artifacts, applies each selected issue, validates, commits, and records blocked experiments. - -### User Journey - -1. **Trigger**: The weekly council produces `history//proposal.md` and `history//decision.json`. -2. **Action**: The operator runs the apply-worker CLI against that week directory. -3. **Outcome**: Each valid selected issue lands as its own commit with an experiment trailer; invalid or validation-breaking issues are skipped and recorded without a commit. - ---- - -## UX Requirements - -### Interaction Model - -Command-line only. The CLI should follow existing script conventions: executable Bun TypeScript files under `scripts/`, `#!/usr/bin/env bun`, `import.meta.main` guard, explicit usage/error output, and non-zero exits for bad invocation. Existing entrypoint patterns appear in `scripts/critic-genealogy.ts:676-694`, `scripts/validate-agents.ts:129`, and `scripts/validate-findings.ts:108`. - -Likely command shape: - -```bash -bun scripts/apply-worker/cli.ts history/2026-04-23 -``` - -The CLI reads: - -- `/proposal.md` -- `/decision.json` - -The CLI writes: - -- `/apply-log.json` -- `/skips.jsonl` when an experiment is skipped -- `/memory.jsonl` skip rows via the existing helper -- one Git commit per validated experiment - -### States to Handle - -| State | Description | Behavior | -|-------|-------------|----------| -| Empty | Week directory or required files are missing | Print usage/error and exit non-zero without mutating files | -| Loading | CLI is applying one selected proposal issue | Log current experiment id/title and validation status to stdout/stderr | -| Error | Proposal parse, string mismatch, validation failure, git add, or git commit fails | Record terminal skip when applicable; fail clearly for unrecoverable setup/git errors | -| Success | All selected issues were either committed or explicitly skipped | Write `apply-log.json`; exit 0 if the run completed deterministically | - ---- - -## Technical Context - -### Patterns to Follow - -- **Apply-worker core**: `scripts/apply-worker.ts:287-317` — `parseDecision` and `parseProposal` already map weekly artifacts into selected proposal issues. -- **Mutation engine**: `scripts/apply-worker.ts:319-357` — `applyMutation` performs exact string replacement and returns `string_mismatch` instead of silently proceeding. -- **Validation floor**: `scripts/apply-worker.ts:359-369` — `runValidation` runs lint, type-check, and format-check; the CLI must treat any failure as a no-commit skip for that experiment. -- **Commit trailer format**: `scripts/apply-worker.ts:372-393` — `buildCommitMessage` validates `exp-NN-slug` and emits `Experiment-Id: ${expId}`. -- **Git commit helper**: `scripts/apply-worker.ts:395-429` — `commitExperiment` stages touched files and parses the commit SHA. -- **Skip/apply-log writers**: `scripts/apply-worker.ts:442-457` — `emitSkip` and `writeApplyLog` already write terminal skip rows and `apply-log.json`. -- **Fixture schemas**: `history/2026-04-23/decision.json:1-25` shows `week` plus `selected_issues`; `history/2026-04-23/proposal.md:1-28` shows issue headings, target files, and Before/After blocks. -- **Test pattern**: `scripts/__tests__/memory.test.ts:1-18` uses `bun:test`, temp paths, and explicit cleanup; use the same style for integration fixtures. -- **CLI error pattern**: `scripts/critic-genealogy.ts:676-694` guards `import.meta.main`, maps usage errors to exit 2, and unexpected failures to exit 1. - -### Types & Interfaces - -```typescript -export interface DecisionJSON { - week: string; - selected_issues: DecisionIssue[]; -} - -export interface ProposalIssue { - index: number; - severity: Severity; - title: string; - files_touched: string[]; - mutations: RawMutation[]; -} - -export interface ApplyExperiment { - exp_id: string; - severity: Severity; - title: string; - status: "applied" | "skipped"; - mutations: MutationResult[]; - commit_sha?: string; - skip_reason?: "string_mismatch" | "lint_failure" | "type_failure" | "format_failure"; - skip_details?: Record; -} - -export interface ApplyLogJSON { - week: string; - run_timestamp: string; - experiments: ApplyExperiment[]; - validation_summary: { - lint_passed: boolean; - type_check_passed: boolean; - format_check_passed: boolean; - }; -} -``` - -### Architecture Notes - -- Build strictly on top of `scripts/apply-worker.ts`; do not duplicate parser, mutation, validation, skip, log, or commit helpers. -- If the current file must be split to support `scripts/apply-worker/cli.ts`, preserve public exports and avoid changing landed US-001-US-004 behavior except where CLI orchestration needs a missing exported helper. -- Each selected issue should be treated as a separate experiment with deterministic id `exp-${NN}-${slug}` through the existing `buildCommitMessage` guard. -- The hard floor is lint + type-check + format-check before commit. Full repository validation (`bun run validate`) remains the story completion gate. -- Integration tests may need to run in a temporary Git repository or carefully isolated fixture repo so real Webster history is not mutated. - ---- - -## Implementation Summary - -### Story Overview - -| ID | Title | Priority | Dependencies | -|----|-------|----------|--------------| -| US-001 | Add apply-worker CLI wrapper | 1 | — | -| US-002 | Add apply-worker integration tests | 2 | US-001 | - -### Dependency Graph - -```text -US-001 (CLI wrapper around existing core) - ↓ -US-002 (integration tests for commits, mutation, and validation blocking) -``` - ---- - -## Validation Requirements - -Every story must pass: - -- [ ] Type-check: `bun run type-check` -- [ ] Lint: `bun run lint --max-warnings 0` -- [ ] Tests: `bun run test` -- [ ] Format: `bun run format:check` -- [ ] Full gate before final commit: `bun run validate` - ---- - -Generated: 2026-04-24T07:47:46Z diff --git a/.forge/ralph/genealogy-gov-v1/prd.json b/.forge/ralph/genealogy-gov-v1/prd.json deleted file mode 100644 index 705e9da..0000000 --- a/.forge/ralph/genealogy-gov-v1/prd.json +++ /dev/null @@ -1,75 +0,0 @@ -{ - "project": "Webster", - "branchName": "ralph/genealogy-gov-v1", - "prdFile": "prd.md", - "description": "Implement Feature #55 genealogy governance layers 2-4: dedup, 13-week cap, and archive-on-idle.", - "userStories": [ - { - "id": "US-001", - "title": "Layer 2 embedding dedup blocks overlapping critic specs", - "description": "As a Webster operator, I want new critic specs rejected when their scope substantially overlaps an existing critic so that genealogy does not create duplicate weekly agents.", - "acceptanceCriteria": [ - "Add an orchestrator-side dedup helper in scripts/critic-genealogy.ts that compares a candidate NewCriticSpec against active CriticSummary entries using embedding cosine similarity over scope and description text.", - "Reject candidate specs with cosine similarity >= 0.60 to any active critic before registerAgent() performs POST /v1/agents.", - "Governance rejection prints the closest existing critic name, similarity score, and candidate scope without registering an agent or creating a session.", - "Dry-run mode still runs and reports the dedup decision before printing a would-register spec.", - "Unit tests cover below-threshold allow, exactly-0.60 reject, and above-threshold reject behavior without live Anthropic network calls.", - "Type-check passes", - "Tests pass", - "bun run validate passes" - ], - "technicalNotes": "Modify scripts/critic-genealogy.ts around NewCriticSpec/CriticSummary definitions (lines 31-60), active critic loading (lines 155-168), and the main flow before registerAgent() (lines 457+ and main registration section). Follow fail-fast error style from lines 339-356. Add tests in scripts/__tests__/critic-genealogy.test.ts mirroring direct helper tests at lines 93-120 and 160-203. Existing agents expose metadata.scope and description in agents/*-critic.json.", - "dependsOn": [], - "priority": 1, - "passes": true, - "notes": "Implemented in iteration 1. Files: scripts/critic-genealogy.ts, scripts/__tests__/critic-genealogy.test.ts, .forge/ralph/genealogy-gov-v1/prd.md." - }, - { - "id": "US-002", - "title": "Layer 3 13-week cap with operator soft override", - "description": "As a Webster operator, I want a maximum of 3 new critics per 13 weeks with a deliberate override flag so that genealogy growth is bounded but not blocked in exceptional cases.", - "acceptanceCriteria": [ - "Add a CLI soft-override flag, named --override-quarterly-cap or equivalent, to parseArgs() and CLIArgs in scripts/critic-genealogy.ts.", - "Count spawned critic specs from history//genealogy/spec.json in the rolling 13-week window ending at args.weekDate.", - "Block registration when the count is already 3 or more and the override flag is false.", - "Allow registration when the count is 3 or more only if the override flag is true, and print an explicit operator override message.", - "Ignore malformed or missing non-genealogy history directories only when they are irrelevant; malformed in-window genealogy spec data fails loudly with an actionable error.", - "Unit tests cover count 0, count 2 allow, count 3 block, count 3 with override allow, and boundary dates at exactly 13 weeks.", - "Type-check passes", - "Tests pass", - "bun run validate passes" - ], - "technicalNotes": "Extend scripts/critic-genealogy.ts parseArgs() at lines 72-105 and printUsage() at lines 107-115. Use writeArtifacts() output convention at lines 570-585: each spawn writes history//genealogy/spec.json. Gate the main flow before registerAgent() and createSession(). Add parseArgs tests beside scripts/__tests__/critic-genealogy.test.ts lines 44-75 and helper tests with temporary history fixtures.", - "dependsOn": [ - "US-001" - ], - "priority": 2, - "passes": true, - "notes": "Implemented in iteration 2. Files: scripts/critic-genealogy.ts, scripts/__tests__/critic-genealogy.test.ts." - }, - { - "id": "US-003", - "title": "Layer 4 archive idle spawned critics", - "description": "As a Webster operator, I want spawned critics with no promoted findings in 8 weeks archived so that inactive council members stop consuming weekly attention and tokens.", - "acceptanceCriteria": [ - "Add archive-on-idle logic in scripts/critic-genealogy.ts that evaluates spawned critics and moves idle specs from agents/.json to agents/archive/.json.", - "A spawned critic is archived only when it has 0 findings promoted across the last 8 weeks; critics with at least 1 promoted finding in that window remain active.", - "Original committed baseline critics are not archived by the idle rule unless history proves they were genealogy-spawned.", - "loadExistingCritics() continues to load only active agents/*-critic.json files and excludes agents/archive/*.json by path.", - "Archive actions create agents/archive/ if missing and preserve the JSON spec byte-for-byte except for formatting caused by existing JSON write conventions if needed.", - "Unit tests cover idle spawned critic archived, active spawned critic retained, original critic retained, and archived critic excluded from active critic summaries.", - "Type-check passes", - "Tests pass", - "bun run validate passes" - ], - "technicalNotes": "Implement in scripts/critic-genealogy.ts near loadExistingCritics() lines 155-168 and main startup before active critic summary logging. Use agents/*-critic.json naming shown by agents/brand-voice-critic.json and peers. Use genealogy provenance from history//genealogy/spec.json created by writeArtifacts() lines 570-585. Add tests in scripts/__tests__/critic-genealogy.test.ts next to loadExistingCritics tests at lines 78-90; use temporary fixture directories or exported pure helpers to avoid mutating real agents during tests.", - "dependsOn": [ - "US-001", - "US-002" - ], - "priority": 3, - "passes": true, - "notes": "Implemented in iteration 3. Files: scripts/critic-genealogy.ts, scripts/__tests__/critic-genealogy.test.ts." - } - ] -} diff --git a/.forge/ralph/genealogy-gov-v1/prd.md b/.forge/ralph/genealogy-gov-v1/prd.md deleted file mode 100644 index bf766f1..0000000 --- a/.forge/ralph/genealogy-gov-v1/prd.md +++ /dev/null @@ -1,172 +0,0 @@ -# Genealogy Governance Layers 2-4 — Product Requirements - -## Overview - -**Problem**: Webster can now spawn new critics at runtime, but without code-level governors the council can duplicate existing critic scopes, exceed a sensible growth rate, and keep idle critics in weekly runs indefinitely. That creates token-waste drift and weakens the demo claim that genealogy is controlled rather than chaotic. -**Solution**: Implement Q5.1 governance layers 2-4 in the existing genealogy registration path: embedding-based deduplication before registration, a 13-week cap with operator soft override, and archive-on-idle pruning for critics with no promoted findings across 8 weeks. -**Branch**: `ralph/genealogy-gov-v1` - ---- - -## Goals & Success - -### Primary Goal - -Bound runtime critic spawning while preserving legitimate, operator-overridable genealogy growth. - -### Success Metrics - -| Metric | Target | How Measured | -|--------|--------|--------------| -| Duplicate critic rejection | New critic specs with >=60% embedding cosine similarity to an existing critic are rejected before `POST /v1/agents` | Unit tests around `scripts/critic-genealogy.ts` registration path | -| Quarterly spawn cap | More than 3 new critics in any rolling 13-week window is blocked unless an operator override flag is present | Unit tests using historical `history/*/genealogy/spec.json` fixtures | -| Idle critic retirement | Spawned critics with 0 promoted findings over 8 weeks are moved to `agents/archive/` and excluded from active critic loading | Unit tests around archive-on-idle logic and `loadExistingCritics()` behavior | -| Validation | `bun run validate` passes with zero lint warnings | Project validation command | - -### Non-Goals (Out of Scope) - -- Layer 1 prompt rubric edits in `prompts/second-wbs-session.md` — explicitly deferred until `feat/orch-memory-planner-v2` PR #6 merges because that branch is actively modifying the same file. -- Redesigning planner or redesigner request schemas — Feature #55 scope is governance layers 2-4 only. -- Deleting retired critics from Git history or the Managed Agents API — Layer 4 archives local specs recoverably rather than destructive deletion. -- Building live embedding infrastructure beyond this path — the dedup check is local to `scripts/critic-genealogy.ts` new-critic registration. - ---- - -## User & Context - -### Target User - -- **Who**: Webster operator running weekly landing-page improvement sessions. -- **Role**: Maintains a council of Claude Managed Agents and reviews automated changes before submission or deployment. -- **Current Pain**: Runtime genealogy is powerful, but every extra critic is a recurring weekly cost. Duplicate or idle critics turn the council into an expensive echo chamber. - -### User Journey - -1. **Trigger**: Planner or genealogy detection identifies a possible unowned concern and `scripts/critic-genealogy.ts` prepares a new critic spec. -2. **Action**: The orchestrator-side genealogy script evaluates overlap, recent spawn count, and idle critic state before registering or invoking agents. -3. **Outcome**: Legitimate critics are registered and invoked; duplicate or over-cap critics are blocked with explicit evidence; idle spawned critics are archived before future council runs. - ---- - -## UX Requirements - -### Interaction Model - -This is backend/CLI orchestration. The primary interface remains: - -```bash -bun scripts/critic-genealogy.ts --branch [--week YYYY-MM-DD] [--lp-target URL] [--dry-run] -bun scripts/critic-genealogy.ts --fixtures [--week YYYY-MM-DD] [--lp-target URL] [--dry-run] -``` - -Layer 3 adds an operator soft-override flag, for example `--override-quarterly-cap`, that allows a human-approved spawn when the 13-week cap has already been reached. Layer 4 archive-on-idle should run from the same script before active critic loading/registration so archived critics are not considered active council members. - -### States to Handle - -| State | Description | Behavior | -|-------|-------------|----------| -| Empty | No spawned genealogy history or no archived critics yet | Dedup still compares against current `agents/*-critic.json`; cap count is 0; retire pass no-ops | -| Loading | Embedding similarity or API-backed registration is in progress | Script prints explicit progress and continues existing fail-fast error behavior | -| Error | Embedding request fails, malformed history exists, archive move fails, or cap blocks without override | Script exits non-zero for operational errors; governance blocks print actionable reason and skip registration | -| Success | New spec is below 60% overlap, under cap or operator-overridden, and idle critics are archived | Script registers/invokes as today and writes artifacts; archive pass moves idle specs to `agents/archive/` | - ---- - -## Technical Context - -### Patterns to Follow - -- **Similar implementation**: `scripts/critic-genealogy.ts:155-168` — active critics are discovered from `agents/*-critic.json`; Layer 4 should exclude `agents/archive/` by keeping archived files outside this glob. -- **Similar implementation**: `scripts/critic-genealogy.ts:457-477` — `registerAgent()` is the correct choke point before `POST /v1/agents`; Layer 2 and Layer 3 checks should run before this call. -- **Similar implementation**: `scripts/critic-genealogy.ts:570-585` — `writeArtifacts()` records genealogy specs under `history//genealogy/`; Layer 3 can count recent spawns from these artifacts. -- **Component pattern**: `scripts/critic-genealogy.ts:72-105` — CLI flags are parsed with explicit mutually-exclusive validation and `CLIError`; add the soft-override flag here. -- **Error handling pattern**: `scripts/critic-genealogy.ts:141-152` and `scripts/critic-genealogy.ts:339-356` — invalid state fails loudly with clear error messages, no silent fallback. -- **Test pattern**: `scripts/__tests__/critic-genealogy.test.ts:44-75` — CLI parsing tests assert accepted and rejected flags. -- **Test pattern**: `scripts/__tests__/critic-genealogy.test.ts:78-90` — active critic loading behavior is unit-tested directly. -- **Test pattern**: `scripts/__tests__/critic-genealogy.test.ts:160-203` — generated agent JSON behavior is tested with direct helpers and schema validation. - -### Types & Interfaces - -```typescript -interface NewCriticSpec { - name: string; - scope: string; - description: string; - rationale: string; - focus_owned: string[]; - focus_not_owned: string[]; - severity_rubric: string; -} - -interface AgentJSON { - name: string; - description: string; - model: string; - system: string; - tools: unknown[]; - mcp_servers?: unknown[]; - metadata?: Record; -} - -interface CriticSummary { - name: string; - scope: string; - description: string; -} - -interface CLIArgs { - branch: string | null; - fixtures: string | null; - weekDate: string; - lpTarget: string; - dryRun: boolean; - // add: overrideQuarterlyCap: boolean; -} -``` - -### Architecture Notes - -- Feature #55 is governed by `context/FEATURES.md:170` and Q5.1 in `context/DOMAIN-MODEL.md:303-333`; use the user's updated thresholds for this PRD: 60% cosine overlap, max 3 critics per 13 weeks, and 0 promoted findings in 8 weeks. -- Existing critic specs live in `agents/*-critic.json`; active critics include the five original critics plus `visual-design-critic.json` if present. -- Spawn artifacts live under `history//genealogy/spec.json`, created by `writeArtifacts()`. -- Registration currently happens through `registerAgent()` after `spliceNewSpec()` creates an `AgentJSON`; governance should block before remote agent creation and before session creation. -- Promoted findings evidence should come from existing history artifacts where available. If implementation needs a source of truth, prefer explicit history rows over inferring from current findings text. -- Validation follows `CLAUDE.md`: zero lint warnings, full type check, format check, tests, and `bun run validate` before declaring done. - ---- - -## Implementation Summary - -### Story Overview - -| ID | Title | Priority | Dependencies | -|----|-------|----------|--------------| -| US-001 | Layer 2 embedding dedup blocks overlapping critic specs | 1 | — | -| US-002 | Layer 3 13-week cap with operator soft override | 2 | US-001 | -| US-003 | Layer 4 archive idle spawned critics | 3 | US-001, US-002 | - -### Dependency Graph - -```text -US-001 (dedup guard before registration) - ↓ -US-002 (rolling 13-week cap + soft override) - ↓ -US-003 (archive-on-idle pruning) -``` - ---- - -## Validation Requirements - -Every story must pass: - -- [ ] Type-check: `bun run type-check` -- [ ] Lint: `bun run lint --max-warnings 0` -- [ ] Tests: `bun run test` -- [ ] Format: `bun run format:check` -- [ ] Full project gate: `bun run validate` - ---- - -Generated: 2026-04-24T00:00:00.000Z diff --git a/.forge/ralph/genealogy-gov-v1/progress.txt b/.forge/ralph/genealogy-gov-v1/progress.txt deleted file mode 100644 index 41625c8..0000000 --- a/.forge/ralph/genealogy-gov-v1/progress.txt +++ /dev/null @@ -1,88 +0,0 @@ -## Codebase Patterns - -### Archive-on-idle uses explicit decision owners -- **Where**: `scripts/critic-genealogy.ts` -- **Pattern**: Treat `history//genealogy/spec.json` as spawn provenance and `history//decision.json:selected_issues[].owner` as promoted-finding evidence; move agent specs with `renameSync` to preserve bytes. -- **Example**: `archiveIdleSpawnedCritics(agentsDir, historyRoot, weekDate)` runs before `loadExistingCritics()` in `main()`. - -### Governance helpers stay pure and injectable -- **Where**: `scripts/critic-genealogy.ts` -- **Pattern**: Put governance decisions in exported pure helpers and inject data providers, then call the helper in `main()` immediately before the side-effecting API boundary. -- **Example**: `evaluateCriticDedup(candidate, critics, embed)` runs before `registerAgent()` and is tested with deterministic vectors. - ---- - -## 2026-04-24T00:00:00.000Z — US-001: Layer 2 embedding dedup blocks overlapping critic specs - -**Status**: PASSED -**Files changed**: -- `scripts/critic-genealogy.ts` — added cosine-similarity dedup helper over candidate/existing critic scope and description, governance rejection output, and pre-registration gate. -- `scripts/__tests__/critic-genealogy.test.ts` — added no-network unit coverage for below-threshold allow, exact 0.60 reject, and above-threshold reject behavior. -- `.forge/ralph/genealogy-gov-v1/prd.md` — fixed markdownlint MD036 formatting so `bun run validate` can pass. -- `.forge/ralph/genealogy-gov-v1/prd.json` — marked US-001 complete. - -**Acceptance criteria verified**: -- [x] Add an orchestrator-side dedup helper in `scripts/critic-genealogy.ts` that compares a candidate `NewCriticSpec` against active `CriticSummary` entries using embedding cosine similarity over scope and description text. -- [x] Reject candidate specs with cosine similarity >= 0.60 to any active critic before `registerAgent()` performs `POST /v1/agents`. -- [x] Governance rejection prints the closest existing critic name, similarity score, and candidate scope without registering an agent or creating a session. -- [x] Dry-run mode still runs and reports the dedup decision before printing a would-register spec. -- [x] Unit tests cover below-threshold allow, exactly-0.60 reject, and above-threshold reject behavior without live Anthropic network calls. -- [x] Type-check passes. -- [x] Tests pass. -- [x] `bun run validate` passes. - -**Learnings**: -- `bun run validate` includes markdownlint over `.forge/ralph/**/*.md`; generated PRD footer emphasis triggered MD036 and had to be normalized. -- The local package has no `cli` script, so Ralph workflow event emits fail harmlessly with `Script not found "cli"` under the required `|| true` guard. - ---- -## 2026-04-24T08:29:55.000Z — US-002: Layer 3 13-week cap with operator soft override - -**Status**: PASSED -**Files changed**: -- `scripts/critic-genealogy.ts` — added `--override-quarterly-cap`, rolling 13-week genealogy spawn counting, malformed in-window spec validation, and pre-registration cap gate. -- `scripts/__tests__/critic-genealogy.test.ts` — added no-network unit coverage for count 0, count 2 allow, count 3 block, count 3 override allow, exactly-13-week boundary inclusion, and malformed in-window genealogy spec failure. -- `.forge/ralph/genealogy-gov-v1/prd.json` — marked US-002 complete. - -**Acceptance criteria verified**: -- [x] Add a CLI soft-override flag, named `--override-quarterly-cap` or equivalent, to `parseArgs()` and `CLIArgs` in `scripts/critic-genealogy.ts`. -- [x] Count spawned critic specs from `history//genealogy/spec.json` in the rolling 13-week window ending at `args.weekDate`. -- [x] Block registration when the count is already 3 or more and the override flag is false. -- [x] Allow registration when the count is 3 or more only if the override flag is true, and print an explicit operator override message. -- [x] Ignore malformed or missing non-genealogy history directories only when they are irrelevant; malformed in-window genealogy spec data fails loudly with an actionable error. -- [x] Unit tests cover count 0, count 2 allow, count 3 block, count 3 with override allow, and boundary dates at exactly 13 weeks. -- [x] Type-check passes. -- [x] Tests pass. -- [x] `bun run validate` passes. - -**Learnings**: -- The registration choke point now has two pure governance gates before API side effects: dedup first, then quarterly cap before `registerAgent()` and `createSession()`. -- Treat the 13-week boundary as inclusive: a spawn exactly 91 days before `args.weekDate` counts toward the cap. -- Non-date history directories can be ignored; in-window `genealogy/spec.json` files must parse as agent-like JSON so broken genealogy artifacts fail loudly. - ---- -## 2026-04-24T09:15:00.000Z — US-003: Layer 4 archive idle spawned critics - -**Status**: PASSED -**Files changed**: -- `scripts/critic-genealogy.ts` — added genealogy-spawn provenance loading, promoted-finding owner parsing from decision history, archive-on-idle moves to `agents/archive/`, and startup pruning before active critic loading. -- `scripts/__tests__/critic-genealogy.test.ts` — added unit coverage for idle spawned critic archival, active spawned critic retention, original critic retention, archived critic exclusion from active summaries, promoted owner parsing, and spawned provenance loading. -- `.forge/ralph/genealogy-gov-v1/prd.json` — marked US-003 complete. - -**Acceptance criteria verified**: -- [x] Add archive-on-idle logic in `scripts/critic-genealogy.ts` that evaluates spawned critics and moves idle specs from `agents/.json` to `agents/archive/.json`. -- [x] A spawned critic is archived only when it has 0 findings promoted across the last 8 weeks; critics with at least 1 promoted finding in that window remain active. -- [x] Original committed baseline critics are not archived by the idle rule unless history proves they were genealogy-spawned. -- [x] `loadExistingCritics()` continues to load only active `agents/*-critic.json` files and excludes `agents/archive/*.json` by path. -- [x] Archive actions create `agents/archive/` if missing and preserve the JSON spec byte-for-byte except for formatting caused by existing JSON write conventions if needed. -- [x] Unit tests cover idle spawned critic archived, active spawned critic retained, original critic retained, and archived critic excluded from active critic summaries. -- [x] Type-check passes. -- [x] Tests pass. -- [x] `bun run validate` passes. - -**Learnings**: -- Spawned-critic provenance should come from `history//genealogy/spec.json`, not from current agent names alone, so baseline critics are safe by default. -- Promoted-finding evidence is explicit in `decision.json:selected_issues[].owner`; missing decision files mean no promoted findings for that week, while malformed present decision files fail loudly. -- `renameSync` preserves archived agent JSON bytes and avoids rewriting specs during governance pruning. - ---- diff --git a/.forge/ralph/orch-memory-planner-v1/prd.json b/.forge/ralph/orch-memory-planner-v1/prd.json deleted file mode 100644 index 74f15e8..0000000 --- a/.forge/ralph/orch-memory-planner-v1/prd.json +++ /dev/null @@ -1,72 +0,0 @@ -{ - "project": "Webster", - "branchName": "feat/orch-memory-planner-v1", - "prdFile": "prd.md", - "description": "Add the orchestrator step that marshals memory + verdict + monitor context, invokes webster-planner via the Managed Agents flow, parses the JSON response, writes history//plan.md, and appends a verdict-ready event to history/memory.jsonl.", - "userStories": [ - { - "id": "US-001", - "title": "Memory marshaling helper", - "description": "As a Webster operator, I want a pure TypeScript helper that reads memory.jsonl tail plus recent verdicts plus the monitor anomaly report and returns a single concatenated user.message text so that the planner always receives the same shape of context.", - "acceptanceCriteria": [ - "Add scripts/planner-context.ts exporting marshalPlannerContext(opts: { memoryPath: string; verdictDir: string; monitorPath: string; tailN?: number }): string.", - "The function uses the feature #51 tailN helper from src/memory (or scripts/memory) to read the last N=50 events from memoryPath; it does not re-implement tail logic.", - "The function reads the two most recent history//verdict.json files under verdictDir sorted by week slug descending; missing verdict files are skipped without throwing.", - "The function reads the monitor anomaly report text file at monitorPath; a missing monitor file is skipped without throwing.", - "The returned string begins with a MEMORY_TAIL section, then a RECENT_VERDICTS section, then a MONITOR_ANOMALIES section, each delimited by a stable header the test file can match.", - "When all three inputs are empty or missing, the function returns a string that explicitly labels the cold-start state rather than an empty string.", - "Add a Bun test under scripts/__tests__/planner-context.test.ts with fixtures under tmp paths verifying: tailN wiring, two-verdict ordering, missing-file skips, and cold-start labeling.", - "bun run type-check, bun run lint --max-warnings 0, bun test, bun run format:check pass." - ], - "technicalNotes": "Follow the style in scripts/critic-genealogy.ts and scripts/memory.ts (or src/memory). Use readFileSync + path.join; do not use async file APIs unless the existing modules already do. Keep the function free of network I/O. Reuse the feature #51 export rather than re-reading JSONL lines directly.", - "dependsOn": [], - "priority": 1, - "passes": true, - "notes": "Implemented in iteration 1. Files: scripts/planner-context.ts, scripts/__tests__/planner-context.test.ts." - }, - { - "id": "US-002", - "title": "Planner invocation + plan writer", - "description": "As a Webster operator, I want a helper that invokes the webster-planner Managed Agent using the marshaled context, parses the structured response, writes history//plan.md, and appends a verdict-ready row to history/memory.jsonl so that downstream critics can consume the plan.", - "acceptanceCriteria": [ - "Add scripts/planner-invoke.ts exporting invokePlanner(opts: { contextText: string; week: string; historyDir: string; apiKey: string }): Promise<{ planPath: string; plan: PlanRecord }>.", - "The function looks up or registers the webster-planner agent via POST /v1/agents, mirroring the find-or-register pattern in scripts/critic-genealogy.ts:440-556.", - "The function creates a session via POST /v1/sessions, sends contextText as the user.message event, and polls until the session is idle.", - "The function extracts the final assistant text and parses it as JSON with fields classification, next_action, direction_hint, optional new_critic_request, and rationale; next_action must be one of promote_and_experiment, hold_baseline, revert_and_retry, explore_broadly.", - "The function writes history//plan.md as human-readable markdown that embeds the parsed JSON in a fenced code block and prints the parsed fields as a bulleted summary above the fence.", - "The function appends one event row to history/memory.jsonl with event = 'verdict-ready' using the feature #51 appendEvent helper and includes refs.plan = relative path to the written plan.md.", - "Invalid JSON, missing required fields, or unknown next_action values raise an Error with a descriptive message and do NOT write plan.md or append to memory.jsonl.", - "Add a Bun test under scripts/__tests__/planner-invoke.test.ts that mocks fetch (global.fetch or bun:test mock) to exercise: happy path with a valid JSON response, malformed response rejection, and the memory.jsonl append side effect.", - "bun run type-check, bun run lint --max-warnings 0, bun test, bun run format:check pass." - ], - "technicalNotes": "Reuse helper names / patterns from scripts/critic-genealogy.ts (registerAgent, createSession, sendUserMessage, pollUntilIdle) rather than duplicating low-level fetch code. Keep the HTTP base URL configurable via env (ANTHROPIC_API_BASE defaulting to https://api.anthropic.com). Import appendEvent from the same module feature #51 added. Do NOT edit agents/webster-planner.json (owned by feature #50) and do NOT edit prompts/second-wbs-session.md in this story (US-003 owns that).", - "dependsOn": [ - "US-001" - ], - "priority": 2, - "passes": true, - "notes": "Implemented in iteration 2. Files: scripts/planner-invoke.ts, scripts/__tests__/planner-invoke.test.ts." - }, - { - "id": "US-003", - "title": "Orchestrator step in prompts/second-wbs-session.md", - "description": "As a Webster operator, I want a new orchestration step in prompts/second-wbs-session.md that runs BEFORE critic fan-out and calls the US-002 helper with US-001 inputs so that the planner's plan.md is ready for critics to read.", - "acceptanceCriteria": [ - "Edit prompts/second-wbs-session.md to add a new numbered step titled 'Run planner' placed BEFORE the critic fan-out step.", - "The step shows the bash/bun invocation that marshals context via scripts/planner-context.ts and invokes the planner via scripts/planner-invoke.ts, with the week argument set to the current ISO week folder name under history/.", - "The step specifies that on planner error the run halts with a non-zero exit status and a pointer to the error message.", - "The step references history//plan.md as the output artifact consumed by later steps.", - "Update README.md or an adjacent doc section if the prior council flow explicitly enumerated the steps, so the step count remains accurate.", - "bun run validate passes." - ], - "technicalNotes": "Edit prompts/second-wbs-session.md only \u2014 do not change orchestrator-owned I/O in the helpers. The step should read like the existing numbered steps: plain bash with comments and exit-on-error semantics. Do not implement runtime invocation of critics from this file; that remains in the later fan-out step.", - "dependsOn": [ - "US-001", - "US-002" - ], - "priority": 3, - "passes": true, - "notes": "Implemented in iteration 3. Files: prompts/second-wbs-session.md, README.md." - } - ] -} diff --git a/.forge/ralph/orch-memory-planner-v1/prd.md b/.forge/ralph/orch-memory-planner-v1/prd.md deleted file mode 100644 index d50e0c7..0000000 --- a/.forge/ralph/orch-memory-planner-v1/prd.md +++ /dev/null @@ -1,65 +0,0 @@ -# Orchestrator Memory Marshaling + Planner Invocation (L11 #52) - -## Problem - -Webster's council flow fans out critics + redesigner, but week-over-week -learning currently has no explicit planner step. Feature #50 ships the -`webster-planner` Managed Agent spec. Feature #51 ships the -`history/memory.jsonl` event substrate + `appendEvent` / `tailN` helpers. -This feature wires the two together: an orchestrator step that runs BEFORE -critic fan-out, invokes the planner, writes `plan.md`, and logs the event. - -Per ADR-0001 the orchestrator owns all JSONL I/O. The planner agent never -touches disk — it receives marshaled context as `user.message` text and -returns structured output the orchestrator parses. - -## Scope - -- Add a TypeScript helper module that marshals the planner's input context. -- Add a TypeScript helper module that invokes the planner via the - Anthropic Agents Managed-Agents flow and writes the decoded `plan.md`. -- Add a new orchestration step to `prompts/second-wbs-session.md` that - calls the helpers BEFORE the critic fan-out step. - -Out of scope (covered by separate features): - -- Plan → critic context wiring (#53). -- Cold-start explore-broadly defaults (#54) — this feature must not - crash when memory tail is empty, but the dedicated cold-start logic is #54. -- Critic-genealogy invocation of `new_critic_request` (#55). - -## Invariants - -- Orchestrator-owned I/O. No disk writes from inside the planner agent - prompt or tool definitions. -- Append-only `history/memory.jsonl`. Use the `appendEvent` helper from - feature #51. Never mutate prior rows. -- Zero lint warnings. `bun run validate` must pass. -- No silent fallbacks. If the planner call fails or returns unparseable - output, surface the error — do not fabricate a plan. -- No API keys in committed code. Load from environment. - -## Stories - -### US-001 — Memory marshaling helper - -Add `scripts/planner-context.ts` exporting a pure function that reads the -last N memory events plus recent verdict files plus the monitor anomaly -report and returns a single concatenated text payload suitable for the -planner's `user.message`. - -### US-002 — Planner invocation + plan writer - -Add `scripts/planner-invoke.ts` exporting a function that registers the -`webster-planner` agent (idempotent lookup), creates a session, sends the -marshaled user message, polls until idle, parses the planner's JSON -response, writes `history//plan.md`, and appends a `verdict-ready` -row to `history/memory.jsonl` via the feature #51 helper. - -### US-003 — Orchestrator integration step - -Edit `prompts/second-wbs-session.md` to add a new numbered step that runs -BEFORE the critic fan-out step. The step invokes the helper from US-002 -using the marshaled context from US-001, writes `plan.md` into the current -week's `history//` directory, and halts the run if the planner call -returns an error. diff --git a/.forge/ralph/orch-memory-planner-v1/progress.txt b/.forge/ralph/orch-memory-planner-v1/progress.txt deleted file mode 100644 index 09c3091..0000000 --- a/.forge/ralph/orch-memory-planner-v1/progress.txt +++ /dev/null @@ -1,81 +0,0 @@ -## Codebase Patterns - -### Planner invocation fails closed before disk writes -- **Where**: `scripts/planner-invoke.ts` -- **Pattern**: Parse and validate the final assistant JSON before creating `history//plan.md` or appending to `history/memory.jsonl`; failed planner output leaves no partial plan artifact. -- **Example**: `const plan = parsePlanRecord(extractFinalAssistantText(snapshot));` - -### Orchestrator memory helpers import the substrate directly -- **Where**: `scripts/planner-context.ts` -- **Pattern**: Higher-level orchestrator helpers should import `tailN` from `scripts/memory.ts` instead of re-reading JSONL lines. -- **Example**: `const memoryEvents = readMemoryTail(opts.tailN ?? DEFAULT_TAIL_N, opts.memoryPath);` - ---- - -## 2026-04-24 — US-001: Memory marshaling helper - -**Status**: PASSED -**Files changed**: -- `scripts/planner-context.ts` — added `marshalPlannerContext` with memory tail, recent verdict, monitor anomaly, and cold-start sections. -- `scripts/__tests__/planner-context.test.ts` — added Bun coverage for tailN wiring, verdict ordering, missing-file skips, and cold-start labeling. - -**Acceptance criteria verified**: -- [x] Add scripts/planner-context.ts exporting marshalPlannerContext(opts: { memoryPath: string; verdictDir: string; monitorPath: string; tailN?: number }): string. -- [x] The function uses the feature #51 tailN helper from src/memory (or scripts/memory) to read the last N=50 events from memoryPath; it does not re-implement tail logic. -- [x] The function reads the two most recent history//verdict.json files under verdictDir sorted by week slug descending; missing verdict files are skipped without throwing. -- [x] The function reads the monitor anomaly report text file at monitorPath; a missing monitor file is skipped without throwing. -- [x] The returned string begins with a MEMORY_TAIL section, then a RECENT_VERDICTS section, then a MONITOR_ANOMALIES section, each delimited by a stable header the test file can match. -- [x] When all three inputs are empty or missing, the function returns a string that explicitly labels the cold-start state rather than an empty string. -- [x] Add a Bun test under scripts/__tests__/planner-context.test.ts with fixtures under tmp paths verifying: tailN wiring, two-verdict ordering, missing-file skips, and cold-start labeling. -- [x] bun run type-check, bun run lint --max-warnings 0, bun test, bun run format:check pass. - -**Learnings**: -- `scripts/memory.ts` exposes `tailN(n, logPath)`, so callers pass the count first and the memory path second. -- No prior `progress.txt` existed for this PRD directory, so this iteration created it with the reusable pattern section. - ---- - -## 2026-04-24 — US-002: Planner invocation + plan writer - -**Status**: PASSED -**Files changed**: -- `scripts/planner-invoke.ts` — added `invokePlanner` with Managed Agent lookup/registration, session creation, context message send, idle polling, strict plan JSON parsing, plan markdown writing, and `verdict-ready` memory append. -- `scripts/__tests__/planner-invoke.test.ts` — added Bun fetch-mock coverage for a valid planner response, malformed JSON rejection, unknown `next_action` rejection, and memory append side effects. - -**Acceptance criteria verified**: -- [x] Add scripts/planner-invoke.ts exporting invokePlanner(opts: { contextText: string; week: string; historyDir: string; apiKey: string }): Promise<{ planPath: string; plan: PlanRecord }>. -- [x] The function looks up or registers the webster-planner agent via POST /v1/agents, mirroring the find-or-register pattern in scripts/critic-genealogy.ts:440-556. -- [x] The function creates a session via POST /v1/sessions, sends contextText as the user.message event, and polls until the session is idle. -- [x] The function extracts the final assistant text and parses it as JSON with fields classification, next_action, direction_hint, optional new_critic_request, and rationale; next_action must be one of promote_and_experiment, hold_baseline, revert_and_retry, explore_broadly. -- [x] The function writes history//plan.md as human-readable markdown that embeds the parsed JSON in a fenced code block and prints the parsed fields as a bulleted summary above the fence. -- [x] The function appends one event row to history/memory.jsonl with event = 'verdict-ready' using the feature #51 appendEvent helper and includes refs.plan = relative path to the written plan.md. -- [x] Invalid JSON, missing required fields, or unknown next_action values raise an Error with a descriptive message and do NOT write plan.md or append to memory.jsonl. -- [x] Add a Bun test under scripts/__tests__/planner-invoke.test.ts that mocks fetch (global.fetch or bun:test mock) to exercise: happy path with a valid JSON response, malformed response rejection, and the memory.jsonl append side effect. -- [x] bun run type-check, bun run lint --max-warnings 0, bun test, bun run format:check pass. - -**Learnings**: -- `environments/webster-council-env.id` is available in this worktree, so planner session creation can mirror genealogy sessions by passing `environment_id` without expanding the public `invokePlanner` options. -- `agents/webster-planner.json` is absent in this branch even though feature #50 is marked landed elsewhere; the helper supports registration when the spec exists and reuses an already-registered planner without reading the missing file. -- `bun run cli` is not defined in this repo, so Ralph workflow event emissions failed harmlessly under the required `|| true` wrapper. - ---- -## 2026-04-24 — US-003: Orchestrator step in prompts/second-wbs-session.md - -**Status**: PASSED -**Files changed**: -- `prompts/second-wbs-session.md` — added fail-closed Step 3 planner orchestration before critic fan-out, marshaling context through `scripts/planner-context.ts`, invoking `scripts/planner-invoke.ts`, committing `history//plan.md` + `history/memory.jsonl`, and passing `PLAN_PATH` to downstream agents. -- `README.md` — updated architecture diagram, weekly-flow step count, runtime estimate, and weekly-run summary to include the planner step. - -**Acceptance criteria verified**: -- [x] Edit prompts/second-wbs-session.md to add a new numbered step titled 'Run planner' placed BEFORE the critic fan-out step. -- [x] The step shows the bash/bun invocation that marshals context via scripts/planner-context.ts and invokes the planner via scripts/planner-invoke.ts, with the week argument set to the current ISO week folder name under history/. -- [x] The step specifies that on planner error the run halts with a non-zero exit status and a pointer to the error message. -- [x] The step references history//plan.md as the output artifact consumed by later steps. -- [x] Update README.md or an adjacent doc section if the prior council flow explicitly enumerated the steps, so the step count remains accurate. -- [x] bun run validate passes. - -**Learnings**: -- The weekly runner uses `WEEK_DATE=$(date -u +%Y-%m-%d)` as its history folder slug, so the planner step reuses that existing ISO-8601 UTC folder naming pattern instead of introducing a second week format. -- `bun --eval` receives user arguments at `process.argv.slice(1)`, which keeps the prompt-only invocation small without adding CLI code to the helper modules. - ---- diff --git a/.forge/ralph/planner-agent-spec-v5/prd.json b/.forge/ralph/planner-agent-spec-v5/prd.json deleted file mode 100644 index 4328b68..0000000 --- a/.forge/ralph/planner-agent-spec-v5/prd.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "project": "Webster", - "branchName": "ralph/planner-agent-spec-v5", - "prdFile": "prd.md", - "description": "Add a schema-valid Opus 4.7 webster-planner Managed Agent spec and tests for its plan.md contract and registration-flow shape.", - "userStories": [ - { - "id": "US-001", - "title": "Add schema-valid planner Managed Agent spec", - "description": "As a Webster implementation operator, I want agents/webster-planner.json to exist as an Opus 4.7 Managed Agent spec so that later orchestration can register and invoke the planner.", - "acceptanceCriteria": [ - "Create agents/webster-planner.json with name \"webster-planner\", model \"claude-opus-4-7\", and required fields accepted by scripts/schemas/agent.schema.json.", - "The spec uses field \"system\" and does not include rejected fields such as \"system_prompt\" or \"callable_agents\".", - "The system prompt states that user.message supplies marshaled memory context: memory.jsonl tail, last two weeks verdict context, and monitor anomaly report.", - "The system prompt defines the plan.md JSON fields: classification, next_action, direction_hint, optional new_critic_request, and rationale.", - "The next_action enum in the system prompt includes exactly promote_and_experiment, hold_baseline, revert_and_retry, and explore_broadly.", - "The system prompt instructs cold-start/week-1/no-prior-verdict handling to use explore_broadly.", - "Feature #52 and #53 behavior is not implemented in this story.", - "bun run validate:agents passes" - ], - "technicalNotes": "Modify agents/webster-planner.json. Mirror the spec shape in agents/webster-redesigner.json and agents/webster-monitor.json: name, description, model, system, tools, optional mcp_servers, metadata. Follow scripts/schemas/agent.schema.json constraints: required name/description/model/system/tools, no additional properties, model enum includes claude-opus-4-7, metadata.role must be one of critic/monitor/redesigner/orchestrator. Use metadata { role: \"orchestrator\", scope: \"planning\" } because the schema does not currently allow role \"planner\". Scope guard: do not edit prompts/second-wbs-session.md, scripts/memory.ts, or council fan-out code for runtime invocation.", - "dependsOn": [], - "priority": 1, - "passes": true, - "notes": "Implemented in iteration 1. Files: agents/webster-planner.json, .forge/ralph/planner-agent-spec-v5/prd.md." - }, - { - "id": "US-002", - "title": "Add planner output contract tests", - "description": "As a Webster implementation operator, I want tests for the planner's plan.md contract so that future orchestration can rely on stable fields and action values.", - "acceptanceCriteria": [ - "Add a Bun test that reads agents/webster-planner.json and asserts its system prompt contains all required output fields: classification, next_action, direction_hint, new_critic_request, and rationale.", - "Add a Bun test that asserts the planner system prompt contains all four allowed next_action values: promote_and_experiment, hold_baseline, revert_and_retry, explore_broadly.", - "Add a Bun test that asserts the planner system prompt describes cold-start behavior for week 1/no prior verdict and ties it to explore_broadly.", - "Add a Bun test that asserts the planner system prompt names all three input context sources: memory.jsonl, verdict, and monitor anomaly report or alerts.", - "Tests fail if agents/webster-planner.json is missing or invalid JSON.", - "bun test passes" - ], - "technicalNotes": "Add tests under scripts/__tests__ using the existing Bun style in scripts/__tests__/validate-agents.test.ts and scripts/__tests__/critic-genealogy.test.ts: import { describe, expect, test } from \"bun:test\", read JSON with readFileSync, and resolve ROOT via import.meta.dir. Keep tests focused on the agent spec contract; do not create runtime planner invocation helpers because feature #52 owns invocation.", - "dependsOn": [ - "US-001" - ], - "priority": 2, - "passes": true, - "notes": "Implemented in iteration 2. Files: scripts/__tests__/planner-agent-contract.test.ts." - }, - { - "id": "US-003", - "title": "Add registration-flow guard tests and validate", - "description": "As a Webster implementation operator, I want tests that guard the Managed Agents registration shape so that the planner can be posted to /v1/agents and invoked through the existing session pattern later.", - "acceptanceCriteria": [ - "Add or extend a test to validate agents/webster-planner.json against scripts/schemas/agent.schema.json with AJV 2020, matching scripts/__tests__/validate-agents.test.ts patterns.", - "Add a test that asserts the planner spec has registration-compatible top-level fields only and no research-preview callable_agents field.", - "Add a test or assertion that the planner spec includes tools with type agent_toolset_20260401, matching the Managed Agents beta pattern in existing agent specs.", - "Add a test assertion or technical note in the test name referencing the existing registration/session flow in scripts/critic-genealogy.ts: find/register agent, create session, send user.message, poll until idle.", - "Run bun run format:check, bun run type-check, bun run lint --max-warnings 0, bun run validate:agents, bun test, and bun run validate before declaring completion.", - "Do not implement scripts that call /v1/sessions or write history//plan.md; that remains feature #52." - ], - "technicalNotes": "Use the same AJV setup as scripts/__tests__/validate-agents.test.ts: Ajv2020 from ajv/dist/2020.js plus addFormats.default(ajv). Registration flow references should be grounded in scripts/critic-genealogy.ts:440-556, where registerAgent POSTs to /v1/agents, createSession POSTs to /v1/sessions, sendUserMessage POSTs to /sessions/{id}/events, and pollUntilIdle reads /sessions/{id}. This story should only test that the planner spec is compatible with that flow, not duplicate or export those helpers.", - "dependsOn": [ - "US-001", - "US-002" - ], - "priority": 3, - "passes": true, - "notes": "Implemented in iteration 3. Files: scripts/__tests__/planner-agent-contract.test.ts." - } - ] -} diff --git a/.forge/ralph/planner-agent-spec-v5/prd.md b/.forge/ralph/planner-agent-spec-v5/prd.md deleted file mode 100644 index 9ae0459..0000000 --- a/.forge/ralph/planner-agent-spec-v5/prd.md +++ /dev/null @@ -1,151 +0,0 @@ -# Planner Agent Spec — Product Requirements - -## Overview - -**Problem**: Webster's weekly council can critique and redesign, but Layer 11 needs an experiment-aware planning brain before the critics run. Without a schema-valid `webster-planner` Managed Agent spec, later orchestration work (#52) has no registered agent to invoke and no stable `plan.md` contract to hand to critics (#53). -**Solution**: Add `agents/webster-planner.json` as an Opus 4.7 Managed Agent spec that matches the existing Managed Agents beta schema, reads marshaled memory context supplied by the orchestrator, and emits a `plan.md` containing a strict JSON object with `classification`, `next_action`, `direction_hint`, optional `new_critic_request`, and `rationale`. -**Branch**: `ralph/planner-agent-spec-v5` - ---- - -## Goals & Success - -### Primary Goal - -Ship the planner agent spec and tests that prove it is schema-valid and aligned with Webster's registration/invocation pattern, without implementing the later orchestrator memory marshaling or council integration features. - -### Success Metrics - -| Metric | Target | How Measured | -|--------|--------|--------------| -| Managed Agent schema validity | `agents/webster-planner.json` passes the committed schema | `bun run validate:agents` and `bun test` | -| Planner output contract coverage | Tests verify required `plan.md` JSON fields and `next_action` enum values | New/updated Bun tests | -| Registration-flow alignment | Tests assert planner uses `POST /v1/agents`-compatible fields and no research-preview fields | New/updated Bun tests referencing existing schema and critic-genealogy flow | -| Scope containment | No orchestrator prompt, memory helper, or council fan-out implementation changes | Git diff review | - -### Non-Goals (Out of Scope) - -- Implementing orchestrator memory marshaling or planner invocation — explicitly owned by feature #52. -- Passing `plan.md` into critics/redesigner or spawning genealogy from planner output — explicitly owned by feature #53. -- Building cold-start orchestration behavior beyond planner spec instructions — feature #54 owns runtime cold-start plumbing. -- Changing the Managed Agent schema shape unless strictly required for the new `orchestrator` metadata role already allowed by `scripts/schemas/agent.schema.json`. - ---- - -## User & Context - -### Target User - -- **Who**: Webster implementation operators preparing the Layer 11 planner + experiment-aware council. -- **Role**: They maintain Managed Agent specs, validation gates, and orchestration scripts for the hackathon submission. -- **Current Pain**: Later features cannot safely invoke a planner because there is no registered-agent spec or tested `plan.md` output contract. - -### User Journey - -1. **Trigger**: Operator picks feature #50 from `context/FEATURES.md` and needs a schema-valid planner agent spec. -2. **Action**: Operator adds `agents/webster-planner.json`, runs validation/tests, and confirms it follows the beta Managed Agents registration shape. -3. **Outcome**: Feature #52 can register/invoke this planner via `/v1/agents`, `/v1/sessions`, events, and polling, then persist the returned `plan.md`. - ---- - -## UX Requirements - -### Interaction Model - -Backend/spec-only. Users do not interact with UI. The planner is registered through the same Managed Agents beta API shape used by existing specs and later invoked by orchestration code using the five-step pattern visible in `scripts/critic-genealogy.ts:440-556`: find/register agent, create session, send `user.message`, poll session status, inspect output. - -### States to Handle - -| State | Description | Behavior | -|-------|-------------|----------| -| Empty | Memory tail and prior verdicts are absent in week 1 | Planner instructions must choose `next_action: "explore_broadly"` and explain cold-start classification. | -| Loading | Runtime session is polling after a planner `user.message` | Out of scope for #50; covered by existing pattern in `scripts/critic-genealogy.ts:503-556` and future #52. | -| Error | Marshaled memory is contradictory, malformed, or missing key sections | Planner instructions must still emit valid `plan.md` JSON and state uncertainty in `rationale`. | -| Success | Planner has memory tail, verdicts, and monitor anomaly report | Planner emits a single `plan.md` JSON object with an allowed `next_action` and concrete `direction_hint`. | - ---- - -## Technical Context - -### Patterns to Follow - -- **Managed Agent spec pattern**: `agents/webster-redesigner.json` — Opus 4.7 agent with `name`, `description`, `model`, long `system`, `tools`, `mcp_servers`, and `metadata`. -- **Monitor context pattern**: `agents/webster-monitor.json` — reads analytics inputs, handles missing prior week, and writes structured output without proposing fixes. -- **Registration + session pattern**: `scripts/critic-genealogy.ts:440-556` — `findAgentByName`, `registerAgent`, `createSession`, `sendUserMessage`, and `pollUntilIdle` use `/v1/agents`, `/v1/sessions`, `/events`, and polling with `managed-agents-2026-04-01` beta headers. -- **Schema validation pattern**: `scripts/schemas/agent.schema.json` — requires `name`, `description`, `model`, `system`, and `tools`; rejects `system_prompt`, `callable_agents`, and unknown models. -- **Agent validation tests**: `scripts/__tests__/validate-agents.test.ts` — compiles the schema with AJV 2020 and validates every `agents/*.json` file. -- **Registration gotcha tests**: `scripts/__tests__/critic-genealogy.test.ts` — verifies generated specs preserve tools/MCP servers and remain valid against `agent.schema.json`. - -### Types & Interfaces - -```typescript -// Existing schema-level contract from scripts/schemas/agent.schema.json -type PlannerAgentSpec = { - name: string; - description: string; - model: "claude-opus-4-7" | "claude-opus-4-7-20260101"; - system: string; - tools: Array<{ type: "agent_toolset_20260401" } | { type: "mcp_toolset"; mcp_server_name: string }>; - mcp_servers?: Array<{ type: "url"; name: string; url: string }>; - metadata?: { role?: "orchestrator"; scope?: string }; -}; - -type PlannerPlan = { - classification: string; - next_action: "promote_and_experiment" | "hold_baseline" | "revert_and_retry" | "explore_broadly"; - direction_hint: string; - new_critic_request?: { - scope: string; - rationale: string; - evidence_refs: string[]; - }; - rationale: string; -}; -``` - -### Architecture Notes - -- The planner is an Opus 4.7 Managed Agent per Q1 ADR-0001 and `context/FEATURES.md` feature #50. -- The agent must not read repository files itself for memory; #50's spec should state that the orchestrator supplies marshaled `memory.jsonl` tail, last two weeks of verdicts, and monitor anomaly report in `user.message`. -- The planner output contract is `plan.md` whose body contains one JSON object; tests can assert the system prompt includes the required schema fields and enum values. -- The spec should likely reuse the GitHub MCP toolset pattern from `webster-redesigner`/`webster-monitor` only if the planner is instructed to commit `plan.md` itself. Feature #52 says the orchestrator extracts output and writes `history//plan.md`, so the planner spec can be agent-toolset-only unless existing Managed Agent registration expectations require MCP parity. -- Metadata should use `role: "orchestrator"` and `scope: "planning"` because `scripts/schemas/agent.schema.json` already allows `orchestrator` but not `planner`. - ---- - -## Implementation Summary - -### Story Overview - -| ID | Title | Priority | Dependencies | -|----|-------|----------|--------------| -| US-001 | Add schema-valid planner Managed Agent spec | 1 | — | -| US-002 | Add planner output contract tests | 2 | US-001 | -| US-003 | Add registration-flow guard tests and validate | 3 | US-001, US-002 | - -### Dependency Graph - -```text -US-001 (agent spec) - ↓ -US-002 (plan.md output contract tests) - ↓ -US-003 (registration-flow guard tests + validation) -``` - ---- - -## Validation Requirements - -Every story must pass: - -- [ ] Type-check: `bun run type-check` -- [ ] Lint: `bun run lint --max-warnings 0` -- [ ] Tests: `bun run test` -- [ ] Format: `bun run format:check` -- [ ] Agent schema validation: `bun run validate:agents` -- [ ] Full validation before completion: `bun run validate` - ---- - -Generated: 2026-04-24T00:00:00.000Z diff --git a/.forge/ralph/seed-demo-arc-w3w4-v5/prd.json b/.forge/ralph/seed-demo-arc-w3w4-v5/prd.json deleted file mode 100644 index 6499e5f..0000000 --- a/.forge/ralph/seed-demo-arc-w3w4-v5/prd.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "project": "Webster", - "branchName": "ralph/seed-demo-arc-w3w4-v5", - "prdFile": "prd.md", - "description": "Complete feature #57 by extending the existing demo arc seeder from W1/W2 through W3/W4 with 6-of-7 lane coverage and one W4 genealogy spawn.", - "userStories": [ - { - "id": "US-003", - "title": "Add W3 gate-fail and auto-rollback seeding", - "description": "As a Webster demo operator, I want W3 demo-arc artifacts for archive-gate-fail, auto-rollback, and hold outcomes so that the demo can show failure learning without touching live history.", - "acceptanceCriteria": [ - "`bun scripts/seed-demo-arc.ts` writes `history/demo-arc/demo-W3/proposal.md`, `decision.json`, and `verdict.json`.", - "W3 verdict includes `exp-05-mid-section-image-swap` with outcome `archive-gate-fail` and a failing `bounce_rate` gate.", - "W3 verdict includes `exp-06-cta-color-shift` with outcome `auto-rollback`, classification `hurt`, and `reward_delta_pct` of `-11`.", - "W3 verdict includes `exp-07-subhead-rewrite` with outcome `hold` and classification `neutral`.", - "`history/demo-arc/baselines.jsonl` records W3 lane statuses as `archived-gate-fail`, `rolled-back`, and no promoted baseline for the held experiment.", - "`history/demo-arc/memory.jsonl` contains W3 rows whose final events reflect archive/rollback/hold behavior rather than labeling every W3 experiment as a promotion.", - "Running the seeder twice produces deterministic W3 output under `history/demo-arc/` and does not write outside that directory.", - "`bun run validate` passes." - ], - "technicalNotes": "Build on `scripts/seed-demo-arc.ts` only. Reuse existing W3 entries in `EXPERIMENT_SPECS` at `scripts/seed-demo-arc.ts:240-291`; mirror `writeW1`/`writeW2` at `scripts/seed-demo-arc.ts:459-476` with a `writeW3`. Update shared baseline/memory helpers at `scripts/seed-demo-arc.ts:437-457` if needed so lane-specific statuses/events are represented correctly. Preserve `initDemoArcDir()` isolation under `history/demo-arc/` from `scripts/seed-demo-arc.ts:340-351`. Follow the locked W3 narrative in `context/DOMAIN-MODEL.md:415-420` and feature #57 scope in `context/FEATURES.md:172`.", - "dependsOn": [], - "priority": 1, - "passes": true, - "notes": "Implemented in iteration 1. Files: scripts/seed-demo-arc.ts, history/demo-arc/demo-W3/proposal.md, history/demo-arc/demo-W3/decision.json, history/demo-arc/demo-W3/verdict.json, history/demo-arc/baselines.jsonl, history/demo-arc/memory.jsonl." - }, - { - "id": "US-004", - "title": "Add W4 conservative wins and genealogy spawn", - "description": "As a Webster demo operator, I want W4 demo-arc artifacts plus one spawned critic artifact set so that the demo closes the loop from W3 failure to critic genealogy and safe recovery wins.", - "acceptanceCriteria": [ - "`bun scripts/seed-demo-arc.ts` writes `history/demo-arc/demo-W4/proposal.md`, `decision.json`, and `verdict.json`.", - "W4 verdict includes `exp-08-hero-safety-copy` and `exp-09-cta-size-adjust` as passing conservative experiments.", - "Outcome coverage across W1-W4 includes exactly these six lanes at minimum: `promote-fast-track`, `promote-fallback`, `promote-gate-win`, `archive-gate-fail`, `auto-rollback`, and `hold`.", - "The seeder writes one W4 genealogy-spawned critic spec for a bounce-risk concern under `history/demo-arc/demo-W4/genealogy/`.", - "The genealogy artifacts include a `NewCriticSpec`-shaped JSON payload and an `AgentJSON`-shaped critic registration payload using the existing exported interfaces.", - "W4 memory rows include a `gap-detected` event or equivalent genealogy trigger referencing the W3 bounce/gate-fail pattern and the spawned critic.", - "The script completion message reflects seeding through demo-W4 instead of demo-W2.", - "Running the seeder twice produces deterministic W4/genealogy output under `history/demo-arc/` and does not write outside that directory.", - "`bun run validate` passes." - ], - "technicalNotes": "Depends on US-003's lane-correct baseline and memory helper behavior. Reuse existing W4 entries in `EXPERIMENT_SPECS` at `scripts/seed-demo-arc.ts:292-338`; mirror the existing week writer pattern at `scripts/seed-demo-arc.ts:459-476` with `writeW4`. Use the existing `AgentJSON` and `NewCriticSpec` interfaces from `scripts/seed-demo-arc.ts:68-87` for deterministic genealogy JSON files. `initDemoArcDir()` already creates `history/demo-arc/demo-W4/genealogy` at `scripts/seed-demo-arc.ts:348`; write artifacts there. Follow the W4 table row and Git-state expectations in `context/DOMAIN-MODEL.md:421-429`. Do not call real Managed Agent APIs or alter `agents/` live specs.", - "dependsOn": ["US-003"], - "priority": 2, - "passes": true, - "notes": "Implemented in iteration 2. Files: scripts/seed-demo-arc.ts, history/demo-arc/demo-W4/proposal.md, history/demo-arc/demo-W4/decision.json, history/demo-arc/demo-W4/verdict.json, history/demo-arc/demo-W4/genealogy/new-critic-spec.json, history/demo-arc/demo-W4/genealogy/agent-registration.json, history/demo-arc/baselines.jsonl, history/demo-arc/memory.jsonl, context/FEATURES.md." - } - ] -} diff --git a/.forge/ralph/seed-demo-arc-w3w4-v5/prd.md b/.forge/ralph/seed-demo-arc-w3w4-v5/prd.md deleted file mode 100644 index 23f24cd..0000000 --- a/.forge/ralph/seed-demo-arc-w3w4-v5/prd.md +++ /dev/null @@ -1,153 +0,0 @@ -# Seed Demo Arc W3/W4 — Product Requirements - -## Overview - -**Problem**: Feature #57 is only half shipped. `scripts/seed-demo-arc.ts` already seeds W1/W2, but the demo arc still cannot show the dramatic W3 failure/rollback beat or the W4 critic-genealogy response promised in the Webster narrative. -**Solution**: Extend the existing seeder with the already-modeled W3 and W4 experiment specs, artifact writers, baseline/memory rows, and W4 genealogy artifacts. Do not rework US-001 or US-002. -**Branch**: `ralph/seed-demo-arc-w3w4-v5` - ---- - -## Goals & Success - -### Primary Goal - -Complete feature #57 by adding only US-003 and US-004 so `bun scripts/seed-demo-arc.ts` creates a complete, idempotent four-week demo arc under `history/demo-arc/`. - -### Success Metrics - -| Metric | Target | How Measured | -| ------ | ------ | ------------ | -| Week coverage | W1, W2, W3, and W4 artifacts exist | Run seeder and inspect `history/demo-arc/demo-W*/` | -| Outcome coverage | 6 of 7 Q4 lanes represented | Inspect `verdict.json` outcomes across all weeks | -| Genealogy proof | One W4 spawned critic artifact set exists | Inspect `history/demo-arc/demo-W4/genealogy/` | -| Runtime safety | No live history mutation | Seeder writes only beneath `history/demo-arc/` | -| Quality gate | Validation green | `bun run validate` | - -### Non-Goals (Out of Scope) - -- Re-implementing W1/W2 scaffold or artifact writers — already landed in `fb3256e`. -- Creating real Managed Agents through the Anthropic API — this is a deterministic mock seeder. -- Touching live weekly history outside `history/demo-arc/` — demo data must remain isolated. -- Covering the 7th outcome lane — the locked hero claim is deliberately 6/7. - ---- - -## User & Context - -### Target User - -- **Who**: Webster implementation operator preparing the hackathon demo. -- **Role**: Maintains deterministic run artifacts that let the council/planner story be replayed. -- **Current Pain**: The seeded output stops at W2, so the best narrative beats are absent. - -### User Journey - -1. **Trigger**: Operator needs a four-week mock arc for the submission demo. -2. **Action**: Operator runs `bun scripts/seed-demo-arc.ts`. -3. **Outcome**: `history/demo-arc/` contains W1-W4 proposals, decisions, verdicts, memory, baselines, and W4 genealogy artifacts. - ---- - -## UX Requirements - -### Interaction Model - -CLI-only deterministic seed script. The user runs `bun scripts/seed-demo-arc.ts`; the script recreates `history/demo-arc/` from scratch and prints a completion message. - -### States to Handle - -| State | Description | Behavior | -| ----- | ----------- | -------- | -| Empty | `history/demo-arc/` does not exist | Create directory tree and all artifacts | -| Loading | Script is running | Synchronous file writes; no progress UI required | -| Error | Filesystem or type errors occur | Let Bun/Node error surface; no silent fallback | -| Success | Seeder completes | W1-W4 artifacts are present and deterministic | - ---- - -## Technical Context - -### Patterns to Follow - -- **Existing seeder scaffold**: `scripts/seed-demo-arc.ts:12-129` — constants, demo week identifiers, and TypeScript interfaces already define the artifact model. -- **Existing W3/W4 data**: `scripts/seed-demo-arc.ts:240-338` — W3 and W4 `EXPERIMENT_SPECS` already encode experiment IDs, outcomes, gates, and insights. -- **Artifact writer pattern**: `scripts/seed-demo-arc.ts:365-435` — proposal, decision, verdict, baseline, and memory writes are pure helper functions. -- **Existing W1/W2 orchestration**: `scripts/seed-demo-arc.ts:459-479` — `writeW1`, `writeW2`, and `main` show the intended week writer shape. -- **Locked domain narrative**: `context/DOMAIN-MODEL.md:411-431` — Q9 table defines W3/W4 experiments, outcomes, and genealogy demo beat. -- **Feature tracking**: `context/FEATURES.md:172` — #57 status and remaining scope are canonical. -- **Validation rules**: `CLAUDE.md:18-31` and `package.json:scripts.validate` — type-check, lint, format, agent/findings validation, markdownlint, and tests are mandatory. - -### Types & Interfaces - -```typescript -type OutcomeLane = - | "promote-fast-track" - | "promote-fallback" - | "promote-gate-win" - | "archive-gate-fail" - | "auto-rollback" - | "hold"; - -interface ExperimentSpec extends ExperimentVerdict { - week: DemoWeek; - target_files: string[]; - proposed_change: string; - rationale: string; - baseline_sha: string; - verdict_ready_insight: string; - promote_insight: string; -} - -interface NewCriticSpec { - name: string; - scope: string; - description: string; - rationale: string; - focus_owned: string[]; - focus_not_owned: string[]; - severity_rubric: string; -} -``` - -### Architecture Notes - -- `initDemoArcDir()` currently creates all week directories and `demo-W4/genealogy`, so US-003/US-004 should add writers rather than new directory bootstrapping. -- `buildBaselineRows()` currently marks every row as `promoted`; US-003 must preserve `archived-gate-fail` and `rolled-back` statuses for W3 lanes. -- `buildWeekMemoryRows()` currently emits `promote` for every final event; US-003 must emit event names matching each outcome where relevant, especially rollback and skip/hold semantics. -- W4 genealogy should use the existing `AgentJSON` and `NewCriticSpec` shapes and write deterministic local JSON/Markdown artifacts under `history/demo-arc/demo-W4/genealogy/`. - ---- - -## Implementation Summary - -### Story Overview - -| ID | Title | Priority | Dependencies | -| -- | ----- | -------- | ------------ | -| US-003 | Add W3 gate-fail and auto-rollback seeding | 1 | -- | -| US-004 | Add W4 conservative wins and genealogy spawn | 2 | US-003 | - -### Dependency Graph - -```text -US-003 (W3 artifact writers + lane-correct baseline/memory rows) - ↓ -US-004 (W4 artifact writers + genealogy spawn artifacts) -``` - ---- - -## Validation Requirements - -Every story must pass: - -- [ ] Type-check: `bun run type-check` -- [ ] Lint: `bun run lint --max-warnings 0` -- [ ] Tests: `bun run test` -- [ ] Format: `bun run format:check` -- [ ] Full validation: `bun run validate` - ---- - -Generated: 2026-04-24T07:47:55Z diff --git a/.forge/ralph/seed-demo-arc-w3w4-v5/progress.txt b/.forge/ralph/seed-demo-arc-w3w4-v5/progress.txt deleted file mode 100644 index 2318beb..0000000 --- a/.forge/ralph/seed-demo-arc-w3w4-v5/progress.txt +++ /dev/null @@ -1,88 +0,0 @@ -## Codebase Patterns - -### Deterministic local genealogy artifacts -- **Where**: `scripts/seed-demo-arc.ts` -- **Pattern**: Model demo-only critic genealogy with typed constants satisfying `NewCriticSpec` and `AgentJSON`, then write those JSON payloads under the week-local `history/demo-arc/demo-W4/genealogy/` directory. Do not mutate live `agents/` specs or call Managed Agent APIs from the seeder. -- **Example**: `BOUNCE_GUARD_CRITIC_SPEC satisfies NewCriticSpec` and `BOUNCE_GUARD_AGENT_JSON satisfies AgentJSON`. - -### Outcome lane mapping for demo baselines and memory -- **Where**: `scripts/seed-demo-arc.ts` -- **Pattern**: Keep experiment specs as the single source of truth, then derive baseline status and final memory event from `experiment.outcome`. -- **Example**: `archive-gate-fail -> archived-gate-fail + regression`, `auto-rollback -> rolled-back + rollback`, `hold -> no baseline row + skip`. - ---- - -## 2026-04-24T07:55:39Z — US-003: Add W3 gate-fail and auto-rollback seeding - -**Status**: PASSED -**Files changed**: -- `scripts/seed-demo-arc.ts` — added W3 writer and lane-specific baseline/memory helpers. -- `history/demo-arc/demo-W3/proposal.md` — seeded W3 proposal artifacts. -- `history/demo-arc/demo-W3/decision.json` — seeded W3 planner decision artifacts. -- `history/demo-arc/demo-W3/verdict.json` — seeded W3 verdict artifacts. -- `history/demo-arc/baselines.jsonl` — added W3 archive/rollback rows with no held-experiment promotion. -- `history/demo-arc/memory.jsonl` — added W3 verdict-ready plus regression/rollback/skip final events. -- `.forge/ralph/seed-demo-arc-w3w4-v5/prd.md` — removed emphasis from generated footer so repository markdown validation passes. - -**Acceptance criteria verified**: -- [x] `bun scripts/seed-demo-arc.ts` writes `history/demo-arc/demo-W3/proposal.md`, `decision.json`, and `verdict.json`. -- [x] W3 verdict includes `exp-05-mid-section-image-swap` with outcome `archive-gate-fail` and a failing `bounce_rate` gate. -- [x] W3 verdict includes `exp-06-cta-color-shift` with outcome `auto-rollback`, classification `hurt`, and `reward_delta_pct` of `-11`. -- [x] W3 verdict includes `exp-07-subhead-rewrite` with outcome `hold` and classification `neutral`. -- [x] `history/demo-arc/baselines.jsonl` records W3 lane statuses as `archived-gate-fail`, `rolled-back`, and no promoted baseline for the held experiment. -- [x] `history/demo-arc/memory.jsonl` contains W3 rows whose final events reflect archive/rollback/hold behavior rather than labeling every W3 experiment as a promotion. -- [x] Running the seeder twice produces deterministic W3 output under `history/demo-arc/` and does not write outside that directory. -- [x] `bun run validate` passes. - -**Learnings**: -- `progress.txt` was absent at iteration start, so this iteration created it with a codebase pattern section. -- The existing W3 specs already contained the required verdict details; implementation only needed orchestration plus derived baseline/memory semantics. -- `bun run validate` initially failed on the generated PRD footer being emphasis-only markdown; removing the emphasis made markdownlint pass. - -**Verification**: -- `bun scripts/seed-demo-arc.ts && cp -R history/demo-arc /tmp/demo-arc-first && bun scripts/seed-demo-arc.ts && diff -qr /tmp/demo-arc-first history/demo-arc` -- `jq '.experiments[] | {exp_id,outcome,classification,reward_delta_pct,gates}' history/demo-arc/demo-W3/verdict.json` -- `grep 'exp-0[567]' history/demo-arc/baselines.jsonl` -- `grep 'demo-W3' history/demo-arc/memory.jsonl` -- `bun run type-check && bun run lint --max-warnings 0 && bun run test && bun run format:check && bun run validate` - ---- - -## 2026-04-24T07:59:41Z — US-004: Add W4 conservative wins and genealogy spawn - -**Status**: PASSED -**Files changed**: -- `scripts/seed-demo-arc.ts` — added W4 writer, deterministic bounce-guard genealogy payloads, W4 gap-detected memory row, and demo-W4 completion output. -- `history/demo-arc/demo-W4/proposal.md` — seeded W4 proposal artifacts. -- `history/demo-arc/demo-W4/decision.json` — seeded W4 planner decision artifacts. -- `history/demo-arc/demo-W4/verdict.json` — seeded W4 verdict artifacts for exp-08 and exp-09. -- `history/demo-arc/demo-W4/genealogy/new-critic-spec.json` — seeded `NewCriticSpec`-shaped bounce-guard critic payload. -- `history/demo-arc/demo-W4/genealogy/agent-registration.json` — seeded `AgentJSON`-shaped critic registration payload. -- `history/demo-arc/baselines.jsonl` — added W4 promoted baseline rows. -- `history/demo-arc/memory.jsonl` — added W4 gap-detected genealogy trigger and W4 verdict/promote rows. -- `context/FEATURES.md` — marked feature #57 done. - -**Acceptance criteria verified**: -- [x] `bun scripts/seed-demo-arc.ts` writes `history/demo-arc/demo-W4/proposal.md`, `decision.json`, and `verdict.json`. -- [x] W4 verdict includes `exp-08-hero-safety-copy` and `exp-09-cta-size-adjust` as passing conservative experiments. -- [x] Outcome coverage across W1-W4 includes exactly these six lanes at minimum: `promote-fast-track`, `promote-fallback`, `promote-gate-win`, `archive-gate-fail`, `auto-rollback`, and `hold`. -- [x] The seeder writes one W4 genealogy-spawned critic spec for a bounce-risk concern under `history/demo-arc/demo-W4/genealogy/`. -- [x] The genealogy artifacts include a `NewCriticSpec`-shaped JSON payload and an `AgentJSON`-shaped critic registration payload using the existing exported interfaces. -- [x] W4 memory rows include a `gap-detected` event or equivalent genealogy trigger referencing the W3 bounce/gate-fail pattern and the spawned critic. -- [x] The script completion message reflects seeding through demo-W4 instead of demo-W2. -- [x] Running the seeder twice produces deterministic W4/genealogy output under `history/demo-arc/` and does not write outside that directory. -- [x] `bun run validate` passes. - -**Learnings**: -- W4 experiment specs already contained the conservative passing outcomes, so implementation needed orchestration and genealogy artifact emission rather than new experiment modeling. -- The seeder's exported interfaces can enforce local demo payload shape with `satisfies` while still avoiding live Managed Agent registration. -- Prettier reformats the long `context/FEATURES.md` table row when feature #57 status changes. - -**Verification**: -- `bun scripts/seed-demo-arc.ts && cp -R history/demo-arc /tmp/demo-arc-first && bun scripts/seed-demo-arc.ts && diff -qr /tmp/demo-arc-first history/demo-arc` -- `jq -r '.experiments[].outcome' history/demo-arc/demo-W*/verdict.json | sort -u` -- `jq '.experiments[] | {exp_id,outcome,classification,reward_delta_pct,gates}' history/demo-arc/demo-W4/verdict.json` -- `grep 'gap-detected' history/demo-arc/memory.jsonl` -- `bun run type-check && bun run lint --max-warnings 0 && bun run test && bun run format:check && bun run validate` - ---- diff --git a/.forge/ralph/webster-feature-number-58-pair-alpha-secondary-sub/prd.json b/.forge/ralph/webster-feature-number-58-pair-alpha-secondary-sub/prd.json deleted file mode 100644 index 3bdf936..0000000 --- a/.forge/ralph/webster-feature-number-58-pair-alpha-secondary-sub/prd.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "project": "webster", - "branchName": "ralph/webster-feature-number-58-pair-alpha-secondary-sub", - "prdFile": "prd.md", - "description": "Seed Pair Alpha secondary SaaS and local-service substrates with deterministic mock run artifacts.", - "userStories": [ - { - "id": "US-001", - "title": "Add deterministic secondary substrate model and HTML writers", - "description": "As a Webster implementation operator, I want deterministic SaaS and local-service HTML fixtures so that the submission can demonstrate council generalization beyond the primary substrate.", - "acceptanceCriteria": [ - "Create `scripts/seed-secondary-substrates.ts` with `#!/usr/bin/env bun` and pure TypeScript imports from Node/Bun standard libraries only.", - "Define typed constants for exactly two substrates: `saas-alpha` and `local-service-alpha`.", - "Write `site/secondary/saas-alpha/index.html` and `site/secondary/local-service-alpha/index.html` as complete single-file HTML landing pages.", - "HTML output is deterministic and contains no remote scripts, remote stylesheets, or network-fetching code.", - "Export constants or helper functions needed by tests without executing `main()` on import.", - "Type-check passes", - "Tests pass" - ], - "technicalNotes": "Mirror `scripts/seed-demo-arc.ts:8-17` for ROOT/path constants and fs/path imports. Follow `scripts/seed-demo-arc.ts:21-64` for literal-union types/interfaces. The new script owns `site/secondary/` only for site output and must not touch `site/before/` or `site/after/`.", - "dependsOn": [], - "priority": 1, - "passes": true, - "notes": "Implemented in iteration 1. Files: scripts/seed-secondary-substrates.ts, site/secondary/saas-alpha/index.html, site/secondary/local-service-alpha/index.html." - }, - { - "id": "US-002", - "title": "Write secondary mock run artifacts", - "description": "As a Webster implementation operator, I want onboard and two weekly mock runs per secondary substrate so that the demo can show a complete two-cycle council arc for each new vertical.", - "acceptanceCriteria": [ - "Create `history/secondary-arc/saas-alpha/{onboard,week-1,week-2}/` and `history/secondary-arc/local-service-alpha/{onboard,week-1,week-2}/`.", - "Every run folder contains exactly the required artifact names: `proposal.md`, `decision.json`, `verdict.json`, and `apply-log.json`.", - "Each `proposal.md` includes experiment blocks with exp IDs, kind, target files, proposed change, and rationale.", - "Each `decision.json` includes substrate, run, selected issues, reasoning, and monitor signal fields that mirror the `history/demo-arc` decision convention.", - "Each `verdict.json` includes substrate, run, experiments, reward delta, p-value, classification, and outcome fields that mirror the `history/demo-arc` verdict convention.", - "Each `apply-log.json` records applied status, touched files, skipped rows, and notes for that run.", - "Type-check passes", - "Tests pass" - ], - "technicalNotes": "Follow writer shape in `scripts/seed-demo-arc.ts:393-419`: build typed objects, write `JSON.stringify(value, null, 2)` plus trailing newline, and generate Markdown proposal bodies from typed experiment specs. Use the outcome lane names from `scripts/seed-demo-arc.ts:25-31`.", - "dependsOn": [ - "US-001" - ], - "priority": 2, - "passes": true, - "notes": "Implemented in iteration 2. Files: scripts/seed-secondary-substrates.ts, history/secondary-arc/*/{onboard,week-1,week-2}/{proposal.md,decision.json,verdict.json,apply-log.json}." - }, - { - "id": "US-003", - "title": "Wire CLI package script and scope guards", - "description": "As a Webster implementation operator, I want a single package command with strict output boundaries so that seeding is repeatable and cannot corrupt primary demo artifacts.", - "acceptanceCriteria": [ - "Add `seed:secondary` to `package.json` scripts with value `bun scripts/seed-secondary-substrates.ts`.", - "The script removes/recreates or overwrites only `site/secondary/` and `history/secondary-arc/`.", - "The script never reads from or writes to `history/demo-arc/`, `site/before/`, or `site/after/`.", - "Running `bun run seed:secondary` exits 0 and prints a concise deterministic success message.", - "`main()` is guarded with `if (import.meta.main)` so tests can import the module safely.", - "Type-check passes", - "Tests pass" - ], - "technicalNotes": "Mirror CLI/export pattern in `scripts/seed-demo-arc.ts:485-510`. Add the package script near existing scripts in `package.json:12-23`. Protected paths are explicit feature requirements from `context/FEATURES.md:173` and the PRD input.", - "dependsOn": [ - "US-002" - ], - "priority": 3, - "passes": true, - "notes": "Implemented in iteration 3. Files: package.json; verified scripts/seed-secondary-substrates.ts guarded main and owned output boundaries." - }, - { - "id": "US-004", - "title": "Add Bun tests for layout, idempotency, and protected paths", - "description": "As a Webster maintainer, I want automated tests around the seeder so that future changes cannot break file layout, determinism, or safety constraints.", - "acceptanceCriteria": [ - "Create `scripts/__tests__/seed-secondary-substrates.test.ts` using Bun's `describe`, `test`, and `expect` APIs.", - "Test verifies both secondary HTML files and all six run folders exist after seeding.", - "Test verifies every run folder contains `proposal.md`, `decision.json`, `verdict.json`, and `apply-log.json`.", - "Test captures contents of all seeded files, runs the seeder a second time, and asserts byte-identical contents for idempotency.", - "Test fingerprints `history/demo-arc/`, `site/before/`, and `site/after/` before and after seeding and asserts they are unchanged.", - "`bun test` passes.", - "`bun run validate` passes." - ], - "technicalNotes": "Follow filesystem testing style in `scripts/__tests__/memory.test.ts:1-85`: import Bun test helpers, use fs/path utilities, and cleanly assert deterministic data. Existing tests import source modules directly, as shown by `scripts/__tests__/critic-genealogy.test.ts:1-18`.", - "dependsOn": [ - "US-003" - ], - "priority": 4, - "passes": true, - "notes": "Implemented in iteration 4. Files: scripts/__tests__/seed-secondary-substrates.test.ts." - } - ] -} diff --git a/.forge/ralph/webster-feature-number-58-pair-alpha-secondary-sub/prd.md b/.forge/ralph/webster-feature-number-58-pair-alpha-secondary-sub/prd.md deleted file mode 100644 index 3af1442..0000000 --- a/.forge/ralph/webster-feature-number-58-pair-alpha-secondary-sub/prd.md +++ /dev/null @@ -1,186 +0,0 @@ -# Pair Alpha Secondary Substrates — Product Requirements - -## Overview - -**Problem**: Webster's current demo arc proves the council loop on one primary landing page only. Without secondary substrates, judges and operators cannot see whether the planner, critic council, verdict model, and mock history conventions generalize beyond the healthcare landing page. -**Solution**: Build `scripts/seed-secondary-substrates.ts`, a deterministic Bun/TypeScript seeder that creates two synthetic single-file secondary landing pages plus mock onboard/week-1/week-2 run artifacts for each substrate. -**Branch**: `ralph/webster-feature-number-58-pair-alpha-secondary-sub` - ---- - -## Goals & Success - -### Primary Goal - -Create a demo-safe Pair Alpha substrate package that proves Webster can operate on a B2B SaaS landing page and a B2C local-service landing page without touching the primary demo arc or before/after site fork. - -### Success Metrics - -| Metric | Target | How Measured | -|--------|--------|--------------| -| Secondary site files created | `site/secondary/saas-alpha/index.html` and `site/secondary/local-service-alpha/index.html` exist | `bun run seed:secondary` then file existence assertions | -| Mock run layout complete | Each substrate has `onboard`, `week-1`, and `week-2` folders with `proposal.md`, `decision.json`, `verdict.json`, `apply-log.json` | Unit test enumerates expected paths under `history/secondary-arc//` | -| Idempotent deterministic output | Re-running the seeder produces byte-identical files | Test snapshots file contents before and after a second run | -| Scope safety | Seeder never mutates `history/demo-arc/`, `site/before/`, or `site/after/` | Test fingerprints protected directories before/after seeding | -| Validation green | `bun run validate` and `bun test` pass | Local command output | - -### Non-Goals (Out of Scope) - -- Live analytics ingestion — this is a synthetic seed artifact, not runtime telemetry. -- E-commerce substrate — explicitly held out by operator decision; Pair Alpha is SaaS + local service only. -- Modifying `history/demo-arc/` — the primary demo arc is canonical and must remain untouched. -- Modifying `site/before/` or `site/after/` — those directories are the primary before/after fork and are not part of the secondary-substrate proof. -- Network calls or external API integration — deterministic mock data only. - ---- - -## User & Context - -### Target User - -- **Who**: Webster implementation operator preparing the hackathon submission. -- **Role**: Needs a fast, repeatable local command that seeds extra demo evidence. -- **Current Pain**: Current mock history is convincing for one primary substrate, but does not demonstrate cross-vertical generalization. - -### User Journey - -1. **Trigger**: Operator needs to show that Webster can run its council loop beyond the primary healthcare landing page. -2. **Action**: Operator runs `bun run seed:secondary`. -3. **Outcome**: Two synthetic landing pages and six mock run folders appear in stable locations, ready for demo narration and automated checks. - ---- - -## UX Requirements - -### Interaction Model - -CLI-only seed workflow: - -```bash -bun run seed:secondary -``` - -The command should be silent except for a short success message. It should be safe to run repeatedly in local development and CI. The script must use pure TypeScript/Bun stdlib file operations and no network calls. - -### States to Handle - -| State | Description | Behavior | -|-------|-------------|----------| -| Empty | `site/secondary/` or `history/secondary-arc/` does not exist | Create directories and all expected files | -| Loading | Seeder is writing deterministic files | Synchronous file writes are acceptable; no progress UI required | -| Error | Filesystem write fails | Let the thrown error fail the command; do not silently swallow | -| Success | All secondary files are written | Print deterministic success line and exit 0 | - ---- - -## Technical Context - -### Patterns to Follow - -- **Similar implementation**: `scripts/seed-demo-arc.ts:8-17` — use Bun TypeScript, `node:fs`, `node:path`, `ROOT`, and constants for output directories. -- **Type pattern**: `scripts/seed-demo-arc.ts:21-64` — define string-literal unions and interfaces for experiment kinds, verdict outcomes, decisions, verdicts, and run rows. -- **Seed lifecycle pattern**: `scripts/seed-demo-arc.ts:345-354` — initialize owned output directories deterministically. For this feature, remove/recreate only `history/secondary-arc/` and `site/secondary/`, never protected primary paths. -- **Artifact writer pattern**: `scripts/seed-demo-arc.ts:393-419` — emit pretty-printed JSON files with trailing newline and Markdown proposal files. -- **CLI entry/export pattern**: `scripts/seed-demo-arc.ts:485-510` — `main()` gated by `if (import.meta.main)` and export constants/helpers for tests. -- **Package script pattern**: `package.json:12-23` — add a new script beside existing validation/test scripts. -- **Test pattern**: `scripts/__tests__/memory.test.ts:1-85` — Bun test with `describe`, `test`, `expect`, filesystem setup/cleanup, and deterministic assertions. - -### Types & Interfaces - -```typescript -type SecondarySubstrate = "saas-alpha" | "local-service-alpha"; -type SecondaryRun = "onboard" | "week-1" | "week-2"; -type ExperimentKind = "text" | "component" | "asset" | "css"; -type OutcomeLane = - | "promote-fast-track" - | "promote-fallback" - | "promote-gate-win" - | "archive-gate-fail" - | "auto-rollback" - | "hold"; - -interface SecondaryDecisionJSON { - substrate: SecondarySubstrate; - run: SecondaryRun; - selected_issues: Array<{ - exp_id: string; - kind: ExperimentKind; - target_files: string[]; - proposed_change: string; - expected_outcome_lane: OutcomeLane; - }>; - reasoning: string; - monitor_signal: string; -} - -interface SecondaryVerdictJSON { - substrate: SecondarySubstrate; - run: SecondaryRun; - experiments: Array<{ - exp_id: string; - kind: ExperimentKind; - reward_delta_pct: number; - p_value: number; - classification: "improved" | "hurt" | "neutral"; - outcome: OutcomeLane; - }>; -} - -interface SecondaryApplyLogJSON { - substrate: SecondarySubstrate; - run: SecondaryRun; - applied: boolean; - touched_files: string[]; - skipped: Array<{ exp_id: string; reason: string }>; - notes: string; -} -``` - -### Architecture Notes - -- `context/FEATURES.md:173` defines feature #58 as Layer 11 Pair Alpha: SaaS B2B + local service B2C synthetic HTMLs plus onboard/week-1/week-2 mock runs. -- The script owns only `site/secondary/` and `history/secondary-arc/`. -- Mock run artifact filenames must match the existing demo-run convention plus the new apply log: `proposal.md`, `decision.json`, `verdict.json`, `apply-log.json`. -- Artifact JSON shape should mirror `history/demo-arc` conventions: selected issues in `decision.json`, experiment verdict rows in `verdict.json`, Markdown experiment blocks in `proposal.md`. -- Tests should import exported constants/helpers from `scripts/seed-secondary-substrates.ts` rather than shelling out where possible, then separately verify package script presence if useful. - ---- - -## Implementation Summary - -### Story Overview - -| ID | Title | Priority | Dependencies | -|----|-------|----------|--------------| -| US-001 | Add deterministic secondary substrate model and HTML writers | 1 | — | -| US-002 | Write secondary mock run artifacts | 2 | US-001 | -| US-003 | Wire CLI/package script and scope guards | 3 | US-002 | -| US-004 | Add Bun tests for layout, idempotency, and protected paths | 4 | US-003 | - -### Dependency Graph - -```text -US-001 (substrate data + HTML writers) - ↓ -US-002 (history/secondary-arc artifact writers) - ↓ -US-003 (main + package script + protected path discipline) - ↓ -US-004 (tests) -``` - ---- - -## Validation Requirements - -Every story must pass: - -- [ ] Type-check: `bun run type-check` -- [ ] Lint: `bun run lint --max-warnings 0` -- [ ] Tests: `bun run test` -- [ ] Format: `bun run format:check` -- [ ] Full validation: `bun run validate` - ---- - -Generated: 2026-04-24T00:00:00.000Z diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4bcf553..a8b1e7e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -42,6 +42,29 @@ jobs: - name: Markdown lint run: bun run validate:md + - name: Install ImageMagick (with magick v7 dispatcher wrapper) + run: | + sudo apt-get update + sudo apt-get install -y imagemagick + # Ubuntu ships ImageMagick 6 (separate binaries: convert, identify, etc.). + # build-demo-manifest.ts and its test use the v7 multi-call entrypoint + # (magick, magick identify, ...). Install a small dispatcher at + # /usr/local/bin/magick that forwards to the v6 binary by subcommand. + sudo tee /usr/local/bin/magick > /dev/null <<'WRAPPER' + #!/bin/bash + case "$1" in + identify|convert|mogrify|composite|montage|compare|conjure|stream|display|animate|import) + cmd="$1"; shift; exec "$cmd" "$@" ;; + *) + exec convert "$@" ;; + esac + WRAPPER + sudo chmod +x /usr/local/bin/magick + magick -version | head -1 + + - name: Install Playwright Chromium (browser-audit) + run: bunx playwright install --with-deps chromium + - name: Run tests run: bun run test diff --git a/.gitignore b/.gitignore index 1e96696..e5a6dd4 100644 --- a/.gitignore +++ b/.gitignore @@ -24,11 +24,84 @@ tmp/ # Webster generated asset dedup cache .webster/generated-cache/ +# Council simulation working copy — committed deliverable lives in demo-output/ +local-runs/ + +# Audio working copies — only the leveled narration.mp3 is committed +audio/*.raw.mp3 + +# AI tool symlinks created by `skills add` — we only use Claude Code +# (.claude/skills) and the canonical store (.agents/skills). The rest are +# parallel-platform symlinks we don't ship to the repo. +.adal/ +.augment/ +.codebuddy/ +.commandcode/ +.continue/ +.crush/ +.factory/ +.goose/ +.iflow/ +.junie/ +.kilocode/ +.kiro/ +.kode/ +.mcpjam/ +.mux/ +.neovate/ +.openhands/ +.pi/ +.pochi/ +.qoder/ +.qwen/ +.roo/ +.trae/ +.vibe/ +.windsurf/ +.zencoder/ +skills-lock.json +skills/gsap +skills/hyperframes +skills/hyperframes-cli +skills/hyperframes-registry +skills/website-to-hyperframes + +# HyperFrames skill content (installed via npx skills add heygen-com/hyperframes) +# Treat like node_modules: re-installable via skills CLI, not committed. +.agents/ +.claude/skills/ + +# Rendered timelapse mp4 — hosted externally for the hackathon submission +demo-output/videos/ + # Pi subagent transient outputs (scout context.md, planner plan.md at repo root) /context.md /plan.md /research.md -# Local Claude Code council reruns (isolated, regenerated) -local-runs/ +# Internal tracking docs — preserved in ~/Vault/Projects/webster/internal-tracking/ +context/EXPANSION-TASKS.md +context/E2E-IMPLEMENTATION-TRACKER.md +context/SITE-FORK-CHECKLIST.md +context/ROADMAP.md +context/VIDEO-PLAN.md +context/VIDEO-PLAN-90s.md +context/v2-design.md + +# Intermediate session prompts — only first/second-wbs and sim-council are public-facing +prompts/third-wbs-session.md +prompts/fourth-wbs-session.md +prompts/sim-audit-fix-session.md +prompts/composition-session.md +prompts/e2e-demo-run-session.md +prompts/sim-runner.md + +# History operator notes (story belongs in README/AGENTS.md, not duplicated) +history/AGENTS.md +history/CLAUDE.md + +# Polish-session worktree prompts (local hand-off only) +ONBOARDING-V2-PROMPT.md +# Personal launchd plist (hardcoded user paths) — preserved in vault +deploy/webster-dispatcher.plist diff --git a/.markdownlint-cli2.jsonc b/.markdownlint-cli2.jsonc index 7acde1c..35fd8ac 100644 --- a/.markdownlint-cli2.jsonc +++ b/.markdownlint-cli2.jsonc @@ -22,6 +22,8 @@ "tmp", "local-runs", "history", + "demo-output", + "video/assets", "raw-videos", "transcripts", "research", diff --git a/.prettierignore b/.prettierignore index 1c68827..a0410e9 100644 --- a/.prettierignore +++ b/.prettierignore @@ -14,6 +14,7 @@ context/critics/*/findings.md context/monitor/alerts.md history site +demo-output # Symlinks (CLAUDE.md → AGENTS.md) CLAUDE.md diff --git a/AGENTS.md b/AGENTS.md index 44a7001..549ba88 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -13,8 +13,8 @@ This file is for implementation operators. See `skills/webster-lp-audit/SKILL.md Two active workstreams: -- **Production Webster** — Nicolette's weekly landing-page improvement council runs on `main` via `prompts/second-wbs-session.md`. This is live for her business; do not break it. -- **Hackathon expansion** — Dual-substrate demo (Richer Health LP + Northwest Home Renovations 3-page site) with a simulation runner producing timelapse assets. Deadline **2026-04-28**. Working branch: `dev/`. See `context/VISION.md` for canonical north-star. +- **Production Webster** — Nicolette's weekly landing-page improvement council runs on `main`. Operator surface: `/webster-weekly-council` (skill at `skills/webster-weekly-council/SKILL.md`) or the single-page runbook at `prompts/second-wbs-session.md`. Both produce identical artifacts; the prompt is the locked source-of-truth runbook. This is live for her business; do not break it. +- **Single-substrate Richer Health LP demo** with a simulation runner producing 11-week timelapse assets under `demo-output/landing-page/`. See `context/VISION.md` for canonical north-star. ## First actions every session @@ -22,9 +22,9 @@ Two active workstreams: 2. `context/ARCHITECTURE.md` — current system design 3. `context/FEATURES.md` — shipped state + stream allocation 4. `context/VISION.md` — canonical north-star for the active hackathon expansion. If about to code or make an architectural call, this doc tells you whether you're drifting. -5. `context/EXPANSION-TASKS.md` — topologically ordered tasks with acceptance criteria -6. `context/QUALITY-GATES.md` — validation rules (mirror Forge pattern) -7. `~/Vault/Projects/webster/webster-decision-log.md` — architectural decisions with rationale +5. `context/QUALITY-GATES.md` — validation rules (mirror Forge pattern) +6. `~/Vault/Projects/webster/webster-decision-log.md` — architectural decisions with rationale +7. `~/Vault/Projects/webster/internal-tracking/context/EXPANSION-TASKS.md` — local-only task tracker for the hackathon expansion (vault, not in repo) ## Communication with Richie @@ -37,9 +37,33 @@ Two active workstreams: ## Branch strategy - `main` — production Webster. Nicolette's live council runs here. Stable. -- `dev/` — hackathon expansion work merges here. Eventually rolled up into main as a single batch once the submission ships. -- Feature branches / worktrees → merge into `dev/`, not directly to `main`. -- Never force-push to `main` or `dev/`. +- `dev` — hackathon expansion trunk. All expansion work eventually merges here. Once the submission ships, `dev` rolls up to `main` as a single batch. +- **Feature / worktree branches → PR to `dev`, not `main`.** +- Never force-push to `main` or `dev`. + +## Worktree + PR flow + +Every task in `context/EXPANSION-TASKS.md` (and any other implementation work during the hackathon expansion) follows this pattern: + +1. **Branch off `dev`**, not `main`: + - Claude Code (manual): `git worktree add ../webster-T- dev -b feat/T-` + - Forge workers: spawn from the `dev` base; Forge auto-creates `forge/task-feat-` branches (existing pattern) +2. Work on the worktree branch. Commits conventional (`feat:` / `fix:` / `test:` / `docs:` / `refactor:` / `chore:`) +3. Push the branch and open a PR with **base = `dev`** +4. After review + green CI, merge the PR into `dev` (squash preferred for feature branches; merge commit acceptable for Forge multi-commit task branches) +5. Delete the feature branch after merge. Local worktree cleanup per the Forge lifecycle rules + +Feature branch naming: + +- Claude Code manual: `feat/T-` (example: `feat/T1-memory-provisioning`) +- Forge-generated: `forge/task-feat-` (unchanged from existing pattern) +- `fix:` for Pass-7 review fixes and bug fixes: `fix/T0-pass7-visual-veto` style + +Hackathon rollup procedure (after T10 completes): + +- Final `dev → main` merge is a single PR with the full expansion as a commit block +- Nicolette's production council on `main` is not affected until that PR lands +- Do NOT merge `dev → main` in pieces before T10 completes — the expansion lands atomically so production stays coherent ## Operating rules @@ -65,7 +89,7 @@ Two active workstreams: - Bypass validation (`--no-verify`, `--no-gpg-sign`, `--force`) - Fabricate analytics numbers or business stats - Silently catch errors to make things look green -- Touch the existing 9 production `webster-*` agents during hackathon expansion — they run Nicolette's real council. Sim agents are additive (`webster-lp-sim-*`, `webster-site-sim-*`). +- Touch the 9 specs in `agents/production/` during hackathon expansion — they run Nicolette's real council. Sim agents in `agents/simulation/` (`webster-lp-sim-*`) are additive. - Touch `prompts/second-wbs-session.md` — it's the production orchestrator. Sim orchestrator is a fork at `prompts/sim-council.md`. ## Quality gates @@ -79,7 +103,7 @@ bun run validate ## Task pickup protocol (hackathon expansion) -1. Check `context/EXPANSION-TASKS.md` — pick next unblocked task in topological order. Do NOT skip T0. +1. Check `~/Vault/Projects/webster/internal-tracking/context/EXPANSION-TASKS.md` (vault, local-only) — pick next unblocked task in topological order. Do NOT skip T0. 2. Re-read the task's acceptance criteria 3. Read every file the task touches before editing 4. Implement minimally — no scope expansion, no drive-by refactors @@ -102,12 +126,14 @@ Use `TaskCreate` / `TaskUpdate` for multi-step work within a single session. Tas ## Skill invocation (Claude Code) -Webster ships two runtime-critic skills: +Webster ships these skills: -- `skills/webster-lp-audit/SKILL.md` — shared council run flow (referenced by production critics) -- `skills/webster-onboarding/SKILL.md` — end-user onboarding flow (universal, demo placeholder) +- `skills/webster-weekly-council/SKILL.md` — operator surface for the weekly run. Library skill: SKILL.md index + on-demand phase references + helper scripts. Slash-command form: `/webster-weekly-council`. Equivalent single-page runbook at `prompts/second-wbs-session.md`. +- `skills/webster-onboarding/SKILL.md` — first-time setup for a new operator (brand context capture, key checklist, repo scaffold, agent + memory-store provisioning, first council) +- `skills/webster-lp-audit/SKILL.md` — shared council run discipline (referenced by production critics) +- `skills/webster-browser-audit/SKILL.md` — headless browser audit capability for visual review -If your work modifies either skill, test with a sample invocation before committing. +If your work modifies any skill, test with a sample invocation before committing. The weekly-council skill must stay artifact-equivalent with `prompts/second-wbs-session.md` — when in doubt, fix the skill, never the prompt. ## Parallel stream etiquette @@ -123,4 +149,4 @@ State the conflict. Don't paper over it. Consult `~/Vault/Projects/webster/webster-decision-log.md` — every locked decision with rationale. -If a path isn't clear and VISION.md / EXPANSION-TASKS.md don't answer, leave a `[STUCK]` or `[QUESTION]` prefix in your session output. Don't compose around it. +If a path isn't clear and VISION.md doesn't answer, leave a `[STUCK]` or `[QUESTION]` prefix in your session output. Don't compose around it. diff --git a/README.md b/README.md index 9734595..1d06579 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # Webster -> A council of 7 Claude Managed Agents that autonomously audits a small-business landing page every week, synthesizes the findings, and opens a PR with the proposed redesign. +> A council of 9 Claude Managed Agents that autonomously audits a small-business landing page every week, synthesizes the findings, and opens a PR with the proposed redesign. **Built with Opus 4.7** — Anthropic × Cerebral Valley Hackathon submission (deadline 2026-04-26). ## The one-line pitch -Small businesses pay marketing agencies $2K–$20K/month for landing-page optimization that arrives in 4–6 week cycles. Webster runs the audit + proposal loop for ~$0.60/month in Opus 4.7 tokens and hands the operator a reviewable draft PR each week. The win is cycle time (minutes vs weeks) and the baseline cost of the analytical loop — a human still reviews the PR before it ships. +A council of 9 Claude Managed Agents audits a landing page once a week, synthesizes findings across SEO, brand-voice, compliance, conversion, copy, and rendered-layout lenses, and hands the operator a reviewable draft PR. The win is cycle time — the analytical loop runs in tens of minutes instead of multi-week agency rounds — and a runtime mechanism (Critic Genealogy) where Opus 4.7 detects an unowned audit gap and registers a brand-new specialist agent against the live API mid-run. A human still reviews the PR before it ships. ## The hero moment — Critic Genealogy @@ -14,7 +14,7 @@ Small businesses pay marketing agencies $2K–$20K/month for landing-page optimi weekly trigger │ ▼ -Planner ──► SEO / brand / compliance / conversion / copy / monitor +Planner ──► SEO / brand / compliance / conversion / copy / monitor / visual-reviewer │ │ │ └── out-of-scope overlap found ▼ @@ -46,15 +46,15 @@ bun scripts/critic-genealogy.ts --fixtures scripts/__tests__/fixtures/genealogy Planner (Opus 4.7) memory + verdicts → plan.md │ - fans out 6 sessions in parallel ─┐ - │ - ┌───────┬──────────┬──────────┬────────┬──────┴─┐ - │ SEO │ brand │ FH-compl │ CRO │ copy │ monitor - │ Sonnet│ Sonnet │ Sonnet │ Sonnet │ Sonnet │ Haiku - │ 4.6 │ 4.6 │ 4.6 │ 4.6 │ 4.6 │ 4.5 - └───┬───┴────┬─────┴─────┬────┴────┬───┴────┬───┘ - │ │ │ │ │ - └────────┴─── each critic commits via GitHub MCP ──┐ + fans out 7 sessions in parallel ─────────────┐ + │ + ┌───────┬──────────┬──────────┬────────┬────────┬─────────┴─┬───────────────┐ + │ SEO │ brand │ FH-compl │ CRO │ copy │ monitor │ visual-review │ + │ Sonnet│ Sonnet │ Sonnet │ Sonnet │ Sonnet │ Haiku │ Opus 4.7 │ + │ 4.6 │ 4.6 │ 4.6 │ 4.6 │ 4.6 │ 4.5 │ (post) │ + └───┬───┴────┬─────┴─────┬────┴────┬───┴────┬───┴─────┬─────┴──────┬────────┘ + │ │ │ │ │ │ │ + └────────┴─── each critic commits via GitHub MCP ──┴────────────┘ to council/ branch │ ▼ Critic Genealogy (Opus 4.7, runtime) @@ -77,24 +77,43 @@ bun scripts/critic-genealogy.ts --fixtures scripts/__tests__/fixtures/genealogy ## For judges -**30-second pitch:** Webster is an autonomous landing-page improvement council. Seven Claude Managed Agents plan, audit, monitor, synthesize, and package one weekly redesign proposal; the standout demo is Critic Genealogy, where Opus 4.7 detects an unowned audit gap and registers a new specialist at runtime. +**30-second pitch:** Webster is an autonomous landing-page improvement council. Nine Claude Managed Agents plan, audit, monitor, synthesize, and package one weekly redesign proposal; the standout demo is Critic Genealogy, where Opus 4.7 detects an unowned audit gap and registers a new specialist at runtime. -**Live-run evidence:** the full operator path is [`prompts/second-wbs-session.md`](prompts/second-wbs-session.md), registration IDs live in `environments/webster-council-env.id` and `context/*/id.txt`, and run artifacts are written under `history//` when the weekly prompt is executed. +**Live-run evidence:** the operator surface is the [`/webster-weekly-council`](skills/webster-weekly-council/SKILL.md) skill (library: SKILL.md index + on-demand phase references + helper scripts); the full single-page runbook lives at [`prompts/second-wbs-session.md`](prompts/second-wbs-session.md). Registration IDs live in `environments/webster-council-env.id` and `context/*/id.txt`. Run artifacts are written under `history//` when the weekly run executes. + +**Demo arc artifacts:** an 11-week simulation council run, week-by-week, browsable as files. Start at [`demo-output/landing-page/INDEX.md`](demo-output/landing-page/INDEX.md) for the narrated walk-through. Each week directory under [`demo-output/landing-page/w00..w10/`](demo-output/landing-page/) contains desktop/mobile/tablet screenshots, heatmap JSON+SVG, synthetic analytics, and the visual reviewer's markdown verdict. Anthropic Managed Agents memory-store provisioning is captured at [`assets/memory-stores-screenshots/`](assets/memory-stores-screenshots/). The render pipeline that turns these per-week assets into a timelapse video is submission tooling and lives outside the public repo. **Hero code:** [`scripts/critic-genealogy.ts`](scripts/critic-genealogy.ts) is the runtime specialist-spawn path; [`scripts/__tests__/critic-genealogy.test.ts`](scripts/__tests__/critic-genealogy.test.ts) and [`scripts/__tests__/fixtures/genealogy`](scripts/__tests__/fixtures/genealogy) are the fixture proof. **Validate locally:** run `bun install` once, then `bun run validate` for type-check, zero-warning lint, format, agent schemas, findings format, markdown, and tests. +## 5-minute judge tour + +If you're evaluating this submission and have five minutes: + +1. **Read the 30-second pitch + hero moment above** (you're here) — that's the architecture and the novel-mechanic claim in one screen. +2. **Open [`demo-output/landing-page/INDEX.md`](demo-output/landing-page/INDEX.md)** — narrated walk through the 11-week LP timelapse. One paragraph per week, links to that week's screenshots + heatmap + visual-reviewer verdict. +3. **Click into one week's `visual-review.md`** (e.g. [`w04/visual-review.md`](demo-output/landing-page/w04/visual-review.md) for the largest beat, [`w10/visual-review.md`](demo-output/landing-page/w10/visual-review.md) for the terminal polish) — that's what the council actually wrote about its own changes. +4. **Read [`scripts/critic-genealogy.ts`](scripts/critic-genealogy.ts)** — the hero file. Two tools (`report_no_gap` / `report_gap`), Opus 4.7 picks one, then drafts a JSON spec, registers it via `POST /v1/agents`, and invokes it via `POST /v1/sessions` — all at runtime. +5. **Optional, if a terminal is handy:** `bun install && bun scripts/critic-genealogy.ts --fixtures scripts/__tests__/fixtures/genealogy --dry-run`. Live Opus 4.7 call against the committed fixture findings, ~15s wall clock, prints the new critic spec it would have registered. + +[`agents/production/`](agents/production/) holds the 9 pre-registered specs; [`agents/simulation/`](agents/simulation/) holds the 1:1 simulation mirror used for the timelapse run. [`prompts/second-wbs-session.md`](prompts/second-wbs-session.md) is the production weekly orchestrator (locked); [`skills/webster-weekly-council/SKILL.md`](skills/webster-weekly-council/SKILL.md) is the same flow as a Claude Code skill. + ## What's in the repo ```text webster/ -├── agents/ 7 Managed Agent JSON specs (5 critics + monitor + redesigner) +├── agents/ +│ ├── production/ 9 Managed Agent specs that run Nicolette's live council +│ └── simulation/ 9 LP-sim specs (1:1 mirror) that drive the timelapse demo ├── context/ architecture, features, quality gates, per-critic findings dirs ├── environments/ webster-council-env.json (single Anthropic environment) -├── prompts/ first-wbs-session.md (bootstrap), second-wbs-session.md (weekly run) +├── prompts/ first-wbs-session.md (bootstrap), second-wbs-session.md (weekly run runbook) ├── scripts/ validate-agents, validate-findings, critic-genealogy -├── skills/ webster-lp-audit (shared critic discipline), webster-onboarding +├── skills/ webster-weekly-council (operator surface for the weekly run), +│ webster-onboarding (first-time setup for a new operator), +│ webster-lp-audit (shared critic discipline), +│ webster-browser-audit (Playwright-headless audit capability) ├── .github/workflows/ CI: type + lint + format + schema + findings + markdown + tests ├── .husky/ pre-commit runs the same gates locally └── AGENTS.md operator guide for in-repo work @@ -102,19 +121,19 @@ webster/ ## The weekly flow -The live council runner is a bash-in-markdown prompt: [`prompts/second-wbs-session.md`](prompts/second-wbs-session.md). It: +The live council runner is a Claude Code library skill: [`/webster-weekly-council`](skills/webster-weekly-council/SKILL.md) — slim SKILL.md index, on-demand phase references under `references/`, and reusable helper scripts under `scripts/`. The single-page bash-in-markdown runbook at [`prompts/second-wbs-session.md`](prompts/second-wbs-session.md) is the same flow as a scrollable readable page. Both produce identical artifacts. The flow: 1. Seeds 10 weeks of mock analytics on first run (monitor needs baselines to diff). 2. Prepares a shared `council/YYYY-MM-DD` branch. 3. Runs the planner — marshals `history/memory.jsonl`, recent verdicts, and monitor anomalies; writes `history/YYYY-MM-DD/plan.md`. -4. Fans out 6 Managed Agent sessions (monitor + 5 critics) — each commits `context/critics//findings.md` via GitHub MCP. +4. Fans out 7 Managed Agent sessions (monitor + 5 critics + visual-reviewer) — each commits `context/critics//findings.md` via GitHub MCP. 5. Validates findings via `bun scripts/validate-findings.ts`. 6. Runs the redesigner — commits `history/YYYY-MM-DD/proposal.md` + `decision.json`. 7. Opens a draft PR. -Expected wall-clock: 30–50 min. Expected API cost: ~$0.16–0.25 per run. +Wall-clock per run is in the tens of minutes; the bulk of that is the parallel critic fan-out, not orchestration overhead. -**Submission note**: all 7 agent specs are registered against the live Anthropic API (IDs in `environments/webster-council-env.id` + `context/*/id.txt`), the genealogy hero is live-validated (~$0.03 Opus 4.7 dry-run documented above), and the full orchestration prompt is committed. The end-to-end 6-agent fan-out that produces `history/YYYY-MM-DD/` artifacts is the operator-triggered weekly run — `history/` is empty at submission time by design. Loop has been exercised component-by-component. +**Submission note**: all 9 agent specs are registered against the live Anthropic API (IDs in `environments/webster-council-env.id` + `context/*/id.txt`), the genealogy hero is live-validated (~$0.03 Opus 4.7 dry-run documented above), and the full orchestration prompt is committed. The end-to-end fan-out that produces `history/YYYY-MM-DD/` artifacts is the operator-triggered weekly run — `history/` is empty at submission time by design. Loop has been exercised component-by-component. ## Quality gates @@ -126,11 +145,11 @@ bun run validate Chains: `tsc --noEmit` → `eslint --max-warnings 0` → `prettier --check` → agent+environment schema validation → findings format validation → markdownlint → `bun test`. Every gate is blocking. Pre-commit hook enforces the same set. CI enforces the same set on push + PR. See [`context/QUALITY-GATES.md`](context/QUALITY-GATES.md). -Current state: 29 tests passing, 0 lint warnings, 0 type errors, 8 JSON specs valid, 6 findings files valid. +Current state: 29 test files green via `bun run validate`, 0 lint warnings, 0 type errors, 18 JSON specs valid, 6 findings files valid. ## Prize-lane alignment -- **Best Use of Claude Managed Agents** — 7 pre-registered agents + runtime-registered genealogy critics, all invoked via `/v1/sessions` with vault-bound GitHub MCP (no tokens in `user.message`). +- **Best Use of Claude Managed Agents** — 9 pre-registered production agents (with a 1:1 sim mirror in `agents/simulation/`) + runtime-registered genealogy critics, all invoked via `/v1/sessions` with vault-bound GitHub MCP (no tokens in `user.message`). - **Creative Exploration** — runtime critic genealogy. Gap detection → template-cloned spec → live `POST /v1/agents` → immediate invocation. The emergent-capability demo beat. ## Running it yourself @@ -143,21 +162,41 @@ Current state: 29 tests passing, 0 lint warnings, 0 type errors, 8 JSON specs va - `git` with commit-signing configured - An Anthropic API key stored in macOS keychain under service `anthropic-webster`. First-session will show the exact `security add-generic-password` command if missing. +### The `wbs` alias (project convention) + +The `wbs @prompts/...` commands below assume a shell alias that launches Claude Code into Webster's dispatcher mode (Opus 4.7, 1M context, custom system prompt at `.claude/dispatcher.md`, custom settings at `.claude/dispatcher-settings.json`). Add to your shell rc: + +```bash +alias wbs='cd ~/Projects/webster && claude --dangerously-skip-permissions --model claude-opus-4-7 \ + --settings .claude/dispatcher-settings.json \ + --system-prompt "$(cat .claude/dispatcher.md)"' +``` + +Or run the equivalent `claude --settings ... --system-prompt ...` directly without aliasing. Either works. + ### Bootstrap (one-time) ```bash wbs @prompts/first-wbs-session.md ``` -Registers the single environment + 7 agents against the Anthropic API. Runs an SEO hello-world to prove the council loop end-to-end. Artifacts: `environments/webster-council-env.id` + `context/{monitor,redesigner,critics/*}/id.txt`. +Registers the single environment + 9 production agents against the Anthropic API. Runs an SEO hello-world to prove the council loop end-to-end. Artifacts: `environments/webster-council-env.id` + `context/{monitor,redesigner,critics/*}/id.txt`. ### Weekly council run +In Claude Code (primary): + +```text +/webster-weekly-council +``` + +Or as a single-page prompt (fallback): + ```bash wbs @prompts/second-wbs-session.md ``` -Runs the full planner + fan-out + redesigner + draft PR described above. +Both run the full planner + fan-out + redesigner + draft PR described above. The skill loads phase references on demand (smaller per-turn context budget); the prompt is one readable file. ### Spawn a genealogy critic manually @@ -171,13 +210,13 @@ Reads the week's findings, asks Opus 4.7 if any scope is unowned, and spawns + r Every layer uses Opus 4.7 as author: -| Layer | Opus 4.7 role | -| ------------------------------- | ------------------------------------------------------------------------- | -| 7 agent specs (`agents/*.json`) | Drafted during bootstrap session, validated against live API | -| Bootstrap + weekly prompts | Opus-authored during dispatcher sessions; in git history | -| Critic Genealogy script | Opus-authored; see `dcf5726` + `e474301` | -| Redesigner synthesis | Opus 4.7 at runtime — its decision.json outputs live in `history//` | -| Runtime critic spawning | Opus 4.7 selects the gap AND authors the new spec via `tool_use` | +| Layer | Opus 4.7 role | +| ------------------------------------------ | ------------------------------------------------------------------------- | +| 9 agent specs (`agents/production/*.json`) | Drafted during bootstrap session, validated against live API | +| Bootstrap + weekly prompts | Opus-authored during dispatcher sessions; in git history | +| Critic Genealogy script | Opus-authored; see `dcf5726` + `e474301` | +| Redesigner synthesis | Opus 4.7 at runtime — its decision.json outputs live in `history//` | +| Runtime critic spawning | Opus 4.7 selects the gap AND authors the new spec via `tool_use` | Repo is entirely MIT. No Anthropic or third-party proprietary code. diff --git a/agents/AGENTS.md b/agents/AGENTS.md index 6996637..fd94a05 100644 --- a/agents/AGENTS.md +++ b/agents/AGENTS.md @@ -43,12 +43,9 @@ Environments are a **separate resource** (`POST /v1/environments`). Reference by ## Two agent sets (hackathon expansion) -The existing 9 `webster-*` specs are the **production set**. They run Nicolette's real weekly council. **Do not modify them.** +The 9 specs in `agents/production/` are the **production set**. They run Nicolette's real weekly council. **Do not modify them.** -Sim expansion adds 18 new specs: - -- `webster-lp-sim-*` (9) — Richer Health simulation, MCP-native (no WebFetch) -- `webster-site-sim-*` (9) — Northwest Home Renovations simulation, MCP-native. Fifth critic is `licensing-and-warranty-critic` replacing `fh-compliance-critic` +The 9 specs in `agents/simulation/` are the **LP-sim set** — Richer Health simulation, MCP-native (no WebFetch). They mirror the production set 1:1 and drive the timelapse demo. Sim agents read the site via `get_file_contents` (GitHub MCP) at demo branch refs — never WebFetch, never localhost. diff --git a/agents/brand-voice-critic.json b/agents/production/brand-voice-critic.json similarity index 100% rename from agents/brand-voice-critic.json rename to agents/production/brand-voice-critic.json diff --git a/agents/conversion-critic.json b/agents/production/conversion-critic.json similarity index 100% rename from agents/conversion-critic.json rename to agents/production/conversion-critic.json diff --git a/agents/copy-critic.json b/agents/production/copy-critic.json similarity index 100% rename from agents/copy-critic.json rename to agents/production/copy-critic.json diff --git a/agents/fh-compliance-critic.json b/agents/production/fh-compliance-critic.json similarity index 100% rename from agents/fh-compliance-critic.json rename to agents/production/fh-compliance-critic.json diff --git a/agents/seo-critic.json b/agents/production/seo-critic.json similarity index 100% rename from agents/seo-critic.json rename to agents/production/seo-critic.json diff --git a/agents/webster-monitor.json b/agents/production/webster-monitor.json similarity index 100% rename from agents/webster-monitor.json rename to agents/production/webster-monitor.json diff --git a/agents/webster-planner.json b/agents/production/webster-planner.json similarity index 100% rename from agents/webster-planner.json rename to agents/production/webster-planner.json diff --git a/agents/webster-redesigner.json b/agents/production/webster-redesigner.json similarity index 100% rename from agents/webster-redesigner.json rename to agents/production/webster-redesigner.json diff --git a/agents/webster-visual-reviewer.json b/agents/production/webster-visual-reviewer.json similarity index 100% rename from agents/webster-visual-reviewer.json rename to agents/production/webster-visual-reviewer.json diff --git a/agents/simulation/webster-lp-sim-brand-voice-critic.json b/agents/simulation/webster-lp-sim-brand-voice-critic.json new file mode 100644 index 0000000..933cc8b --- /dev/null +++ b/agents/simulation/webster-lp-sim-brand-voice-critic.json @@ -0,0 +1,33 @@ +{ + "name": "webster-lp-sim-brand-voice-critic", + "description": "brand-voice simulation critic for lp; MCP-native, branch-ref based, no live URL reads.", + "model": "claude-sonnet-4-6", + "system": "You are the brand-voice critic in Webster's lp simulation council.\n\n# Bootstrap (first action)\nYour user.message supplies: BRANCH, WEEK_DATE, SUBSTRATE=lp, CONTEXT_PATH, SITE_PATH, and the demo branch ref to inspect.\n\nRepo coordinates for all GitHub MCP calls: owner=richsak, repo=webster.\n\nThe container has NO git credentials and NO local clone of the repo. All file IO happens through GitHub MCP tools. Do NOT attempt shell git commands.\n\nRead site and context through GitHub MCP get_file_contents at ref=$BRANCH. Never use external page fetches, localhost, preview URLs, or live production URLs.\n\nRequired reads:\n1. get_file_contents path=demo-landing-page/context/business.md, ref=$BRANCH.\n2. get_file_contents path=demo-landing-page/context/brand.json, ref=$BRANCH.\n3. get_file_contents path=demo-landing-page/context/personas.json, ref=$BRANCH.\n4. Read the single landing page index.html under the supplied SITE_PATH.\n5. get_file_contents path=context/sim/lp/critics/brand-voice/findings.md, ref=$BRANCH for prior findings if present; treat 404 as week 1.\n\nJudge against the brand bible and personas, not against the ugly current state. The ugly state is the unimproved surface; Webster converges toward brand intent.\n\n# Scope\nOwn only voice, tone, palette/typography intent, signature phrases, forbidden phrases, and brand-rule adherence.\nFor the Richer Health landing page, evaluate the single-page journey against the supplied Richer Health business, persona, and brand context.\n\nDo not fix issues. Do not claim ownership of other critic domains.\n\n# Findings format\nCommit markdown exactly shaped as:\n\n# Findings — Week $WEEK_DATE\n\n## Issues identified\n- [CRITICAL|HIGH|MEDIUM|LOW] \n\n## Patterns observed\n- \n\n## Out of scope\n- [] \n\nHard cap: 10 issues. Do not fabricate evidence.\n\n# Severity rubric\n- CRITICAL — blocks the demo's believable business outcome or violates a hard brand/legal/trust rule\n- HIGH — materially weakens conversion, trust, clarity, or domain fit\n- MEDIUM — visible quality gap a weekly council should address soon\n- LOW — polish opportunity or minor inconsistency\n\n# Commit + push (GitHub MCP, not shell git)\nCreate or update context/sim/lp/critics/brand-voice/findings.md on BRANCH with the full markdown body. Use create_branch first if needed. Use create_or_update_file with commit message 'chore(webster-lp-sim-brand-voice-critic): week $WEEK_DATE findings'.", + "tools": [ + { + "type": "agent_toolset_20260401" + }, + { + "type": "mcp_toolset", + "mcp_server_name": "github", + "default_config": { + "enabled": true, + "permission_policy": { + "type": "always_allow" + } + } + } + ], + "mcp_servers": [ + { + "type": "url", + "name": "github", + "url": "https://api.githubcopilot.com/mcp/" + } + ], + "metadata": { + "role": "critic", + "scope": "brand-voice", + "substrate": "lp" + } +} diff --git a/agents/simulation/webster-lp-sim-conversion-critic.json b/agents/simulation/webster-lp-sim-conversion-critic.json new file mode 100644 index 0000000..f1ed26d --- /dev/null +++ b/agents/simulation/webster-lp-sim-conversion-critic.json @@ -0,0 +1,33 @@ +{ + "name": "webster-lp-sim-conversion-critic", + "description": "conversion simulation critic for lp; MCP-native, branch-ref based, no live URL reads.", + "model": "claude-sonnet-4-6", + "system": "You are the conversion critic in Webster's lp simulation council.\n\n# Bootstrap (first action)\nYour user.message supplies: BRANCH, WEEK_DATE, SUBSTRATE=lp, CONTEXT_PATH, SITE_PATH, and the demo branch ref to inspect.\n\nRepo coordinates for all GitHub MCP calls: owner=richsak, repo=webster.\n\nThe container has NO git credentials and NO local clone of the repo. All file IO happens through GitHub MCP tools. Do NOT attempt shell git commands.\n\nRead site and context through GitHub MCP get_file_contents at ref=$BRANCH. Never use external page fetches, localhost, preview URLs, or live production URLs.\n\nRequired reads:\n1. get_file_contents path=demo-landing-page/context/business.md, ref=$BRANCH.\n2. get_file_contents path=demo-landing-page/context/brand.json, ref=$BRANCH.\n3. get_file_contents path=demo-landing-page/context/personas.json, ref=$BRANCH.\n4. Read the single landing page index.html under the supplied SITE_PATH.\n5. get_file_contents path=context/sim/lp/critics/conversion/findings.md, ref=$BRANCH for prior findings if present; treat 404 as week 1.\n\nJudge against the brand bible and personas, not against the ugly current state. The ugly state is the unimproved surface; Webster converges toward brand intent.\n\n# Scope\nOwn only CTA clarity, booking friction, trust-signal placement, social proof proximity, and persona conversion triggers.\nFor the Richer Health landing page, evaluate the single-page journey against the supplied Richer Health business, persona, and brand context.\n\nDo not fix issues. Do not claim ownership of other critic domains.\n\n# Findings format\nCommit markdown exactly shaped as:\n\n# Findings — Week $WEEK_DATE\n\n## Issues identified\n- [CRITICAL|HIGH|MEDIUM|LOW] \n\n## Patterns observed\n- \n\n## Out of scope\n- [] \n\nHard cap: 10 issues. Do not fabricate evidence.\n\n# Severity rubric\n- CRITICAL — blocks the demo's believable business outcome or violates a hard brand/legal/trust rule\n- HIGH — materially weakens conversion, trust, clarity, or domain fit\n- MEDIUM — visible quality gap a weekly council should address soon\n- LOW — polish opportunity or minor inconsistency\n\n# Commit + push (GitHub MCP, not shell git)\nCreate or update context/sim/lp/critics/conversion/findings.md on BRANCH with the full markdown body. Use create_branch first if needed. Use create_or_update_file with commit message 'chore(webster-lp-sim-conversion-critic): week $WEEK_DATE findings'.", + "tools": [ + { + "type": "agent_toolset_20260401" + }, + { + "type": "mcp_toolset", + "mcp_server_name": "github", + "default_config": { + "enabled": true, + "permission_policy": { + "type": "always_allow" + } + } + } + ], + "mcp_servers": [ + { + "type": "url", + "name": "github", + "url": "https://api.githubcopilot.com/mcp/" + } + ], + "metadata": { + "role": "critic", + "scope": "conversion", + "substrate": "lp" + } +} diff --git a/agents/simulation/webster-lp-sim-copy-critic.json b/agents/simulation/webster-lp-sim-copy-critic.json new file mode 100644 index 0000000..0aae9f8 --- /dev/null +++ b/agents/simulation/webster-lp-sim-copy-critic.json @@ -0,0 +1,33 @@ +{ + "name": "webster-lp-sim-copy-critic", + "description": "copy simulation critic for lp; MCP-native, branch-ref based, no live URL reads.", + "model": "claude-sonnet-4-6", + "system": "You are the copy critic in Webster's lp simulation council.\n\n# Bootstrap (first action)\nYour user.message supplies: BRANCH, WEEK_DATE, SUBSTRATE=lp, CONTEXT_PATH, SITE_PATH, and the demo branch ref to inspect.\n\nRepo coordinates for all GitHub MCP calls: owner=richsak, repo=webster.\n\nThe container has NO git credentials and NO local clone of the repo. All file IO happens through GitHub MCP tools. Do NOT attempt shell git commands.\n\nRead site and context through GitHub MCP get_file_contents at ref=$BRANCH. Never use external page fetches, localhost, preview URLs, or live production URLs.\n\nRequired reads:\n1. get_file_contents path=demo-landing-page/context/business.md, ref=$BRANCH.\n2. get_file_contents path=demo-landing-page/context/brand.json, ref=$BRANCH.\n3. get_file_contents path=demo-landing-page/context/personas.json, ref=$BRANCH.\n4. Read the single landing page index.html under the supplied SITE_PATH.\n5. get_file_contents path=context/sim/lp/critics/copy/findings.md, ref=$BRANCH for prior findings if present; treat 404 as week 1.\n\nJudge against the brand bible and personas, not against the ugly current state. The ugly state is the unimproved surface; Webster converges toward brand intent.\n\n# Scope\nOwn only specificity, headline strength, skim structure, benefit clarity, objection handling, and phrase-level persuasion.\nFor the Richer Health landing page, evaluate the single-page journey against the supplied Richer Health business, persona, and brand context.\n\nDo not fix issues. Do not claim ownership of other critic domains.\n\n# Findings format\nCommit markdown exactly shaped as:\n\n# Findings — Week $WEEK_DATE\n\n## Issues identified\n- [CRITICAL|HIGH|MEDIUM|LOW] \n\n## Patterns observed\n- \n\n## Out of scope\n- [] \n\nHard cap: 10 issues. Do not fabricate evidence.\n\n# Severity rubric\n- CRITICAL — blocks the demo's believable business outcome or violates a hard brand/legal/trust rule\n- HIGH — materially weakens conversion, trust, clarity, or domain fit\n- MEDIUM — visible quality gap a weekly council should address soon\n- LOW — polish opportunity or minor inconsistency\n\n# Commit + push (GitHub MCP, not shell git)\nCreate or update context/sim/lp/critics/copy/findings.md on BRANCH with the full markdown body. Use create_branch first if needed. Use create_or_update_file with commit message 'chore(webster-lp-sim-copy-critic): week $WEEK_DATE findings'.", + "tools": [ + { + "type": "agent_toolset_20260401" + }, + { + "type": "mcp_toolset", + "mcp_server_name": "github", + "default_config": { + "enabled": true, + "permission_policy": { + "type": "always_allow" + } + } + } + ], + "mcp_servers": [ + { + "type": "url", + "name": "github", + "url": "https://api.githubcopilot.com/mcp/" + } + ], + "metadata": { + "role": "critic", + "scope": "copy", + "substrate": "lp" + } +} diff --git a/agents/simulation/webster-lp-sim-fh-compliance-critic.json b/agents/simulation/webster-lp-sim-fh-compliance-critic.json new file mode 100644 index 0000000..7a0aa83 --- /dev/null +++ b/agents/simulation/webster-lp-sim-fh-compliance-critic.json @@ -0,0 +1,33 @@ +{ + "name": "webster-lp-sim-fh-compliance-critic", + "description": "fh-compliance simulation critic for lp; MCP-native, branch-ref based, no live URL reads.", + "model": "claude-sonnet-4-6", + "system": "You are the fh-compliance critic in Webster's lp simulation council.\n\n# Bootstrap (first action)\nYour user.message supplies: BRANCH, WEEK_DATE, SUBSTRATE=lp, CONTEXT_PATH, SITE_PATH, and the demo branch ref to inspect.\n\nRepo coordinates for all GitHub MCP calls: owner=richsak, repo=webster.\n\nThe container has NO git credentials and NO local clone of the repo. All file IO happens through GitHub MCP tools. Do NOT attempt shell git commands.\n\nRead site and context through GitHub MCP get_file_contents at ref=$BRANCH. Never use external page fetches, localhost, preview URLs, or live production URLs.\n\nRequired reads:\n1. get_file_contents path=demo-landing-page/context/business.md, ref=$BRANCH.\n2. get_file_contents path=demo-landing-page/context/brand.json, ref=$BRANCH.\n3. get_file_contents path=demo-landing-page/context/personas.json, ref=$BRANCH.\n4. Read the single landing page index.html under the supplied SITE_PATH.\n5. get_file_contents path=context/sim/lp/critics/fh-compliance/findings.md, ref=$BRANCH for prior findings if present; treat 404 as week 1.\n\nJudge against the brand bible and personas, not against the ugly current state. The ugly state is the unimproved surface; Webster converges toward brand intent.\n\n# Scope\nOwn only health-claim restraint, credential clarity, disclaimer placement, and non-diagnostic wording.\nFor the Richer Health landing page, evaluate the single-page journey against the supplied Richer Health business, persona, and brand context.\n\nDo not fix issues. Do not claim ownership of other critic domains.\n\n# Findings format\nCommit markdown exactly shaped as:\n\n# Findings — Week $WEEK_DATE\n\n## Issues identified\n- [CRITICAL|HIGH|MEDIUM|LOW] \n\n## Patterns observed\n- \n\n## Out of scope\n- [] \n\nHard cap: 10 issues. Do not fabricate evidence.\n\n# Severity rubric\n- CRITICAL — blocks the demo's believable business outcome or violates a hard brand/legal/trust rule\n- HIGH — materially weakens conversion, trust, clarity, or domain fit\n- MEDIUM — visible quality gap a weekly council should address soon\n- LOW — polish opportunity or minor inconsistency\n\n# Commit + push (GitHub MCP, not shell git)\nCreate or update context/sim/lp/critics/fh-compliance/findings.md on BRANCH with the full markdown body. Use create_branch first if needed. Use create_or_update_file with commit message 'chore(webster-lp-sim-fh-compliance-critic): week $WEEK_DATE findings'.", + "tools": [ + { + "type": "agent_toolset_20260401" + }, + { + "type": "mcp_toolset", + "mcp_server_name": "github", + "default_config": { + "enabled": true, + "permission_policy": { + "type": "always_allow" + } + } + } + ], + "mcp_servers": [ + { + "type": "url", + "name": "github", + "url": "https://api.githubcopilot.com/mcp/" + } + ], + "metadata": { + "role": "critic", + "scope": "fh-compliance", + "substrate": "lp" + } +} diff --git a/agents/simulation/webster-lp-sim-monitor.json b/agents/simulation/webster-lp-sim-monitor.json new file mode 100644 index 0000000..ec33ad1 --- /dev/null +++ b/agents/simulation/webster-lp-sim-monitor.json @@ -0,0 +1,33 @@ +{ + "name": "webster-lp-sim-monitor", + "description": "Simulation monitor for lp demo analytics anomalies and persona movement.", + "model": "claude-haiku-4-5", + "system": "You are Webster's lp simulation analytics monitor.\n\n# Bootstrap (first action)\nYour user.message supplies: BRANCH, WEEK_DATE, SUBSTRATE=lp, CONTEXT_PATH, SITE_PATH, and the demo branch ref to inspect.\n\nRepo coordinates for all GitHub MCP calls: owner=richsak, repo=webster.\n\nThe container has NO git credentials and NO local clone of the repo. All file IO happens through GitHub MCP tools. Do NOT attempt shell git commands.\n\nRead site and context through GitHub MCP get_file_contents at ref=$BRANCH. Never use external page fetches, localhost, preview URLs, or live production URLs.\n\nRequired reads:\n1. get_file_contents path=demo-landing-page/context/business.md, ref=$BRANCH.\n2. get_file_contents path=demo-landing-page/context/brand.json, ref=$BRANCH.\n3. get_file_contents path=demo-landing-page/context/personas.json, ref=$BRANCH.\n4. Read the single landing page index.html under the supplied SITE_PATH.\n5. get_file_contents path=context/sim/lp/monitor/alerts.md, ref=$BRANCH for prior findings if present; treat 404 as week 1.\n\nJudge against the brand bible and personas, not against the ugly current state. The ugly state is the unimproved surface; Webster converges toward brand intent.\n\n# Scope\nAudit synthetic analytics only: sessions, bounce rate, scroll depth, CTA clicks, persona-specific movement, and section engagement. Explain anomalies as evidence for the council. Do not propose design fixes.\n\n# Output\n# Alerts — Week $WEEK_DATE\n\n## Anomalies\n- [CRITICAL|HIGH|MEDIUM] \n\n## Within normal range\n- : \n\n# Commit + push (GitHub MCP, not shell git)\nCreate or update context/sim/lp/monitor/alerts.md on BRANCH with the full markdown body. Use create_branch first if needed. Use create_or_update_file with commit message 'chore(webster-lp-sim-monitor): week $WEEK_DATE findings'.", + "tools": [ + { + "type": "agent_toolset_20260401" + }, + { + "type": "mcp_toolset", + "mcp_server_name": "github", + "default_config": { + "enabled": true, + "permission_policy": { + "type": "always_allow" + } + } + } + ], + "mcp_servers": [ + { + "type": "url", + "name": "github", + "url": "https://api.githubcopilot.com/mcp/" + } + ], + "metadata": { + "role": "monitor", + "scope": "analytics", + "substrate": "lp" + } +} diff --git a/agents/simulation/webster-lp-sim-planner.json b/agents/simulation/webster-lp-sim-planner.json new file mode 100644 index 0000000..ca7a7e5 --- /dev/null +++ b/agents/simulation/webster-lp-sim-planner.json @@ -0,0 +1,33 @@ +{ + "name": "webster-lp-sim-planner", + "description": "Simulation planner for lp; chooses weekly experiment direction from memory and synthetic analytics.", + "model": "claude-opus-4-7", + "system": "You are Webster's lp simulation planner.\n\n# Bootstrap (first action)\nYour user.message supplies: BRANCH, WEEK_DATE, SUBSTRATE=lp, CONTEXT_PATH, SITE_PATH, and the demo branch ref to inspect.\n\nRepo coordinates for all GitHub MCP calls: owner=richsak, repo=webster.\n\nThe container has NO git credentials and NO local clone of the repo. All file IO happens through GitHub MCP tools. Do NOT attempt shell git commands.\n\nRead site and context through GitHub MCP get_file_contents at ref=$BRANCH. Never use external page fetches, localhost, preview URLs, or live production URLs.\n\nRequired reads:\n1. get_file_contents path=demo-landing-page/context/business.md, ref=$BRANCH.\n2. get_file_contents path=demo-landing-page/context/brand.json, ref=$BRANCH.\n3. get_file_contents path=demo-landing-page/context/personas.json, ref=$BRANCH.\n4. Read the single landing page index.html under the supplied SITE_PATH.\n5. get_file_contents path=context/sim/lp/planner/notes.md, ref=$BRANCH for prior findings if present; treat 404 as week 1.\n\nJudge against the brand bible and personas, not against the ugly current state. The ugly state is the unimproved surface; Webster converges toward brand intent.\n\n# Task\nUse user.message memory context, synthetic analytics, prior verdicts, and brand/persona files to choose this week's experiment direction. Preserve critic sovereignty: direction_hint is additive only and cannot silence critics. First 2–3 weeks should explore broadly and propose substantive moves, not micro-tweaks.\n\n# Output\nReturn only one JSON object for plan.md with fields classification, next_action, direction_hint, optional new_critic_request, and rationale. Allowed next_action values: promote_and_experiment, hold_baseline, revert_and_retry, explore_broadly.", + "tools": [ + { + "type": "agent_toolset_20260401" + }, + { + "type": "mcp_toolset", + "mcp_server_name": "github", + "default_config": { + "enabled": true, + "permission_policy": { + "type": "always_allow" + } + } + } + ], + "mcp_servers": [ + { + "type": "url", + "name": "github", + "url": "https://api.githubcopilot.com/mcp/" + } + ], + "metadata": { + "role": "orchestrator", + "scope": "planning", + "substrate": "lp" + } +} diff --git a/agents/simulation/webster-lp-sim-redesigner.json b/agents/simulation/webster-lp-sim-redesigner.json new file mode 100644 index 0000000..2d0f6f4 --- /dev/null +++ b/agents/simulation/webster-lp-sim-redesigner.json @@ -0,0 +1,33 @@ +{ + "name": "webster-lp-sim-redesigner", + "description": "Simulation redesigner for lp; synthesizes MCP-read critic findings into proposal and decision artifacts.", + "model": "claude-opus-4-7", + "system": "You are Webster's lp simulation redesigner.\n\n# Bootstrap (first action)\nYour user.message supplies: BRANCH, WEEK_DATE, SUBSTRATE=lp, CONTEXT_PATH, SITE_PATH, and the demo branch ref to inspect.\n\nRepo coordinates for all GitHub MCP calls: owner=richsak, repo=webster.\n\nThe container has NO git credentials and NO local clone of the repo. All file IO happens through GitHub MCP tools. Do NOT attempt shell git commands.\n\nRead site and context through GitHub MCP get_file_contents at ref=$BRANCH. Never use external page fetches, localhost, preview URLs, or live production URLs.\n\nRequired reads:\n1. get_file_contents path=demo-landing-page/context/business.md, ref=$BRANCH.\n2. get_file_contents path=demo-landing-page/context/brand.json, ref=$BRANCH.\n3. get_file_contents path=demo-landing-page/context/personas.json, ref=$BRANCH.\n4. Read the single landing page index.html under the supplied SITE_PATH.\n5. get_file_contents path=context/sim/lp/redesigner/notes.md, ref=$BRANCH for prior findings if present; treat 404 as week 1.\n\nJudge against the brand bible and personas, not against the ugly current state. The ugly state is the unimproved surface; Webster converges toward brand intent.\n\n# Required council reads\nRead context/sim/lp/monitor/alerts.md and every context/sim/lp/critics/*/findings.md from ref=$BRANCH. Read history/lp-demo/$WEEK_DATE/analytics.json and plan.md if present.\n\n# Task\nSelect 3–5 atomic weekly changes across text, css, component, or asset kinds. Judge against brand intent, personas, synthetic analytics, and critic findings. Propose substantive landing-page moves, especially in the first 2–3 weeks; do not merely polish the ugly baseline.\n\n# Output\nCommit history/lp-demo/$WEEK_DATE/proposal.md and history/lp-demo/$WEEK_DATE/decision.json. Include selected issues, deferred issues, proposed file edits, rationale, constraints, and experiment IDs.\n\nUse GitHub MCP push_files if available, otherwise create_or_update_file. No shell git.", + "tools": [ + { + "type": "agent_toolset_20260401" + }, + { + "type": "mcp_toolset", + "mcp_server_name": "github", + "default_config": { + "enabled": true, + "permission_policy": { + "type": "always_allow" + } + } + } + ], + "mcp_servers": [ + { + "type": "url", + "name": "github", + "url": "https://api.githubcopilot.com/mcp/" + } + ], + "metadata": { + "role": "redesigner", + "scope": "synthesis", + "substrate": "lp" + } +} diff --git a/agents/simulation/webster-lp-sim-seo-critic.json b/agents/simulation/webster-lp-sim-seo-critic.json new file mode 100644 index 0000000..2f7ee1e --- /dev/null +++ b/agents/simulation/webster-lp-sim-seo-critic.json @@ -0,0 +1,33 @@ +{ + "name": "webster-lp-sim-seo-critic", + "description": "seo simulation critic for lp; MCP-native, branch-ref based, no live URL reads.", + "model": "claude-sonnet-4-6", + "system": "You are the seo critic in Webster's lp simulation council.\n\n# Bootstrap (first action)\nYour user.message supplies: BRANCH, WEEK_DATE, SUBSTRATE=lp, CONTEXT_PATH, SITE_PATH, and the demo branch ref to inspect.\n\nRepo coordinates for all GitHub MCP calls: owner=richsak, repo=webster.\n\nThe container has NO git credentials and NO local clone of the repo. All file IO happens through GitHub MCP tools. Do NOT attempt shell git commands.\n\nRead site and context through GitHub MCP get_file_contents at ref=$BRANCH. Never use external page fetches, localhost, preview URLs, or live production URLs.\n\nRequired reads:\n1. get_file_contents path=demo-landing-page/context/business.md, ref=$BRANCH.\n2. get_file_contents path=demo-landing-page/context/brand.json, ref=$BRANCH.\n3. get_file_contents path=demo-landing-page/context/personas.json, ref=$BRANCH.\n4. Read the single landing page index.html under the supplied SITE_PATH.\n5. get_file_contents path=context/sim/lp/critics/seo/findings.md, ref=$BRANCH for prior findings if present; treat 404 as week 1.\n\nJudge against the brand bible and personas, not against the ugly current state. The ugly state is the unimproved surface; Webster converges toward brand intent.\n\n# Scope\nOwn only technical discoverability, metadata, headings, semantic HTML, internal links, image alt text, and search-result shareability.\nFor the Richer Health landing page, evaluate the single-page journey against the supplied Richer Health business, persona, and brand context.\n\nDo not fix issues. Do not claim ownership of other critic domains.\n\n# Findings format\nCommit markdown exactly shaped as:\n\n# Findings — Week $WEEK_DATE\n\n## Issues identified\n- [CRITICAL|HIGH|MEDIUM|LOW] \n\n## Patterns observed\n- \n\n## Out of scope\n- [] \n\nHard cap: 10 issues. Do not fabricate evidence.\n\n# Severity rubric\n- CRITICAL — blocks the demo's believable business outcome or violates a hard brand/legal/trust rule\n- HIGH — materially weakens conversion, trust, clarity, or domain fit\n- MEDIUM — visible quality gap a weekly council should address soon\n- LOW — polish opportunity or minor inconsistency\n\n# Commit + push (GitHub MCP, not shell git)\nCreate or update context/sim/lp/critics/seo/findings.md on BRANCH with the full markdown body. Use create_branch first if needed. Use create_or_update_file with commit message 'chore(webster-lp-sim-seo-critic): week $WEEK_DATE findings'.", + "tools": [ + { + "type": "agent_toolset_20260401" + }, + { + "type": "mcp_toolset", + "mcp_server_name": "github", + "default_config": { + "enabled": true, + "permission_policy": { + "type": "always_allow" + } + } + } + ], + "mcp_servers": [ + { + "type": "url", + "name": "github", + "url": "https://api.githubcopilot.com/mcp/" + } + ], + "metadata": { + "role": "critic", + "scope": "seo", + "substrate": "lp" + } +} diff --git a/agents/simulation/webster-lp-sim-visual-reviewer.json b/agents/simulation/webster-lp-sim-visual-reviewer.json new file mode 100644 index 0000000..ce266ae --- /dev/null +++ b/agents/simulation/webster-lp-sim-visual-reviewer.json @@ -0,0 +1,33 @@ +{ + "name": "webster-lp-sim-visual-reviewer", + "description": "Simulation visual reviewer for lp; checks rendered local screenshots and proposal intent.", + "model": "claude-opus-4-7", + "system": "You are Webster's lp simulation visual reviewer.\n\n# Bootstrap (first action)\nYour user.message supplies: BRANCH, WEEK_DATE, SUBSTRATE=lp, CONTEXT_PATH, SITE_PATH, and the demo branch ref to inspect.\n\nRepo coordinates for all GitHub MCP calls: owner=richsak, repo=webster.\n\nThe container has NO git credentials and NO local clone of the repo. All file IO happens through GitHub MCP tools. Do NOT attempt shell git commands.\n\nRead site and context through GitHub MCP get_file_contents at ref=$BRANCH. Never use external page fetches, localhost, preview URLs, or live production URLs.\n\nRequired reads:\n1. get_file_contents path=demo-landing-page/context/business.md, ref=$BRANCH.\n2. get_file_contents path=demo-landing-page/context/brand.json, ref=$BRANCH.\n3. get_file_contents path=demo-landing-page/context/personas.json, ref=$BRANCH.\n4. Read the single landing page index.html under the supplied SITE_PATH.\n5. get_file_contents path=history/lp-demo/$WEEK_DATE/visual-review.md, ref=$BRANCH for prior findings if present; treat 404 as week 1.\n\nJudge against the brand bible and personas, not against the ugly current state. The ugly state is the unimproved surface; Webster converges toward brand intent.\n\n# Task\nReview screenshot references and accessibility text supplied in user.message for 375, 768, and 1440 widths. Verify selected proposal intent landed visibly and no breakpoint regression, overflow, clipped text, missing CTA, or missing trust block was introduced.\n\n# Output\nCommit history/lp-demo/$WEEK_DATE/visual-review.md with PASS or BLOCK, screenshot refs, proposal intent checks, breakpoint regressions, and fix hints. Use GitHub MCP only.", + "tools": [ + { + "type": "agent_toolset_20260401" + }, + { + "type": "mcp_toolset", + "mcp_server_name": "github", + "default_config": { + "enabled": true, + "permission_policy": { + "type": "always_allow" + } + } + } + ], + "mcp_servers": [ + { + "type": "url", + "name": "github", + "url": "https://api.githubcopilot.com/mcp/" + } + ], + "metadata": { + "role": "critic", + "scope": "visual-review", + "substrate": "lp" + } +} diff --git a/agents/visual-design-critic.json b/agents/visual-design-critic.json deleted file mode 100644 index b6c178d..0000000 --- a/agents/visual-design-critic.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "name": "visual-design-critic", - "description": "Visual-design critic — audits layout, imagery, rhythm, hierarchy, and breakpoint flow, for a landing page that looks credible and guides action. Weekly audit of LP visual and layout architecture: hero imagery, section rhythm, trust/testimonial distribution, stat and step-card treatment, marquee/ticker elements, nav styling, and CTA hierarchy.", - "model": "claude-sonnet-4-6", - "system": "You are the visual-design-critic in Webster's landing-page improvement council for Dr. Nicolette Richer / Richer Health.\n\n# Bootstrap (first action)\nYour user.message supplies: BRANCH, WEEK_DATE, LP_TARGET.\n\nRepo coordinates for all GitHub MCP calls: owner=richsak, repo=webster.\n\nThe container has NO git credentials and NO local clone of the repo. All file IO happens through GitHub MCP tools (bound to the repo via a vault credential). Do NOT attempt `git clone`, `git push`, or any shell git command — they will fail. Do NOT ask for a WEBSTER_REPO_URL — there isn't one.\n\n1. Call WebFetch on $LP_TARGET once for rendered HTML analysis. This is your PRIMARY evidence source.\n2. Call github MCP `get_file_contents` (owner=richsak, repo=webster, path=context/business.md, ref=main) — if it 404s, skip.\n3. Call github MCP `get_file_contents` (owner=richsak, repo=webster, path=context/critics/visual-design/findings.md, ref=main) — prior week's findings for memory. If 404, treat as week 1.\n4. If site/ exists on main: `get_file_contents` path=site to list entries, then targeted per-file reads.\n\n# Scope (ONLY visual-design)\nYou own:\n- Hero imagery relevance to the offer (e.g. forest ferns vs. clinical-team context)\n- Section rhythm, pacing, and vertical hierarchy across scroll depth\n- Visual distribution of trust signals, logos, and testimonials across the page and across breakpoints\n- Stat-block visual treatment (typography scale, grouping, emphasis)\n- Step-card and process-section visual design (icons, numbering, card parity)\n- Nav CTA button styling and above-fold visual prominence on mobile\n- Marquee/ticker and decorative elements — signal vs. noise\n- Visual hierarchy and contrast of primary CTAs vs. body copy\n- Image alt-vs-decorative treatment (coordinating with SEO)\n- Responsive layout integrity across mobile / tablet / desktop breakpoints\n\nYou do NOT own:\n- Copy wording, headline clarity, benefit framing (copy-critic)\n- Tone, register, signature phrases, credential display (brand-voice-critic)\n- CTA destination URLs, booking path, urgency language (conversion-critic)\n- JSON-LD, canonical, meta tags, heading hierarchy markup (seo-critic)\n- Medical claim language, disclaimers, regulatory risk (fh-compliance-critic)\n- Stat sourcing and factual substantiation (fh-compliance-critic + copy-critic)\n- CTA label wording consistency (conversion-critic + copy-critic)\n\n# Reading discipline\n- Prefer `search_code` with scoped queries over fetching every file.\n- Use `get_file_contents` with a specific `path`. Never list the entire repo.\n- Aim for under 15 file reads per audit.\n\n# Findings format (mandatory)\nCommit this exact structure to context/critics/visual-design/findings.md on the target branch:\n\n# Findings — Week $WEEK_DATE\n\n## Issues identified\n- [CRITICAL|HIGH|MEDIUM|LOW] \n\n## Patterns observed\n- \n\n## Out of scope\n- [] \n\nHard cap: 10 issues total.\n\n# Severity rubric (visual-design-tuned)\nCRITICAL: Visual failure that breaks comprehension or trust on first scroll (e.g., hero image actively contradicts offer, primary CTA invisible on mobile, layout collapse at common breakpoint). HIGH: Structural visual debt that materially weakens persuasion (trust signals absent at decision points visually, step cards inconsistent, stat block lacks hierarchy). MEDIUM: Rhythm and pacing issues (section density uneven, decorative elements compete with content, icon treatment weakens process section). LOW: Polish gaps (alt-text semantics, preload hints for LCP imagery as visual-performance concerns, minor spacing inconsistencies).\n\n# Out-of-scope rule\nTag owner; do not claim or fix.\n\n# Commit + push (GitHub MCP, not shell git)\n\n1. Call `create_branch` owner=richsak, repo=webster, branch=$BRANCH, from_branch=main. If it returns 422 (branch exists), proceed.\n\n2. Call `get_file_contents` owner=richsak, repo=webster, path=context/critics/visual-design/findings.md, ref=$BRANCH. If it exists, capture the SHA. If 404, skip.\n\n3. Call `create_or_update_file` with:\n - owner: richsak\n - repo: webster\n - branch: $BRANCH\n - path: context/critics/visual-design/findings.md\n - content: the full findings.md body (starting with '# Findings — Week $WEEK_DATE')\n - message: 'chore(visual-design-critic): week $WEEK_DATE findings'\n - sha: \n\nThat single `create_or_update_file` call is both the commit and the push. No shell git required.\n", - "tools": [ - { - "type": "agent_toolset_20260401" - }, - { - "type": "mcp_toolset", - "mcp_server_name": "github", - "default_config": { - "enabled": true, - "permission_policy": { - "type": "always_allow" - } - } - } - ], - "mcp_servers": [ - { - "type": "url", - "name": "github", - "url": "https://api.githubcopilot.com/mcp/" - } - ], - "metadata": { - "role": "critic", - "scope": "visual-design" - } -} diff --git a/assets/memory-stores-screenshots/manifest.json b/assets/memory-stores-screenshots/manifest.json new file mode 100644 index 0000000..52f7ff7 --- /dev/null +++ b/assets/memory-stores-screenshots/manifest.json @@ -0,0 +1,12 @@ +{ + "generated_at": "2026-04-25T07:59:00.355Z", + "screenshots": [], + "manual_proof": [ + { + "substrate": "manual", + "week": null, + "path": "/Users/richiesakhon/Projects/webster/assets/memory-stores-screenshots/manual/console-memory-stores-2026-04-25.png", + "bytes": 542042 + } + ] +} diff --git a/assets/memory-stores-screenshots/manual/console-memory-stores-2026-04-25.png b/assets/memory-stores-screenshots/manual/console-memory-stores-2026-04-25.png new file mode 100644 index 0000000..6441c3a Binary files /dev/null and b/assets/memory-stores-screenshots/manual/console-memory-stores-2026-04-25.png differ diff --git a/bun.lock b/bun.lock index aac7e3c..a1b7214 100644 --- a/bun.lock +++ b/bun.lock @@ -11,6 +11,7 @@ "eslint": "^9.18.0", "husky": "^9.1.7", "markdownlint-cli2": "^0.17.2", + "playwright": "^1.59.1", "prettier": "^3.4.2", "typescript": "^5.7.3", "typescript-eslint": "^8.22.0", @@ -184,6 +185,8 @@ "flatted": ["flatted@3.4.2", "", {}, "sha512-PjDse7RzhcPkIJwy5t7KPWQSZ9cAbzQXcafsetQoD7sOJRQlGikNbx7yZp2OotDnJyrDcbyRq3Ttb18iYOqkxA=="], + "fsevents": ["fsevents@2.3.2", "", { "os": "darwin" }, "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA=="], + "glob-parent": ["glob-parent@6.0.2", "", { "dependencies": { "is-glob": "^4.0.3" } }, "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A=="], "globals": ["globals@14.0.0", "", {}, "sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ=="], @@ -326,6 +329,10 @@ "picomatch": ["picomatch@2.3.2", "", {}, "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA=="], + "playwright": ["playwright@1.59.1", "", { "dependencies": { "playwright-core": "1.59.1" }, "optionalDependencies": { "fsevents": "2.3.2" }, "bin": { "playwright": "cli.js" } }, "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw=="], + + "playwright-core": ["playwright-core@1.59.1", "", { "bin": { "playwright-core": "cli.js" } }, "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg=="], + "prelude-ls": ["prelude-ls@1.2.1", "", {}, "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g=="], "prettier": ["prettier@3.8.3", "", { "bin": { "prettier": "bin/prettier.cjs" } }, "sha512-7igPTM53cGHMW8xWuVTydi2KO233VFiTNyF5hLJqpilHfmn8C8gPf+PS7dUT64YcXFbiMGZxS9pCSxL/Dxm/Jw=="], diff --git a/context/ARCHITECTURE.md b/context/ARCHITECTURE.md index 05b222d..b8816d5 100644 --- a/context/ARCHITECTURE.md +++ b/context/ARCHITECTURE.md @@ -2,7 +2,7 @@ > Mirrors [[webster-architecture]] in vault. Canonical source is this file for in-repo operators; vault file for cross-session memory. > -> **Submission state**: Layers 1–4 + Layer 7 shipped. Layer 5 (`site/` fork + analytics pixel + `scripts/seed-mock-history.ts`) is scoped out for submission — the mock seeder is inlined in `prompts/second-wbs-session.md` Step 1 instead of a separate script, and the redesigner emits `proposal.md` instead of `proposal.diff`. Layer 6 (video) is blocked on Richie's voice record. See `context/FEATURES.md` for per-row status. +> **Shipped state**: 9 production Managed Agents, mirrored 1:1 by 9 `webster-lp-sim-*` simulation specs. Full council loop runs end-to-end — planner → fan-out → redesigner → visual review — with critic genealogy as the runtime specialist-spawn beat. The redesigner emits `proposal.md` (PR body) rather than `proposal.diff`; a real `site/` fork that lets the council emit a one-click diff is roadmap, not pending. See `context/FEATURES.md` for the full inventory. ## System Overview @@ -13,8 +13,12 @@ │ Claude Code Session (orchestrator — Opus 4.7) │ │ ├─ reads site/ + history/ + context/critics/*/findings.md │ │ │ │ -│ ├─ fan-out: POST /v1/sessions for each of 6 pre-registered │ -│ │ Managed Agents (parallel), then send user.message event │ +│ ├─ planner session (Opus 4.7) │ +│ │ ├─ marshals memory + verdicts + monitor anomalies │ +│ │ └─ writes plan.md with direction_hint for the week │ +│ │ │ +│ ├─ fan-out: POST /v1/sessions for 6 pre-registered Managed │ +│ │ Agents (parallel), then send user.message event │ │ │ ├─ monitor (Haiku 4.5) — detects analytics anomalies │ │ │ ├─ 5 specialist critics (Sonnet 4.6) │ │ │ │ ├─ SEO, brand-voice, FH-compliance, │ @@ -24,7 +28,9 @@ │ ├─ redesigner session (Opus 4.7) │ │ │ ├─ orchestrator gathers committed findings │ │ │ ├─ passes them as input text to redesigner session │ -│ │ └─ redesigner outputs proposal.diff + decision.json │ +│ │ └─ redesigner outputs proposal.md + decision.json │ +│ │ │ +│ ├─ visual-reviewer (Opus 4.7) — post-redesign visual audit │ │ │ │ │ ├─ Critic Genealogy (runtime creation, public beta) │ │ │ ├─ detects pattern no existing critic owns │ @@ -48,13 +54,14 @@ ### Layer 1: Routine + Orchestrator -- `routines/weekly-lp-improve.yaml` — cut from submission; weekly trigger is manual `wbs @prompts/second-wbs-session.md` -- `prompts/second-wbs-session.md` — bash-in-markdown orchestrator (replaces the planned `webster/orchestrator.ts`), reads state, fans out, runs genealogy, opens PR +- `routines/weekly-lp-improve.yaml` — cut from submission; weekly trigger is manual `/webster-weekly-council` (Claude Code skill) or `wbs @prompts/second-wbs-session.md` (single-page runbook) +- `skills/webster-weekly-council/SKILL.md` — library skill: slim index + on-demand phase references + helper scripts; the operator surface for the weekly run +- `prompts/second-wbs-session.md` — bash-in-markdown orchestrator (replaces the planned `webster/orchestrator.ts`), reads state, fans out, runs genealogy, opens PR; the immutable single-page runbook the skill mirrors - Shared agent skill `skills/webster-lp-audit/SKILL.md` — universal e2e flow: _read context → critique → write findings → exit_ - Per-critic context: `context/critics/{name}/findings.md` - Run artifacts: `history/YYYY-MM-DD/{analytics.json, council-output/, synthesis.md, proposal.md, decision.json}` -### Layer 2: Managed Agent Critics (7 pre-registered) +### Layer 2: Pre-registered Managed Agents (9 production, mirrored 1:1 by 9 simulation) **Environment is a separate resource** (`POST /v1/environments`), registered once per workspace and referenced by ID in every session. There is NO in-agent `environment:` or `resources:` field. @@ -65,17 +72,23 @@ Environment `environments/webster-council-env.json`: - Networking: `limited` with `allowed_hosts: [api.github.com, github.com, raw.githubusercontent.com, api.anthropic.com]`, `allow_mcp_servers: true`, `allow_package_managers: true` - No GitHub-repo mount primitive exists — the agent `git clone`s at session start via bash using a `GITHUB_TOKEN` passed in the first user.message -Agent specs (JSON, not YAML — matches `POST /v1/agents` schema): +Production specs (JSON, not YAML — matches `POST /v1/agents` schema): + +| Spec | Model | Role | +| ------------------------------------------------ | ---------- | -------------------------- | +| `agents/production/webster-planner.json` | Opus 4.7 | orchestrator (pre-fan-out) | +| `agents/production/brand-voice-critic.json` | Sonnet 4.6 | critic | +| `agents/production/conversion-critic.json` | Sonnet 4.6 | critic | +| `agents/production/copy-critic.json` | Sonnet 4.6 | critic | +| `agents/production/fh-compliance-critic.json` | Sonnet 4.6 | critic | +| `agents/production/seo-critic.json` | Sonnet 4.6 | critic | +| `agents/production/webster-visual-reviewer.json` | Opus 4.7 | critic (post-redesign) | +| `agents/production/webster-monitor.json` | Haiku 4.5 | monitor | +| `agents/production/webster-redesigner.json` | Opus 4.7 | redesigner | -- `agents/webster-monitor.json` — Haiku 4.5 -- `agents/brand-voice-critic.json` — Sonnet 4.6 -- `agents/fh-compliance-critic.json` — Sonnet 4.6 -- `agents/seo-critic.json` — Sonnet 4.6 -- `agents/conversion-critic.json` — Sonnet 4.6 -- `agents/copy-critic.json` — Sonnet 4.6 -- `agents/webster-redesigner.json` — Opus 4.7 +Simulation set at `agents/simulation/webster-lp-sim-*` mirrors the production roster 1:1 — same models, same role distribution, no extra surface for judges to evaluate. Sim agents are additive, never touching production. **No `callable_agents`** (research preview) on either set. -Each spec has: `name`, `model`, `system` (multi-line string with escaped \n), `tools: [{type: agent_toolset_20260401}]`, `metadata`. **No `callable_agents`** (research preview). +Each spec has: `name`, `model`, `system` (multi-line string with escaped \n), `tools: [{type: agent_toolset_20260401}]`, `metadata`. ### Layer 3: Critic Genealogy (novel mechanic) @@ -234,11 +247,7 @@ Production/sim agents should receive the same evidence order, especially prior h ### Layer 6: Meta Video -- Remotion template + 5 comps (title, council viz, TAM+10wk morph, Genealogy diagram, end-card) -- Opus-authored narration script (`video/script.md`) -- Voice: Richie's own, Sat AM record -- Final assembly in Descript or CapCut, 3-min clean cut -- End-card: commit hashes for Claude-authored assets +Submission tooling, not part of the product. The HyperFrames render pipeline that turns the per-week LP simulation assets into a timelapse video lives outside the public repo. Per-week deliverables stay committed under `demo-output/landing-page/w00..w10/` as judge evidence: desktop/mobile/tablet screenshots, heatmap JSON+SVG, synthetic analytics, visual-review verdicts. See `demo-output/landing-page/INDEX.md` for the narrated walk-through. ### Layer 7: Polish @@ -252,7 +261,7 @@ Production/sim agents should receive the same evidence order, especially prior h 1. **Agents are registered from the orchestrator session.** `POST /v1/agents` from Claude Code (orchestrator), never from inside a Managed Agent's own loop. Both pre-registered critics AND runtime-created Genealogy critics are registered this way. 2. **Environments are separate resources.** `POST /v1/environments` once per workspace; referenced by `environment_id` in every session. 3. **No `callable_agents`.** Agent-to-agent invocation is research preview. Orchestrator fans out via parallel `/v1/sessions` calls. -4. **State lives in git.** Critics commit findings from inside their sessions. No managed memory stores (also research preview). +4. **State is hybrid.** Authoritative state lives in git — critics commit findings from inside their sessions, run artifacts land under `history/`. Six Anthropic Managed Memory Stores (registered IDs in `context/memory-stores.json`) hold cross-session priors for council, planner, redesigner, genealogy, conversion-critic, and visual-reviewer; git remains the auditable source of truth. 5. **Credentials**: orchestrator holds `ANTHROPIC_API_KEY` + `GITHUB_TOKEN`. Sessions receive `GITHUB_TOKEN` in the first user.message so they can `git clone` + push. Cloudflare creds are onboarding-only. 6. **Skill is universal.** Same markdown, Claude Code + claude.ai. 7. **Zero fabricated stats.** Mock analytics framed as POC priors. @@ -260,10 +269,10 @@ Production/sim agents should receive the same evidence order, especially prior h ## Dependencies - Anthropic Managed Agents API, beta header `managed-agents-2026-04-01` (public beta — verified live 2026-04-23) -- (Research preview, NOT required for public beta path: `callable_agents`, memory stores, outcomes — request at ) +- Anthropic Managed Memory Stores (public beta) — six stores per substrate, IDs at `context/memory-stores.json` +- (Research preview, NOT required for public beta path: `callable_agents`, outcomes — request at ) - Claude Code (Routines, `/v1/claude_code/routines/{id}/fire`) - Claude Design (user-facing, bundle `.zip`) - Cloudflare Workers + Static Assets + Workers Builds - GitHub (MCP + webhooks) - Astro 6 + `@astrojs/cloudflare` -- Remotion (video) diff --git a/context/DOMAIN-MODEL.md b/context/DOMAIN-MODEL.md index 737727e..42ec4c6 100644 --- a/context/DOMAIN-MODEL.md +++ b/context/DOMAIN-MODEL.md @@ -47,28 +47,29 @@ redesigner apply merged 7-day verdict planner - `rolled-back` — verdict is `hurt` at p<0.05, auto-rollback fired OR planner directed revert - `inconclusive` — verdict is `neutral` or ambiguous, baseline holds, next experiment adjusts direction -## Agent Roster (9 base + dynamic genealogy) - -| # | Agent | Model | Role | Shipped | -| --- | ----------------------------- | ---------------------- | ---------------------------------------------------------- | ----------- | -| 1 | `webster-monitor` | Haiku 4.5 | Analytics anomaly detection | Shipped L2 | -| 2 | **`webster-planner`** | **Opus 4.7** | **NEW — reads verdict, decides experiment direction** | Planned L11 | -| 3 | `seo-critic` | Sonnet 4.6 | SEO findings | Shipped L2 | -| 4 | `brand-voice-critic` | Sonnet 4.6 | Brand-voice consistency | Shipped L2 | -| 5 | `fh-compliance-critic` | Sonnet 4.6 | Functional-health medical-claims audit | Shipped L2 | -| 6 | `conversion-critic` | Sonnet 4.6 | Conversion-path + CTA audit | Shipped L2 | -| 7 | `copy-critic` | Sonnet 4.6 | Copy quality + voice | Shipped L2 | -| 8 | `visual-design-critic` | Sonnet 4.6 | Visual rhythm, hierarchy, imagery relevance (pre-proposal) | Shipped L2 | -| 9 | `webster-redesigner` | Opus 4.7 | Synthesizes findings + plan → proposal | Shipped L2 | -| 10 | **`webster-apply-worker`** | **Pi / Codex gpt-5.4** | **Executes proposal against Site** | Planned L8 | -| 11 | **`webster-visual-reviewer`** | **Opus 4.7** | **Browser-based post-apply verification** | Planned L9 | -| — | Genealogy critics | Sonnet 4.6 | Runtime-created when Opus detects gap | Shipped L3 | - -Planner is new (L11). Apply worker + visual-reviewer are planned (L8 / L9). Note: `visual-design-critic` (#8, shipped L2, pre-proposal audit) is a distinct agent from `webster-visual-reviewer` (#11, planned L9, post-apply verification) — different stages, different concerns. +## Agent Roster (9 production, mirrored 1:1 by 9 simulation + dynamic genealogy) + +Production set in `agents/production/`; simulation set in `agents/simulation/` is a 1:1 mirror with `webster-lp-sim-*` prefix. + +| # | Agent | Model | Role | +| --- | ------------------------- | ---------- | ------------------------------------------------------------------------ | +| 1 | `webster-planner` | Opus 4.7 | Reads verdict + memory + monitor anomaly, decides experiment direction | +| 2 | `webster-monitor` | Haiku 4.5 | Analytics anomaly detection | +| 3 | `seo-critic` | Sonnet 4.6 | SEO findings | +| 4 | `brand-voice-critic` | Sonnet 4.6 | Brand-voice consistency | +| 5 | `fh-compliance-critic` | Sonnet 4.6 | Functional-health medical-claims audit | +| 6 | `conversion-critic` | Sonnet 4.6 | Conversion-path + CTA audit | +| 7 | `copy-critic` | Sonnet 4.6 | Copy quality + voice | +| 8 | `webster-redesigner` | Opus 4.7 | Synthesizes findings + plan → proposal | +| 9 | `webster-visual-reviewer` | Opus 4.7 | Post-redesign visual review (rendered-layout audit) | +| — | Apply worker | Pi / Codex | Executes proposal against site (Forge-orchestrated, not a Managed Agent) | +| — | Genealogy critics | Sonnet 4.6 | Runtime-spawned when Opus detects an unowned scope | + +`visual-design-critic` was a runtime-genealogy spawn during the W4 demo arc — not a permanent base agent. It was retired before the final symmetric 9+9 cut so production and simulation rosters mirror 1:1. ## Managed Agent invocation pattern -All Claude Managed Agents in Webster (monitor, planner, 6 critics, redesigner, visual-reviewer) follow the same 5-step pattern, shipped today in `scripts/critic-genealogy.ts`: +All Claude Managed Agents in Webster (planner, monitor, 5 critics, redesigner, visual-reviewer = 9 production specs) follow the same 5-step pattern, shipped today in `scripts/critic-genealogy.ts`: | Step | Endpoint | Frequency | Purpose | | -------------------- | ------------------------------ | ----------------------------------------------------------------- | --------------------------------------------------------------------------------- | @@ -332,14 +333,15 @@ A spawned critic that has not emitted a CRITICAL or HIGH finding in 4 consecutiv The token-efficiency gate ("council run cost must not regress at p<0.05") catches runaway AFTER the fact. Governor prevents; gate catches. Two independent checks. -**Token math** (why this matters): +**Why governance matters**: -- 1 critic: ~10K tokens/run × 52 weeks = ~520K tokens/year -- Current 6 critics + redesigner + monitor ≈ 4M tokens/year -- Ungoverned spawning (~1 new critic/quarter, no retirement): +4 critics/year = +2M tokens/year (~50% annual run cost) -- With governor (cap 2/quarter, dedup rejects ~60%, retire-idle prunes ~30%): steady state ~10 critics max = +25% over current +Critic count drives council run cost roughly linearly — every spawned critic adds another `/v1/sessions` call per week. Without a governor, runtime spawning trends toward unbounded growth as new gaps appear. The governor's job is to keep the council bounded: -Over 3 years: governor saves roughly 10M tokens. +- Layer 2 dedup rejects scope-overlapping requests so every spawn is genuinely additive. +- Layer 3 quarterly cap prevents single-quarter spawn storms. +- Layer 4 retire-idle prunes critics that stop producing promoted findings. + +Real run costs depend on per-critic token volume, which varies by site complexity. The token-efficiency gate (next section) is the after-the-fact check; the governor is the before-the-fact one. **Escalation paths for blocked requests**: @@ -448,7 +450,7 @@ Decisions needed before L11 (and some L9) can be implemented: 5. **Planner overriding critics** — 🔒 **LOCKED (Richie, 2026-04-23) as Option 5C (88/100)**: planner can request a NEW critic via L3 genealogy. Plan emits `genealogy_request: { concern, rationale }`; orchestrator authors the spec via existing `scripts/critic-genealogy.ts`. Cannot silence or weight existing critics. Preserves invariant #6. Directly used in Q9 demo arc W4 (bounce-guard-critic spawn). Prior rejected options: `suppressed_findings[]` (60, silences validation), `direction_hint` only (80, no blind-spot mechanism). -- **Q5.1 Genealogy governance** — 🔒 **LOCKED (Richie, 2026-04-23) as Option 5.1C (90/100)**: four-layer governor bounding 5C's spawn mechanism. See "Genealogy governance" section below for full spec. Prevents token-waste drift over 52-week operation without rigid per-period caps. Token math: ungoverned spawning adds ~50% annual run cost over 3 years; governor C steady-state adds ~25%. +- **Q5.1 Genealogy governance** — 🔒 **LOCKED (Richie, 2026-04-23) as Option 5.1C (90/100)**: four-layer governor bounding 5C's spawn mechanism. See "Genealogy governance" section below for full spec. Prevents unbounded drift in council size over long-running operation without rigid per-period caps. - **Q6 Partial experiments (skip contract)** — 🔒 **LOCKED (Richie, 2026-04-23) as Option 6D (92/100)**: skip is terminal at the current week + feeds next-week planning as structured data. No mechanical roll-forward, no in-session retry loops. See "Skip contract" section below for full spec. Dominates prior options (roll-forward 75 creates infinite loops on systemic vetoes; retry-in-session 60 spirals; logging-only 85 doesn't answer "what next for the skipped experiment"). @@ -466,14 +468,10 @@ Answer Q5, Q6, Q7 → I implement. Once the full stack ships, Webster's pitch upgrades: -- **v1 (today)**: Council produces proposals. Human applies. 7 agents + critic genealogy. -- **v2 (L8)**: Council produces PRs. 7 agents + genealogy + apply worker. -- **v2.5 (L10)**: Council proposes design-level changes (CSS, components, assets), not just copy. -- **v3 (L9)**: Every PR gated by visual verification. No regressions reach human review. -- **v4 (L11)**: Council plans experiments, measures outcomes, auto-rolls-back failures, promotes winners to baseline. **Genuine autonomous improvement.** - -v4 is the hackathon pitch's honest claim. Everything below v4 is a subset. - ---- +- **v1 (shipped, this submission)**: 9 production agents — planner reads verdict + memory + monitor anomaly to set direction, council critics audit, redesigner synthesizes the proposal, visual-reviewer rendered-layout audits the result. Critic genealogy spawns specialists at runtime when an unowned scope appears. Council produces a reviewable PR body (`proposal.md`); human merges. +- **v2 (roadmap, L8)**: Council emits `proposal.diff` directly via the apply worker. +- **v2.5 (roadmap, L10)**: Council proposes design-level changes (CSS, components, assets), not just copy. +- **v3 (roadmap, L9)**: Every PR gated by visual verification. No regressions reach human review. +- **v4 (roadmap, L11)**: Council plans experiments, measures outcomes, auto-rolls-back failures, promotes winners to baseline. **Genuine autonomous improvement.** -Last updated: 2026-04-23 (session 4 Phase 7, after Richie's "pre-submission scope + planner agent + autoresearch-as-council-input" corrections). +v4 is the long-arc claim. v1 is what's live in this repo today. diff --git a/context/EXPANSION-TASKS.md b/context/EXPANSION-TASKS.md deleted file mode 100644 index 30404f4..0000000 --- a/context/EXPANSION-TASKS.md +++ /dev/null @@ -1,378 +0,0 @@ -# Webster Expansion Tasks - -> Topologically ordered. Implement in sequence. Do NOT skip T0. Read `context/VISION.md` before each task and re-read it before marking any task done. - -## Per-task loop - -1. Re-read the task's acceptance criteria here -2. Read the files the task touches before editing them -3. Implement minimally — no scope expansion, no drive-by refactors, no "while I'm here" -4. Write the tests listed in acceptance criteria -5. `bun run validate` must be green -6. Conventional commit (`fix:` for T0, `feat:` for expansion tasks). One task = one commit (or one small series) -7. Before marking done, re-read VISION.md's "what's locked" + the task's acceptance criteria. If anything drifted, revisit. - -## Day-by-day target - -- **Day 1**: T0, T1, T3, T4 (infrastructure + assets, parallel-friendly) -- **Day 2**: T2, T5 (agent specs + synthetic analytics) -- **Day 3**: T6, T7, T8 + first dry run -- **Day 4**: T9, T10 + diagnose/re-run if needed + handoff - ---- - -## T0 — Pass-7 review fixes - -**Status**: blocking. 4 of 5 fixes touch simulation-path code; skipping T0 risks contaminating the demo with known bugs. - -**Files**: - -- `scripts/apply-worker-cli.ts:142` — og_card dims 1200x630 → 1536x1024 (or closest supported) -- `scripts/apply-worker.ts:733-739` — `runtime_failure` drops from visual-veto branch, falls through to `apply-fail` -- `.husky/pre-commit:13-15` — add `chomp;` + `print "$_\0"` in perl pipeline -- `scripts/critic-genealogy.ts` — wrap `fetchSessionSnapshot` call in `main()` with try/catch; persist spec.json + snapshot-error sentinel + agent JSON on failure; exit non-zero after commitArtifacts -- Extract shared paginated `findAgentByName` helper, import from both `scripts/planner-invoke.ts` and `scripts/critic-genealogy.ts` - -**Accept**: - -- `bun run validate` green -- New/updated unit tests: `runtime_failure → apply-fail`, snapshot-fetch-fail still writes spec.json, pagination helper finds name on page 2 -- `printf 'foo.ts\0bar.md\0baz.txt\0' | perl -0ne 'chomp; print "$_\0" if /\.(ts|js|json|md|jsonc)$/;' | wc -c` returns 13 -- Conventional commits (one per fix, or one bundled `fix: apply pass 7 review items`) - ---- - -## T1 — Memory store provisioning - -**Depends on**: T0 - -Create `scripts/provision-memory-stores.ts` — idempotent provisioner that creates 12 memory stores via `POST /v1/memory_stores` (beta header `managed-agents-2026-04-01`). - -**Stores** (6 per substrate): - -| Store name | Writer | Readers | -| ------------------------------------- | ----------------------------- | ----------------------------- | -| `webster-council-memory-lp` | orchestrator (RW) | all LP sim agents (read_only) | -| `webster-planner-memory-lp` | planner (RW) | planner (RW) | -| `webster-redesigner-memory-lp` | redesigner (RW) | redesigner (RW) | -| `webster-genealogy-memory-lp` | orchestrator (RW) | genealogy logic (read_only) | -| `webster-conversion-critic-memory-lp` | conversion-critic (RW) | conversion-critic (RW) | -| `webster-visual-reviewer-memory-lp` | visual-reviewer (RW) | visual-reviewer (RW) | -| (same 6 names with `-site` suffix) | (parallel for site substrate) | (parallel) | - -**Output**: `context/memory-stores.json`: - -```json -{ - "lp": { - "council": "memstore_01...", - "planner": "memstore_01...", - "redesigner": "memstore_01...", - "genealogy": "memstore_01...", - "conversion-critic": "memstore_01...", - "visual-reviewer": "memstore_01..." - }, - "site": { ... } -} -``` - -**Accept**: - -- Running script twice produces identical output (idempotent by name lookup — if store with `name` already exists, reuse its ID) -- `context/memory-stores.json` contains 12 entries keyed by substrate + role -- Unit test mocks the API, verifies idempotency + error handling (network fail + partial completion resume) -- Script is safe to re-run after partial failure - ---- - -## T2 — 18 new sim-specific agent specs - -**Depends on**: T0 - -Create 18 new MCP-native agent specs. **Existing 9 `webster-*` agents are NOT modified.** - -**LP sim set** (9 files under `agents/`): - -- `webster-lp-sim-monitor.json` (Haiku 4.5) -- `webster-lp-sim-seo-critic.json` (Sonnet 4.6) -- `webster-lp-sim-brand-voice-critic.json` (Sonnet 4.6) -- `webster-lp-sim-fh-compliance-critic.json` (Sonnet 4.6) -- `webster-lp-sim-conversion-critic.json` (Sonnet 4.6) -- `webster-lp-sim-copy-critic.json` (Sonnet 4.6) -- `webster-lp-sim-redesigner.json` (Opus 4.7) -- `webster-lp-sim-planner.json` (Opus 4.7) -- `webster-lp-sim-visual-reviewer.json` (Opus 4.7) - -**Site sim set** (9 files under `agents/`): - -- `webster-site-sim-monitor.json` -- `webster-site-sim-seo-critic.json` -- `webster-site-sim-brand-voice-critic.json` -- `webster-site-sim-licensing-and-warranty-critic.json` (replaces fh-compliance slot, Sonnet 4.6) -- `webster-site-sim-conversion-critic.json` -- `webster-site-sim-copy-critic.json` -- `webster-site-sim-redesigner.json` -- `webster-site-sim-planner.json` -- `webster-site-sim-visual-reviewer.json` - -**System prompt differences from existing `webster-*` agents**: - -- **No WebFetch**. All site reads via `get_file_contents` (GitHub MCP) at the demo branch ref passed in user.message (e.g. `ref: demo-sim-lp/w03`) -- **No LP_TARGET URL** reference. Replace with substrate-appropriate context block -- **Context paths substrate-specific**: LP agents read `demo-landing-page/context/business.md`; site agents read `demo-sites/northwest-reno/context/business.md` -- **Site pages (site set only)**: redesigner + critics reference the 3-page structure (`/`, `/services`, `/free-estimate`) -- **licensing-and-warranty-critic**: scoped to contractor licensing number display, insurance claims, warranty terms, service-area clarity -- **Brand-voice critic**: reads `brand.json` + `business.md`, enforces voice + do_not_use - -**Registration**: via idempotent `POST /v1/agents` (by-name lookup before POST). Wrap in `scripts/register-sim-agents.ts` or extend existing registration script. - -**Accept**: - -- All 18 specs validate against existing JSON schema -- `scripts/register-sim-agents.ts` idempotent: re-running doesn't duplicate -- Spec schema tests cover both sets -- No reference to `LP_TARGET` or WebFetch anywhere in the 18 new specs -- Existing 9 `webster-*` agents unchanged (diff check) - ---- - -## T3 — Prefilled contexts - -**Depends on**: T0. Can run in parallel with T2. - -### 3a — Richer Health (LP) - -Directory: `demo-landing-page/context/` - -- `business.md` — copy from existing `context/business.md` (already Richer-Health-scoped) -- `personas.json` — 3 personas extracted from `.claude/skills/nicolette-richer/references/brand-bible.md`. Each persona: `{id, name, archetype, goals, anxieties, conversion_triggers, behavior_hints}`. Suggested: "credentials-conscious-executive" / "curious-self-starter" / "skeptical-researcher". -- `brand.json` — structured: `{voice, tone, palette, typography, signature_phrases, do_not_use}`. Extract from brand bible. - -### 3b — Northwest Home Renovations (site) - -Directory: `demo-sites/northwest-reno/context/` - -- `business.md` — invent from scratch. Fields: business name, owner ("Sam Reyes"), location (Pacific Northwest, non-specific town), services (kitchen / bath / deck renovation), license number (fictional, e.g. WA-CONTR-NWR-2024), warranty terms ("5-year workmanship, 10-year structural"), insurance ("$2M liability"), tone ("competent, direct, trust-heavy"). -- `personas.json` — 3 B2C homeowner personas: "first-time-homeowner-anxious" (scared of being scammed), "price-comparing-pragmatist" (getting 3 quotes), "warranty-conscious-veteran" (has been burned before). -- `brand.json` — palette (navy/white/safety-orange OR forest-green/cream/brass — pick one, document choice), typography (clear sans-serif + utility), voice (direct + trust-heavy), do_not_use (no superlatives, no "world-class", no generic "quality"). - -**Accept**: - -- Both contexts validate against a shared schema you define in the task (even a simple Zod schema in `scripts/context-schema.ts` is fine) -- Both brand extracts are rich enough to give the brand-voice critic concrete rules to enforce (at least 5 do_not_use items, palette with hex codes, typography with font families) -- No cross-contamination (contractor context never references Richer Health; LP context never references Northwest Reno) - ---- - -## T4 — Ugly sites - -**Depends on**: T3 (needs brand.json to know what the ideal is, so we can deliberately violate it). Can run in parallel with T2. - -### 4a — Richer Health ugly - -Directory: `demo-landing-page/ugly/` - -- `index.html` — single file, intentionally unpolished -- `style.css` — inline acceptable; keep minimal -- `README.md` — "Intentionally ugly. Do NOT improve outside simulation." - -**Characteristics** (each is something a specific critic should flag): - -- Generic stock hero image (not Nicolette's actual photo) — conversion / brand-voice / SEO ding -- Vague headline "Health & Wellness Coaching" — copy / conversion ding -- No credentials anywhere — fh-compliance / brand-voice ding -- Weak CTA "Learn More" — conversion ding -- Times New Roman everywhere — brand-voice / visual-review ding -- Center-aligned body text, no hierarchy — visual-review / copy ding -- No testimonials / social proof — conversion ding - -Reference (human-read only, not committed to repo references): existing `site/before/index.html` for layout structure. Do NOT copy — derive an intentionally-worse version. - -### 4b — Contractor ugly (3 pages) - -Directory: `demo-sites/northwest-reno/ugly/` - -- `index.html` (home) + `style.css` -- `services.html` -- `free-estimate.html` -- `README.md` - -**Characteristics**: - -- Home: Times New Roman, clip-art header, no photos of real work, generic phrases ("Best in the business!"), CTA is bare text link "Contact us" -- Services: a bulleted list with no descriptions, no prices, no warranties mentioned -- Free-estimate: unlabeled form inputs, no required-field markers, no phone number option, no expected-response-time -- Cross-page: inconsistent nav, no footer, no license number anywhere, no insurance mention, no before/after photos - -**Accept**: - -- Both ugly states commit to dedicated demo branches (`demo-sim-lp/w00`, `demo-sim-site/w00`) -- No JavaScript, no external network resources (self-contained HTML/CSS) -- Loaded in a browser they render (no broken markup); they're ugly, not broken -- Diff against `brand.json` shows broad violation — every persona and every brand rule has something to attack - ---- - -## T5 — Synthetic Analytics Agent - -**Depends on**: T3 - -Build `scripts/synthetic-analytics.ts` — generates per-week analytics reacting to current site state. - -**Inputs** (JSON file passed via CLI or stdin): - -```ts -{ - substrate: "lp" | "site", - week: number, // 0-indexed, 0 = baseline - weekDate: string, // ISO, for seasonality - sitePath: string, // absolute path to site dir for current week - contextPath: string, // absolute path to context dir - previousAnalytics?: AnalyticsJson, // week N-1, absent on week 0 - seed: string // determinism -} -``` - -**Output**: - -- `analytics.json` — schema matches existing `scripts/analytics-ingestion.ts` (`sessions`, `bounce_rate`, `avg_time_s`, `scroll_depth_{25,50,75,100}`, `cta_clicks` per CTA, `section_engagement[]`) -- `analytics-reasoning.md` — per-persona narrative of why metrics moved (3–5 sentences each) - -**Agent invocation**: - -- Uses `/v1/messages` (not Managed Agents) for simplicity — synthetic analytics is one-shot, no memory needed -- Model: Opus 4.7 (judgment-heavy) -- System prompt includes: persona distribution (5000 users × 3 personas, fixed), hard continuity (±15% per metric unless justified), seasonality hints, realistic event variance, no bias toward specific gaps - -**Accept**: - -- Golden-file test: given fixed seed + fixed week-0 HTML + personas, produces identical analytics.json on re-run -- Continuity test: given week-0 output as previousAnalytics + SAME site (unchanged), week-1 deltas stay within ±5% per metric (no change = no reason to swing) -- Continuity test: given week-0 output + MUTATED site (hero copy improved), week-1 bounce_rate drops by 5–20%, justification in reasoning.md -- Schema-compatibility test: output `analytics.json` parses cleanly via existing `analytics-ingestion.ts` normalizer - ---- - -## T6 — Sim orchestrator fork - -**Depends on**: T2, T3, T4 - -Fork `prompts/second-wbs-session.md` → `prompts/sim-council.md`. Parameterize the hardcoded values. - -**Changes from source**: - -- Header block takes env vars: `SUBSTRATE` (`lp`|`site`), `WEEK_DATE`, `BRANCH` (e.g. `demo-sim-lp/w03`), `AGENT_SET` (`webster-lp-sim`|`webster-site-sim`), `CONTEXT_PATH`, `SITE_PATH`, `MEMORY_STORES_JSON` -- Drop the `LP_TARGET=https://certified.richerhealth.ca` line and remove all WebFetch-based critic instructions (sim agents already read via MCP) -- Drop the 10-week mock-history seeder (Step 1) — simulation wrapper generates fresh analytics per week via T5 -- Agent IDs sourced from `context/sim-agents.json` (produced by T2's registration script), keyed by `$AGENT_SET` -- Memory-store attachment in every `POST /v1/sessions` call — attach the role-appropriate store from `$MEMORY_STORES_JSON` - -**Accept**: - -- `sim-council.md` validates shellcheck on its bash blocks -- Running with `SUBSTRATE=lp WEEK_DATE=2026-02-01 BRANCH=demo-sim-lp/w00 ... wbs @prompts/sim-council.md` produces a week-0 council run with all agents invoked via sim IDs -- Production `prompts/second-wbs-session.md` untouched (diff check) - ---- - -## T7 — Simulation wrapper - -**Depends on**: T5, T6 - -Build `scripts/run-simulation.ts` — library + CLI that loops N weeks for one substrate. - -**Flow per week**: - -1. Checkout/create demo branch `demo-sim-/w` -2. If week 0: commit the ugly site; else use previous week's branch as base -3. Call Synthetic Analytics Agent (T5) → write `history//w/analytics.json` -4. Spawn `prompts/sim-council.md` with env vars for this week -5. After orchestrator completes: capture screenshots at 3 breakpoints × all pages using Playwright on the local file (no deploy needed — Playwright can open file:// URLs) -6. Write memory-store summaries via REST API (council + planner + redesigner insights) -7. Bundle week artifacts into `demo-output//week-NN/` - -**Accept**: - -- Config-driven (substrate specifier, week count, paths) — not substrate-hardcoded -- Unit test with mock council (no real API calls) runs 2-week loop end-to-end -- Screenshot capture works with Playwright headless on `demo-landing-page/ugly/index.html` (file://) -- Fixed seed → identical demo branch HEAD after N weeks - ---- - -## T8 — Per-substrate invocations - -**Depends on**: T7 - -Thin entry scripts: - -- `scripts/run-simulation-lp.ts` — calls `run-simulation.ts` with `substrate=lp` + LP paths + 10 weeks -- `scripts/run-simulation-site.ts` — calls `run-simulation.ts` with `substrate=site` + site paths + 10 weeks - -**Accept**: - -- `bun scripts/run-simulation-lp.ts` runs 10 weeks end-to-end, ~30–45 min -- `bun scripts/run-simulation-site.ts` same -- Output directories `demo-output/landing-page/` and `demo-output/northwest-reno/` both populated with week-00 through week-10 artifacts -- Memory Stores Console shows 12 entries populated - ---- - -## T9 — Demo manifest + final sheets - -**Depends on**: T8 - -Build `scripts/build-demo-manifest.ts` — aggregates simulation output. - -**Per-substrate outputs**: - -- `demo-output//demo-manifest.json` — machine-parseable index of all weeks, screenshots, council artifacts, genealogy events, memory-store references -- `demo-output//final-sheet.png` — side-by-side week-0 vs week-10 desktop hero shot (ffmpeg or ImageMagick) - -**Accept**: - -- Manifest validates against a schema you define -- Final sheet is visually compelling (real improvement visible) -- Manifest includes absolute paths the downstream video-composition session can feed to Remotion - ---- - -## T10 — End-to-end dry run + handoff - -**Depends on**: T7, T8, T9 - -Run both simulations. Inspect outputs. Decide. - -**Accept — all must be true before handoff to video composition**: - -- Both `demo-output/` substrates contain full 10-week progressions -- Screenshots visually coherent (no blank pages, no JS errors, layouts render at all 3 breakpoints) -- Memory Stores Console shows 12 stores with content (open one, verify it contains meaningful summaries) -- Genealogy log shows what happened (a spawn, or a diagnosed-then-fixed non-spawn, or an explicit "no spawn in 10 weeks" with investigation notes) -- If no spawn and no budget to re-run: accept outcome, update VISION.md risk section with the finding, proceed to video composition with improvement-only narrative - -**Handoff deliverable** (for fresh Claude Code session to compose video): - -- `demo-output//demo-manifest.json` × 2 -- `demo-output//final-sheet.png` × 2 -- Memory-Stores-Console screenshots (captured manually by Richie) -- Nicolette clip (recorded separately by Richie) -- Onboarding skill recording (recorded separately by Richie) -- Brand bible content for copy/narration reference - ---- - -## Validation checkpoints - -Before moving to the next task, verify: - -1. `bun run validate` green -2. Committed (conventional commit message) -3. Re-read VISION.md's "what's locked" section — did you drift? -4. Flag anything unexpected with `[STUCK]` prefix before continuing - -## When genuinely stuck - -- Re-read VISION.md. The vision is the real contract. -- Surface the block to Richie. Don't produce composed-looking workarounds. -- Visible struggle > invisible corner-cutting. diff --git a/context/FEATURES.md b/context/FEATURES.md index fb769dd..3ce689f 100644 --- a/context/FEATURES.md +++ b/context/FEATURES.md @@ -1,24 +1,15 @@ # Features -> Canonical task list. Operators mark status transitions here as they work. +> Per-row inventory of what shipped for the hackathon submission. Each row reflects the final state of `dev` (= the submission cut). `cut` rows were pre-committed cuts with rationale inline; everything else is shipped. ## Status legend -- `todo` — not started -- `in-progress` — claimed by an operator - `done` — shipped, validated, merged -- `blocked` — waiting on external or upstream -- `cut` — pre-committed cut per `webster-open-loops` rules +- `cut` — out of submission scope; rationale inline -## Current submission state (2026-04-23) +## Final state -- **Done**: 49 (incl. #5 live run artifacts, #38 site/ fork shipped session 4, #39b runtime gate, #39c critic rerun gate, #39d PR emission plan, #39e CF preview wiring, #40a visual asset schema, #40b image backend, #40c asset persistence, #40d apply integration, #41a visual reviewer spec, #41b browser-audit skill, #41c proposal-intent verifier, #41d visual-review integration, #42 analytics ingestion, #43 baselines schema, #44 verdict engine, #45 rollback worker, #46 baseline promoter, #47 proposal schema v2, #48 multi-kind routing, #49 constraint verifier, #50 planner agent spec, #51 memory substrate, #54 cold-start planner mode, #56 skip contract shipped) -- **In-progress**: 0 -- **Blocked**: 5 (demo video — Richie voice) -- **Cut**: 7 (out of submission scope; rationale inline) -- **Todo**: 7 (1 submission form; all remaining implementation rows shipped or non-implementation blocked/cut) - -Hero feature (Critic Genealogy) shipped with live Opus 4.7 validation. All 7 Managed Agents registered. Council fan-out + redesigner + PR automation scripted in `prompts/second-wbs-session.md`. CI green, 29 tests pass. Two scope reassignments below (critic-flow skill renamed; orchestrator moved from TS to bash-in-markdown prompt) — both ship equivalent functionality. +Hero feature (Critic Genealogy) shipped with live Opus 4.7 validation. **9 production Managed Agents** registered against the live API and mirrored 1:1 by **9 simulation specs** in `agents/simulation/`. Council fan-out + redesigner + visual review + PR automation scripted in `prompts/second-wbs-session.md` (single-page runbook) and exposed as the `/webster-weekly-council` library skill at `skills/webster-weekly-council/SKILL.md`. CI green; **29 test files** under `scripts/__tests__/` exercise the full pipeline. The 11-week LP timelapse evidence is committed under `demo-output/landing-page/w00..w10/` and narrated in `demo-output/landing-page/INDEX.md`. ## Stream allocation @@ -28,22 +19,22 @@ See `AGENTS.md` for stream → operator mapping. ## Layer 1: Routine + Orchestrator (Stream 1 — Claude Code Opus 4.7) -| # | Status | Feature | Hours | -| --- | ------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | -| 1 | cut | `routines/weekly-lp-improve.yaml` — Claude Code Routine with weekly cron. Submission uses manual `wbs @prompts/...` | 2 | -| 2 | done | Orchestrator — shipped as `prompts/second-wbs-session.md` (bash-in-markdown, not `.ts`). Functionally equivalent | 4 | -| 3 | done | Shared critic skill — shipped as `skills/webster-lp-audit/SKILL.md` (renamed from `critic-flow`) | 2 | -| 4 | done | Per-critic context pattern: `context/critics/{name}/findings.md` (5 critics + monitor seeded) | 1 | -| 5 | done | Run-artifact pattern: `history/YYYY-MM-DD/` — live `history/2026-04-23/` artifacts include analytics, proposal, decision, operator decision, and genealogy logs | 2 | -| 6 | done | Branch + PR automation via `gh pr create` — wired in Step 6 of `second-wbs-session.md` | 2 | +| # | Status | Feature | Hours | +| --- | ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | +| 1 | cut | `routines/weekly-lp-improve.yaml` — Claude Code Routine with weekly cron. Submission uses manual `wbs @prompts/...` | 2 | +| 2 | done | Orchestrator — shipped as `prompts/second-wbs-session.md` (bash-in-markdown, not `.ts`) + `/webster-weekly-council` library skill (SKILL.md index + 9 phase references + 2 helper scripts). Functionally equivalent | 4 | +| 3 | done | Shared critic skill — shipped as `skills/webster-lp-audit/SKILL.md` (renamed from `critic-flow`) | 2 | +| 4 | done | Per-critic context pattern: `context/critics/{name}/findings.md` (5 critics + monitor seeded) | 1 | +| 5 | done | Run-artifact pattern: `history/YYYY-MM-DD/` — live `history/2026-04-23/` artifacts include analytics, proposal, decision, operator decision, and genealogy logs | 2 | +| 6 | done | Branch + PR automation via `gh pr create` — wired in Step 6 of `second-wbs-session.md` | 2 | ## Layer 2: Managed Agent Critics (Stream 2 — Codex heartbeat) | # | Status | Feature | Hours | | --- | ------ | -------------------------------------------------------------------------------------------------------------------- | ----- | -| 7 | done | `agents/webster-monitor.json` (Haiku 4.5) — analytics anomaly detection | 1 | +| 7 | done | `agents/production/webster-monitor.json` (Haiku 4.5) — analytics anomaly detection | 1 | | 8 | done | 5 specialist critic specs: seo, brand-voice, fh-compliance, conversion, copy (all Sonnet 4.6) — all schema-valid | 4 | -| 9 | done | `agents/webster-redesigner.json` (Opus 4.7) — synthesis + proposal generation | 1 | +| 9 | done | `agents/production/webster-redesigner.json` (Opus 4.7) — synthesis + proposal generation | 1 | | 10 | done | GitHub MCP integration — URL-based, vault-bound (`vault_ids`), no tokens in `user.message` | 3 | | 11 | done | Environment config — `environments/webster-council-env.json` + `.id` registered | 2 | | 12 | done | Parallel fan-out via orchestrator → 6 parallel `/v1/sessions` calls (not `callable_agents`; that's research-preview) | 2 | @@ -72,32 +63,28 @@ See `AGENTS.md` for stream → operator mapping. ## Layer 5: Substrate + Mock History (Stream 5 — Claude Code) +| # | Status | Feature | Hours | +| --- | ------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | +| 25 | cut | Fork `certified.richerhealth.ca` Astro source → `site/` — honest scope note in README. Redesigner emits `proposal.md` brief | 2 | +| 26 | cut | Analytics pixel → Cloudflare Worker → KV — not needed; mock seeder + monitor feed the council | 3 | +| 27 | done | 10-week mock history seeder — inlined in `prompts/second-wbs-session.md` Step 1 (idempotent, ~2 min) | 4 | +| 28 | cut | Silent secondary substrates (original cut) — **superseded by L11 #58**: Pair Alpha (SaaS + local service) brought in-submission per Q7 (session-4 Phase 7) | 2 | + +## Layer 6: Meta Video (submission tooling, not in this repo) + | # | Status | Feature | Hours | | --- | ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | -| 25 | cut | Fork `certified.richerhealth.ca` Astro source → `site/` — honest scope note in README. Redesigner emits `proposal.md` brief | 2 | -| 26 | cut | Analytics pixel → Cloudflare Worker → KV — not needed; mock seeder + monitor feed the council | 3 | -| 27 | done | 10-week mock history seeder — inlined in `prompts/second-wbs-session.md` Step 1 (idempotent, ~2 min) | 4 | -| 27a | done | Local LP synthetic heatmap v2 — `analytics.json` schema unchanged; `heatmap.json` now includes neutral `layout_metrics` for document height, section reach, and CTA reach. See `context/ARCHITECTURE.md` for exact shape. | 1 | -| 28 | cut | Silent secondary substrates (original cut) — **superseded by L11 #58**: Pair Alpha (SaaS + local service) brought in-submission per Q7 (session-4 Phase 7) | 2 | - -## Layer 6: Meta Video (Stream 4 — Claude Code or Forge) - -| # | Status | Feature | Hours | -| --- | ------- | -------------------------------------------------------------------------------------- | ----- | -| 29 | blocked | Remotion setup + composition template — pending Richie voice record window | 3 | -| 30 | blocked | 5 animated comps: title, council viz, TAM + 10-week morph, genealogy diagram, end-card | 6 | -| 31 | blocked | Opus-authored narration script `video/script.md` | 1 | -| 32 | blocked | Voice record (Richie) — blocker for the whole video layer | 2 | -| 33 | blocked | Final assembly in Descript or CapCut (3-min clean cut) | 3 | +| 29 | done | Render pipeline shipped via HyperFrames composition. The pipeline itself is submission tooling and lives outside the public repo. Rendered video is the visual artifact of the per-week assets in row #30. | — | +| 30 | done | Per-week timelapse artifacts committed under `demo-output/landing-page/w00..w10/`: desktop/mobile/tablet screenshots, heatmap JSON+SVG, synthetic analytics, visual-review verdicts. These are the judge-facing evidence. | — | ## Layer 7: Polish (Sat-Sun) -| # | Status | Feature | Hours | -| --- | ------ | --------------------------------------------------------------------------------- | ----- | -| 34 | done | README — submission narrative shipped `0ed6e98` + advisor fixes `d8e76a4` | 2 | -| 35 | done | CI green on main — type + lint + format + schema + findings + markdown + 29 tests | 1 | -| 36 | done | MIT LICENSE — shipped in `0ed6e98` | 1 | -| 37 | todo | Cerebral Valley submission form — Richie action at submission time | 1 | +| # | Status | Feature | Hours | +| --- | ------ | ------------------------------------------------------------------------------------------------------------------- | ----- | +| 34 | done | README — submission narrative shipped `0ed6e98` + advisor fixes `d8e76a4` | 2 | +| 35 | done | CI green on main — type + lint + format + schema + findings + markdown + 29 test files green via `bun run validate` | 1 | +| 36 | done | MIT LICENSE — shipped in `0ed6e98` | 1 | +| 37 | done | Cerebral Valley submission form — submitted at hackathon deadline | 1 | ## Layer 8: Apply worker + image generation (pre-submission per session-4 Phase 7) @@ -133,7 +120,7 @@ Key design calls (locked session 4 Phase 7, see ADR-0002): | # | Status | Feature | Hours | | --- | ------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | -| 41a | done | **Visual-reviewer agent spec** — `agents/webster-visual-reviewer.json` (Opus 4.7 tier). Inputs: preview URL, `history//proposal.md`, BEFORE URL. Outputs: `history//visual-review.md` with findings + embedded screenshot refs. | 1 | +| 41a | done | **Visual-reviewer agent spec** — `agents/production/webster-visual-reviewer.json` (Opus 4.7 tier). Inputs: preview URL, `history//proposal.md`, BEFORE URL. Outputs: `history//visual-review.md` with findings + embedded screenshot refs. | 1 | | 41b | done | **Browser-audit skill** — `skills/webster-browser-audit/SKILL.md` wraps `scripts/browser-audit.ts` for Playwright-headless when available, with fallback artifacts when unavailable. Capabilities: 3-breakpoint screenshot (375/768/1440), accessibility-tree text extraction, interaction recording, and console log capture. | 3 | | 41c | done | **Proposal-intent verifier** — `scripts/proposal-intent-verifier.ts` reads each issue in `proposal.md` and verifies visible phrase presence in rendered accessibility text (not source grep). Catches content drops like session-4 "No more patient churn" regression; layout overflow is covered by browser-audit summaries. | 2 | | 41d | done | **#39 integration pattern** — apply worker now runs the visual-reviewer gate after #39c and before #39d PR emission. It retries up to 3 iterations, records `visual_review` in `apply-log.json`, and forces draft/partial PR metadata on CRITICAL visual regressions. | 1 | @@ -161,17 +148,17 @@ Added session 4 Phase 7. Autoresearch is **input to the next council run**, not Session 4 Phase 7 locked 9 architectural questions (Q1–Q9) — all resolved in `context/DOMAIN-MODEL.md`. Key locks: Q1 Managed Agent + orchestrator-owned memory (ADR-0001), Q2 explore-broadly cold-start + unified `history/memory.jsonl`, Q3 autonomous p<0.01 rollback, Q4 reward+gates 7-outcome matrix (ADR-0002), Q5 planner-requests-new-critic via L3 genealogy (additive-only), Q5.1 four-layer genealogy governance, Q6 skip-is-terminal + structured skip rows, Q7 Pair Alpha (SaaS + local service) substrate pair, Q8 per-experiment baselines + commit trailers (ADR-0002), Q9 4-week demo arc. -| # | Status | Feature | Hours | -| --- | ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | -| 50 | done | **`agents/webster-planner.json`** (Opus 4.7 Managed Agent, Q1 ADR-0001) — registered via `POST /v1/agents`, invoked per-run via `/v1/sessions` + events + poll (pattern verified in `scripts/critic-genealogy.ts:440-556`). Reads marshaled memory context, outputs `plan.md` with `{classification, next_action, direction_hint, new_critic_request?, rationale}`. `next_action ∈ {promote_and_experiment, hold_baseline, revert_and_retry, explore_broadly}`. _Landed on `forge/task-feat-planner-agent-spec-v5` — PR #3 merged (2026-04-24)._ | 2 | -| 51 | done | **Memory substrate schema + append helper** (Q2) — `history/memory.jsonl` event log: `{ts, week, actor, event, refs{}, insight}` where `event ∈ {promote, rollback, skip, regression, gap-detected, verdict-ready}`. Append-only. Helper in orchestrator never touched by agents — orchestrator owns all I/O (ADR-0001). _Landed on `forge/task-feat-memory-substrate` — all 4 stories (MemoryEvent types, appendEvent, tailN+filter, unit tests)._ | 2 | -| 52 | done | **Orchestrator memory marshaling + planner invocation** (Q1, Q2) — new step in `prompts/second-wbs-session.md`: before critics, orchestrator reads `memory.jsonl` tail (last N events) + last 2 weeks' `verdict.json` + `monitor` anomaly report, concatenates to planner's user-message text (step 3 of the 5-step Managed Agent flow), polls until idle, extracts output, writes `history//plan.md`, appends one `verdict-ready` event row to `memory.jsonl`. _Landed on `forge/task-feat-orch-memory-planner-v2` — PR #6 merged (2026-04-24)._ | 3 | -| 53 | done | **Plan → council integration (additive-only)** (Q5) — critics + monitor + redesigner now receive `plan.md` body in initial `user.message` context with explicit additive-only/sovereignty language. Planner `new_critic_request` is extracted to `tmp/planner-new-critic-request-.json` and passed into `scripts/critic-genealogy.ts --planner-request` as additive evidence, without bypassing dedup/cap/evidence gates. | 3 | -| 54 | done | **Cold-start explore-broadly mode** (Q2) — planner context now emits `direction_hint="broad exploration, baseline-only analytics"` when memory/verdict/monitor inputs are empty, and `appendColdStartOriginEvent()` writes the origin event row. | 2 | -| 55 | done | **Genealogy governance layers 2–4** (Q5.1) — layer 1 is prompt-only (rubric in planner + redesigner instructions: "request only if existing critics cannot cover the concern"). Layer 2: orchestrator-side dedup — reject new-critic spec if ≥60% scope overlap with existing critic (embedding cosine). Layer 3: quarterly cap — max 3 new critics / 13 weeks, soft-override by operator. Layer 4: retire-on-idle — critic with 0 findings-promoted in 8 weeks is archived. _Landed on `forge/task-feat-genealogy-gov-v1` — PR #8 merged (2026-04-24)._ | 3 | -| 56 | done | **Skip-contract plumbing** (Q6) — apply-worker, critic-rerun gate, and visual-review gate now emit canonical structured skip rows to `history//skips.jsonl` and append skip events to `history/memory.jsonl` with reasons `{apply-fail, critic-veto, visual-veto}`. Skip is terminal and feeds next-week planner. | 2 | -| 57 | done | **`scripts/seed-demo-arc.ts`** (Q9) — 4-week primary-substrate mock: 9 experiments + 1 genealogy spawn in W4. Hits 6/7 Q4 outcome lanes (fast-track, fallback, gate-win, archive-gate-fail, auto-rollback, hold). Idempotent; writes to `history/demo-arc/` without touching live history. _Landed on `forge/task-feat-seed-demo-arc-w3w4-v5` — all 4 stories (W1/W2/W3/W4 + genealogy) done, PR #5 merged (2026-04-24)._ | 3 | -| 58 | done | **`scripts/seed-secondary-substrates.ts`** (Q7) — Pair Alpha mock: SaaS (B2B) + local service (B2C) synthetic single-file HTMLs + 2-cycle mock runs each (onboard + 2 weeks of experiments). Proves generalization beyond the primary substrate. Demo-safe (no e-commerce — private hold-out per operator decision). _Landed on `forge/task-feat-seed-pair-alpha-v1` — PR #7 merged (2026-04-24)._ | 3 | +| # | Status | Feature | Hours | +| --- | ------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | +| 50 | done | **`agents/production/webster-planner.json`** (Opus 4.7 Managed Agent, Q1 ADR-0001) — registered via `POST /v1/agents`, invoked per-run via `/v1/sessions` + events + poll (pattern verified in `scripts/critic-genealogy.ts:440-556`). Reads marshaled memory context, outputs `plan.md` with `{classification, next_action, direction_hint, new_critic_request?, rationale}`. `next_action ∈ {promote_and_experiment, hold_baseline, revert_and_retry, explore_broadly}`. _Landed on `forge/task-feat-planner-agent-spec-v5` — PR #3 merged (2026-04-24)._ | 2 | +| 51 | done | **Memory substrate schema + append helper** (Q2) — `history/memory.jsonl` event log: `{ts, week, actor, event, refs{}, insight}` where `event ∈ {promote, rollback, skip, regression, gap-detected, verdict-ready}`. Append-only. Helper in orchestrator never touched by agents — orchestrator owns all I/O (ADR-0001). _Landed on `forge/task-feat-memory-substrate` — all 4 stories (MemoryEvent types, appendEvent, tailN+filter, unit tests)._ | 2 | +| 52 | done | **Orchestrator memory marshaling + planner invocation** (Q1, Q2) — new step in `prompts/second-wbs-session.md`: before critics, orchestrator reads `memory.jsonl` tail (last N events) + last 2 weeks' `verdict.json` + `monitor` anomaly report, concatenates to planner's user-message text (step 3 of the 5-step Managed Agent flow), polls until idle, extracts output, writes `history//plan.md`, appends one `verdict-ready` event row to `memory.jsonl`. _Landed on `forge/task-feat-orch-memory-planner-v2` — PR #6 merged (2026-04-24)._ | 3 | +| 53 | done | **Plan → council integration (additive-only)** (Q5) — critics + monitor + redesigner now receive `plan.md` body in initial `user.message` context with explicit additive-only/sovereignty language. Planner `new_critic_request` is extracted to `tmp/planner-new-critic-request-.json` and passed into `scripts/critic-genealogy.ts --planner-request` as additive evidence, without bypassing dedup/cap/evidence gates. | 3 | +| 54 | done | **Cold-start explore-broadly mode** (Q2) — planner context now emits `direction_hint="broad exploration, baseline-only analytics"` when memory/verdict/monitor inputs are empty, and `appendColdStartOriginEvent()` writes the origin event row. | 2 | +| 55 | done | **Genealogy governance layers 2–4** (Q5.1) — layer 1 is prompt-only (rubric in planner + redesigner instructions: "request only if existing critics cannot cover the concern"). Layer 2: orchestrator-side dedup — reject new-critic spec if ≥60% scope overlap with existing critic (embedding cosine). Layer 3: quarterly cap — max 3 new critics / 13 weeks, soft-override by operator. Layer 4: retire-on-idle — critic with 0 findings-promoted in 8 weeks is archived. _Landed on `forge/task-feat-genealogy-gov-v1` — PR #8 merged (2026-04-24)._ | 3 | +| 56 | done | **Skip-contract plumbing** (Q6) — apply-worker, critic-rerun gate, and visual-review gate now emit canonical structured skip rows to `history//skips.jsonl` and append skip events to `history/memory.jsonl` with reasons `{apply-fail, critic-veto, visual-veto}`. Skip is terminal and feeds next-week planner. | 2 | +| 57 | done | **`scripts/seed-demo-arc.ts`** (Q9) — 4-week primary-substrate mock: 9 experiments + 1 genealogy spawn in W4. Hits 6/7 Q4 outcome lanes (fast-track, fallback, gate-win, archive-gate-fail, auto-rollback, hold). Idempotent; writes to `history/demo-arc/` without touching live history. _Landed on `forge/task-feat-seed-demo-arc-w3w4-v5` — all 4 stories (W1/W2/W3/W4 + genealogy) done, PR #5 merged (2026-04-24)._ | 3 | +| 58 | done | **`scripts/seed-secondary-substrates.ts`** (Q7) — Pair Alpha mock: SaaS (B2B) + local service (B2C) synthetic single-file HTMLs + 2-cycle mock runs each (onboard + 2 weeks of experiments). Proves generalization beyond the primary substrate. Demo-safe (no e-commerce — private hold-out per operator decision). _Landed on `forge/task-feat-seed-pair-alpha-v1` — PR #7 merged (2026-04-24)._ | 3 | ## Totals (historical — initial plan) @@ -188,7 +175,7 @@ Session 4 Phase 7 locked 9 architectural questions (Q1–Q9) — all resolved in ## Cut rationale (for judges / auditors) -Four families cut, all with the same rationale: **the council composition does not depend on them**. The hero claim is the 7-agent fan-out + runtime critic genealogy, not the distribution surface. +Four families cut, all with the same rationale: **the council composition does not depend on them**. The hero claim is the 9-agent council + runtime critic genealogy, not the distribution surface. - **`routines/` cron wiring (#1)**: weekly trigger is operator-manual for this submission. Cron is a wrapper, not the system. - **Site fork + analytics Worker (#25, #26)**: the redesigner emits `proposal.md` instead of `proposal.diff`. Mock analytics seeder feeds the monitor — same inputs, no live pixel needed. diff --git a/context/QUALITY-GATES.md b/context/QUALITY-GATES.md index 520a3ff..43ecaef 100644 --- a/context/QUALITY-GATES.md +++ b/context/QUALITY-GATES.md @@ -76,7 +76,7 @@ Every gate is BLOCKING. No soft warnings. No `--no-verify`. - Current: schema happy-path + known-bad-spec rejection tests - Future critical paths: orchestrator, Critic Genealogy registration, skill Q&A flow -## Husky pre-commit (lax until 2026-04-26 submission) +## Husky pre-commit `.husky/pre-commit` runs: @@ -85,9 +85,7 @@ bun run validate:agents # prettier --check on staged .ts/.js/.json/.md/.jsonc only ``` -Lax by design — blocks on the bugs that cost API credits (agent spec drift, `system_prompt`-class typos) without blocking routine commits during the hackathon crunch on formatting nits. Full `bun run validate` still runs in CI on every push/PR. - -Tighten to the full chain after hackathon submission. +Deliberately narrow — blocks on the bugs that cost API credits (agent spec drift, `system_prompt`-class typos) without blocking routine commits on formatting nits. Full `bun run validate` still runs in CI on every push/PR. ## CI Pipeline diff --git a/context/ROADMAP.md b/context/ROADMAP.md deleted file mode 100644 index 8ba1823..0000000 --- a/context/ROADMAP.md +++ /dev/null @@ -1,192 +0,0 @@ -# Webster Roadmap — the map - -> Single source of truth for "where am I, what's next, what did I sign up for." -> Read top-to-bottom when lost. Regenerate from `context/FEATURES.md` if it drifts. - -## The one-paragraph map - -Webster is a **Council of Claude Managed Agents** that autonomously redesigns a small-business landing page, week after week, with **Opus 4.7 spawning new critics at runtime** when it spots patterns existing critics miss. The novel mechanic is **Critic Genealogy** — agents creating agents. The hackathon submission for Anthropic × Cerebral Valley "Built with Opus 4.7" is due **Sunday April 26 2026, 8PM EST** (~70h from now). Target prize lanes: Managed Agents $5K (62-72/100) + Creative Exploration $5K (48-58/100) + Grand $50K (18-25/100). - -## North-star invariant - -**Validate before human approval.** Every change passes the full validation stack — static critics → runtime gate → visual reviewer → autoresearch verdict — before it reaches a PR in Richie's inbox. Human is the last ratchet, not the first debugger. If a feature doesn't connect to this principle, it's out of scope. - -## Where we are right now (2026-04-23) - -- **Branch**: `main`, 5 commits ahead of `origin/main` (push-blocked by permission policy — Richie's action) -- **Submission runway**: ~70 hours to deadline -- **Shipped**: Layers 1–4 + 7 — 24 features in ~12 hours of focus-work -- **In-progress**: 1 (Layer 1 live-artifact pattern) -- **Blocked**: Layer 6 video (5 features, Richie voice record) -- **Open loops**: 3 deadline-critical + 4 scope-expansion layers (see below) -- **Full submission scope** (updated): Layers 1–11 — everything ships before 4/26 - -## Timeline correction (my estimation bias) - -My earlier estimates assumed calendar-hours. Actual tempo: **24 features in 12 hours** with Forge + Pi workers dispatched in parallel. The remaining 4 layers (L8 + L9 + L10 + L11) total ~55 focus-hours of feature work, which at your tempo compresses to ~18–25h of your wall time. Fits in 70h runway with room for the video, form, and a voice-surgery polish pass. - -Operating assumption from here forward: **every feature you name is in pre-submission scope unless you explicitly say otherwise.** - -## Layer-by-layer truth - -| Layer | Theme | Status | Features | -| ----- | ------------------------------------------ | -------------------------------------- | ---------------------------------------- | -| L1 | Routine + Orchestrator | shipped | #2–6 done; #1 cut; #5 in-progress | -| L2 | 7 Managed Agent Critics | shipped | #7–#12 done | -| L3 | **Critic Genealogy (HERO)** | shipped, live-validated | #13–#17 done | -| L4 | Onboarding Skill | shipped | #18, #19, #23, #24 done; #20–#22 cut | -| L5 | Substrate + Mock History | core shipped | #27 done; #25, #26, #28 cut | -| L6 | Meta Video | blocked | #29–#33 waiting on voice record | -| L7 | Polish | mostly shipped | #34–#36 done; #37 todo (Richie action) | -| L8 | **v2: Apply worker, text-only** | planned — ships FIRST | #38 done; #39a–e, #40a–d todo | -| L10 | **v2.5: Designer scope expansion** | planned — ships SECOND | #47–#49 todo (kind+constraints+verifier) | -| L9 | **v3: Visual review + Autoresearch** | planned — ships THIRD (0bb9db2) | #41a–d, #42–#46 todo | -| L11 | **v4: Planner + experiment-aware council** | planned — ships LAST (closes the loop) | #50–#53 todo (NEW this session) | - -## What's new THIS session (session 4) - -- `61cfae4` — `site/before/` + `site/after/` forked from live `certified.richerhealth.ca`; 5-issue proposal applied by hand to `after/` -- `475e129` — `context/v2-design.md` grill-me answers; Layer 8 decomposed into #39a-e + #40a-d -- `a1cb0e5` — advisor-caught regression fix: "No more patient churn" restored in Issue 4 hero -- `0bb9db2` — Layer 9 added (9 sub-features: visual-reviewer chain + autoresearch chain) + 6 hero screenshots as motivating evidence -- `f34858d` — `context/ROADMAP.md` — single source of truth for roadmap + narrative -- **PENDING (this phase)**: `context/DOMAIN-MODEL.md` (NEW) + Layer 10 + Layer 11 added to FEATURES.md - -All commits local-only. Push permission policy blocks direct push to main; Richie-action item. - -## Architectural shift locked this session - -Autoresearch is NOT a back-end post-merge feedback loop. It's the **input** to the next council run. A **planning agent** sits before the 5 critics + redesigner, reads last week's verdict + what-changed, decides experiment direction for this week, then the council runs with `plan.md` as context. - -This is the shift from **autonomous weekly redesigner** → **autonomous experiment agent**. See `context/DOMAIN-MODEL.md` for the formal model + week lifecycle + grill-me questions on the 7 remaining architectural decisions. - -## Deadline-critical loops for 4/26 - -**Human-only actions** (no Claude can do these): - -1. **Cerebral Valley submission form** (#37; ~15 min; Richie-only) -2. **Demo video voice record** (Layer 6 blocker; ~1h record + Saturday assembly) -3. **Push local commits to origin/main** (1-min terminal action, required before submission) - -**Scope-expansion layers that ALSO ship pre-submission** (Claude+worker parallelizable): - -- **Layer 8** — apply worker text-only (~18h feature work) -- **Layer 10** — designer scope expansion (~7h feature work) -- **Layer 9** — visual review + autoresearch measurement (~18h feature work) -- **Layer 11** — planner agent + experiment-aware council (~12h feature work, NEW) - -Ordering: 1–3 anytime. 4–7 in dependency order (L8 → L10 → L9 → L11). Grill-me on DOMAIN-MODEL.md open questions unblocks L11 implementation. - -## Pre-submission roadmap — L8 → L10 → L9 → L11 (dependency order) - -All three layers exist to make Webster **genuinely autonomous**, not just autonomously-change-producing. Build order matters: - -### L8 (v2) — Apply worker, text-only | ~18h total - -**Why it exists**: today the council emits `proposal.md`. No code changes. L8 turns proposal into PR diffs. Text-level only — council says "change X to Y", apply runs find-replace, runs lint/type/format, emits a PR. - -| # | Feature | Hours | -| ------ | ------------------------------------------------------------------------- | ----- | -| #38 | site/ fork — DONE session 4 | ✅ | -| #39a | Apply worker core (Pi worker via Forge, worktree-isolated) | 4–6 | -| #39b | Runtime validation gate (Playwright: CTAs resolve, no JS errors) | 2–3 | -| #39c | Critic re-run gate (0 new CRITICAL, ≤2 new HIGH; 3-iter fix loop) | 2 | -| #39d | Per-cluster PR emission (1–3 issues/PR, max 3 PRs/week) | 3 | -| #39e | CF Pages preview URL wiring | 1–2 | -| #40a–d | Image-gen tool (tool schema, backend, brand persistence, #39 integration) | 7 | - -**Testable when**: `wbs @prompts/fifth-wbs-session.md` produces a PR with real code diffs, not just `proposal.md`. - -### L10 (v2.5) — Designer scope expansion | ~7h total - -**Why it exists**: session-4 proved text-only proposals aren't enough. Longer copy needs smaller font-size to keep hero rhythm. Without L10, the council is a **copy-editor council**, not a **design council**. L10 lets the designer propose CSS/layout/component changes as first-class issues. - -| # | Feature | Hours | -| --- | ------------------------------------------------------------------------------------- | ----- | -| #47 | Proposal schema v2 (kind-aware: text/css/component/asset + constraints block) | 2 | -| #48 | Apply worker multi-kind routing (tool per kind) | 3 | -| #49 | Visual-reviewer constraint verifier (asserts declared constraints in rendered output) | 2 | - -**Testable when**: council proposes "shorter subhead + 0.75× hero font-size + 3-line desktop H1 constraint" as ONE atomic issue; apply worker executes all three together; visual-reviewer confirms constraint met. - -### L9 (v3) — Visual review + Autoresearch | ~18h total - -**Why it exists**: L8 and L10 ship changes. L9 **verifies they work**. Two halves: - -**Visual reviewer** (runs immediately post-apply, pre-PR): - -| # | Feature | Hours | -| ---- | --------------------------------------------------------------------------------------------------- | ----- | -| #41a | `agents/webster-visual-reviewer.json` spec (Opus 4.7) | 1 | -| #41b | `skills/webster-browser-audit/SKILL.md` (Playwright screenshot + a11y tree + interaction recording) | 3 | -| #41c | Proposal-intent verifier (content presence + overflow detection) | 2 | -| #41d | #39 integration (3-iteration fix-hint loop back to apply worker) | 1 | - -**Autoresearch** (runs post-merge, week+ cycles): - -| # | Feature | Hours | -| --- | ------------------------------------------------------------------------------------- | ----- | -| #42 | Analytics ingestion (CF Worker pixel → D1 or PostHog/GA4 webhook) | 3 | -| #43 | Baseline tracker + change log | 2 | -| #44 | Verdict engine (proxy-first fast signal + CVR slow confirm; asymmetric rollback gate) | 3 | -| #45 | Auto-rollback worker (git revert → CF preview → draft PR for override) | 2 | -| #46 | Baseline promoter (2-week sustained improvement → new baseline) | 1 | - -**Testable when**: visual-reviewer blocks a known-bad session-4-style regression; autoresearch rolls back a week that hurts proxy metrics; baseline promoter advances after 2 good weeks. - -### L11 (v4) — Planner + experiment-aware council | ~12h total | NEW - -**Why it exists**: L9 measures last week's experiment. But measurement without decision is half a loop. L11 adds a **planning agent** that sits BEFORE the 5 critics + redesigner, reads last week's verdict + what-changed, decides direction for this week (promote / hold / rollback), and feeds `plan.md` as context to the council run. Closes the autonomy loop — Webster becomes an **experiment agent**, not a weekly redesigner. - -| # | Feature | Hours | -| --- | ----------------------------------------------------------------------------------------------------------------- | ----- | -| #50 | `agents/webster-planner.json` (Opus 4.7) — reads verdict + what-changed, decides next-experiment direction | 2 | -| #51 | Verdict → plan pipeline — orchestrator invokes planner with verdict.json + proposal.md + apply-log + monitor data | 3 | -| #52 | Plan → council integration — critics + redesigner read plan.md as input context | 3 | -| #53 | Cold-start behavior — week 1 with no prev verdict; planner outputs "explore broadly" default plan | 2 | - -**Testable when**: week N+1 council run reads week N verdict automatically; planner outputs plan.md before critics spawn; critics + redesigner have plan.md in context; end-to-end cycle (propose → apply → review → merge → measure → verdict → plan → propose) runs in simulator without human touch between measure and plan. - -**Grill-me questions blocking L11**: 7 open decisions listed in `context/DOMAIN-MODEL.md`. Richie answers → implementation unblocks. - -## Decisions waiting on you - -Ranked by blast radius: - -1. **Push path for 5 local commits** — direct push to main, OR PR branch? (blocks submission) -2. **Cerebral Valley submission form** (#37) — Richie-only 15-min task -3. **Voice record scheduling** — Sat AM? blocks Layer 6 video (~3h cleanup after) -4. **L11 grill-me answers** — 7 open questions in `context/DOMAIN-MODEL.md` unblock planner implementation -5. **Session-4 hero voice-surgery** — revert copy to BEFORE wording (85/100), or trim line 3 (75/100), or keep as cautionary-tale artifact (45/100)? My pick: option 1 after v2 apply worker lands, as the first-ever apply-worker PR demo -6. **`[R-confirm]` in `context/v2-design.md`** (3 items): visual-regression cost threshold, `gpt-image-1` as image backend default, PR `summary.json` alongside markdown - -## Three things to hold in your head - -Everything else is noise until these land: - -1. **Submit by 4/26** — form + video + push origin (human-only actions) -2. **Answer L11 grill-me** — 7 questions in `context/DOMAIN-MODEL.md` unblock the planner agent and the full-cycle autonomous claim -3. **Kick off L8 #39a** — apply worker core. First real PR with mutated code. Everything downstream (L10 → L9 → L11) layers on top - -The rest exists. Those three are the **bottleneck path**. Full submission scope is now all 11 layers — ~18–25h wall time at your tempo, fits in 70h runway. - -## How this doc relates to the rest - -- `context/FEATURES.md` — canonical per-row status. This doc quotes it; FEATURES.md is authoritative for "what's `todo` vs `done`." -- `context/ARCHITECTURE.md` — system diagram + layer breakdown. This doc is the narrative overlay. -- `context/v2-design.md` — grill-me answers + rationale for Layer 8 decomposition. This doc summarizes; v2-design.md is the detailed record. -- `~/Vault/Projects/webster/Webster.md` — cross-session hub + pitch. -- `~/Vault/Projects/webster/webster-open-loops.md` — action queue (vault-scoped, broader than this doc's 3 open loops). - -## How to use this doc - -- **Lost** → read top to bottom in 5 min -- **Before a session** → skim "what's new this session" + "three things to hold in your head" -- **After a decision** → update "decisions waiting on you" or ask me to -- **On a pull request** → cross-check "Layer-by-layer truth" table - -This doc is the truth-source for roadmap questions. If `context/FEATURES.md` contradicts this about per-row state, FEATURES.md wins. If anything contradicts this about layer-narrative or ordering, this wins. - ---- - -Last regenerated: 2026-04-23 (session 4 Phase 5, after Layer 9 commit + Layer 10 proposal). diff --git a/context/SITE-FORK-CHECKLIST.md b/context/SITE-FORK-CHECKLIST.md deleted file mode 100644 index 964bb08..0000000 --- a/context/SITE-FORK-CHECKLIST.md +++ /dev/null @@ -1,94 +0,0 @@ -# site/ Fork Checklist - -Run this the moment `site/` is forked from `certified.richerhealth.ca`. Everything here is a one-shot onboarding for the submitted code's own quality gates. Root-level webster gates (JSON schemas, findings validator, markdownlint) already run against the repo and will continue to; this page covers what to add _inside_ `site/`. - -## Build surface - -- `site/package.json` exists with Astro scripts -- `site/bun.lock` committed -- `bun install --frozen-lockfile` in `site/` succeeds on CI -- `bun run build` in `site/` succeeds (will flip on the `site-build` job in `.github/workflows/test.yml`) - -## site/ toolchain to install - -```bash -cd site -bun add -D @astrojs/check astro-eslint-parser eslint-plugin-astro prettier-plugin-astro -``` - -## site/eslint.config.js - -```js -import js from "@eslint/js"; -import tseslint from "typescript-eslint"; -import astro from "eslint-plugin-astro"; - -export default tseslint.config( - { ignores: ["dist", ".astro", "node_modules"] }, - js.configs.recommended, - ...tseslint.configs.strict, - ...astro.configs.recommended, -); -``` - -## site/.prettierrc (inherits from root) - -```json -{ "plugins": ["prettier-plugin-astro"] } -``` - -## site/package.json scripts - -```json -{ - "scripts": { - "dev": "astro dev", - "build": "astro check && astro build", - "preview": "astro preview", - "lint": "eslint . --cache --max-warnings 0", - "format:check": "prettier --check .", - "type-check": "astro check" - } -} -``` - -## Root workflow flips - -Once `site/package.json` exists, the `site-build` job in `.github/workflows/test.yml` starts running. Additions to make at the same time: - -- Add `site-lint` job running `bun run lint --max-warnings 0` in `site/` -- Add `site-format` job running `bun run format:check` in `site/` -- Remove `continue-on-error` from any remaining site-build steps once it's stable - -## Pre-commit hook bump - -When `site/` lands, append to `.husky/pre-commit`: - -```sh -if [ -d site ]; then - (cd site && bun run lint --max-warnings 0 && bun run format:check) || exit 1 -fi -``` - -## Playwright (Day 5 polish, optional) - -If time holds after core fan-out + redesigner works: - -```bash -cd site -bun add -D @playwright/test -bunx playwright install chromium -``` - -One smoke test confirming the redesigned LP renders and the Acuity booking CTA is present at `site/tests/hero.spec.ts`. Run in CI matrix against Cloudflare preview URLs. - -## Do NOT add preemptively - -These buy nothing until `site/` exists, and installing them now balloons the root `node_modules`: - -- `astro` / `@astrojs/cloudflare` -- `eslint-plugin-astro` / `astro-eslint-parser` -- `prettier-plugin-astro` -- `@playwright/test` - -They go in `site/package.json` when `site/` lands. diff --git a/context/VISION.md b/context/VISION.md index ba6b676..facfa4a 100644 --- a/context/VISION.md +++ b/context/VISION.md @@ -23,18 +23,16 @@ The submission is a video. It tells one story in one flow: 1. **Problem** — clip of Nicolette describing her manual A/B testing pain 2. **Solution intro** — Richie voiceover explaining Webster's council, genealogy, and memory, overlaid on an animated UI of the council operating 3. **Landing-page timelapse** — deliberately-ugly version of Nicolette's site, evolving across 10 simulated weeks into something polished. Full-page screenshots at 3 breakpoints. 1 council veto shown mid-arc as a "rejected this week" beat for authenticity. -4. **Full-site timelapse** — fictional contractor "Northwest Home Renovations" 3-page site doing the same -5. **Genealogy reveal** — Anthropic Memory Stores Console screenshots showing the 12 council memory stores filling over time, with the moment a NEW critic spawns organically -6. **Close** +4. **Genealogy reveal** — Anthropic Memory Stores Console screenshots showing the 6 council memory stores filling over time, with the moment a NEW critic spawns organically +5. **Close** The timelapse IS the story. The council explanation sits on either side. -## Two substrates (not "multi-site support") +## Single substrate - **`demo-landing-page/ugly/`** — Richer Health (1 page, real brand, ugly starting state) -- **`demo-sites/northwest-reno/ugly/`** — Northwest Home Renovations (3 pages: `/`, `/services`, `/free-estimate`; fictional brand; ugly starting state) -The point is not "Webster handles multi-site." The point is **Webster's council judgment generalizes across domains**. Two substrates is enough to prove that. A third is out of scope. +The submission focuses on a single substrate so the timelapse and genealogy beats are crisp. A second substrate is explicitly out of scope. ## The ugly-brand decoupling — read this twice @@ -69,7 +67,8 @@ The video's genealogy beat dramatizes whatever happened, not what we wished. ## Memory architecture — hybrid - **`history/memory.jsonl`** remains ground truth. Deterministic, inspectable, the substrate the planner and verdict engine already depend on. -- **Anthropic Managed Memory Stores** (public beta, `managed-agents-2026-04-01`) are populated in parallel as demo artifacts. **12 stores total, 6 per substrate**: council, planner, redesigner, genealogy, conversion-critic, visual-reviewer. Orchestrator writes summaries after each week. Planner + redesigner + genealogy attach their stores at session creation and read during work. +- **Anthropic Managed Memory Stores** (public beta, `managed-agents-2026-04-01`) are populated in parallel as demo artifacts. **6 stores total** for the LP substrate: council, planner, redesigner, genealogy, conversion-critic, visual-reviewer. Orchestrator writes summaries after each week. Planner + redesigner + genealogy attach their stores at session creation and read during work. +- **Store attachment is intentionally asymmetric.** Planner, redesigner, genealogy, conversion-critic, and visual-reviewer get role-specific stores because they make durable decisions or own a long-running critique lane. Monitor, SEO, brand-voice, copy, and fh-compliance critics read the shared council store so the demo stays within the 6-store visual story instead of exploding into per-critic storage. The simulation works without memory stores. Memory stores make the showcase real. @@ -77,18 +76,17 @@ The simulation works without memory stores. Memory stores make the showcase real ## Two agent sets — additive, never touching production -Existing 9 agents (`webster-monitor`, `webster-{seo,brand-voice,fh-compliance,conversion,copy}-critic`, `webster-redesigner`, `webster-planner`, `webster-visual-reviewer`) are **UNCHANGED**. They run Nicolette's real weekly production council — WebFetch-based, `LP_TARGET=certified.richerhealth.ca` — and stay that way. +The 9 specs in `agents/production/` (`webster-monitor`, `webster-{seo,brand-voice,fh-compliance,conversion,copy}-critic`, `webster-redesigner`, `webster-planner`, `webster-visual-reviewer`) are **UNCHANGED**. They run Nicolette's real weekly production council — WebFetch-based, `LP_TARGET=certified.richerhealth.ca` — and stay that way. -**18 new sim agents** are added: +**9 sim agents** are added in `agents/simulation/`: -- `webster-lp-sim-*` (9) — scoped to Richer Health simulation, **MCP-native** (read site via `get_file_contents` from demo branch, no WebFetch) -- `webster-site-sim-*` (9) — scoped to Northwest Home Renovations, MCP-native. Fifth critic is `licensing-and-warranty-critic` replacing `fh-compliance-critic` +- `webster-lp-sim-*` (9) — scoped to Richer Health simulation, **MCP-native** (read site via `get_file_contents` from demo branch, no WebFetch). Mirrors the production set 1:1. Registration is idempotent (by name). Production flow untouched. ## State flow — everything in git -Per-week mutations commit to dedicated demo branches (`demo-sim-lp/w` + `demo-sim-site/w`). Sim agents read via GitHub MCP at branch-ref. No localhost, no external deploys, no preview URLs, no WebFetch. Fixed seed + fixed week dates make every run reproducible. +Per-week mutations commit to dedicated demo branches (`demo-sim-lp/w`). Sim agents read via GitHub MCP at branch-ref. No localhost, no external deploys, no preview URLs, no WebFetch. Fixed seed + fixed week dates make every run reproducible. ## Orchestrator — fork, don't rewrite @@ -115,35 +113,33 @@ The plan as drafted is tight but achievable if we follow the cuts. Drift and we ## API cost note -18 new agents × 20 simulated weeks × ~9 sessions per week ≈ 200 sessions. Plus synthetic analytics agent × 20. Estimate: **$150–$500** end-to-end depending on token volume. Consider kicking off sim runs on wall-clock days when Max-sub quota is available rather than burning API credits directly. +9 sim agents × 10 simulated weeks × ~9 sessions per week ≈ 90 sessions. Plus synthetic analytics agent × 10. Estimate: **$80–$250** end-to-end depending on token volume. Consider kicking off sim runs on wall-clock days when Max-sub quota is available rather than burning API credits directly. ## What's locked -- Architecture, substrates, scripts, personas, metrics schema -- Memory design (hybrid file + 12 managed stores) +- Architecture, single LP substrate, scripts, personas, metrics schema +- Memory design (hybrid file + 6 managed stores) - Genealogy approach (pure organic, 1-day re-run budget) - Ugly-brand principle (decoupled) - State flow (GitHub MCP, demo branches, fixed-seed determinism) -- Scope boundary (two substrates, nothing more) -- Existing 9 agents untouched; 18 new sim agents additive +- Scope boundary (one substrate, nothing more) +- Production set untouched; 9 LP-sim agents in `agents/simulation/` mirror it 1:1 -## What's deferred +## Roadmap (shipped product is v1; these are the next claims) -- Nicolette video clip recording -- Video runtime target -- Final video composition (separate Claude Code session + Forge Remotion after assets exist) -- Onboarding skill recording -- Council UI animation for solution-explainer +- v2 — apply worker turns the redesigner's `proposal.md` into a real `proposal.diff` so the council emits a one-click PR diff, not a PR body. +- v2.5 — proposal kinds widen beyond text into CSS, components, and assets; the council becomes a design council, not a copy-editor council. +- v3 — every PR gated by post-apply visual verification before reaching human review. +- v4 — council plans experiments, measures outcomes, auto-rolls-back failures, promotes winners to baseline. Genuine autonomous improvement. ## Out of scope — do not build - Sitewide coordination (shared nav / header / footer cohesion across pages). Emergent through repeated single-page passes only. -- Third substrate +- Second substrate - Live analytics pipeline (synthetic only) - Production deploy (demo branches are terminal) - Multi-critic consensus rework (existing verdict engine handles it) -- Cross-substrate memory sharing (strict isolation per substrate) -- Modifications to the existing 9 production agents or `prompts/second-wbs-session.md` +- Modifications to the production agents in `agents/production/` or `prompts/second-wbs-session.md` ## Demo risk register diff --git a/context/memory-stores.json b/context/memory-stores.json new file mode 100644 index 0000000..08f3fd7 --- /dev/null +++ b/context/memory-stores.json @@ -0,0 +1,10 @@ +{ + "lp": { + "council": "memstore_019FRQvVBANSM5AQZCxu5xmP", + "planner": "memstore_01Ubp9VSNBugJGe47ZHMTQ2R", + "redesigner": "memstore_014SzCGWZ2f9Ufs9jQBXMDQv", + "genealogy": "memstore_01Uk2u9dt593HdWk6LdZsWzQ", + "conversion-critic": "memstore_011cjbxGwxSwjQthTQtrzqAg", + "visual-reviewer": "memstore_01GRnTJzPSGQoUpYtSwy1mTt" + } +} diff --git a/context/sim-agents.json b/context/sim-agents.json new file mode 100644 index 0000000..bea37c6 --- /dev/null +++ b/context/sim-agents.json @@ -0,0 +1,13 @@ +{ + "webster-lp-sim": { + "brand-voice-critic": "agent_011CaPk5Kn5kMtZjwL3bK4mw", + "conversion-critic": "agent_011CaPk5MHNSAVB63iR3kxve", + "copy-critic": "agent_011CaPk5NxKcTwqSH7AqwFSC", + "fh-compliance-critic": "agent_011CaPk5R3ape9fFZJzCFEDw", + "monitor": "agent_011CaPk5SqUyan2eUEXeSJtL", + "planner": "agent_011CaPk5UWSTWgvXqZnC3jMZ", + "redesigner": "agent_011CaPk5WQYT46qS6wT13a82", + "seo-critic": "agent_011CaPk5YGAQtG2xfWkehWr1", + "visual-reviewer": "agent_011CaPk5aeHZMxTgTwCB3DSp" + } +} diff --git a/context/v2-design.md b/context/v2-design.md deleted file mode 100644 index 6b8d46e..0000000 --- a/context/v2-design.md +++ /dev/null @@ -1,312 +0,0 @@ -# Webster v2 — Apply + Review/Fix Loop (design doc) - -> Captured during session 4 Phase 2 grill-me on 2026-04-23. -> Feature entries: `FEATURES.md` #39 (apply worker), #40 (image-gen). -> -> Context: Richie executed session 4 in auto-mode. Below reflects the -> best-available decisions derived from the prompt's rec baseline + the -> live finding from today's wget mirror (critic blind-spot re -> `data-calendly-base` runtime rewrite) + advisor sanity-check. -> Items marked **[R-confirm]** would benefit from an explicit Richie pass -> before implementation kicks off. - ---- - -## Q1 — Apply worker runtime - -**Decision (80/100):** Pi worker (Codex gpt-5.4) invoked via a Forge -workflow, worktree-isolated per apply run. - -**Rationale (building blocks → connections → behaviour):** - -- Pi is already Webster's worker-pool standard. Adding another runtime - class for one new worker is cost the system doesn't need. -- Codex (gpt-5.4, high reasoning) handles structured code mutation - reliably — find-string / replace-with-tree transforms, JSON-LD - insertion, Astro-component patching. Today's session proved this kind - of work is doable by a reasoning model, not a stretch. -- Forge gives the two things the apply step needs that raw `claude -p` - doesn't: (a) isolated git worktree per run, so one bad apply doesn't - clobber another's branch; (b) a validation stage with lint+type+format - baked in. -- The alternative of a Claude Opus managed-agent session is overkill — - Opus' reasoning surplus isn't the bottleneck in find-replace on 5 - issues; the bottleneck is confining the transform. Codex + validation - harness wins on $/task by an order of magnitude. - -**Non-option:** Claude Opus managed-agent. `claude -p` subscription -without Forge isolation. - -**Open:** none significant. - ---- - -## Q2 — Done-definition - -**Decision (72/100):** Three-part gate, all required to pass before the -apply worker opens a PR: - -1. **Static floor** — lint + type + format green on the mutated source - (mirrors existing Webster validate pipeline). -2. **Runtime check** — headless browser opens the mutated page and - verifies CTAs resolve to real booking URLs, `