diff --git a/docs/evals/weekly/2026-06-09.md b/docs/evals/weekly/2026-06-09.md new file mode 100644 index 0000000..571b385 --- /dev/null +++ b/docs/evals/weekly/2026-06-09.md @@ -0,0 +1,94 @@ +# ahd eval · swiss-editorial · 2026-06-09T06:14:00.070Z + +```yaml ahd-replay +schema_version: 1 +kind: eval-live +ahd_version: 0.11.0 +ahd_commit: d641dceb6870c4e7845375eb0d0df6694ff454d1 +git_dirty: true +node_version: v20.20.2 +platform: linux-x64 +invoked_at: 2026-06-09T05:48:53.232Z +token: + path: /home/runner/work/ahd/ahd/tokens/swiss-editorial.yml + hash: sha256:380a3d833d94 +brief: + path: briefs/landing.yml + hash: sha256:8b7d42759643 +sampling: + n: 30 + temperature: null + seed: null +models: + - id: @cf/google/gemma-4-26b-a4b-it + provider: cloudflare-workers-ai + provider_request_ids: 60 captured + - id: @cf/meta/llama-4-scout-17b-16e-instruct + provider: cloudflare-workers-ai + provider_request_ids: 60 captured + - id: @cf/mistralai/mistral-small-3.1-24b-instruct + provider: cloudflare-workers-ai + provider_request_ids: 60 captured + - id: @cf/openai/gpt-oss-120b + provider: cloudflare-workers-ai + provider_request_ids: 60 captured + - id: @cf/qwen/qwen3-30b-a3b-fp8 + provider: cloudflare-workers-ai + provider_request_ids: 60 captured +conditions: + requested: [raw, compiled] + effective: [raw, compiled] +``` + +Replay this run: + +```sh +git checkout d641dceb6870 +npm ci && npm run build +/opt/hostedtoolcache/node/20.20.2/x64/bin/node /home/runner/work/ahd/ahd/bin/ahd.js eval-live swiss-editorial --brief briefs/landing.yml --models cf:@cf/google/gemma-4-26b-a4b-it,cf:@cf/meta/llama-4-scout-17b-16e-instruct,cf:@cf/mistralai/mistral-small-3.1-24b-instruct,cf:@cf/openai/gpt-oss-120b,cf:@cf/qwen/qwen3-30b-a3b-fp8 --n 30 --sample-concurrency 6 --out evals --report docs/evals/weekly/2026-06-09.md +``` + +## Run + +- Brief: `briefs/landing.yml` +- Samples per cell: **30** +- Max tokens: 12000 +- Models: + - `@cf/google/gemma-4-26b-a4b-it` (cloudflare-workers-ai) · spec `cf:@cf/google/gemma-4-26b-a4b-it` + - `@cf/meta/llama-4-scout-17b-16e-instruct` (cloudflare-workers-ai) · spec `cf:@cf/meta/llama-4-scout-17b-16e-instruct` + - `@cf/mistralai/mistral-small-3.1-24b-instruct` (cloudflare-workers-ai) · spec `cf:@cf/mistralai/mistral-small-3.1-24b-instruct` + - `@cf/openai/gpt-oss-120b` (cloudflare-workers-ai) · spec `cf:@cf/openai/gpt-oss-120b` + - `@cf/qwen/qwen3-30b-a3b-fp8` (cloudflare-workers-ai) · spec `cf:@cf/qwen/qwen3-30b-a3b-fp8` + +## Per-model slop reduction + +| model | raw attempted → scored | compiled attempted → scored | raw mean tells | compiled mean tells | Δ | reduction | +|---|---:|---:|---:|---:|---:|---:| +| `@cf/google/gemma-4-26b-a4b-it` | 30 → 30 | 30 → 30 | 2.67 | 1.23 | 1.43 | 53.8% | +| `@cf/meta/llama-4-scout-17b-16e-instruct` | 30 → 30 | 30 → 30 | 2.00 | 2.00 | 0.00 | 0.0% | +| `@cf/mistralai/mistral-small-3.1-24b-instruct` | 30 → 30 | 30 → 30 | 3.40 | 1.07 | 2.33 | 68.6% | +| `@cf/openai/gpt-oss-120b` | 30 → 30 | 30 → 30 | 3.17 | 0.87 | 2.30 | 72.6% | +| `@cf/qwen/qwen3-30b-a3b-fp8` | 30 → 30 | 30 → 30 | 1.93 | 2.00 | -0.07 | -3.4% | + +## Per-tell frequency (scored samples only) + +| tell | @cf/google/gemma-4-26b-a4b-it/raw | @cf/google/gemma-4-26b-a4b-it/compiled | @cf/meta/llama-4-scout-17b-16e-instruct/raw | @cf/meta/llama-4-scout-17b-16e-instruct/compiled | @cf/mistralai/mistral-small-3.1-24b-instruct/raw | @cf/mistralai/mistral-small-3.1-24b-instruct/compiled | @cf/openai/gpt-oss-120b/raw | @cf/openai/gpt-oss-120b/compiled | @cf/qwen/qwen3-30b-a3b-fp8/raw | @cf/qwen/qwen3-30b-a3b-fp8/compiled | +|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| ahd/body-measure | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 3% | +| ahd/line-height-per-size | 87% | 0% | 0% | 100% | 53% | 20% | 100% | 0% | 80% | 57% | +| ahd/no-default-grotesque | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 3% | 0% | 0% | +| ahd/no-em-dashes-in-prose | 0% | 0% | 0% | 0% | 0% | 0% | 7% | 0% | 0% | 0% | +| ahd/no-flat-dark-mode | 7% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 10% | 0% | +| ahd/radius-hierarchy | 50% | 10% | 0% | 100% | 100% | 0% | 77% | 7% | 23% | 43% | +| ahd/require-named-grid | 0% | 0% | 100% | 0% | 100% | 47% | 30% | 0% | 0% | 0% | +| ahd/require-type-pairing | 23% | 0% | 100% | 0% | 87% | 0% | 53% | 0% | 70% | 0% | +| ahd/tracking-per-size | 0% | 27% | 0% | 0% | 0% | 30% | 0% | 3% | 0% | 0% | +| ahd/weight-variety | 100% | 87% | 0% | 0% | 0% | 10% | 50% | 73% | 10% | 97% | + +## Caveats +- Scoring runs the deterministic AHD linter (38 source-level rules) over every sample that passes a basic HTML sanity check. +- Counts reported per cell: attempted (runs initiated) / errored (API / runtime errors) / extractionFailed (response contained no usable HTML) / scored (linted). A large gap between attempted and scored is a signal that the model is struggling with the instruction, not that it passed the taxonomy. +- Raw condition: the brief is expanded as plain prose (intent + audience + surfaces + mustInclude + mustAvoid) with no AHD system prompt, no style token, no forbidden list. Compiled condition: same brief plus the AHD-compiled system prompt. The only thing that differs between conditions is the AHD intervention. +- Vision-only tells (14 rules in the critic) are not scored in this pipeline; run the critic on rendered screenshots for full taxonomy coverage. +- Tells-per-page is a proxy metric: a thin page has little surface for rules to fire against. Read the Δ alongside the actual rendered HTML, not in isolation. +- Model versions change. See the run manifest for exact canonical model ids. \ No newline at end of file diff --git a/docs/evals/weekly/2026-06-09.replay.json b/docs/evals/weekly/2026-06-09.replay.json new file mode 100644 index 0000000..66757ae --- /dev/null +++ b/docs/evals/weekly/2026-06-09.replay.json @@ -0,0 +1,383 @@ +{ + "schema_version": 1, + "kind": "eval-live", + "ahd_version": "0.11.0", + "ahd_commit": "d641dceb6870c4e7845375eb0d0df6694ff454d1", + "git_dirty": true, + "node_version": "v20.20.2", + "platform": "linux-x64", + "invoked_at": "2026-06-09T05:48:53.232Z", + "argv": [ + "/opt/hostedtoolcache/node/20.20.2/x64/bin/node", + "/home/runner/work/ahd/ahd/bin/ahd.js", + "eval-live", + "swiss-editorial", + "--brief", + "briefs/landing.yml", + "--models", + "cf:@cf/google/gemma-4-26b-a4b-it,cf:@cf/meta/llama-4-scout-17b-16e-instruct,cf:@cf/mistralai/mistral-small-3.1-24b-instruct,cf:@cf/openai/gpt-oss-120b,cf:@cf/qwen/qwen3-30b-a3b-fp8", + "--n", + "30", + "--sample-concurrency", + "6", + "--out", + "evals", + "--report", + "docs/evals/weekly/2026-06-09.md" + ], + "token": { + "path": "/home/runner/work/ahd/ahd/tokens/swiss-editorial.yml", + "hash": "sha256:380a3d833d9463dbc681df7465993ab6413d8e77188f55f97359f60dc4b746b1" + }, + "brief": { + "path": "briefs/landing.yml", + "hash": "sha256:8b7d4275964399a91e6ddec525151ba672ed9c4721279a37ec16ab3450493a4c" + }, + "sampling": { + "n": 30, + "temperature": null, + "seed": null + }, + "models": [ + { + "id": "@cf/google/gemma-4-26b-a4b-it", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a08dd9915cd83457-ORD", + "a08dd9914d846689-ORD", + "a08dd9915a945e0c-ORD", + "a08dd9916b21618a-ORD", + "a08dd9915bccdc1f-ORD", + "a08dd9916c344227-ORD", + "a08dda8e7f9f3457-ORD", + "a08ddac5c9ba5e0c-ORD", + "a08dda931c8f6689-ORD", + "a08ddb7cfa393457-ORD", + "a08ddae5ef91618a-ORD", + "a08ddba0ed785e0c-ORD", + "a08ddaec9be64227-ORD", + "a08ddbbc0be26689-ORD", + "a08ddaeaad95dc1f-ORD", + "a08ddc1a3d233457-ORD", + "a08ddca5b8466689-ORD", + "a08ddc3d8adc5e0c-ORD", + "a08ddc83e9684227-ORD", + "a08ddc265c1f618a-ORD", + "a08ddd096a5c3457-ORD", + "a08ddcf4eb0adc1f-ORD", + "a08ddd148d846689-ORD", + "a08ddd3a1aa94227-ORD", + "a08ddd171a585e0c-ORD", + "a08ddd814faa618a-ORD", + "a08dde484c344227-ORD", + "a08dddebdd32dc1f-ORD", + "a08ddddaec6b3457-ORD", + "a08dde34c9776689-ORD", + "a08ddfcb1aef2145-ORD", + "a08ddfcb0c036689-ORD", + "a08ddfcb1c2bdc1f-ORD", + "a08ddfcb1845bc23-ORD", + "a08ddfcb1bac4227-ORD", + "a08ddfcb1b73d300-ORD", + "a08de0afb84b2145-ORD", + "a08de10d0f3abc23-ORD", + "a08de17a7cdd4227-ORD", + "a08de0fbaaf16689-ORD", + "a08de1dade282145-ORD", + "a08de1b70a88d300-ORD", + "a08de1070ac3dc1f-ORD", + "a08de28c1de9bc23-ORD", + "a08de2dbbe3b4227-ORD", + "a08de2fd6c956689-ORD", + "a08de3533cde2145-ORD", + "a08de3edfa32dc1f-ORD", + "a08de3d35dccd300-ORD", + "a08de459bdab4227-ORD", + "a08de42c3a4bbc23-ORD", + "a08de4b1fc4f6689-ORD", + "a08de4e4ba032145-ORD", + "a08de5091edcd300-ORD", + "a08de58cae80bc23-ORD", + "a08de4e4f872dc1f-ORD", + "a08de5f529e16689-ORD", + "a08de66ade33d300-ORD", + "a08de64bc8552145-ORD", + "a08de582595b4227-ORD" + ] + }, + { + "id": "@cf/meta/llama-4-scout-17b-16e-instruct", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a08de7f5fde64138-ORD", + "a08de7f5f8b57e6b-ORD", + "a08de7f5ec464227-ORD", + "a08de7f5f9c66689-ORD", + "a08de7f5f9256e89-ORD", + "a08de7f5fa81843b-ORD", + "a08de8553dc84138-ORD", + "a08de8561c9c7e6b-ORD", + "a08de8580f584227-ORD", + "a08de858fc0e6e89-ORD", + "a08de8585e1c6689-ORD", + "a08de8598fc3843b-ORD", + "a08de8bb29b74227-ORD", + "a08de8b80c0e4138-ORD", + "a08de8ba69947e6b-ORD", + "a08de8bbcb606e89-ORD", + "a08de8bc195d843b-ORD", + "a08de8bbeb2e6689-ORD", + "a08de915f9ed4227-ORD", + "a08de91758de7e6b-ORD", + "a08de918eae6843b-ORD", + "a08de91728444138-ORD", + "a08de91898436e89-ORD", + "a08de918ec876689-ORD", + "a08de9725de34227-ORD", + "a08de973be437e6b-ORD", + "a08de9754b82843b-ORD", + "a08de9799f874138-ORD", + "a08de97b192c6e89-ORD", + "a08de97bcd316689-ORD", + "a08de9da5f127e6b-ORD", + "a08de9da5ee46689-ORD", + "a08de9da5b51843b-ORD", + "a08de9da59894138-ORD", + "a08de9da5b3d6e89-ORD", + "a08de9da5fbb4227-ORD", + "a08dea4b09696689-ORD", + "a08dea43adf57e6b-ORD", + "a08dea4bcafb4138-ORD", + "a08dea4bbf1d843b-ORD", + "a08dea729f546e89-ORD", + "a08dea75deb24227-ORD", + "a08dead9ef797e6b-ORD", + "a08deae81e07843b-ORD", + "a08deac0390a6689-ORD", + "a08deae4da374138-ORD", + "a08deb07fe084227-ORD", + "a08deb036daf6e89-ORD", + "a08deb509d957e6b-ORD", + "a08deb548fce843b-ORD", + "a08deb5eed996689-ORD", + "a08deb874c684227-ORD", + "a08deb737feb4138-ORD", + "a08deb976a196e89-ORD", + "a08debba3c867e6b-ORD", + "a08debcb5cfa6689-ORD", + "a08debbec951843b-ORD", + "a08debfbab674227-ORD", + "a08dec0aed634138-ORD", + "a08dec3209b96e89-ORD" + ] + }, + { + "id": "@cf/mistralai/mistral-small-3.1-24b-instruct", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a08decc91ff86e89-ORD", + "a08decc92f766689-ORD", + "a08decc93dacc071-ORD", + "a08decc938f0b6f8-ORD", + "a08decc92a447e6b-ORD", + "a08decc93b07dc1f-ORD", + "a08ded27da156689-ORD", + "a08ded21ae306e89-ORD", + "a08ded2c59e1c071-ORD", + "a08ded37bf6c7e6b-ORD", + "a08ded313e4bb6f8-ORD", + "a08ded451af0dc1f-ORD", + "a08ded7b89546689-ORD", + "a08ded840fc36e89-ORD", + "a08deda2cc25b6f8-ORD", + "a08ded97780cc071-ORD", + "a08deda08d6c7e6b-ORD", + "a08dedc27d92dc1f-ORD", + "a08dedd93b9f6689-ORD", + "a08dedffda02b6f8-ORD", + "a08dedfc7c5a6e89-ORD", + "a08dee0ddcd9c071-ORD", + "a08dee0e58d57e6b-ORD", + "a08dee3b1bee6689-ORD", + "a08dee33585bdc1f-ORD", + "a08dee4db875b6f8-ORD", + "a08dee63a99c6e89-ORD", + "a08dee7bad077e6b-ORD", + "a08dee64da71c071-ORD", + "a08dee971a7d6689-ORD", + "a08def245cd26172-ORD", + "a08def2449ee6689-ORD", + "a08def245e644138-ORD", + "a08def2458b46e89-ORD", + "a08def245b29b834-ORD", + "a08def245f2bdc1f-ORD", + "a08defb458916172-ORD", + "a08deff37bb8b834-ORD", + "a08defcbc8df6689-ORD", + "a08defdf2e074138-ORD", + "a08defe6899f6e89-ORD", + "a08df0050ac8dc1f-ORD", + "a08df0bf0e7f6689-ORD", + "a08df09629796172-ORD", + "a08df0ae4d6fb834-ORD", + "a08df0d07aa44138-ORD", + "a08df0fecf5adc1f-ORD", + "a08df0d68fc86e89-ORD", + "a08df15859716689-ORD", + "a08df16d6ab7b834-ORD", + "a08df15b58476172-ORD", + "a08df1bacf324138-ORD", + "a08df1dfbca86e89-ORD", + "a08df1d51ca6dc1f-ORD", + "a08df20f09826689-ORD", + "a08df2418fe36172-ORD", + "a08df23b4f00b834-ORD", + "a08df297f90c4138-ORD", + "a08df2ab8f0c6e89-ORD", + "a08df2bb58acdc1f-ORD" + ] + }, + { + "id": "@cf/openai/gpt-oss-120b", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a08df3cfecb03457-ORD", + "a08df3cfed756210-ORD", + "a08df3cff9f0e815-ORD", + "a08df3cffaab6172-ORD", + "a08df3cfdae6dc1f-ORD", + "a08df3cfef4a6689-ORD", + "a08df3fe6b973457-ORD", + "a08df40abe6de815-ORD", + "a08df402e9bd6210-ORD", + "a08df41e8d89dc1f-ORD", + "a08df40c9bec6172-ORD", + "a08df421dfdb6689-ORD", + "a08df42feeb13457-ORD", + "a08df439ddf5e815-ORD", + "a08df44f3c00dc1f-ORD", + "a08df4403c9a6210-ORD", + "a08df45c080c6172-ORD", + "a08df46948276689-ORD", + "a08df470ac00e815-ORD", + "a08df46c4ff23457-ORD", + "a08df4894d8e6210-ORD", + "a08df4972a046172-ORD", + "a08df4797f2fdc1f-ORD", + "a08df4a158bd6689-ORD", + "a08df4b6cc6c3457-ORD", + "a08df4b9dc546210-ORD", + "a08df4a79812e815-ORD", + "a08df4cc4ea0dc1f-ORD", + "a08df4c6bc156172-ORD", + "a08df4e05f936689-ORD", + "a08df52698b4dc1f-ORD", + "a08df526bf966e89-ORD", + "a08df5269f396172-ORD", + "a08df5269c656689-ORD", + "a08df526a8b87e6b-ORD", + "a08df526abcabc23-ORD", + "a08df5714fa6dc1f-ORD", + "a08df57b89b76172-ORD", + "a08df57e0cb36689-ORD", + "a08df5748ba96e89-ORD", + "a08df59779dcbc23-ORD", + "a08df5849cb57e6b-ORD", + "a08df5c5dd0fdc1f-ORD", + "a08df5d1fcf26172-ORD", + "a08df5f7bde1bc23-ORD", + "a08df5e3993e6689-ORD", + "a08df5eaefab6e89-ORD", + "a08df6162b887e6b-ORD", + "a08df65ece6cbc23-ORD", + "a08df6750f187e6b-ORD", + "a08df6376e91dc1f-ORD", + "a08df6638d236e89-ORD", + "a08df6618ef16689-ORD", + "a08df64f8ed06172-ORD", + "a08df6b1c8a2bc23-ORD", + "a08df6c84df97e6b-ORD", + "a08df6d61a3bdc1f-ORD", + "a08df6da5e546172-ORD", + "a08df6da2ec06689-ORD", + "a08df6d7ac306e89-ORD" + ] + }, + { + "id": "@cf/qwen/qwen3-30b-a3b-fp8", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a08df73bfc23b834-ORD", + "a08df73bfeef6689-ORD", + "a08df73bf8b3c071-ORD", + "a08df73bf88e6e89-ORD", + "a08df73bfc4b6172-ORD", + "a08df73bfa526210-ORD", + "a08df7802958b834-ORD", + "a08df7aaa9b76689-ORD", + "a08df7f52fe06210-ORD", + "a08df7c61e6cc071-ORD", + "a08df7fb495cb834-ORD", + "a08df7d11c9e6e89-ORD", + "a08df7d74cc76172-ORD", + "a08df8476aa26210-ORD", + "a08df82fb8e56689-ORD", + "a08df889fa466172-ORD", + "a08df8520fe2c071-ORD", + "a08df8758f2e6e89-ORD", + "a08df8b1cda76689-ORD", + "a08df8682dadb834-ORD", + "a08df8d9b8326172-ORD", + "a08df8f46a036e89-ORD", + "a08df8a5a8766210-ORD", + "a08df91c78506689-ORD", + "a08df95c5bf96e89-ORD", + "a08df94cec8b6172-ORD", + "a08df8e2e910c071-ORD", + "a08df92429d6b834-ORD", + "a08df96ffeda6210-ORD", + "a08df9afd8e26689-ORD", + "a08dfa1e5ac17e6b-ORD", + "a08dfa1e587c6e89-ORD", + "a08dfa1e58a8dc1f-ORD", + "a08dfa1e3c486689-ORD", + "a08dfa1e49d0b834-ORD", + "a08dfa1e49874138-ORD", + "a08dfab0ee966e89-ORD", + "a08dfaa86d697e6b-ORD", + "a08dfae7cca8b834-ORD", + "a08dfafb89264138-ORD", + "a08dfabddee8dc1f-ORD", + "a08dfac7fb416689-ORD", + "a08dfb27cf786e89-ORD", + "a08dfb6c5e8ab834-ORD", + "a08dfb550d947e6b-ORD", + "a08dfb8eda21dc1f-ORD", + "a08dfbc588f26689-ORD", + "a08dfb8e2a864138-ORD", + "a08dfbcd9d336e89-ORD", + "a08dfc065c83b834-ORD", + "a08dfc493dae7e6b-ORD", + "a08dfc74fc7c4138-ORD", + "a08dfc525cc3dc1f-ORD", + "a08dfcba3e5b6e89-ORD", + "a08dfcefbaefb834-ORD", + "a08dfd17da004138-ORD", + "a08dfcffee067e6b-ORD", + "a08dfd26fdffdc1f-ORD", + "a08dfc71ae796689-ORD", + "a08dfd7d08096e89-ORD" + ] + } + ], + "conditions": { + "requested": [ + "raw", + "compiled" + ], + "effective": [ + "raw", + "compiled" + ] + } +}