gethelio · olivrg · Jun 19, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/docs/adapter-api.md b/docs/adapter-api.md
@@ -4,12 +4,12 @@
 
 The governance API lives on the **SDK sideband** — the local server on `127.0.0.1:3200` (configurable via `sdk.*`), the same server the Python SDK uses for evidence/context. It is **not** the dashboard sideband (`:3100`, documented in [Sideband API Reference](./sideband-api.md)); the two are different servers with different jobs. Endpoints here:
 
-| Route                        | Purpose                                                                        |
-| ---------------------------- | ------------------------------------------------------------------------------ |
-| `POST /evaluate`             | Decide a tool call. **Side-effect-free** on rate/spend counters.               |
-| `POST /audit`                | Record the outcome of an evaluated call; **consumes** counters. Idempotent.    |
-| `POST /install-scan`         | Evaluate a package/skill install. Observational until install-time rules ship. |
-| `POST /approval/:id/resolve` | Record the resolution of a natively-handled approval.                          |
+| Route                        | Purpose                                                                                        |
+| ---------------------------- | ---------------------------------------------------------------------------------------------- |
+| `POST /evaluate`             | Decide a tool call. **Side-effect-free** on rate/spend counters.                               |
+| `POST /audit`                | Record the outcome of an evaluated call; **consumes** counters. Idempotent.                    |
+| `POST /install-scan`         | Evaluate a package/skill install against `policies.install` (observational when none defined). |
+| `POST /approval/:id/resolve` | Record the resolution of a natively-handled approval.                                          |
 
 ## Why this exists, and what it does not promise
 
@@ -36,7 +36,7 @@ An adapter built on this API **MUST**:
 
 ## Authentication
 
-The governance routes require `Authorization: Bearer <HELIO_ADAPTER_TOKEN>`. This is a **separate token** from the SDK's `HELIO_SDK_TOKEN`: an SDK client cannot drive policy decisions, and an adapter cannot write evidence. Both are generated per boot (and printed to stderr) unless set in the environment. Requests carrying an `Origin` header are refused (browser-forgery guard), and bodies over 1 MiB are rejected with 413.
+The governance routes require `Authorization: Bearer <HELIO_ADAPTER_TOKEN>`. This is a **separate token** from the SDK's `HELIO_SDK_TOKEN`: an SDK client cannot drive policy decisions, and an adapter cannot call the SDK's `POST /evidence`/`/context` routes. The adapter's evidence access is deliberately narrow: it may attach evidence to a call it is auditing, via the optional `evidence` field on `POST /audit` (success-only, bound to that evaluation's own session/tool, subject to the policy allowlist — see [Populating evidence](#populating-evidence)); it cannot write arbitrary evidence to arbitrary sessions. Both tokens are generated per boot (and printed to stderr) unless set in the environment. Requests carrying an `Origin` header are refused (browser-forgery guard), and bodies over 1 MiB are rejected with 413.
 
 If you embed `GovernanceService` directly (instead of running `helio start`), wire an `ApprovalRouter` whenever the policy can emit `require_approval` (explicit rules, `flag_destructive: require_approval`, or `on_tool_drift: require_approval`), otherwise construction and hot-reload fail closed by throwing `GovernanceConfigError` (exported from `@gethelio/proxy`).
 
@@ -90,11 +90,20 @@ The `decision` is an **outcome**, not Helio's internal rule action: a `rate_limi
   "error": "…",            // optional, when status == "error"
   "duration_ms": 412,      // optional
   "result": { },           // optional outcome summary
-  "actual_amount": 0.42    // optional, finite ≥0 — true post-execution spend; overrides the arg-derived amount
+  "actual_amount": 0.42,   // optional, finite ≥0 — true post-execution spend; overrides the arg-derived amount
+  "evidence": [            // optional — see "Populating evidence" below
+    { "evidence_key": "recipient", "evidence_data": { "to": "a@b.com" }, "ttl_seconds": 300 }
+  ]
 }
 
 // Response 201 (fresh) — replays return 200
-{ "ok": true, "audit_record_id": "…" }
+{
+  "ok": true,
+  "audit_record_id": "…",
+  "evidence": [            // present only when the request carried evidence
+    { "evidence_key": "recipient", "stored": true }
+  ]
+}
 ```
 
 Counters are consumed here (not at `/evaluate`), and only when the call actually ran (`success`/`error`, not `not_executed`). `/audit` is **idempotent on `evaluation_id`**: an identical replay returns `200 { already_finalized: true }` with no double-consumption, so a network retry after a lost response is safe. A different payload under the same id is an adapter bug → `409 evaluation_conflict`.
@@ -103,6 +112,17 @@ Counters are consumed here (not at `/evaluate`), and only when the call actually
 
 `actual_amount` must be finite and `>= 0` (`400 invalid_actual_amount` otherwise) and only applies to evaluations whose decision carried a spend rule (`400 no_spend_rule` if sent for any other evaluation).
 
+### Populating evidence
+
+The optional `evidence` array lets an adapter ground a call's outcome — e.g. recording the recipient a `send` tool actually resolved — so a later [evidence-grounded rule](./policies.md) (`evidence.requires`) can enforce on it. This is the **only** way the adapter token writes evidence; the SDK-scoped `POST /evidence` route is not available to it (see [Authentication](#authentication)). Each entry is `{ evidence_key, evidence_data, ttl_seconds? }`. The proxy binds the write to the **pending evaluation's own** `session_id` and `tool_name` — an adapter cannot target another session — and stores it via the same evidence store the SDK path uses.
+
+Rules:
+
+- **Success-only.** Evidence is written only when `status: "success"`. On `error`/`not_executed` it is ignored (a failed tool must not ground later calls).
+- **First-finalize-only.** Evidence is written once, on the first `/audit`; idempotent replays never re-write (no TTL reset).
+- **Every per-entry failure is soft — never request-fatal.** The audit always finalizes `201`; per-entry outcomes are reported in the response `evidence` array as `{ evidence_key, stored, reason? }`. Reasons: `key_not_in_policy_allowlist` (the key is not named by any `evidence.requires` rule), `too_large` (`evidence_data` over 64 KiB), `too_many` (more than 16 entries — the excess is dropped), `no_session` (the evaluation had no `session_id`), `evidence_unavailable` (this deployment runs governance without an evidence store), `closed` (the store is shutting down). **A rejected key is silently not stored**, so a later grounded `/evaluate` will fail closed — make sure every key you populate is named by an `evidence.requires` rule.
+- **Idempotency.** Evidence is part of the `/audit` idempotency hash (order-independent): an identical retry replays cleanly, but the same `evaluation_id` with divergent evidence is `409 evaluation_conflict`.
+
 **Other responses:** `404 evaluation_unknown`, `404 evaluation_expired` (the decision aged out — see below), `409 approval_unresolved` (resolve the approval first; **retryable** with short backoff).
 
 ### The crash-TTL and TOCTOU caveats

diff --git a/packages/proxy/src/sideband/governance-api.test.ts b/packages/proxy/src/sideband/governance-api.test.ts
@@ -28,7 +28,9 @@ function setup(opts?: {
   const governance =
     opts?.withGovernance === false
       ? undefined
-      : new GovernanceService({ policy, sweepIntervalMs: 0 })
+      : // Wire the same EvidenceStore into the service, mirroring production
+        // (cli.ts) so the /audit evidence path is exercised end-to-end.
+        new GovernanceService({ policy, sweepIntervalMs: 0, evidenceStore: store })
 
   const app = createSidebandApp(store, {
     token: opts?.tokens ? SDK_TOKEN : undefined,
@@ -253,4 +255,129 @@ describe('sideband governance routes', () => {
     expect(res.status).toBe(200)
     expect(((await res.json()) as { decision: string }).decision).toBe('allow')
   })
+
+  it('accepts an evidence array on /audit and writes it to the store', async () => {
+    const ctx = setup()
+    store = ctx.store
+    governance = ctx.governance ?? null
+
+    const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' }))
+    const { evaluation_id } = (await ev.json()) as { evaluation_id: string }
+    const audit = await ctx.post('/audit', {
+      evaluation_id,
+      status: 'success',
+      evidence: [{ evidence_key: 'recipient', evidence_data: { to: 'a@b.com' } }],
+    })
+
+    expect(audit.status).toBe(201)
+    const json = (await audit.json()) as { evidence: { evidence_key: string; stored: boolean }[] }
+    expect(json.evidence).toEqual([{ evidence_key: 'recipient', stored: true }])
+    expect(ctx.store.getEvidence('oc:s1', 'recipient')?.data).toEqual({ to: 'a@b.com' })
+  })
+
+  it('does not reject an over-count evidence array at the route (no .max) — soft-drops in service', async () => {
+    const ctx = setup()
+    store = ctx.store
+    governance = ctx.governance ?? null
+
+    const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' }))
+    const { evaluation_id } = (await ev.json()) as { evaluation_id: string }
+    const evidence = Array.from({ length: 17 }, (_, i) => ({
+      evidence_key: `k${String(i)}`,
+      evidence_data: i,
+    }))
+    const audit = await ctx.post('/audit', { evaluation_id, status: 'success', evidence })
+
+    expect(audit.status).toBe(201) // NOT 400 — the route has no .max() refinement
+    const json = (await audit.json()) as {
+      evidence: { evidence_key: string; stored: boolean; reason?: string }[]
+    }
+    expect(json.evidence).toHaveLength(17)
+    expect(json.evidence[16]).toEqual({ evidence_key: 'k16', stored: false, reason: 'too_many' })
+  })
+
+  it('treats divergent evidence under the same evaluation_id as a 409 conflict', async () => {
+    const ctx = setup()
+    store = ctx.store
+    governance = ctx.governance ?? null
+
+    const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' }))
+    const { evaluation_id } = (await ev.json()) as { evaluation_id: string }
+
+    const first = await ctx.post('/audit', {
+      evaluation_id,
+      status: 'success',
+      evidence: [{ evidence_key: 'k', evidence_data: 'v1' }],
+    })
+    expect(first.status).toBe(201)
+
+    const divergent = await ctx.post('/audit', {
+      evaluation_id,
+      status: 'success',
+      evidence: [{ evidence_key: 'k', evidence_data: 'v2' }],
+    })
+    expect(divergent.status).toBe(409)
+    expect(((await divergent.json()) as { error: string }).error).toBe('evaluation_conflict')
+  })
+
+  it('treats identical evidence (any entry order) as an idempotent replay', async () => {
+    const ctx = setup()
+    store = ctx.store
+    governance = ctx.governance ?? null
+
+    const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' }))
+    const { evaluation_id } = (await ev.json()) as { evaluation_id: string }
+    const a = { evidence_key: 'a', evidence_data: 1 }
+    const b = { evidence_key: 'b', evidence_data: 2 }
+
+    const first = await ctx.post('/audit', { evaluation_id, status: 'success', evidence: [a, b] })
+    expect(first.status).toBe(201)
+
+    // Same entries, reversed order → must hash identically (sorted tuple).
+    const replay = await ctx.post('/audit', { evaluation_id, status: 'success', evidence: [b, a] })
+    expect(replay.status).toBe(200)
+    expect(((await replay.json()) as { already_finalized: boolean }).already_finalized).toBe(true)
+  })
+
+  it('soft-fails evidence with reason closed over HTTP — /audit stays 201, never 503', async () => {
+    const ctx = setup()
+    store = ctx.store
+    governance = ctx.governance ?? null
+
+    const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' }))
+    const { evaluation_id } = (await ev.json()) as { evaluation_id: string }
+    ctx.store.close() // standalone /evidence would 503; /audit must not inherit that
+
+    const audit = await ctx.post('/audit', {
+      evaluation_id,
+      status: 'success',
+      evidence: [{ evidence_key: 'k', evidence_data: 'x' }],
+    })
+    expect(audit.status).toBe(201)
+    const json = (await audit.json()) as { evidence: { evidence_key: string; reason?: string }[] }
+    expect(json.evidence).toEqual([{ evidence_key: 'k', stored: false, reason: 'closed' }])
+  })
+
+  it('rejects an over-1MiB /audit body with 413 (gross body limit, not an evidence cap)', async () => {
+    const ctx = setup()
+    store = ctx.store
+    governance = ctx.governance ?? null
+
+    const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' }))
+    const { evaluation_id } = (await ev.json()) as { evaluation_id: string }
+
+    const huge = 'x'.repeat(1_200 * 1024) // ~1.2 MiB > the 1 MiB sideband body limit
+    const body = JSON.stringify({
+      evaluation_id,
+      status: 'success',
+      evidence: [{ evidence_key: 'k', evidence_data: huge }],
+    })
+    // Set content-length explicitly — the real Node server sets it, and hono's
+    // bodyLimit checks it to fail fast with 413 before the body is parsed.
+    const res = await ctx.post('/audit', body, {
+      'content-length': String(Buffer.byteLength(body)),
+    })
+    expect(res.status).toBe(413)
+    expect(((await res.json()) as { error: string }).error).toBe('request_body_too_large')
+  })
 })
diff --git a/packages/proxy/src/sideband/governance-api.ts b/packages/proxy/src/sideband/governance-api.ts
@@ -54,13 +54,23 @@ const installScanBody = z.object({
   metadata: metadataSchema,
 })
 
+const evidenceEntrySchema = z.object({
+  evidence_key: z.string().min(1),
+  evidence_data: z.unknown().refine((v) => v !== undefined, { message: 'Required' }),
+  ttl_seconds: z.number().int().positive().optional(),
+})
+
 const auditBody = z.object({
   evaluation_id: z.string().min(1),
   status: z.enum(['success', 'error', 'not_executed']),
   error: z.string().optional(),
   duration_ms: z.number().optional(),
   result: z.unknown().optional(),
   actual_amount: z.number().optional(),
+  // No `.max()` / size refinement here on purpose (issue #11): caps are
+  // enforced per-entry in GovernanceService.populateEvidence as soft-drops, so
+  // an over-cap entry never 400s away the audit row for a call that already ran.
+  evidence: z.array(evidenceEntrySchema).optional(),
 })
 
 const resolveBody = z.object({
@@ -207,10 +217,33 @@ function auditPayloadHash(data: z.infer<typeof auditBody>): string {
     duration_ms: data.duration_ms ?? null,
     result: data.result ?? null,
     actual_amount: data.actual_amount ?? null,
+    evidence: canonicalEvidence(data.evidence),
   }
   return createHash('sha256').update(canonicalize(semantic)).digest('hex')
 }
 
+/**
+ * Order-independent canonical form of the submitted evidence array, for the
+ * idempotency hash (issue #11). Entries are normalized (defaults
+ * filled) and sorted by their full canonical tuple — `(evidence_key,
+ * evidence_data, ttl_seconds)` — so a retry that sends the same facts in a
+ * different array order hashes identically, while any divergence in key, data,
+ * or ttl is a `409 evaluation_conflict`. Hashes the *submitted* payload, so the
+ * result is independent of which entries the service later soft-drops.
+ */
+function canonicalEvidence(evidence: z.infer<typeof auditBody>['evidence']): unknown {
+  if (!evidence || evidence.length === 0) return null
+  return evidence
+    .map((e) => ({
+      evidence_key: e.evidence_key,
+      evidence_data: e.evidence_data ?? null,
+      ttl_seconds: e.ttl_seconds ?? null,
+    }))
+    .map((norm) => ({ sortKey: canonicalize(norm), norm }))
+    .sort((a, b) => (a.sortKey < b.sortKey ? -1 : a.sortKey > b.sortKey ? 1 : 0))
+    .map((x) => x.norm)
+}
+
 /** Narrow a numeric status to Hono's accepted status type. */
 function asStatus(status: number): 200 | 201 | 400 | 404 | 409 | 413 | 503 {
   return status as 200 | 201 | 400 | 404 | 409 | 413 | 503