From ac7ab99cb6e1790d0aa9814dac859804d2b97555 Mon Sep 17 00:00:00 2001
From: olivrg <olivrg@gmail.com>
Date: Thu, 18 Jun 2026 16:06:57 +0100
Subject: [PATCH 1/2] feat(proxy): optional evidence payload on POST /audit
 (#11)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lets a hook-based adapter populate evidence-grounding facts on its single
adapter-scoped token, instead of the SDK-scoped POST /evidence — the
remaining core change for the OpenClaw adapter (#11).

- governance-service: add `evidence` to AuditInput and write it via
  EvidenceStore on a successful, first-finalize audit, bound to the pending
  evaluation's own session_id/tool_name (the adapter never supplies them).
  Every per-entry failure is soft (reported, never request-fatal): caps are
  enforced here as soft-drops — `too_many` (>16) / `too_large` (>64 KiB) — and
  the store's `key_not_in_policy_allowlist` / `closed`, plus `no_session` /
  `evidence_unavailable`, all still finalize 201. Critically, `closed` is NOT
  the 503 the standalone /evidence route returns — losing the audit row for a
  call that already ran would be worse than a dropped evidence entry.
- governance-api: add `evidence` to the auditBody schema (no route-level
  `.max()`, so caps stay soft-drops) and fold it into auditPayloadHash,
  order-independent (sorted by the full canonical tuple) so an identical retry
  replays cleanly while divergent evidence is a 409 conflict.

Success-only, first-finalize-only (idempotent replays never re-write).
Tests cover every soft-drop reason, the closed!=503 asymmetry over HTTP, the
gross-body 413, and order-independent idempotency.
---
 .../proxy/src/sideband/governance-api.test.ts | 129 ++++++++++++++-
 packages/proxy/src/sideband/governance-api.ts |  33 ++++
 .../src/sideband/governance-service.test.ts   | 148 ++++++++++++++++++
 .../proxy/src/sideband/governance-service.ts  |  96 +++++++++++-
 4 files changed, 404 insertions(+), 2 deletions(-)

diff --git a/packages/proxy/src/sideband/governance-api.test.ts b/packages/proxy/src/sideband/governance-api.test.ts
index 2c1e2ab..244ebf5 100644
--- a/packages/proxy/src/sideband/governance-api.test.ts
+++ b/packages/proxy/src/sideband/governance-api.test.ts
@@ -28,7 +28,9 @@ function setup(opts?: {
   const governance =
     opts?.withGovernance === false
       ? undefined
-      : new GovernanceService({ policy, sweepIntervalMs: 0 })
+      : // Wire the same EvidenceStore into the service, mirroring production
+        // (cli.ts) so the /audit evidence path is exercised end-to-end.
+        new GovernanceService({ policy, sweepIntervalMs: 0, evidenceStore: store })
 
   const app = createSidebandApp(store, {
     token: opts?.tokens ? SDK_TOKEN : undefined,
@@ -253,4 +255,129 @@ describe('sideband governance routes', () => {
     expect(res.status).toBe(200)
     expect(((await res.json()) as { decision: string }).decision).toBe('allow')
   })
+
+  it('accepts an evidence array on /audit and writes it to the store', async () => {
+    const ctx = setup()
+    store = ctx.store
+    governance = ctx.governance ?? null
+
+    const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' }))
+    const { evaluation_id } = (await ev.json()) as { evaluation_id: string }
+    const audit = await ctx.post('/audit', {
+      evaluation_id,
+      status: 'success',
+      evidence: [{ evidence_key: 'recipient', evidence_data: { to: 'a@b.com' } }],
+    })
+
+    expect(audit.status).toBe(201)
+    const json = (await audit.json()) as { evidence: { evidence_key: string; stored: boolean }[] }
+    expect(json.evidence).toEqual([{ evidence_key: 'recipient', stored: true }])
+    expect(ctx.store.getEvidence('oc:s1', 'recipient')?.data).toEqual({ to: 'a@b.com' })
+  })
+
+  it('does not reject an over-count evidence array at the route (no .max) — soft-drops in service', async () => {
+    const ctx = setup()
+    store = ctx.store
+    governance = ctx.governance ?? null
+
+    const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' }))
+    const { evaluation_id } = (await ev.json()) as { evaluation_id: string }
+    const evidence = Array.from({ length: 17 }, (_, i) => ({
+      evidence_key: `k${String(i)}`,
+      evidence_data: i,
+    }))
+    const audit = await ctx.post('/audit', { evaluation_id, status: 'success', evidence })
+
+    expect(audit.status).toBe(201) // NOT 400 — the route has no .max() refinement
+    const json = (await audit.json()) as {
+      evidence: { evidence_key: string; stored: boolean; reason?: string }[]
+    }
+    expect(json.evidence).toHaveLength(17)
+    expect(json.evidence[16]).toEqual({ evidence_key: 'k16', stored: false, reason: 'too_many' })
+  })
+
+  it('treats divergent evidence under the same evaluation_id as a 409 conflict', async () => {
+    const ctx = setup()
+    store = ctx.store
+    governance = ctx.governance ?? null
+
+    const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' }))
+    const { evaluation_id } = (await ev.json()) as { evaluation_id: string }
+
+    const first = await ctx.post('/audit', {
+      evaluation_id,
+      status: 'success',
+      evidence: [{ evidence_key: 'k', evidence_data: 'v1' }],
+    })
+    expect(first.status).toBe(201)
+
+    const divergent = await ctx.post('/audit', {
+      evaluation_id,
+      status: 'success',
+      evidence: [{ evidence_key: 'k', evidence_data: 'v2' }],
+    })
+    expect(divergent.status).toBe(409)
+    expect(((await divergent.json()) as { error: string }).error).toBe('evaluation_conflict')
+  })
+
+  it('treats identical evidence (any entry order) as an idempotent replay', async () => {
+    const ctx = setup()
+    store = ctx.store
+    governance = ctx.governance ?? null
+
+    const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' }))
+    const { evaluation_id } = (await ev.json()) as { evaluation_id: string }
+    const a = { evidence_key: 'a', evidence_data: 1 }
+    const b = { evidence_key: 'b', evidence_data: 2 }
+
+    const first = await ctx.post('/audit', { evaluation_id, status: 'success', evidence: [a, b] })
+    expect(first.status).toBe(201)
+
+    // Same entries, reversed order → must hash identically (sorted tuple).
+    const replay = await ctx.post('/audit', { evaluation_id, status: 'success', evidence: [b, a] })
+    expect(replay.status).toBe(200)
+    expect(((await replay.json()) as { already_finalized: boolean }).already_finalized).toBe(true)
+  })
+
+  it('soft-fails evidence with reason closed over HTTP — /audit stays 201, never 503', async () => {
+    const ctx = setup()
+    store = ctx.store
+    governance = ctx.governance ?? null
+
+    const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' }))
+    const { evaluation_id } = (await ev.json()) as { evaluation_id: string }
+    ctx.store.close() // standalone /evidence would 503; /audit must not inherit that
+
+    const audit = await ctx.post('/audit', {
+      evaluation_id,
+      status: 'success',
+      evidence: [{ evidence_key: 'k', evidence_data: 'x' }],
+    })
+    expect(audit.status).toBe(201)
+    const json = (await audit.json()) as { evidence: { evidence_key: string; reason?: string }[] }
+    expect(json.evidence).toEqual([{ evidence_key: 'k', stored: false, reason: 'closed' }])
+  })
+
+  it('rejects an over-1MiB /audit body with 413 (gross body limit, not an evidence cap)', async () => {
+    const ctx = setup()
+    store = ctx.store
+    governance = ctx.governance ?? null
+
+    const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' }))
+    const { evaluation_id } = (await ev.json()) as { evaluation_id: string }
+
+    const huge = 'x'.repeat(1_200 * 1024) // ~1.2 MiB > the 1 MiB sideband body limit
+    const body = JSON.stringify({
+      evaluation_id,
+      status: 'success',
+      evidence: [{ evidence_key: 'k', evidence_data: huge }],
+    })
+    // Set content-length explicitly — the real Node server sets it, and hono's
+    // bodyLimit checks it to fail fast with 413 before the body is parsed.
+    const res = await ctx.post('/audit', body, {
+      'content-length': String(Buffer.byteLength(body)),
+    })
+    expect(res.status).toBe(413)
+    expect(((await res.json()) as { error: string }).error).toBe('request_body_too_large')
+  })
 })
diff --git a/packages/proxy/src/sideband/governance-api.ts b/packages/proxy/src/sideband/governance-api.ts
index f93d45d..2b6d52a 100644
--- a/packages/proxy/src/sideband/governance-api.ts
+++ b/packages/proxy/src/sideband/governance-api.ts
@@ -54,6 +54,12 @@ const installScanBody = z.object({
   metadata: metadataSchema,
 })
 
+const evidenceEntrySchema = z.object({
+  evidence_key: z.string().min(1),
+  evidence_data: z.unknown().refine((v) => v !== undefined, { message: 'Required' }),
+  ttl_seconds: z.number().int().positive().optional(),
+})
+
 const auditBody = z.object({
   evaluation_id: z.string().min(1),
   status: z.enum(['success', 'error', 'not_executed']),
@@ -61,6 +67,10 @@ const auditBody = z.object({
   duration_ms: z.number().optional(),
   result: z.unknown().optional(),
   actual_amount: z.number().optional(),
+  // No `.max()` / size refinement here on purpose (issue #11): caps are
+  // enforced per-entry in GovernanceService.populateEvidence as soft-drops, so
+  // an over-cap entry never 400s away the audit row for a call that already ran.
+  evidence: z.array(evidenceEntrySchema).optional(),
 })
 
 const resolveBody = z.object({
@@ -207,10 +217,33 @@ function auditPayloadHash(data: z.infer<typeof auditBody>): string {
     duration_ms: data.duration_ms ?? null,
     result: data.result ?? null,
     actual_amount: data.actual_amount ?? null,
+    evidence: canonicalEvidence(data.evidence),
   }
   return createHash('sha256').update(canonicalize(semantic)).digest('hex')
 }
 
+/**
+ * Order-independent canonical form of the submitted evidence array, for the
+ * idempotency hash (issue #11). Entries are normalized (defaults
+ * filled) and sorted by their full canonical tuple — `(evidence_key,
+ * evidence_data, ttl_seconds)` — so a retry that sends the same facts in a
+ * different array order hashes identically, while any divergence in key, data,
+ * or ttl is a `409 evaluation_conflict`. Hashes the *submitted* payload, so the
+ * result is independent of which entries the service later soft-drops.
+ */
+function canonicalEvidence(evidence: z.infer<typeof auditBody>['evidence']): unknown {
+  if (!evidence || evidence.length === 0) return null
+  return evidence
+    .map((e) => ({
+      evidence_key: e.evidence_key,
+      evidence_data: e.evidence_data ?? null,
+      ttl_seconds: e.ttl_seconds ?? null,
+    }))
+    .map((norm) => ({ sortKey: canonicalize(norm), norm }))
+    .sort((a, b) => (a.sortKey < b.sortKey ? -1 : a.sortKey > b.sortKey ? 1 : 0))
+    .map((x) => x.norm)
+}
+
 /** Narrow a numeric status to Hono's accepted status type. */
 function asStatus(status: number): 200 | 201 | 400 | 404 | 409 | 413 | 503 {
   return status as 200 | 201 | 400 | 404 | 409 | 413 | 503
diff --git a/packages/proxy/src/sideband/governance-service.test.ts b/packages/proxy/src/sideband/governance-service.test.ts
index 65e86ae..a35e412 100644
--- a/packages/proxy/src/sideband/governance-service.test.ts
+++ b/packages/proxy/src/sideband/governance-service.test.ts
@@ -927,3 +927,151 @@ describe('GovernanceService.resolveApproval', () => {
     router.close()
   })
 })
+
+// ---------------------------------------------------------------------------
+// /audit evidence population (issue #11)
+// ---------------------------------------------------------------------------
+
+describe('audit evidence', () => {
+  type Outcome = { evidence_key: string; stored: boolean; reason?: string }
+
+  function evalAudit(
+    harness: ServiceHarness,
+    evidence: AuditInput['evidence'],
+    opts?: { status?: AuditInput['status']; sessionId?: string | null; hash?: string },
+  ) {
+    const ev = harness.service.evaluate(
+      evalInput({ session_id: opts?.sessionId === undefined ? 'oc:s1' : opts.sessionId }),
+    )
+    const id = ev.body['evaluation_id'] as string
+    const res = harness.service.audit(
+      auditInput(id, { status: opts?.status ?? 'success', evidence }),
+      opts?.hash ?? 'h',
+    )
+    return { id, res, outcomes: res.body['evidence'] as Outcome[] | undefined }
+  }
+
+  it('writes evidence to the store on a successful audit and reports stored:true', () => {
+    const h = makeService({ withEvidence: true })
+    const { res, outcomes } = evalAudit(h, [
+      { evidence_key: 'recipient', evidence_data: { to: 'a@b.com' } },
+    ])
+
+    expect(res.status).toBe(201)
+    expect(outcomes).toEqual([{ evidence_key: 'recipient', stored: true }])
+    expect(h.evidenceStore?.getEvidence('oc:s1', 'recipient')?.data).toEqual({ to: 'a@b.com' })
+    expect(h.evidenceStore?.getEvidence('oc:s1', 'recipient')?.tool_name).toBe('send')
+  })
+
+  it('ignores evidence when the call errored (success-only)', () => {
+    const h = makeService({ withEvidence: true })
+    const { res, outcomes } = evalAudit(h, [{ evidence_key: 'recipient', evidence_data: 'x' }], {
+      status: 'error',
+    })
+
+    expect(res.status).toBe(201)
+    expect(outcomes).toBeUndefined()
+    expect(h.evidenceStore?.hasEvidence('oc:s1', 'recipient')).toBe(false)
+  })
+
+  it('ignores evidence when status is not_executed', () => {
+    const h = makeService({ withEvidence: true })
+    const { outcomes } = evalAudit(h, [{ evidence_key: 'recipient', evidence_data: 'x' }], {
+      status: 'not_executed',
+    })
+
+    expect(outcomes).toBeUndefined()
+    expect(h.evidenceStore?.hasEvidence('oc:s1', 'recipient')).toBe(false)
+  })
+
+  it('soft-fails an allowlist-rejected key but still finalizes 201', () => {
+    const h = makeService({ withEvidence: true })
+    h.evidenceStore?.setAllowedEvidenceKeys(['allowed'])
+    const { res, outcomes } = evalAudit(h, [{ evidence_key: 'blocked', evidence_data: 'x' }])
+
+    expect(res.status).toBe(201)
+    expect(outcomes).toEqual([
+      { evidence_key: 'blocked', stored: false, reason: 'key_not_in_policy_allowlist' },
+    ])
+    expect(h.evidenceStore?.hasEvidence('oc:s1', 'blocked')).toBe(false)
+  })
+
+  it('soft-drops an oversized entry with reason too_large (no 413)', () => {
+    const h = makeService({ withEvidence: true })
+    const big = 'x'.repeat(70 * 1024)
+    const { res, outcomes } = evalAudit(h, [{ evidence_key: 'big', evidence_data: big }])
+
+    expect(res.status).toBe(201)
+    expect(outcomes).toEqual([{ evidence_key: 'big', stored: false, reason: 'too_large' }])
+    expect(h.evidenceStore?.hasEvidence('oc:s1', 'big')).toBe(false)
+  })
+
+  it('soft-drops entries beyond MAX_EVIDENCE_ENTRIES with reason too_many', () => {
+    const h = makeService({ withEvidence: true })
+    const entries = Array.from({ length: 18 }, (_, i) => ({
+      evidence_key: `k${String(i)}`,
+      evidence_data: i,
+    }))
+    const { res, outcomes } = evalAudit(h, entries)
+
+    expect(res.status).toBe(201)
+    expect(outcomes).toHaveLength(18)
+    expect(outcomes?.slice(0, 16).every((o) => o.stored)).toBe(true)
+    expect(outcomes?.slice(16)).toEqual([
+      { evidence_key: 'k16', stored: false, reason: 'too_many' },
+      { evidence_key: 'k17', stored: false, reason: 'too_many' },
+    ])
+    expect(h.evidenceStore?.hasEvidence('oc:s1', 'k15')).toBe(true)
+    expect(h.evidenceStore?.hasEvidence('oc:s1', 'k16')).toBe(false)
+  })
+
+  it('soft-fails with reason no_session when the evaluation has no session', () => {
+    const h = makeService({ withEvidence: true })
+    const { res, outcomes } = evalAudit(h, [{ evidence_key: 'k', evidence_data: 'x' }], {
+      sessionId: null,
+    })
+
+    expect(res.status).toBe(201)
+    expect(outcomes).toEqual([{ evidence_key: 'k', stored: false, reason: 'no_session' }])
+  })
+
+  it('soft-fails with reason evidence_unavailable when the service has no evidence store', () => {
+    const h = makeService() // no withEvidence → evidenceStore undefined
+    const { res, outcomes } = evalAudit(h, [{ evidence_key: 'k', evidence_data: 'x' }])
+
+    expect(res.status).toBe(201)
+    expect(outcomes).toEqual([{ evidence_key: 'k', stored: false, reason: 'evidence_unavailable' }])
+  })
+
+  it('soft-fails with reason closed when the store is shutting down — still 201, never 503', () => {
+    const h = makeService({ withEvidence: true })
+    h.evidenceStore?.close()
+    const { res, outcomes } = evalAudit(h, [{ evidence_key: 'k', evidence_data: 'x' }])
+
+    expect(res.status).toBe(201)
+    expect(outcomes).toEqual([{ evidence_key: 'k', stored: false, reason: 'closed' }])
+  })
+
+  it('does not re-write evidence on an idempotent replay', () => {
+    const h = makeService({ withEvidence: true })
+    const { id } = evalAudit(h, [{ evidence_key: 'k', evidence_data: 'v' }])
+    const firstExpiry = h.evidenceStore?.getEvidence('oc:s1', 'k')?.expires_at
+
+    h.advance(10_000)
+    const replay = h.service.audit(
+      auditInput(id, { evidence: [{ evidence_key: 'k', evidence_data: 'v' }] }),
+      'h',
+    )
+
+    expect(replay.status).toBe(200)
+    expect(replay.body['already_finalized']).toBe(true)
+    // No re-write → TTL/expiry unchanged despite the 10s advance.
+    expect(h.evidenceStore?.getEvidence('oc:s1', 'k')?.expires_at).toBe(firstExpiry)
+  })
+
+  it('is a no-op (no outcomes) when no evidence is supplied', () => {
+    const h = makeService({ withEvidence: true })
+    const { outcomes } = evalAudit(h, undefined)
+    expect(outcomes).toBeUndefined()
+  })
+})
diff --git a/packages/proxy/src/sideband/governance-service.ts b/packages/proxy/src/sideband/governance-service.ts
index 4fa0f6c..0e1a8aa 100644
--- a/packages/proxy/src/sideband/governance-service.ts
+++ b/packages/proxy/src/sideband/governance-service.ts
@@ -49,6 +49,11 @@ const MAX_PENDING_BYTES = 64 * 1_024 * 1_024
 // Capped here in the service (NOT in the limiters) so the evaluate/audit split can
 // reserve pre-execution and the MCP door is never gated (issue #13).
 const MAX_SENDER_KEYS = 50_000
+// Optional /audit evidence payload (issue #11). Caps are enforced in
+// populateEvidence (NOT route validation) so an over-cap entry soft-drops
+// without discarding the audit row for a call that already ran.
+const MAX_EVIDENCE_ENTRIES = 16
+const MAX_EVIDENCE_BYTES = 64 * 1_024
 
 const SWEEP_INTERVAL_MS = 30_000
 
@@ -99,6 +104,19 @@ export interface InstallScanInput {
   readonly metadata: Record<string, unknown> | null
 }
 
+export interface AuditEvidenceInput {
+  readonly evidence_key: string
+  readonly evidence_data: unknown
+  readonly ttl_seconds?: number
+}
+
+/** Per-entry outcome reported back for each submitted evidence entry. */
+export interface AuditEvidenceOutcome {
+  readonly evidence_key: string
+  readonly stored: boolean
+  readonly reason?: string
+}
+
 export interface AuditInput {
   readonly evaluation_id: string
   readonly status: 'success' | 'error' | 'not_executed'
@@ -106,6 +124,12 @@ export interface AuditInput {
   readonly duration_ms?: number
   readonly result?: unknown
   readonly actual_amount?: number
+  /**
+   * Optional evidence to populate on a successfully-audited call (issue #11). Adapter-scoped, single-token evidence write: bound to the pending
+   * evaluation's session/tool, success-only, first-finalize-only. Every
+   * per-entry failure is soft (reported, never request-fatal) — see audit().
+   */
+  readonly evidence?: ReadonlyArray<AuditEvidenceInput>
 }
 
 export interface ResolveApprovalInput {
@@ -546,6 +570,13 @@ export class GovernanceService {
       this.evidenceStore.recordToolCall(entry.sessionId, entry.toolName, req.status === 'success')
     }
 
+    // Populate evidence (issue #11). Success-only, and first-finalize
+    // only — we are past every tombstone replay return above, so this never
+    // re-writes on a replay. Every per-entry failure is soft (reported below,
+    // never request-fatal): losing the audit row for a call that already ran
+    // would be worse than a dropped evidence entry.
+    const evidenceOutcomes = this.populateEvidence(req, entry)
+
     const auditId = this.writeAudit({
       timestampIso: entry.timestampIso,
       origin: entry.origin,
@@ -577,7 +608,70 @@ export class GovernanceService {
       expiresAtMs: this.now() + this.ttlMs,
     })
 
-    return { status: 201, body: { ok: true, audit_record_id: auditId } }
+    const body: Record<string, unknown> = { ok: true, audit_record_id: auditId }
+    if (evidenceOutcomes) body['evidence'] = evidenceOutcomes
+    return { status: 201, body }
+  }
+
+  /**
+   * Write the optional `/audit` evidence entries for a successful call
+   * (issue #11), returning a per-entry outcome list — or `undefined`
+   * when there is nothing to report (non-success status, or no evidence
+   * supplied). Caps are enforced here, NOT in route validation, so an over-cap
+   * entry soft-drops without discarding the audit row: entries past
+   * `MAX_EVIDENCE_ENTRIES` → `too_many`; oversized `evidence_data` →
+   * `too_large`; no evidence store on the service → `evidence_unavailable`;
+   * a sessionless evaluation → `no_session`; the store's own rejections
+   * (`key_not_in_policy_allowlist`, `closed`) pass through as the per-entry
+   * reason. None of these fail the audit.
+   */
+  private populateEvidence(
+    req: AuditInput,
+    entry: PendingEvaluation,
+  ): AuditEvidenceOutcome[] | undefined {
+    if (req.status !== 'success' || !req.evidence || req.evidence.length === 0) {
+      return undefined
+    }
+    const outcomes: AuditEvidenceOutcome[] = []
+    for (let i = 0; i < req.evidence.length; i++) {
+      const e = req.evidence[i]
+      if (!e) continue
+      if (i >= MAX_EVIDENCE_ENTRIES) {
+        outcomes.push({ evidence_key: e.evidence_key, stored: false, reason: 'too_many' })
+        continue
+      }
+      const bytes = Buffer.byteLength(canonicalize(e.evidence_data ?? null), 'utf8')
+      if (bytes > MAX_EVIDENCE_BYTES) {
+        outcomes.push({ evidence_key: e.evidence_key, stored: false, reason: 'too_large' })
+        continue
+      }
+      if (!this.evidenceStore) {
+        // Governance enabled without an evidence store (evidence-only-disabled
+        // deployment) — distinct from a call that simply has no session.
+        outcomes.push({
+          evidence_key: e.evidence_key,
+          stored: false,
+          reason: 'evidence_unavailable',
+        })
+        continue
+      }
+      if (!entry.sessionId) {
+        outcomes.push({ evidence_key: e.evidence_key, stored: false, reason: 'no_session' })
+        continue
+      }
+      const result = this.evidenceStore.putEvidence(entry.sessionId, {
+        evidence_key: e.evidence_key,
+        data: e.evidence_data,
+        tool_name: entry.toolName,
+        ttl_seconds: e.ttl_seconds,
+      })
+      outcomes.push(
+        result.stored
+          ? { evidence_key: e.evidence_key, stored: true }
+          : { evidence_key: e.evidence_key, stored: false, reason: result.reason },
+      )
+    }
+    return outcomes
   }
 
   // -------------------------------------------------------------------------

From cd11a52753440140d101bf3adcd424f106f1b11c Mon Sep 17 00:00:00 2001
From: olivrg <olivrg@gmail.com>
Date: Thu, 18 Jun 2026 16:08:04 +0100
Subject: [PATCH 2/2] docs(adapter-api): document the /audit evidence field
 (#11)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add a "Populating evidence" section: array shape, success-only,
  first-finalize-only, per-entry soft-fail outcomes (and their reasons), and
  order-independent idempotency.
- Rewrite the Authentication note that claimed "an adapter cannot write
  evidence" — it now describes the narrow, scoped capability the adapter gains
  via /audit (bound to its own session/tool, allowlist-enforced), without the
  SDK-scoped /evidence route.
- Fix the summary-table /install-scan line that still read "observational" to
  reflect the deny_install enforcement shipped in #13.
---
 docs/adapter-api.md | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/docs/adapter-api.md b/docs/adapter-api.md
index 7ed8341..df5f60b 100644
--- a/docs/adapter-api.md
+++ b/docs/adapter-api.md
@@ -4,12 +4,12 @@
 
 The governance API lives on the **SDK sideband** — the local server on `127.0.0.1:3200` (configurable via `sdk.*`), the same server the Python SDK uses for evidence/context. It is **not** the dashboard sideband (`:3100`, documented in [Sideband API Reference](./sideband-api.md)); the two are different servers with different jobs. Endpoints here:
 
-| Route                        | Purpose                                                                        |
-| ---------------------------- | ------------------------------------------------------------------------------ |
-| `POST /evaluate`             | Decide a tool call. **Side-effect-free** on rate/spend counters.               |
-| `POST /audit`                | Record the outcome of an evaluated call; **consumes** counters. Idempotent.    |
-| `POST /install-scan`         | Evaluate a package/skill install. Observational until install-time rules ship. |
-| `POST /approval/:id/resolve` | Record the resolution of a natively-handled approval.                          |
+| Route                        | Purpose                                                                                        |
+| ---------------------------- | ---------------------------------------------------------------------------------------------- |
+| `POST /evaluate`             | Decide a tool call. **Side-effect-free** on rate/spend counters.                               |
+| `POST /audit`                | Record the outcome of an evaluated call; **consumes** counters. Idempotent.                    |
+| `POST /install-scan`         | Evaluate a package/skill install against `policies.install` (observational when none defined). |
+| `POST /approval/:id/resolve` | Record the resolution of a natively-handled approval.                                          |
 
 ## Why this exists, and what it does not promise
 
@@ -36,7 +36,7 @@ An adapter built on this API **MUST**:
 
 ## Authentication
 
-The governance routes require `Authorization: Bearer <HELIO_ADAPTER_TOKEN>`. This is a **separate token** from the SDK's `HELIO_SDK_TOKEN`: an SDK client cannot drive policy decisions, and an adapter cannot write evidence. Both are generated per boot (and printed to stderr) unless set in the environment. Requests carrying an `Origin` header are refused (browser-forgery guard), and bodies over 1 MiB are rejected with 413.
+The governance routes require `Authorization: Bearer <HELIO_ADAPTER_TOKEN>`. This is a **separate token** from the SDK's `HELIO_SDK_TOKEN`: an SDK client cannot drive policy decisions, and an adapter cannot call the SDK's `POST /evidence`/`/context` routes. The adapter's evidence access is deliberately narrow: it may attach evidence to a call it is auditing, via the optional `evidence` field on `POST /audit` (success-only, bound to that evaluation's own session/tool, subject to the policy allowlist — see [Populating evidence](#populating-evidence)); it cannot write arbitrary evidence to arbitrary sessions. Both tokens are generated per boot (and printed to stderr) unless set in the environment. Requests carrying an `Origin` header are refused (browser-forgery guard), and bodies over 1 MiB are rejected with 413.
 
 If you embed `GovernanceService` directly (instead of running `helio start`), wire an `ApprovalRouter` whenever the policy can emit `require_approval` (explicit rules, `flag_destructive: require_approval`, or `on_tool_drift: require_approval`), otherwise construction and hot-reload fail closed by throwing `GovernanceConfigError` (exported from `@gethelio/proxy`).
 
@@ -90,11 +90,20 @@ The `decision` is an **outcome**, not Helio's internal rule action: a `rate_limi
   "error": "…",            // optional, when status == "error"
   "duration_ms": 412,      // optional
   "result": { },           // optional outcome summary
-  "actual_amount": 0.42    // optional, finite ≥0 — true post-execution spend; overrides the arg-derived amount
+  "actual_amount": 0.42,   // optional, finite ≥0 — true post-execution spend; overrides the arg-derived amount
+  "evidence": [            // optional — see "Populating evidence" below
+    { "evidence_key": "recipient", "evidence_data": { "to": "a@b.com" }, "ttl_seconds": 300 }
+  ]
 }
 
 // Response 201 (fresh) — replays return 200
-{ "ok": true, "audit_record_id": "…" }
+{
+  "ok": true,
+  "audit_record_id": "…",
+  "evidence": [            // present only when the request carried evidence
+    { "evidence_key": "recipient", "stored": true }
+  ]
+}
 ```
 
 Counters are consumed here (not at `/evaluate`), and only when the call actually ran (`success`/`error`, not `not_executed`). `/audit` is **idempotent on `evaluation_id`**: an identical replay returns `200 { already_finalized: true }` with no double-consumption, so a network retry after a lost response is safe. A different payload under the same id is an adapter bug → `409 evaluation_conflict`.
@@ -103,6 +112,17 @@ Counters are consumed here (not at `/evaluate`), and only when the call actually
 
 `actual_amount` must be finite and `>= 0` (`400 invalid_actual_amount` otherwise) and only applies to evaluations whose decision carried a spend rule (`400 no_spend_rule` if sent for any other evaluation).
 
+### Populating evidence
+
+The optional `evidence` array lets an adapter ground a call's outcome — e.g. recording the recipient a `send` tool actually resolved — so a later [evidence-grounded rule](./policies.md) (`evidence.requires`) can enforce on it. This is the **only** way the adapter token writes evidence; the SDK-scoped `POST /evidence` route is not available to it (see [Authentication](#authentication)). Each entry is `{ evidence_key, evidence_data, ttl_seconds? }`. The proxy binds the write to the **pending evaluation's own** `session_id` and `tool_name` — an adapter cannot target another session — and stores it via the same evidence store the SDK path uses.
+
+Rules:
+
+- **Success-only.** Evidence is written only when `status: "success"`. On `error`/`not_executed` it is ignored (a failed tool must not ground later calls).
+- **First-finalize-only.** Evidence is written once, on the first `/audit`; idempotent replays never re-write (no TTL reset).
+- **Every per-entry failure is soft — never request-fatal.** The audit always finalizes `201`; per-entry outcomes are reported in the response `evidence` array as `{ evidence_key, stored, reason? }`. Reasons: `key_not_in_policy_allowlist` (the key is not named by any `evidence.requires` rule), `too_large` (`evidence_data` over 64 KiB), `too_many` (more than 16 entries — the excess is dropped), `no_session` (the evaluation had no `session_id`), `evidence_unavailable` (this deployment runs governance without an evidence store), `closed` (the store is shutting down). **A rejected key is silently not stored**, so a later grounded `/evaluate` will fail closed — make sure every key you populate is named by an `evidence.requires` rule.
+- **Idempotency.** Evidence is part of the `/audit` idempotency hash (order-independent): an identical retry replays cleanly, but the same `evaluation_id` with divergent evidence is `409 evaluation_conflict`.
+
 **Other responses:** `404 evaluation_unknown`, `404 evaluation_expired` (the decision aged out — see below), `409 approval_unresolved` (resolve the approval first; **retryable** with short backoff).
 
 ### The crash-TTL and TOCTOU caveats