From ac7ab99cb6e1790d0aa9814dac859804d2b97555 Mon Sep 17 00:00:00 2001 From: olivrg Date: Thu, 18 Jun 2026 16:06:57 +0100 Subject: [PATCH 1/2] feat(proxy): optional evidence payload on POST /audit (#11) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lets a hook-based adapter populate evidence-grounding facts on its single adapter-scoped token, instead of the SDK-scoped POST /evidence — the remaining core change for the OpenClaw adapter (#11). - governance-service: add `evidence` to AuditInput and write it via EvidenceStore on a successful, first-finalize audit, bound to the pending evaluation's own session_id/tool_name (the adapter never supplies them). Every per-entry failure is soft (reported, never request-fatal): caps are enforced here as soft-drops — `too_many` (>16) / `too_large` (>64 KiB) — and the store's `key_not_in_policy_allowlist` / `closed`, plus `no_session` / `evidence_unavailable`, all still finalize 201. Critically, `closed` is NOT the 503 the standalone /evidence route returns — losing the audit row for a call that already ran would be worse than a dropped evidence entry. - governance-api: add `evidence` to the auditBody schema (no route-level `.max()`, so caps stay soft-drops) and fold it into auditPayloadHash, order-independent (sorted by the full canonical tuple) so an identical retry replays cleanly while divergent evidence is a 409 conflict. Success-only, first-finalize-only (idempotent replays never re-write). Tests cover every soft-drop reason, the closed!=503 asymmetry over HTTP, the gross-body 413, and order-independent idempotency. --- .../proxy/src/sideband/governance-api.test.ts | 129 ++++++++++++++- packages/proxy/src/sideband/governance-api.ts | 33 ++++ .../src/sideband/governance-service.test.ts | 148 ++++++++++++++++++ .../proxy/src/sideband/governance-service.ts | 96 +++++++++++- 4 files changed, 404 insertions(+), 2 deletions(-) diff --git a/packages/proxy/src/sideband/governance-api.test.ts b/packages/proxy/src/sideband/governance-api.test.ts index 2c1e2ab..244ebf5 100644 --- a/packages/proxy/src/sideband/governance-api.test.ts +++ b/packages/proxy/src/sideband/governance-api.test.ts @@ -28,7 +28,9 @@ function setup(opts?: { const governance = opts?.withGovernance === false ? undefined - : new GovernanceService({ policy, sweepIntervalMs: 0 }) + : // Wire the same EvidenceStore into the service, mirroring production + // (cli.ts) so the /audit evidence path is exercised end-to-end. + new GovernanceService({ policy, sweepIntervalMs: 0, evidenceStore: store }) const app = createSidebandApp(store, { token: opts?.tokens ? SDK_TOKEN : undefined, @@ -253,4 +255,129 @@ describe('sideband governance routes', () => { expect(res.status).toBe(200) expect(((await res.json()) as { decision: string }).decision).toBe('allow') }) + + it('accepts an evidence array on /audit and writes it to the store', async () => { + const ctx = setup() + store = ctx.store + governance = ctx.governance ?? null + + const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' })) + const { evaluation_id } = (await ev.json()) as { evaluation_id: string } + const audit = await ctx.post('/audit', { + evaluation_id, + status: 'success', + evidence: [{ evidence_key: 'recipient', evidence_data: { to: 'a@b.com' } }], + }) + + expect(audit.status).toBe(201) + const json = (await audit.json()) as { evidence: { evidence_key: string; stored: boolean }[] } + expect(json.evidence).toEqual([{ evidence_key: 'recipient', stored: true }]) + expect(ctx.store.getEvidence('oc:s1', 'recipient')?.data).toEqual({ to: 'a@b.com' }) + }) + + it('does not reject an over-count evidence array at the route (no .max) — soft-drops in service', async () => { + const ctx = setup() + store = ctx.store + governance = ctx.governance ?? null + + const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' })) + const { evaluation_id } = (await ev.json()) as { evaluation_id: string } + const evidence = Array.from({ length: 17 }, (_, i) => ({ + evidence_key: `k${String(i)}`, + evidence_data: i, + })) + const audit = await ctx.post('/audit', { evaluation_id, status: 'success', evidence }) + + expect(audit.status).toBe(201) // NOT 400 — the route has no .max() refinement + const json = (await audit.json()) as { + evidence: { evidence_key: string; stored: boolean; reason?: string }[] + } + expect(json.evidence).toHaveLength(17) + expect(json.evidence[16]).toEqual({ evidence_key: 'k16', stored: false, reason: 'too_many' }) + }) + + it('treats divergent evidence under the same evaluation_id as a 409 conflict', async () => { + const ctx = setup() + store = ctx.store + governance = ctx.governance ?? null + + const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' })) + const { evaluation_id } = (await ev.json()) as { evaluation_id: string } + + const first = await ctx.post('/audit', { + evaluation_id, + status: 'success', + evidence: [{ evidence_key: 'k', evidence_data: 'v1' }], + }) + expect(first.status).toBe(201) + + const divergent = await ctx.post('/audit', { + evaluation_id, + status: 'success', + evidence: [{ evidence_key: 'k', evidence_data: 'v2' }], + }) + expect(divergent.status).toBe(409) + expect(((await divergent.json()) as { error: string }).error).toBe('evaluation_conflict') + }) + + it('treats identical evidence (any entry order) as an idempotent replay', async () => { + const ctx = setup() + store = ctx.store + governance = ctx.governance ?? null + + const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' })) + const { evaluation_id } = (await ev.json()) as { evaluation_id: string } + const a = { evidence_key: 'a', evidence_data: 1 } + const b = { evidence_key: 'b', evidence_data: 2 } + + const first = await ctx.post('/audit', { evaluation_id, status: 'success', evidence: [a, b] }) + expect(first.status).toBe(201) + + // Same entries, reversed order → must hash identically (sorted tuple). + const replay = await ctx.post('/audit', { evaluation_id, status: 'success', evidence: [b, a] }) + expect(replay.status).toBe(200) + expect(((await replay.json()) as { already_finalized: boolean }).already_finalized).toBe(true) + }) + + it('soft-fails evidence with reason closed over HTTP — /audit stays 201, never 503', async () => { + const ctx = setup() + store = ctx.store + governance = ctx.governance ?? null + + const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' })) + const { evaluation_id } = (await ev.json()) as { evaluation_id: string } + ctx.store.close() // standalone /evidence would 503; /audit must not inherit that + + const audit = await ctx.post('/audit', { + evaluation_id, + status: 'success', + evidence: [{ evidence_key: 'k', evidence_data: 'x' }], + }) + expect(audit.status).toBe(201) + const json = (await audit.json()) as { evidence: { evidence_key: string; reason?: string }[] } + expect(json.evidence).toEqual([{ evidence_key: 'k', stored: false, reason: 'closed' }]) + }) + + it('rejects an over-1MiB /audit body with 413 (gross body limit, not an evidence cap)', async () => { + const ctx = setup() + store = ctx.store + governance = ctx.governance ?? null + + const ev = await ctx.post('/evaluate', evalBody({ session_id: 'oc:s1' })) + const { evaluation_id } = (await ev.json()) as { evaluation_id: string } + + const huge = 'x'.repeat(1_200 * 1024) // ~1.2 MiB > the 1 MiB sideband body limit + const body = JSON.stringify({ + evaluation_id, + status: 'success', + evidence: [{ evidence_key: 'k', evidence_data: huge }], + }) + // Set content-length explicitly — the real Node server sets it, and hono's + // bodyLimit checks it to fail fast with 413 before the body is parsed. + const res = await ctx.post('/audit', body, { + 'content-length': String(Buffer.byteLength(body)), + }) + expect(res.status).toBe(413) + expect(((await res.json()) as { error: string }).error).toBe('request_body_too_large') + }) }) diff --git a/packages/proxy/src/sideband/governance-api.ts b/packages/proxy/src/sideband/governance-api.ts index f93d45d..2b6d52a 100644 --- a/packages/proxy/src/sideband/governance-api.ts +++ b/packages/proxy/src/sideband/governance-api.ts @@ -54,6 +54,12 @@ const installScanBody = z.object({ metadata: metadataSchema, }) +const evidenceEntrySchema = z.object({ + evidence_key: z.string().min(1), + evidence_data: z.unknown().refine((v) => v !== undefined, { message: 'Required' }), + ttl_seconds: z.number().int().positive().optional(), +}) + const auditBody = z.object({ evaluation_id: z.string().min(1), status: z.enum(['success', 'error', 'not_executed']), @@ -61,6 +67,10 @@ const auditBody = z.object({ duration_ms: z.number().optional(), result: z.unknown().optional(), actual_amount: z.number().optional(), + // No `.max()` / size refinement here on purpose (issue #11): caps are + // enforced per-entry in GovernanceService.populateEvidence as soft-drops, so + // an over-cap entry never 400s away the audit row for a call that already ran. + evidence: z.array(evidenceEntrySchema).optional(), }) const resolveBody = z.object({ @@ -207,10 +217,33 @@ function auditPayloadHash(data: z.infer): string { duration_ms: data.duration_ms ?? null, result: data.result ?? null, actual_amount: data.actual_amount ?? null, + evidence: canonicalEvidence(data.evidence), } return createHash('sha256').update(canonicalize(semantic)).digest('hex') } +/** + * Order-independent canonical form of the submitted evidence array, for the + * idempotency hash (issue #11). Entries are normalized (defaults + * filled) and sorted by their full canonical tuple — `(evidence_key, + * evidence_data, ttl_seconds)` — so a retry that sends the same facts in a + * different array order hashes identically, while any divergence in key, data, + * or ttl is a `409 evaluation_conflict`. Hashes the *submitted* payload, so the + * result is independent of which entries the service later soft-drops. + */ +function canonicalEvidence(evidence: z.infer['evidence']): unknown { + if (!evidence || evidence.length === 0) return null + return evidence + .map((e) => ({ + evidence_key: e.evidence_key, + evidence_data: e.evidence_data ?? null, + ttl_seconds: e.ttl_seconds ?? null, + })) + .map((norm) => ({ sortKey: canonicalize(norm), norm })) + .sort((a, b) => (a.sortKey < b.sortKey ? -1 : a.sortKey > b.sortKey ? 1 : 0)) + .map((x) => x.norm) +} + /** Narrow a numeric status to Hono's accepted status type. */ function asStatus(status: number): 200 | 201 | 400 | 404 | 409 | 413 | 503 { return status as 200 | 201 | 400 | 404 | 409 | 413 | 503 diff --git a/packages/proxy/src/sideband/governance-service.test.ts b/packages/proxy/src/sideband/governance-service.test.ts index 65e86ae..a35e412 100644 --- a/packages/proxy/src/sideband/governance-service.test.ts +++ b/packages/proxy/src/sideband/governance-service.test.ts @@ -927,3 +927,151 @@ describe('GovernanceService.resolveApproval', () => { router.close() }) }) + +// --------------------------------------------------------------------------- +// /audit evidence population (issue #11) +// --------------------------------------------------------------------------- + +describe('audit evidence', () => { + type Outcome = { evidence_key: string; stored: boolean; reason?: string } + + function evalAudit( + harness: ServiceHarness, + evidence: AuditInput['evidence'], + opts?: { status?: AuditInput['status']; sessionId?: string | null; hash?: string }, + ) { + const ev = harness.service.evaluate( + evalInput({ session_id: opts?.sessionId === undefined ? 'oc:s1' : opts.sessionId }), + ) + const id = ev.body['evaluation_id'] as string + const res = harness.service.audit( + auditInput(id, { status: opts?.status ?? 'success', evidence }), + opts?.hash ?? 'h', + ) + return { id, res, outcomes: res.body['evidence'] as Outcome[] | undefined } + } + + it('writes evidence to the store on a successful audit and reports stored:true', () => { + const h = makeService({ withEvidence: true }) + const { res, outcomes } = evalAudit(h, [ + { evidence_key: 'recipient', evidence_data: { to: 'a@b.com' } }, + ]) + + expect(res.status).toBe(201) + expect(outcomes).toEqual([{ evidence_key: 'recipient', stored: true }]) + expect(h.evidenceStore?.getEvidence('oc:s1', 'recipient')?.data).toEqual({ to: 'a@b.com' }) + expect(h.evidenceStore?.getEvidence('oc:s1', 'recipient')?.tool_name).toBe('send') + }) + + it('ignores evidence when the call errored (success-only)', () => { + const h = makeService({ withEvidence: true }) + const { res, outcomes } = evalAudit(h, [{ evidence_key: 'recipient', evidence_data: 'x' }], { + status: 'error', + }) + + expect(res.status).toBe(201) + expect(outcomes).toBeUndefined() + expect(h.evidenceStore?.hasEvidence('oc:s1', 'recipient')).toBe(false) + }) + + it('ignores evidence when status is not_executed', () => { + const h = makeService({ withEvidence: true }) + const { outcomes } = evalAudit(h, [{ evidence_key: 'recipient', evidence_data: 'x' }], { + status: 'not_executed', + }) + + expect(outcomes).toBeUndefined() + expect(h.evidenceStore?.hasEvidence('oc:s1', 'recipient')).toBe(false) + }) + + it('soft-fails an allowlist-rejected key but still finalizes 201', () => { + const h = makeService({ withEvidence: true }) + h.evidenceStore?.setAllowedEvidenceKeys(['allowed']) + const { res, outcomes } = evalAudit(h, [{ evidence_key: 'blocked', evidence_data: 'x' }]) + + expect(res.status).toBe(201) + expect(outcomes).toEqual([ + { evidence_key: 'blocked', stored: false, reason: 'key_not_in_policy_allowlist' }, + ]) + expect(h.evidenceStore?.hasEvidence('oc:s1', 'blocked')).toBe(false) + }) + + it('soft-drops an oversized entry with reason too_large (no 413)', () => { + const h = makeService({ withEvidence: true }) + const big = 'x'.repeat(70 * 1024) + const { res, outcomes } = evalAudit(h, [{ evidence_key: 'big', evidence_data: big }]) + + expect(res.status).toBe(201) + expect(outcomes).toEqual([{ evidence_key: 'big', stored: false, reason: 'too_large' }]) + expect(h.evidenceStore?.hasEvidence('oc:s1', 'big')).toBe(false) + }) + + it('soft-drops entries beyond MAX_EVIDENCE_ENTRIES with reason too_many', () => { + const h = makeService({ withEvidence: true }) + const entries = Array.from({ length: 18 }, (_, i) => ({ + evidence_key: `k${String(i)}`, + evidence_data: i, + })) + const { res, outcomes } = evalAudit(h, entries) + + expect(res.status).toBe(201) + expect(outcomes).toHaveLength(18) + expect(outcomes?.slice(0, 16).every((o) => o.stored)).toBe(true) + expect(outcomes?.slice(16)).toEqual([ + { evidence_key: 'k16', stored: false, reason: 'too_many' }, + { evidence_key: 'k17', stored: false, reason: 'too_many' }, + ]) + expect(h.evidenceStore?.hasEvidence('oc:s1', 'k15')).toBe(true) + expect(h.evidenceStore?.hasEvidence('oc:s1', 'k16')).toBe(false) + }) + + it('soft-fails with reason no_session when the evaluation has no session', () => { + const h = makeService({ withEvidence: true }) + const { res, outcomes } = evalAudit(h, [{ evidence_key: 'k', evidence_data: 'x' }], { + sessionId: null, + }) + + expect(res.status).toBe(201) + expect(outcomes).toEqual([{ evidence_key: 'k', stored: false, reason: 'no_session' }]) + }) + + it('soft-fails with reason evidence_unavailable when the service has no evidence store', () => { + const h = makeService() // no withEvidence → evidenceStore undefined + const { res, outcomes } = evalAudit(h, [{ evidence_key: 'k', evidence_data: 'x' }]) + + expect(res.status).toBe(201) + expect(outcomes).toEqual([{ evidence_key: 'k', stored: false, reason: 'evidence_unavailable' }]) + }) + + it('soft-fails with reason closed when the store is shutting down — still 201, never 503', () => { + const h = makeService({ withEvidence: true }) + h.evidenceStore?.close() + const { res, outcomes } = evalAudit(h, [{ evidence_key: 'k', evidence_data: 'x' }]) + + expect(res.status).toBe(201) + expect(outcomes).toEqual([{ evidence_key: 'k', stored: false, reason: 'closed' }]) + }) + + it('does not re-write evidence on an idempotent replay', () => { + const h = makeService({ withEvidence: true }) + const { id } = evalAudit(h, [{ evidence_key: 'k', evidence_data: 'v' }]) + const firstExpiry = h.evidenceStore?.getEvidence('oc:s1', 'k')?.expires_at + + h.advance(10_000) + const replay = h.service.audit( + auditInput(id, { evidence: [{ evidence_key: 'k', evidence_data: 'v' }] }), + 'h', + ) + + expect(replay.status).toBe(200) + expect(replay.body['already_finalized']).toBe(true) + // No re-write → TTL/expiry unchanged despite the 10s advance. + expect(h.evidenceStore?.getEvidence('oc:s1', 'k')?.expires_at).toBe(firstExpiry) + }) + + it('is a no-op (no outcomes) when no evidence is supplied', () => { + const h = makeService({ withEvidence: true }) + const { outcomes } = evalAudit(h, undefined) + expect(outcomes).toBeUndefined() + }) +}) diff --git a/packages/proxy/src/sideband/governance-service.ts b/packages/proxy/src/sideband/governance-service.ts index 4fa0f6c..0e1a8aa 100644 --- a/packages/proxy/src/sideband/governance-service.ts +++ b/packages/proxy/src/sideband/governance-service.ts @@ -49,6 +49,11 @@ const MAX_PENDING_BYTES = 64 * 1_024 * 1_024 // Capped here in the service (NOT in the limiters) so the evaluate/audit split can // reserve pre-execution and the MCP door is never gated (issue #13). const MAX_SENDER_KEYS = 50_000 +// Optional /audit evidence payload (issue #11). Caps are enforced in +// populateEvidence (NOT route validation) so an over-cap entry soft-drops +// without discarding the audit row for a call that already ran. +const MAX_EVIDENCE_ENTRIES = 16 +const MAX_EVIDENCE_BYTES = 64 * 1_024 const SWEEP_INTERVAL_MS = 30_000 @@ -99,6 +104,19 @@ export interface InstallScanInput { readonly metadata: Record | null } +export interface AuditEvidenceInput { + readonly evidence_key: string + readonly evidence_data: unknown + readonly ttl_seconds?: number +} + +/** Per-entry outcome reported back for each submitted evidence entry. */ +export interface AuditEvidenceOutcome { + readonly evidence_key: string + readonly stored: boolean + readonly reason?: string +} + export interface AuditInput { readonly evaluation_id: string readonly status: 'success' | 'error' | 'not_executed' @@ -106,6 +124,12 @@ export interface AuditInput { readonly duration_ms?: number readonly result?: unknown readonly actual_amount?: number + /** + * Optional evidence to populate on a successfully-audited call (issue #11). Adapter-scoped, single-token evidence write: bound to the pending + * evaluation's session/tool, success-only, first-finalize-only. Every + * per-entry failure is soft (reported, never request-fatal) — see audit(). + */ + readonly evidence?: ReadonlyArray } export interface ResolveApprovalInput { @@ -546,6 +570,13 @@ export class GovernanceService { this.evidenceStore.recordToolCall(entry.sessionId, entry.toolName, req.status === 'success') } + // Populate evidence (issue #11). Success-only, and first-finalize + // only — we are past every tombstone replay return above, so this never + // re-writes on a replay. Every per-entry failure is soft (reported below, + // never request-fatal): losing the audit row for a call that already ran + // would be worse than a dropped evidence entry. + const evidenceOutcomes = this.populateEvidence(req, entry) + const auditId = this.writeAudit({ timestampIso: entry.timestampIso, origin: entry.origin, @@ -577,7 +608,70 @@ export class GovernanceService { expiresAtMs: this.now() + this.ttlMs, }) - return { status: 201, body: { ok: true, audit_record_id: auditId } } + const body: Record = { ok: true, audit_record_id: auditId } + if (evidenceOutcomes) body['evidence'] = evidenceOutcomes + return { status: 201, body } + } + + /** + * Write the optional `/audit` evidence entries for a successful call + * (issue #11), returning a per-entry outcome list — or `undefined` + * when there is nothing to report (non-success status, or no evidence + * supplied). Caps are enforced here, NOT in route validation, so an over-cap + * entry soft-drops without discarding the audit row: entries past + * `MAX_EVIDENCE_ENTRIES` → `too_many`; oversized `evidence_data` → + * `too_large`; no evidence store on the service → `evidence_unavailable`; + * a sessionless evaluation → `no_session`; the store's own rejections + * (`key_not_in_policy_allowlist`, `closed`) pass through as the per-entry + * reason. None of these fail the audit. + */ + private populateEvidence( + req: AuditInput, + entry: PendingEvaluation, + ): AuditEvidenceOutcome[] | undefined { + if (req.status !== 'success' || !req.evidence || req.evidence.length === 0) { + return undefined + } + const outcomes: AuditEvidenceOutcome[] = [] + for (let i = 0; i < req.evidence.length; i++) { + const e = req.evidence[i] + if (!e) continue + if (i >= MAX_EVIDENCE_ENTRIES) { + outcomes.push({ evidence_key: e.evidence_key, stored: false, reason: 'too_many' }) + continue + } + const bytes = Buffer.byteLength(canonicalize(e.evidence_data ?? null), 'utf8') + if (bytes > MAX_EVIDENCE_BYTES) { + outcomes.push({ evidence_key: e.evidence_key, stored: false, reason: 'too_large' }) + continue + } + if (!this.evidenceStore) { + // Governance enabled without an evidence store (evidence-only-disabled + // deployment) — distinct from a call that simply has no session. + outcomes.push({ + evidence_key: e.evidence_key, + stored: false, + reason: 'evidence_unavailable', + }) + continue + } + if (!entry.sessionId) { + outcomes.push({ evidence_key: e.evidence_key, stored: false, reason: 'no_session' }) + continue + } + const result = this.evidenceStore.putEvidence(entry.sessionId, { + evidence_key: e.evidence_key, + data: e.evidence_data, + tool_name: entry.toolName, + ttl_seconds: e.ttl_seconds, + }) + outcomes.push( + result.stored + ? { evidence_key: e.evidence_key, stored: true } + : { evidence_key: e.evidence_key, stored: false, reason: result.reason }, + ) + } + return outcomes } // ------------------------------------------------------------------------- From cd11a52753440140d101bf3adcd424f106f1b11c Mon Sep 17 00:00:00 2001 From: olivrg Date: Thu, 18 Jun 2026 16:08:04 +0100 Subject: [PATCH 2/2] docs(adapter-api): document the /audit evidence field (#11) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add a "Populating evidence" section: array shape, success-only, first-finalize-only, per-entry soft-fail outcomes (and their reasons), and order-independent idempotency. - Rewrite the Authentication note that claimed "an adapter cannot write evidence" — it now describes the narrow, scoped capability the adapter gains via /audit (bound to its own session/tool, allowlist-enforced), without the SDK-scoped /evidence route. - Fix the summary-table /install-scan line that still read "observational" to reflect the deny_install enforcement shipped in #13. --- docs/adapter-api.md | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/docs/adapter-api.md b/docs/adapter-api.md index 7ed8341..df5f60b 100644 --- a/docs/adapter-api.md +++ b/docs/adapter-api.md @@ -4,12 +4,12 @@ The governance API lives on the **SDK sideband** — the local server on `127.0.0.1:3200` (configurable via `sdk.*`), the same server the Python SDK uses for evidence/context. It is **not** the dashboard sideband (`:3100`, documented in [Sideband API Reference](./sideband-api.md)); the two are different servers with different jobs. Endpoints here: -| Route | Purpose | -| ---------------------------- | ------------------------------------------------------------------------------ | -| `POST /evaluate` | Decide a tool call. **Side-effect-free** on rate/spend counters. | -| `POST /audit` | Record the outcome of an evaluated call; **consumes** counters. Idempotent. | -| `POST /install-scan` | Evaluate a package/skill install. Observational until install-time rules ship. | -| `POST /approval/:id/resolve` | Record the resolution of a natively-handled approval. | +| Route | Purpose | +| ---------------------------- | ---------------------------------------------------------------------------------------------- | +| `POST /evaluate` | Decide a tool call. **Side-effect-free** on rate/spend counters. | +| `POST /audit` | Record the outcome of an evaluated call; **consumes** counters. Idempotent. | +| `POST /install-scan` | Evaluate a package/skill install against `policies.install` (observational when none defined). | +| `POST /approval/:id/resolve` | Record the resolution of a natively-handled approval. | ## Why this exists, and what it does not promise @@ -36,7 +36,7 @@ An adapter built on this API **MUST**: ## Authentication -The governance routes require `Authorization: Bearer `. This is a **separate token** from the SDK's `HELIO_SDK_TOKEN`: an SDK client cannot drive policy decisions, and an adapter cannot write evidence. Both are generated per boot (and printed to stderr) unless set in the environment. Requests carrying an `Origin` header are refused (browser-forgery guard), and bodies over 1 MiB are rejected with 413. +The governance routes require `Authorization: Bearer `. This is a **separate token** from the SDK's `HELIO_SDK_TOKEN`: an SDK client cannot drive policy decisions, and an adapter cannot call the SDK's `POST /evidence`/`/context` routes. The adapter's evidence access is deliberately narrow: it may attach evidence to a call it is auditing, via the optional `evidence` field on `POST /audit` (success-only, bound to that evaluation's own session/tool, subject to the policy allowlist — see [Populating evidence](#populating-evidence)); it cannot write arbitrary evidence to arbitrary sessions. Both tokens are generated per boot (and printed to stderr) unless set in the environment. Requests carrying an `Origin` header are refused (browser-forgery guard), and bodies over 1 MiB are rejected with 413. If you embed `GovernanceService` directly (instead of running `helio start`), wire an `ApprovalRouter` whenever the policy can emit `require_approval` (explicit rules, `flag_destructive: require_approval`, or `on_tool_drift: require_approval`), otherwise construction and hot-reload fail closed by throwing `GovernanceConfigError` (exported from `@gethelio/proxy`). @@ -90,11 +90,20 @@ The `decision` is an **outcome**, not Helio's internal rule action: a `rate_limi "error": "…", // optional, when status == "error" "duration_ms": 412, // optional "result": { }, // optional outcome summary - "actual_amount": 0.42 // optional, finite ≥0 — true post-execution spend; overrides the arg-derived amount + "actual_amount": 0.42, // optional, finite ≥0 — true post-execution spend; overrides the arg-derived amount + "evidence": [ // optional — see "Populating evidence" below + { "evidence_key": "recipient", "evidence_data": { "to": "a@b.com" }, "ttl_seconds": 300 } + ] } // Response 201 (fresh) — replays return 200 -{ "ok": true, "audit_record_id": "…" } +{ + "ok": true, + "audit_record_id": "…", + "evidence": [ // present only when the request carried evidence + { "evidence_key": "recipient", "stored": true } + ] +} ``` Counters are consumed here (not at `/evaluate`), and only when the call actually ran (`success`/`error`, not `not_executed`). `/audit` is **idempotent on `evaluation_id`**: an identical replay returns `200 { already_finalized: true }` with no double-consumption, so a network retry after a lost response is safe. A different payload under the same id is an adapter bug → `409 evaluation_conflict`. @@ -103,6 +112,17 @@ Counters are consumed here (not at `/evaluate`), and only when the call actually `actual_amount` must be finite and `>= 0` (`400 invalid_actual_amount` otherwise) and only applies to evaluations whose decision carried a spend rule (`400 no_spend_rule` if sent for any other evaluation). +### Populating evidence + +The optional `evidence` array lets an adapter ground a call's outcome — e.g. recording the recipient a `send` tool actually resolved — so a later [evidence-grounded rule](./policies.md) (`evidence.requires`) can enforce on it. This is the **only** way the adapter token writes evidence; the SDK-scoped `POST /evidence` route is not available to it (see [Authentication](#authentication)). Each entry is `{ evidence_key, evidence_data, ttl_seconds? }`. The proxy binds the write to the **pending evaluation's own** `session_id` and `tool_name` — an adapter cannot target another session — and stores it via the same evidence store the SDK path uses. + +Rules: + +- **Success-only.** Evidence is written only when `status: "success"`. On `error`/`not_executed` it is ignored (a failed tool must not ground later calls). +- **First-finalize-only.** Evidence is written once, on the first `/audit`; idempotent replays never re-write (no TTL reset). +- **Every per-entry failure is soft — never request-fatal.** The audit always finalizes `201`; per-entry outcomes are reported in the response `evidence` array as `{ evidence_key, stored, reason? }`. Reasons: `key_not_in_policy_allowlist` (the key is not named by any `evidence.requires` rule), `too_large` (`evidence_data` over 64 KiB), `too_many` (more than 16 entries — the excess is dropped), `no_session` (the evaluation had no `session_id`), `evidence_unavailable` (this deployment runs governance without an evidence store), `closed` (the store is shutting down). **A rejected key is silently not stored**, so a later grounded `/evaluate` will fail closed — make sure every key you populate is named by an `evidence.requires` rule. +- **Idempotency.** Evidence is part of the `/audit` idempotency hash (order-independent): an identical retry replays cleanly, but the same `evaluation_id` with divergent evidence is `409 evaluation_conflict`. + **Other responses:** `404 evaluation_unknown`, `404 evaluation_expired` (the decision aged out — see below), `409 approval_unresolved` (resolve the approval first; **retryable** with short backoff). ### The crash-TTL and TOCTOU caveats