From 8938899abd6848b1615c74a674748faee149e498 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 21:18:55 +0000 Subject: [PATCH 01/15] feat: structured server-log entries + logger (#178 phase 1) Reshape the admin Server Logs pipeline from raw `lines: string[]` to structured `entries: LogEntry[]` ({ ts, level, msg }) end-to-end, with no new data sources yet (Phase 1 of #178). - contracts: add LogLevel + LogEntry; ServerLogsResponse now carries `entries` instead of `lines` (wire type lives only in contracts/). - backend: logRing holds LogEntry[] via two no-double-capture paths (stdout/stderr tee fallback + pushEntry); new log.ts structured logger writes through the saved original stream so the tee never re-captures it; server.ts banners go through the logger; the misleading `http://localhost:PORT` boot banner now states the port + all-interfaces. - duckdb-service + image-service: symmetric log_ring upgrade (entry dicts, level-aware tee, log_event) and /logs returns `entries`. - homepage: ServerLogsPanel renders timestamp + level badge + msg, color-coded by level. - tests + api-reference/api-contracts docs updated for the new wire shape. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01HU7JHMbvtUKRfZY98qeWdK --- backend/src/app.ts | 8 +- backend/src/log.ts | 34 ++++++ backend/src/logRing.ts | 90 +++++++++++----- backend/src/server.ts | 24 +++-- backend/tests/admin-server-logs.test.ts | 36 +++++-- backend/tests/log.test.ts | 33 ++++++ backend/tests/logRing.test.ts | 62 +++++++---- contracts/src/index.ts | 36 +++++-- .../08-crosscutting-concepts/api-contracts.md | 28 +++-- docs/api-reference.md | 28 ++--- duckdb-service/routes/logs.py | 4 +- duckdb-service/services/log_ring.py | 100 +++++++++++++----- duckdb-service/tests/test_logs.py | 76 +++++++++---- .../src/__tests__/ServerLogsPanel.test.tsx | 42 ++++++-- homepage/src/components/ServerLogsPanel.tsx | 60 ++++++++--- homepage/src/services/api.ts | 2 +- image-service/app.py | 4 +- image-service/services/log_ring.py | 100 +++++++++++++----- image-service/tests/test_logs.py | 72 +++++++++---- tests/ui/tests/admin-server-logs.spec.ts | 8 +- 20 files changed, 622 insertions(+), 225 deletions(-) create mode 100644 backend/src/log.ts create mode 100644 backend/tests/log.test.ts diff --git a/backend/src/app.ts b/backend/src/app.ts index 7e987f6..859810f 100644 --- a/backend/src/app.ts +++ b/backend/src/app.ts @@ -5,7 +5,7 @@ import { tryParseModuleId } from '@highfive/contracts'; import type { ServerLogsResponse } from '@highfive/contracts'; import { db } from './database'; import { verifyApiKey, getApiKey } from './auth'; -import { getRecentLogLines } from './logRing'; +import { getRecentEntries } from './logRing'; import { SESSION_COOKIE, issueSessionToken, @@ -604,8 +604,8 @@ app.get('/api/admin/logs', requireAdmin, async (req, res) => { : LOG_LINES_DEFAULT; if (service === 'backend') { - const { lines: out, truncated } = getRecentLogLines(lines); - const payload: ServerLogsResponse = { service: 'backend', lines: out, truncated }; + const { entries, truncated } = getRecentEntries(lines); + const payload: ServerLogsResponse = { service: 'backend', entries, truncated }; res.json(payload); return; } @@ -626,7 +626,7 @@ app.get('/api/admin/logs', requireAdmin, async (req, res) => { // Don't forward a drifted wire shape typed as valid: a service that // changed its /logs envelope should surface as a clear 502, not as // `undefined` fields reaching the UI. - if (!payload || typeof payload.service !== 'string' || !Array.isArray(payload.lines)) { + if (!payload || typeof payload.service !== 'string' || !Array.isArray(payload.entries)) { res.status(502).json({ error: `malformed logs response from ${service}` }); return; } diff --git a/backend/src/log.ts b/backend/src/log.ts new file mode 100644 index 0000000..46d2edc --- /dev/null +++ b/backend/src/log.ts @@ -0,0 +1,34 @@ +// Structured logger for the backend (#178). +// +// Each call pushes a `LogEntry` straight into the in-memory ring AND writes a +// formatted human line to the *saved original* stdout/stderr (via logRing's +// `writeStdout`/`writeStderr`), so: +// - the admin Server Logs panel sees a structured `{ ts, level, msg }`, and +// - `docker logs` / PM2 see a readable line, exactly once (the original +// writer bypasses the tee, so the line is not re-captured as a duplicate +// entry). +// +// New call sites — the access-log middleware and converted boot banners — use +// this. Stray `console.*` and third-party output still land in the ring via +// the tee fallback (see logRing.ts). +// +// SECURITY: never pass secrets, auth headers, request bodies, or the admin +// password to these functions — entries are admin-readable and (ADR-022) +// persisted to disk. See accessLog.ts for the redaction rules on request data. + +import type { LogEntry, LogLevel } from '@highfive/contracts'; +import { pushEntry, writeStderr, writeStdout } from './logRing'; + +function emit(level: LogLevel, msg: string): void { + const entry: LogEntry = { ts: new Date().toISOString(), level, msg }; + pushEntry(entry); + const line = `${entry.ts} ${level.toUpperCase()} ${msg}\n`; + if (level === 'error') writeStderr(line); + else writeStdout(line); +} + +export const log = { + info: (msg: string): void => emit('info', msg), + warn: (msg: string): void => emit('warn', msg), + error: (msg: string): void => emit('error', msg), +}; diff --git a/backend/src/logRing.ts b/backend/src/logRing.ts index 947375a..c8ba258 100644 --- a/backend/src/logRing.ts +++ b/backend/src/logRing.ts @@ -1,32 +1,68 @@ -// In-memory ring of the backend's own recent stdout/stderr lines (#171). +// In-memory ring of the backend's own recent log entries (#171, #178). // -// A stdout/stderr *tee*: every write is recorded into a bounded ring AND -// passed through to the real stream, so `docker logs` / PM2 still see -// everything unchanged. This captures `console.*` (which write to -// stdout/stderr) and any direct `process.stdout.write` — i.e. exactly what an -// operator sees in the container log today. Same idea as the ESP `logbuf` -// ring. `GET /api/admin/logs?service=backend` reads this directly. +// Two ingestion paths feed the same bounded ring of structured `LogEntry` +// values, with no double-capture: +// +// 1. A stdout/stderr *tee*: every write that is NOT produced by the +// structured logger is recorded as an entry AND passed through to the +// real stream, so `docker logs` / PM2 still see everything unchanged. +// This captures stray `console.*` and third-party output. Lines are +// wrapped as `{ ts: now, level: stream === 'err' ? 'error' : 'info', msg }`. +// 2. The structured logger (`log.ts`) calls `pushEntry()` directly to store +// a `LogEntry`, then writes its formatted human line to the *saved +// original* stream reference — bypassing the tee so its own output is not +// re-captured as a second entry. +// +// `GET /api/admin/logs?service=backend` reads this directly via +// `getRecentEntries`. // // Caveats (see ADR-021): in-memory, so it resets on process restart (only -// holds lines since startup) and is per-process (a future multi-worker prod -// would have one ring per worker). +// holds entries since startup) and is per-process (a future multi-worker prod +// would have one ring per worker). On-disk persistence + startup backfill +// (ADR-022) is layered on separately. -const MAX_RING_LINES = 2000; +import type { LogEntry, LogLevel } from '@highfive/contracts'; -const ring: string[] = []; +const MAX_RING_ENTRIES = 2000; + +const ring: LogEntry[] = []; let installed = false; +// The real stream writers, captured before the tee replaces them. The +// structured logger writes through these so its output reaches the terminal / +// docker logs without being re-captured by the tee. +let originalStdoutWrite: ((...a: unknown[]) => boolean) | null = null; +let originalStderrWrite: ((...a: unknown[]) => boolean) | null = null; + // A write may not end on a newline, so hold the trailing fragment per stream // until the next write completes the line. const carry: Record<'out' | 'err', string> = { out: '', err: '' }; -function pushLine(line: string): void { - ring.push(line); - if (ring.length > MAX_RING_LINES) { - ring.splice(0, ring.length - MAX_RING_LINES); +function pushEntryInternal(entry: LogEntry): void { + ring.push(entry); + if (ring.length > MAX_RING_ENTRIES) { + ring.splice(0, ring.length - MAX_RING_ENTRIES); } } +/** + * Append a fully-formed structured entry to the ring. Used by the structured + * logger (`log.ts`); does not touch any stream. + */ +export function pushEntry(entry: LogEntry): void { + pushEntryInternal(entry); +} + +/** The saved real `process.stdout.write`, or the live one if the tee isn't installed yet. */ +export function writeStdout(text: string): boolean { + return (originalStdoutWrite ?? process.stdout.write.bind(process.stdout))(text); +} + +/** The saved real `process.stderr.write`, or the live one if the tee isn't installed yet. */ +export function writeStderr(text: string): boolean { + return (originalStderrWrite ?? process.stderr.write.bind(process.stderr))(text); +} + function record(which: 'out' | 'err', chunk: unknown): void { const text = typeof chunk === 'string' ? chunk : Buffer.isBuffer(chunk) ? chunk.toString('utf8') : ''; @@ -35,12 +71,16 @@ function record(which: 'out' | 'err', chunk: unknown): void { const parts = combined.split('\n'); // The last element is the (possibly empty) incomplete tail — carry it over. carry[which] = parts.pop() ?? ''; - for (const line of parts) pushLine(line); + const level: LogLevel = which === 'err' ? 'error' : 'info'; + for (const line of parts) { + pushEntryInternal({ ts: new Date().toISOString(), level, msg: line }); + } } /** - * Wrap process.stdout/stderr `write` so each completed line is also stored in - * the ring. Idempotent. Call once, as early as possible at process start. + * Wrap process.stdout/stderr `write` so each completed line printed outside the + * structured logger is also stored in the ring. Idempotent. Call once, as + * early as possible at process start. */ export function installLogRing(): void { if (installed) return; @@ -49,7 +89,9 @@ export function installLogRing(): void { [process.stdout, 'out'], [process.stderr, 'err'], ] as const) { - const original = stream.write.bind(stream); + const original = stream.write.bind(stream) as (...a: unknown[]) => boolean; + if (which === 'out') originalStdoutWrite = original; + else originalStderrWrite = original; stream.write = ((chunk: unknown, ...args: unknown[]): boolean => { try { record(which, chunk); @@ -57,19 +99,19 @@ export function installLogRing(): void { // Capture must never break real logging. } // Preserve the original variadic (chunk, encoding?, cb?) signature. - return (original as (...a: unknown[]) => boolean)(chunk, ...args); + return original(chunk, ...args); }) as typeof stream.write; } } /** - * The most recent `n` captured lines, chronological (oldest→newest), and + * The most recent `n` captured entries, chronological (oldest→newest), and * whether the ring held more than were returned. */ -export function getRecentLogLines(n: number): { lines: string[]; truncated: boolean } { +export function getRecentEntries(n: number): { entries: LogEntry[]; truncated: boolean } { const count = Math.max(0, Math.min(n, ring.length)); - const lines = count === 0 ? [] : ring.slice(ring.length - count); - return { lines, truncated: ring.length > lines.length }; + const entries = count === 0 ? [] : ring.slice(ring.length - count); + return { entries, truncated: ring.length > entries.length }; } /** Test-only: clear the ring and partial-line carry. */ diff --git a/backend/src/server.ts b/backend/src/server.ts index 8fbfa06..728620e 100644 --- a/backend/src/server.ts +++ b/backend/src/server.ts @@ -3,6 +3,7 @@ import { app } from './app'; import { getApiKey } from './auth'; import { duckdbHealth } from './duckdbClient'; import { isProduction } from './env'; +import { log } from './log'; import { installLogRing } from './logRing'; import { DEFAULT_PORT, resolvePort } from './port'; @@ -14,7 +15,7 @@ installLogRing(); const { port: PORT, warned: portUnsetWarning } = resolvePort(process.env.PORT); if (portUnsetWarning) { - console.warn( + log.warn( `[startup] PORT env var unset or non-numeric — defaulting to ${DEFAULT_PORT}. ` + `Set PORT explicitly in production. See docker-compose.yml for the dev convention.`, ); @@ -23,22 +24,25 @@ if (portUnsetWarning) { async function bootstrap() { try { const health = await duckdbHealth(); - console.log('🗄 DuckDB service reachable:', health); + log.info(`🗄 DuckDB service reachable: ${JSON.stringify(health)}`); } catch (err) { - console.warn('⚠ DuckDB service not reachable:', err); + log.warn(`⚠ DuckDB service not reachable: ${String(err)}`); } app.listen(PORT, () => { - console.log(`🐝 HighFive Backend API running on http://localhost:${PORT}`); + // Don't say "http://localhost" — the process binds all interfaces and on + // prod is reached via nginx, so the localhost prefix is misleading in the + // admin log panel (#178). State the port instead. + log.info(`🐝 HighFive Backend API listening on port ${PORT} (all interfaces)`); // Never print the configured API key in production - it would land - // in Docker logs and any aggregator collecting them. Dev/test only. - // `isProduction()` normalises NODE_ENV across casing/whitespace typos - // so `"Production"` or `"production "` don't accidentally re-enable + // in Docker logs, the admin log panel, and (ADR-022) on disk. Dev/test + // only. `isProduction()` normalises NODE_ENV across casing/whitespace + // typos so `"Production"` or `"production "` don't accidentally re-enable // the print on prod (PR #84 senior-review finding). if (!isProduction()) { - console.log(`🔑 Dev admin key: ${getApiKey()}`); - console.log(` Admin login: POST /api/admin/login {"password":""}`); - console.log(` Or machine credential: X-Admin-Key: ${getApiKey()}`); + log.info(`🔑 Dev admin key: ${getApiKey()}`); + log.info(` Admin login: POST /api/admin/login {"password":""}`); + log.info(` Or machine credential: X-Admin-Key: ${getApiKey()}`); } }); } diff --git a/backend/tests/admin-server-logs.test.ts b/backend/tests/admin-server-logs.test.ts index 2e626a6..443568a 100644 --- a/backend/tests/admin-server-logs.test.ts +++ b/backend/tests/admin-server-logs.test.ts @@ -10,16 +10,20 @@ vi.mock('../src/database', () => ({ })); // Control the backend's own ring without touching real stdout. +const BACKEND_ENTRIES = [ + { ts: '2026-06-18T20:42:55.000Z', level: 'info', msg: 'backend line 1' }, + { ts: '2026-06-18T20:42:56.000Z', level: 'error', msg: 'backend line 2' }, +]; vi.mock('../src/logRing', () => ({ installLogRing: vi.fn(), - getRecentLogLines: vi.fn(() => ({ - lines: ['backend line 1', 'backend line 2'], + getRecentEntries: vi.fn(() => ({ + entries: BACKEND_ENTRIES, truncated: false, })), })); import { app } from '../src/app'; -import { getRecentLogLines } from '../src/logRing'; +import { getRecentEntries } from '../src/logRing'; const KEY = 'hf_dev_key_2026'; @@ -57,14 +61,18 @@ describe('GET /api/admin/logs (#171)', () => { expect(res.status).toBe(200); expect(res.body).toEqual({ service: 'backend', - lines: ['backend line 1', 'backend line 2'], + entries: BACKEND_ENTRIES, truncated: false, }); expect(globalThis.fetch).not.toHaveBeenCalled(); }); it('proxies duckdb-service logs, forwarding X-Admin-Key and the lines param', async () => { - const payload = { service: 'duckdb-service', lines: ['db line'], truncated: false }; + const payload = { + service: 'duckdb-service', + entries: [{ ts: '2026-06-18T20:42:55.000Z', level: 'info', msg: 'db line' }], + truncated: false, + }; (globalThis.fetch as unknown as ReturnType).mockResolvedValue({ ok: true, status: 200, @@ -86,7 +94,7 @@ describe('GET /api/admin/logs (#171)', () => { (globalThis.fetch as unknown as ReturnType).mockResolvedValue({ ok: true, status: 200, - json: async () => ({ service: 'image-service', lines: [], truncated: false }), + json: async () => ({ service: 'image-service', entries: [], truncated: false }), }); await request(app) @@ -123,6 +131,20 @@ describe('GET /api/admin/logs (#171)', () => { .set('X-Admin-Key', KEY); expect(res.status).toBe(502); // The backend ring mock should not have been consulted for a proxied service. - expect(getRecentLogLines).not.toHaveBeenCalled(); + expect(getRecentEntries).not.toHaveBeenCalled(); + }); + + it('returns 502 when the upstream envelope is drifted (no entries array)', async () => { + (globalThis.fetch as unknown as ReturnType).mockResolvedValue({ + ok: true, + status: 200, + // Old shape with `lines` instead of `entries` → must not reach the UI. + json: async () => ({ service: 'duckdb-service', lines: ['x'], truncated: false }), + }); + const res = await request(app) + .get('/api/admin/logs?service=duckdb-service') + .set('X-Admin-Key', KEY); + expect(res.status).toBe(502); + expect(res.body.error).toMatch(/malformed/i); }); }); diff --git a/backend/tests/log.test.ts b/backend/tests/log.test.ts new file mode 100644 index 0000000..b52fabe --- /dev/null +++ b/backend/tests/log.test.ts @@ -0,0 +1,33 @@ +import { describe, it, expect, beforeEach } from 'vitest'; +import { installLogRing, getRecentEntries, __resetLogRingForTest } from '../src/logRing'; +import { log } from '../src/log'; + +// The structured logger pushes one entry AND writes a human line to the saved +// original stream, which bypasses the tee — so a logged message must appear in +// the ring exactly once, not twice (#178). +installLogRing(); + +beforeEach(() => { + __resetLogRingForTest(); +}); + +describe('log (#178 structured logger)', () => { + it('records the level and message it is given', () => { + log.warn('hf-log warn message'); + const entry = getRecentEntries(10).entries.find((e) => e.msg === 'hf-log warn message'); + expect(entry?.level).toBe('warn'); + expect(entry?.ts).toMatch(/^\d{4}-\d{2}-\d{2}T.*Z$/); + }); + + it('does not double-capture: a logged line lands in the ring exactly once', () => { + log.info('hf-log unique-once'); + const count = getRecentEntries(50).entries.filter((e) => e.msg === 'hf-log unique-once').length; + expect(count).toBe(1); + }); + + it('error() produces an error-level entry', () => { + log.error('hf-log boom'); + const entry = getRecentEntries(10).entries.find((e) => e.msg === 'hf-log boom'); + expect(entry?.level).toBe('error'); + }); +}); diff --git a/backend/tests/logRing.test.ts b/backend/tests/logRing.test.ts index b13e6f1..b2a35a4 100644 --- a/backend/tests/logRing.test.ts +++ b/backend/tests/logRing.test.ts @@ -1,47 +1,59 @@ import { describe, it, expect, beforeEach } from 'vitest'; -import { installLogRing, getRecentLogLines, __resetLogRingForTest } from '../src/logRing'; +import { + installLogRing, + getRecentEntries, + pushEntry, + __resetLogRingForTest, +} from '../src/logRing'; // installLogRing wraps process.stdout/stderr.write (idempotent) and passes -// every write through, so test output still appears; the ring also records it. +// every write through, so test output still appears; the ring also records it +// as a structured LogEntry (#178). installLogRing(); beforeEach(() => { __resetLogRingForTest(); }); -describe('logRing (#171)', () => { - it('captures complete lines written to stdout', () => { +const msgs = (n: number) => getRecentEntries(n).entries.map((e) => e.msg); + +describe('logRing (#171/#178)', () => { + it('captures complete lines written to stdout as info entries', () => { process.stdout.write('hf-test alpha\n'); process.stdout.write('hf-test bravo\n'); - const { lines } = getRecentLogLines(10); - expect(lines).toContain('hf-test alpha'); - expect(lines).toContain('hf-test bravo'); + const { entries } = getRecentEntries(10); + const alpha = entries.find((e) => e.msg === 'hf-test alpha'); + expect(alpha).toBeTruthy(); + expect(alpha?.level).toBe('info'); + expect(alpha?.ts).toMatch(/^\d{4}-\d{2}-\d{2}T.*Z$/); // ISO 8601 UTC + expect(msgs(10)).toContain('hf-test bravo'); }); it('buffers a partial line until the newline arrives', () => { process.stdout.write('hf-test par'); - expect(getRecentLogLines(10).lines).not.toContain('hf-test partial'); + expect(msgs(10)).not.toContain('hf-test partial'); process.stdout.write('tial\n'); - expect(getRecentLogLines(10).lines).toContain('hf-test partial'); + expect(msgs(10)).toContain('hf-test partial'); }); - it('returns only the most recent n lines and flags truncation', () => { + it('returns only the most recent n entries and flags truncation', () => { for (let i = 0; i < 5; i++) process.stdout.write(`hf-test line ${i}\n`); - const { lines, truncated } = getRecentLogLines(2); - expect(lines).toEqual(['hf-test line 3', 'hf-test line 4']); + const { entries, truncated } = getRecentEntries(2); + expect(entries.map((e) => e.msg)).toEqual(['hf-test line 3', 'hf-test line 4']); expect(truncated).toBe(true); }); it('reports not-truncated when n covers the whole ring', () => { process.stdout.write('hf-test only\n'); - const { lines, truncated } = getRecentLogLines(100); - expect(lines).toEqual(['hf-test only']); + const { entries, truncated } = getRecentEntries(100); + expect(entries.map((e) => e.msg)).toEqual(['hf-test only']); expect(truncated).toBe(false); }); - it('also captures stderr writes', () => { + it('records stderr writes as error-level entries', () => { process.stderr.write('hf-test err line\n'); - expect(getRecentLogLines(10).lines).toContain('hf-test err line'); + const entry = getRecentEntries(10).entries.find((e) => e.msg === 'hf-test err line'); + expect(entry?.level).toBe('error'); }); it('passes the write through (returns the underlying write result, not undefined)', () => { @@ -49,12 +61,18 @@ describe('logRing (#171)', () => { expect(typeof ret).toBe('boolean'); }); - it('evicts the oldest lines once the ring is over its cap', () => { - const N = 2100; // > MAX_RING_LINES (2000) + it('pushEntry appends a structured entry directly', () => { + pushEntry({ ts: '2026-06-18T20:42:55.000Z', level: 'warn', msg: 'hf-direct' }); + const entry = getRecentEntries(10).entries.find((e) => e.msg === 'hf-direct'); + expect(entry).toEqual({ ts: '2026-06-18T20:42:55.000Z', level: 'warn', msg: 'hf-direct' }); + }); + + it('evicts the oldest entries once the ring is over its cap', () => { + const N = 2100; // > MAX_RING_ENTRIES (2000) for (let i = 0; i < N; i++) process.stdout.write(`hf-cap ${i}\n`); - const { lines } = getRecentLogLines(N); - expect(lines.length).toBeLessThan(N); // bounded — not unbounded growth - expect(lines).toContain(`hf-cap ${N - 1}`); // newest kept - expect(lines).not.toContain('hf-cap 0'); // oldest evicted + const all = msgs(N); + expect(all.length).toBeLessThan(N); // bounded — not unbounded growth + expect(all).toContain(`hf-cap ${N - 1}`); // newest kept + expect(all).not.toContain('hf-cap 0'); // oldest evicted }); }); diff --git a/contracts/src/index.ts b/contracts/src/index.ts index 309989a..7d5c113 100644 --- a/contracts/src/index.ts +++ b/contracts/src/index.ts @@ -366,20 +366,38 @@ export interface MeasurementTimeSeries { buckets: MeasurementBucket[]; } -// Admin-gated server process logs (#171). Each service keeps a bounded -// in-memory ring of its own recent stdout/stderr lines (a stdout tee, same -// idea as the ESP `logbuf`), exposed for `GET /api/admin/logs?service=…&lines=N`. +// Admin-gated server process logs (#171, #178). Each service keeps a bounded +// ring of its own recent log entries (a stdout/stderr tee plus a structured +// logger, same idea as the ESP `logbuf`), exposed for +// `GET /api/admin/logs?service=…&lines=N` and streamed live via +// `GET /api/admin/logs/stream?service=…`. // `nginx` is deliberately absent — it has no app process to host a ring, so -// its logs stay a host/file concern (out of scope for v1). See ADR-021. +// its logs stay a host/file concern (out of scope). See ADR-021/ADR-022. export type ServerLogService = 'backend' | 'duckdb-service' | 'image-service'; +// Severity of a single log entry. Drives the panel's color coding and the +// access-log middleware's status→level mapping (>=500 error, >=400 warn, else +// info). Kept deliberately small — this is operational logging, not a +// general-purpose level taxonomy. +export type LogLevel = 'info' | 'warn' | 'error'; + +// One structured log line. Replaces the former raw `string[]`: every line now +// carries a timestamp and level so the panel can render `ts · level · msg`, +// color-code, filter, and export. The SSE stream emits one `LogEntry` per +// `data:` event; the REST backfill returns an array of them. +export interface LogEntry { + ts: string; // ISO 8601 (UTC), e.g. '2026-06-18T20:42:55.123Z' + level: LogLevel; + msg: string; +} + export interface ServerLogsResponse { service: ServerLogService; - // Raw captured stdout/stderr lines, chronological (oldest→newest), like - // `tail`. In-memory only: resets on process restart, so this is "since the - // process started", not a full history. - lines: string[]; - // True when the ring held more lines than were returned (clipped to the + // Captured log entries, chronological (oldest→newest), like `tail`. Backed + // by an in-memory ring with on-disk persistence (ADR-022), so this survives + // process restart up to the retention bound (30 days / 100 MB). + entries: LogEntry[]; + // True when the ring held more entries than were returned (clipped to the // requested `lines`, itself capped server-side). Lets the UI show a // "showing last N" hint without a separate count. truncated: boolean; diff --git a/docs/08-crosscutting-concepts/api-contracts.md b/docs/08-crosscutting-concepts/api-contracts.md index 54e15ab..c0e1d99 100644 --- a/docs/08-crosscutting-concepts/api-contracts.md +++ b/docs/08-crosscutting-concepts/api-contracts.md @@ -330,28 +330,38 @@ wire JSON are deliberately close enough that the proxy is one `.map((b) => ({ timestamp, value, sampleCount }))` and not a field-by-field transform. -## `ServerLogsResponse` — admin server-log tail (#171) +## `ServerLogsResponse` — admin server-log tail (#171, #178) Served by `GET /api/admin/logs?service=…&lines=N` (backend). Distinct from the -per-module ESP telemetry (`TelemetryEntry`): this is a service's **own** -stdout/stderr. The backend reads its own in-memory ring directly and proxies to -`duckdb-service` / `image-service` internal `/logs` (forwarding `X-Admin-Key`). -The type lives in `contracts/src/index.ts`: +per-module ESP telemetry (`TelemetryEntry`): this is a service's **own** log +output. The backend reads its own ring directly and proxies to `duckdb-service` +/ `image-service` internal `/logs` (forwarding `X-Admin-Key`). The types live in +`contracts/src/index.ts`: ```ts export type ServerLogService = 'backend' | 'duckdb-service' | 'image-service'; +export type LogLevel = 'info' | 'warn' | 'error'; + +export interface LogEntry { + ts: string; // ISO 8601 (UTC) + level: LogLevel; + msg: string; +} + export interface ServerLogsResponse { service: ServerLogService; - lines: string[]; // raw stdout/stderr lines, chronological (oldest→newest) + entries: LogEntry[]; // structured entries, chronological (oldest→newest) truncated: boolean; // ring held more than were returned } ``` Unlike the other wire shapes here there is **no snake → camel mapping**: the -Flask `/logs` routes emit exactly these keys (`service`/`lines`/`truncated`), so -the backend proxies the JSON through verbatim and only the `backend` branch -constructs it locally. `nginx` is intentionally not a `ServerLogService` (no app +Flask `/logs` routes emit exactly these keys (`service`/`entries`/`truncated`, +each entry `ts`/`level`/`msg`), so the backend proxies the JSON through verbatim +and only the `backend` branch constructs it locally. The proxy branch rejects a +drifted envelope (no `entries` array) with `502` rather than letting `undefined` +fields reach the UI. `nginx` is intentionally not a `ServerLogService` (no app ring). Design + caveats: [ADR-021](../09-architecture-decisions/adr-021-admin-server-log-ring.md). ## `ImageUploadsPage` — admin gallery pagination diff --git a/docs/api-reference.md b/docs/api-reference.md index ea7f9bd..02906ff 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -255,29 +255,31 @@ GET /api/admin/logs?service=backend|duckdb-service|image-service&lines=N Headers: Cookie: hf_admin_session=… # or X-Admin-Key: ``` -Tails a service's **own** recent stdout/stderr — distinct from §1.5 (per-module -ESP telemetry). Each service keeps an in-memory ring of its log lines (a stdout -tee); the backend serves its own ring and proxies to the two Flask services' -internal `/logs`, forwarding the machine credential. `service` must be one of -the three names (others, incl. `nginx`, return `400`). `lines` defaults to -`200` and is clamped to `[1, 1000]`. Returns `401` without a valid admin -credential, `502` if a proxied service is unreachable. Design + caveats -(in-memory, per-process, nginx not covered): [ADR-021](09-architecture-decisions/adr-021-admin-server-log-ring.md). +Tails a service's **own** recent log entries — distinct from §1.5 (per-module +ESP telemetry). Each service keeps a ring of structured `{ ts, level, msg }` +entries (a stdout/stderr tee plus a structured logger); the backend serves its +own ring and proxies to the two Flask services' internal `/logs`, forwarding the +machine credential. `service` must be one of the three names (others, incl. +`nginx`, return `400`). `lines` defaults to `200` and is clamped to `[1, 1000]`. +Returns `401` without a valid admin credential, `502` if a proxied service is +unreachable or returns a drifted envelope. Design + caveats: +[ADR-021](09-architecture-decisions/adr-021-admin-server-log-ring.md). ```json { "service": "duckdb-service", - "lines": [ - "[heartbeat] mac=aabbccddeeff battery=None rssi=-67 …", - "127.0.0.1 - - [16/Jun/2026 00:04:42] \"POST /heartbeat HTTP/1.1\" 200 -" + "entries": [ + { "ts": "2026-06-18T20:42:55.123Z", "level": "info", "msg": "[heartbeat] mac=aabbccddeeff battery=None rssi=-67 …" }, + { "ts": "2026-06-18T20:42:56.004Z", "level": "info", "msg": "POST /heartbeat 200 3ms" } ], "truncated": false } ``` -`lines` is chronological (oldest→newest, like `tail`); `truncated` is `true` +`entries` is chronological (oldest→newest, like `tail`); each carries an ISO 8601 +`ts`, a `level` (`info` | `warn` | `error`), and the `msg`. `truncated` is `true` when the ring held more than were returned. The TypeScript contract is -`ServerLogsResponse` in [`contracts/src/index.ts`](../contracts/src/index.ts). +`LogEntry` / `ServerLogsResponse` in [`contracts/src/index.ts`](../contracts/src/index.ts). ## 1.5 User location hint (dashboard map) diff --git a/duckdb-service/routes/logs.py b/duckdb-service/routes/logs.py index 4a7d4ed..21eea8c 100644 --- a/duckdb-service/routes/logs.py +++ b/duckdb-service/routes/logs.py @@ -41,7 +41,7 @@ def get_logs(): n = _LINES_DEFAULT n = max(1, min(n, _LINES_CAP)) - lines, truncated = get_recent(n) + entries, truncated = get_recent(n) return jsonify( - {"service": SERVICE_NAME, "lines": lines, "truncated": truncated} + {"service": SERVICE_NAME, "entries": entries, "truncated": truncated} ), 200 diff --git a/duckdb-service/services/log_ring.py b/duckdb-service/services/log_ring.py index 670ca94..7a34db9 100644 --- a/duckdb-service/services/log_ring.py +++ b/duckdb-service/services/log_ring.py @@ -1,35 +1,59 @@ -"""In-memory ring of this service's recent stdout/stderr lines (#171). - -A stdout/stderr *tee* installed at startup: every write is appended to a -bounded deque AND passed through to the real stream, so ``docker logs`` / PM2 -still see everything unchanged. Captures ``print(...)`` (which re-resolves -``sys.stdout`` on every call) and anything else written to the wrapped -streams -- i.e. what the container log shows. Same idea as the ESP ``logbuf`` -ring. Read via this service's internal admin-gated ``GET /logs``. - -Caveats (see ADR-021): in-memory, so it resets on process restart (only holds -lines since startup) and is per-process. ``install()`` runs at import time in -``app.py`` (after the import block, before the app serves traffic), so the -Flask/werkzeug log handlers — constructed lazily at ``app.run`` / first request -— bind to the tee. +"""In-memory ring of this service's recent log entries (#171, #178). + +Two ingestion paths feed the same bounded deque of structured entries +(``{"ts", "level", "msg"}``), with no double-capture: + + 1. A stdout/stderr *tee* installed at startup: every write NOT produced by the + structured logger is wrapped as an entry AND passed through to the real + stream, so ``docker logs`` / PM2 still see everything unchanged. Captures + ``print(...)`` and werkzeug's request lines. stdout lines become + ``info``, stderr lines become ``error``. + 2. The structured logger (``log_event``) appends an entry directly, then + writes its formatted human line to the *saved real* stream (bypassing the + tee, so its own output is not re-captured as a duplicate entry). + +Read via this service's internal admin-gated ``GET /logs`` and streamed live +via ``GET /logs/stream``. + +Caveats (see ADR-021): in-memory, so it resets on process restart and is +per-process. On-disk persistence + startup backfill is layered on separately +(ADR-022). ``install()`` runs at import time in ``app.py`` (after the import +block, before the app serves traffic), so the Flask/werkzeug log handlers — +constructed lazily at ``app.run`` / first request — bind to the tee. """ import sys import threading from collections import deque +from datetime import UTC, datetime -_MAX_RING_LINES = 2000 -_ring: "deque[str]" = deque(maxlen=_MAX_RING_LINES) +_MAX_RING_ENTRIES = 2000 +_ring: "deque[dict]" = deque(maxlen=_MAX_RING_ENTRIES) _lock = threading.Lock() _installed = False _tees: "list[_TeeStream]" = [] # installed tee instances (for test carry reset) +# Saved real streams, so the structured logger can write past the tee. +_real_stdout = None +_real_stderr = None + + +def _now_iso() -> str: + # Millisecond precision, 'Z' suffix — matches the LogEntry contract. + return datetime.now(UTC).isoformat(timespec="milliseconds").replace("+00:00", "Z") + + +def _push(level: str, msg: str) -> None: + with _lock: + _ring.append({"ts": _now_iso(), "level": level, "msg": msg}) + class _TeeStream: - """Wraps a real text stream; mirrors complete lines into the ring.""" + """Wraps a real text stream; mirrors complete lines into the ring as entries.""" - def __init__(self, real): + def __init__(self, real, level): self._real = real + self._level = level self._carry = "" def write(self, text): @@ -48,9 +72,8 @@ def _capture(self, text): parts = combined.split("\n") # The last element is the (possibly empty) incomplete tail. self._carry = parts.pop() - if parts: - with _lock: - _ring.extend(parts) + for line in parts: + _push(self._level, line) def flush(self): self._real.flush() @@ -63,25 +86,48 @@ def __getattr__(self, name): def install(): """Wrap sys.stdout/sys.stderr. Idempotent; call once at process start.""" - global _installed + global _installed, _real_stdout, _real_stderr if _installed: return _installed = True - out = _TeeStream(sys.stdout) - err = _TeeStream(sys.stderr) + _real_stdout = sys.stdout + _real_stderr = sys.stderr + out = _TeeStream(sys.stdout, "info") + err = _TeeStream(sys.stderr, "error") _tees.extend((out, err)) sys.stdout = out sys.stderr = err +def log_event(level: str, msg: str) -> None: + """Append a structured entry AND write a formatted human line to the real + stream (bypassing the tee, so it is not re-captured as a duplicate). + + SECURITY: never pass secrets, auth headers, request bodies, or the admin + password — entries are admin-readable and (ADR-022) persisted to disk. + """ + _push(level, msg) + ts = _ring[-1]["ts"] if _ring else _now_iso() + line = f"{ts} {level.upper()} {msg}\n" + real = _real_stderr if level == "error" else _real_stdout + if real is None: + # Tee not installed yet (e.g. very early startup): fall back to the + # current stream — capture is harmless here. + real = sys.stderr if level == "error" else sys.stdout + try: + real.write(line) + except Exception: + pass + + def get_recent(n): - """Return (lines, truncated): the most recent ``n`` lines oldest→newest, + """Return (entries, truncated): the most recent ``n`` entries oldest→newest, and whether the ring held more than were returned.""" with _lock: items = list(_ring) n = max(0, min(n, len(items))) - lines = items[len(items) - n :] if n else [] - return lines, len(items) > len(lines) + entries = items[len(items) - n :] if n else [] + return entries, len(items) > len(entries) def _reset_for_test(): diff --git a/duckdb-service/tests/test_logs.py b/duckdb-service/tests/test_logs.py index 2ed8108..410f2bc 100644 --- a/duckdb-service/tests/test_logs.py +++ b/duckdb-service/tests/test_logs.py @@ -1,4 +1,4 @@ -"""Tests for the internal admin-gated GET /logs server-log tail (#171).""" +"""Tests for the internal admin-gated GET /logs server-log tail (#171, #178).""" import io @@ -7,40 +7,71 @@ VALID_KEY = "hf_dev_key_2026" # dev fallback resolved when HIGHFIVE_API_KEY unset -def test_tee_captures_complete_lines_and_passes_through(): +def _msgs(entries): + return [e["msg"] for e in entries] + + +def test_tee_captures_complete_lines_as_entries_and_passes_through(): log_ring._reset_for_test() sink = io.StringIO() - tee = log_ring._TeeStream(sink) + tee = log_ring._TeeStream(sink, "info") tee.write("alpha\n") tee.write("partial") # no newline yet — buffered, not a line - lines, _ = log_ring.get_recent(10) - assert "alpha" in lines - assert "partial" not in lines + entries, _ = log_ring.get_recent(10) + assert "alpha" in _msgs(entries) + assert "partial" not in _msgs(entries) tee.write(" done\n") - lines, _ = log_ring.get_recent(10) - assert "partial done" in lines + entries, _ = log_ring.get_recent(10) + assert "partial done" in _msgs(entries) + # Captured lines carry the tee's level and a timestamp. + alpha = next(e for e in entries if e["msg"] == "alpha") + assert alpha["level"] == "info" + assert alpha["ts"].endswith("Z") # The real stream still saw every byte (tee, not intercept). assert sink.getvalue() == "alpha\npartial done\n" +def test_stderr_tee_records_error_level(): + log_ring._reset_for_test() + tee = log_ring._TeeStream(io.StringIO(), "error") + tee.write("boom\n") + entries, _ = log_ring.get_recent(10) + assert entries[-1] == {"ts": entries[-1]["ts"], "level": "error", "msg": "boom"} + + +def test_log_event_appends_entry_and_writes_through(monkeypatch): + log_ring._reset_for_test() + sink = io.StringIO() + # log_event writes to the saved real stream; point it at a sink. + monkeypatch.setattr(log_ring, "_real_stdout", sink) + log_ring.log_event("warn", "GET /modules 200 5ms") + entries, _ = log_ring.get_recent(10) + assert entries[-1]["level"] == "warn" + assert entries[-1]["msg"] == "GET /modules 200 5ms" + # Human line reached the real stream exactly once... + assert sink.getvalue().count("GET /modules 200 5ms") == 1 + # ...and was not re-captured as a duplicate entry. + assert _msgs(entries).count("GET /modules 200 5ms") == 1 + + def test_get_recent_clamps_and_flags_truncation(): log_ring._reset_for_test() for i in range(5): - log_ring._ring.append(f"line {i}") - lines, truncated = log_ring.get_recent(2) - assert lines == ["line 3", "line 4"] + log_ring._push("info", f"line {i}") + entries, truncated = log_ring.get_recent(2) + assert _msgs(entries) == ["line 3", "line 4"] assert truncated is True def test_ring_caps_at_max_and_evicts_oldest(): log_ring._reset_for_test() - total = log_ring._MAX_RING_LINES + 10 + total = log_ring._MAX_RING_ENTRIES + 10 for i in range(total): - log_ring._ring.append(f"L{i}") - lines, truncated = log_ring.get_recent(total) - assert len(lines) == log_ring._MAX_RING_LINES # bounded - assert lines[0] == "L10" # first 10 evicted - assert lines[-1] == f"L{total - 1}" # newest kept + log_ring._push("info", f"L{i}") + entries, truncated = log_ring.get_recent(total) + assert len(entries) == log_ring._MAX_RING_ENTRIES # bounded + assert entries[0]["msg"] == "L10" # first 10 evicted + assert entries[-1]["msg"] == f"L{total - 1}" # newest kept assert truncated is False # asked for >= ring size @@ -52,20 +83,23 @@ def test_logs_requires_admin_key(client): def test_logs_returns_ring_with_valid_key(client): log_ring._reset_for_test() - log_ring._ring.append("duckdb marker line") + log_ring._push("info", "duckdb marker line") resp = client.get("/logs?lines=50", headers={"X-Admin-Key": VALID_KEY}) assert resp.status_code == 200 body = resp.get_json() assert body["service"] == "duckdb-service" - assert "duckdb marker line" in body["lines"] + assert "duckdb marker line" in _msgs(body["entries"]) + # Each entry carries the structured wire shape. + entry = body["entries"][-1] + assert set(entry.keys()) == {"ts", "level", "msg"} assert body["truncated"] is False def test_logs_route_honours_lines_and_reports_truncation(client): log_ring._reset_for_test() for i in range(5): - log_ring._ring.append(f"row {i}") + log_ring._push("info", f"row {i}") resp = client.get("/logs?lines=2", headers={"X-Admin-Key": VALID_KEY}) body = resp.get_json() - assert body["lines"] == ["row 3", "row 4"] + assert _msgs(body["entries"]) == ["row 3", "row 4"] assert body["truncated"] is True diff --git a/homepage/src/__tests__/ServerLogsPanel.test.tsx b/homepage/src/__tests__/ServerLogsPanel.test.tsx index a12c9af..783b0c3 100644 --- a/homepage/src/__tests__/ServerLogsPanel.test.tsx +++ b/homepage/src/__tests__/ServerLogsPanel.test.tsx @@ -5,19 +5,29 @@ import type { ServerLogsResponse } from '@highfive/contracts'; import ServerLogsPanel from '../components/ServerLogsPanel'; import { api } from '../services/api'; -// Pins the #171 admin server-logs view against the real ServerLogsResponse +// Pins the #171/#178 admin server-logs view against the real ServerLogsResponse // wire shape (CLAUDE.md rule 3 — realistic fixture, not a guessed object). // api.getServerLogs is the single boundary, spied so no real fetch fires. const backendLogs: ServerLogsResponse = { service: 'backend', - lines: ['🐝 HighFive Backend API running on http://localhost:3002', '[GET /api/modules] 200'], + entries: [ + { + ts: '2026-06-18T20:42:55.000Z', + level: 'info', + msg: '🐝 HighFive Backend API listening on port 3002 (all interfaces)', + }, + { ts: '2026-06-18T20:42:56.000Z', level: 'info', msg: 'GET /api/modules 200 4ms' }, + ], truncated: false, }; const duckdbLogs: ServerLogsResponse = { service: 'duckdb-service', - lines: ['[heartbeat] mac=aabbccddeeff battery=None', '127.0.0.1 - "GET /health" 200'], + entries: [ + { ts: '2026-06-18T20:43:00.000Z', level: 'info', msg: '[heartbeat] mac=aabbccddeeff battery=None' }, + { ts: '2026-06-18T20:43:01.000Z', level: 'error', msg: 'GET /health 500 12ms' }, + ], truncated: true, }; @@ -29,13 +39,29 @@ afterEach(() => { vi.restoreAllMocks(); }); -describe('ServerLogsPanel (#171)', () => { - it('loads the backend ring on mount and renders its lines', async () => { +describe('ServerLogsPanel (#171/#178)', () => { + it('loads the backend ring on mount and renders structured entries', async () => { render(); await waitFor(() => expect(api.getServerLogs).toHaveBeenCalledWith('backend', 200)); const out = await screen.findByTestId('server-logs-output'); - expect(out.textContent).toContain('HighFive Backend API running'); - expect(out.textContent).toContain('[GET /api/modules] 200'); + expect(out.textContent).toContain('HighFive Backend API listening on port 3002'); + expect(out.textContent).toContain('GET /api/modules 200 4ms'); + // Each entry renders its ISO timestamp and level badge. + expect(out.textContent).toContain('2026-06-18T20:42:55.000Z'); + const rows = screen.getAllByTestId('server-log-entry'); + expect(rows).toHaveLength(2); + expect(rows[0].getAttribute('data-level')).toBe('info'); + }); + + it('color-codes by level (error entries carry data-level=error)', async () => { + (api.getServerLogs as ReturnType).mockResolvedValue(duckdbLogs); + render(); + const out = await screen.findByTestId('server-logs-output'); + expect(out.textContent).toContain('GET /health 500 12ms'); + const errorRow = screen + .getAllByTestId('server-log-entry') + .find((r) => r.getAttribute('data-level') === 'error'); + expect(errorRow).toBeTruthy(); }); it('refetches the selected service when the dropdown changes', async () => { @@ -43,7 +69,7 @@ describe('ServerLogsPanel (#171)', () => { service === 'duckdb-service' ? duckdbLogs : backendLogs, ); render(); - await screen.findByText(/HighFive Backend API running/); + await screen.findByText(/HighFive Backend API listening/); fireEvent.change(screen.getByTestId('log-service-select'), { target: { value: 'duckdb-service' }, diff --git a/homepage/src/components/ServerLogsPanel.tsx b/homepage/src/components/ServerLogsPanel.tsx index 506aa91..603e153 100644 --- a/homepage/src/components/ServerLogsPanel.tsx +++ b/homepage/src/components/ServerLogsPanel.tsx @@ -1,19 +1,27 @@ import { useState, useEffect, useCallback } from 'react'; import { api } from '../services/api'; -import type { ServerLogService } from '../services/api'; +import type { LogEntry, LogLevel, ServerLogService } from '../services/api'; -// Admin-only server-log viewer (#171). Tails a service's own recent -// stdout/stderr via GET /api/admin/logs (the app-level ring in each service). -// English-only, like the rest of AdminPage (operator-facing, no i18n consumer -// in this page — see AdminPage's existing note). See ADR-021. +// Admin-only server-log viewer (#171, #178). Tails a service's own recent +// structured log entries via GET /api/admin/logs (the app-level ring in each +// service). English-only, like the rest of AdminPage (operator-facing, no i18n +// consumer in this page — see AdminPage's existing note). See ADR-021/ADR-022. const SERVICES: ServerLogService[] = ['backend', 'duckdb-service', 'image-service']; const DEFAULT_LINES = 200; const MAX_LINES = 1000; +// Tailwind classes per level. Info is muted; warn amber; error red — matches +// the access-log middleware's status→level mapping. +const LEVEL_STYLES: Record = { + info: { badge: 'bg-gray-700 text-gray-200', text: 'text-gray-100' }, + warn: { badge: 'bg-amber-500 text-gray-900', text: 'text-amber-300' }, + error: { badge: 'bg-red-600 text-white', text: 'text-red-300' }, +}; + export default function ServerLogsPanel() { const [service, setService] = useState('backend'); const [lines, setLines] = useState(DEFAULT_LINES); - const [log, setLog] = useState([]); + const [entries, setEntries] = useState([]); const [truncated, setTruncated] = useState(false); const [loading, setLoading] = useState(false); const [error, setError] = useState(null); @@ -23,7 +31,7 @@ export default function ServerLogsPanel() { setError(null); try { const res = await api.getServerLogs(service, lines); - setLog(res.lines); + setEntries(res.entries); setTruncated(res.truncated); } catch (err) { setError( @@ -31,7 +39,7 @@ export default function ServerLogsPanel() { ? 'Session expired — reload and log in again.' : `Failed to fetch ${service} logs. Is the service running?`, ); - setLog([]); + setEntries([]); } finally { setLoading(false); } @@ -87,19 +95,37 @@ export default function ServerLogsPanel() {
{error &&

{error}

} {truncated && !error && ( -

Showing the most recent {log.length} lines.

- )} - {!error && log.length === 0 && !loading ? ( -

- No log lines captured yet (rings reset on restart). +

+ Showing the most recent {entries.length} entries.

+ )} + {!error && entries.length === 0 && !loading ? ( +

No log entries captured yet.

) : ( -
-            {log.join('\n')}
-          
+ {entries.map((entry, i) => { + const style = LEVEL_STYLES[entry.level] ?? LEVEL_STYLES.info; + return ( +
+ {entry.ts} + + {entry.level} + + {entry.msg} +
+ ); + })} +
)} diff --git a/homepage/src/services/api.ts b/homepage/src/services/api.ts index 86708ef..f678fee 100644 --- a/homepage/src/services/api.ts +++ b/homepage/src/services/api.ts @@ -12,7 +12,7 @@ import type { import { parseModuleId } from '@highfive/contracts'; export type { TelemetryEntry } from '@highfive/contracts'; -export type { ServerLogService, ServerLogsResponse } from '@highfive/contracts'; +export type { LogEntry, LogLevel, ServerLogService, ServerLogsResponse } from '@highfive/contracts'; const API_BASE_URL = import.meta.env.VITE_API_URL || 'http://localhost:3002/api'; diff --git a/image-service/app.py b/image-service/app.py index 5dc3b01..18d8838 100644 --- a/image-service/app.py +++ b/image-service/app.py @@ -108,9 +108,9 @@ def get_logs(): n = _LOGS_LINES_DEFAULT n = max(1, min(n, _LOGS_LINES_CAP)) - lines, truncated = _get_recent_logs(n) + entries, truncated = _get_recent_logs(n) return jsonify( - {"service": "image-service", "lines": lines, "truncated": truncated} + {"service": "image-service", "entries": entries, "truncated": truncated} ), 200 diff --git a/image-service/services/log_ring.py b/image-service/services/log_ring.py index 670ca94..7a34db9 100644 --- a/image-service/services/log_ring.py +++ b/image-service/services/log_ring.py @@ -1,35 +1,59 @@ -"""In-memory ring of this service's recent stdout/stderr lines (#171). - -A stdout/stderr *tee* installed at startup: every write is appended to a -bounded deque AND passed through to the real stream, so ``docker logs`` / PM2 -still see everything unchanged. Captures ``print(...)`` (which re-resolves -``sys.stdout`` on every call) and anything else written to the wrapped -streams -- i.e. what the container log shows. Same idea as the ESP ``logbuf`` -ring. Read via this service's internal admin-gated ``GET /logs``. - -Caveats (see ADR-021): in-memory, so it resets on process restart (only holds -lines since startup) and is per-process. ``install()`` runs at import time in -``app.py`` (after the import block, before the app serves traffic), so the -Flask/werkzeug log handlers — constructed lazily at ``app.run`` / first request -— bind to the tee. +"""In-memory ring of this service's recent log entries (#171, #178). + +Two ingestion paths feed the same bounded deque of structured entries +(``{"ts", "level", "msg"}``), with no double-capture: + + 1. A stdout/stderr *tee* installed at startup: every write NOT produced by the + structured logger is wrapped as an entry AND passed through to the real + stream, so ``docker logs`` / PM2 still see everything unchanged. Captures + ``print(...)`` and werkzeug's request lines. stdout lines become + ``info``, stderr lines become ``error``. + 2. The structured logger (``log_event``) appends an entry directly, then + writes its formatted human line to the *saved real* stream (bypassing the + tee, so its own output is not re-captured as a duplicate entry). + +Read via this service's internal admin-gated ``GET /logs`` and streamed live +via ``GET /logs/stream``. + +Caveats (see ADR-021): in-memory, so it resets on process restart and is +per-process. On-disk persistence + startup backfill is layered on separately +(ADR-022). ``install()`` runs at import time in ``app.py`` (after the import +block, before the app serves traffic), so the Flask/werkzeug log handlers — +constructed lazily at ``app.run`` / first request — bind to the tee. """ import sys import threading from collections import deque +from datetime import UTC, datetime -_MAX_RING_LINES = 2000 -_ring: "deque[str]" = deque(maxlen=_MAX_RING_LINES) +_MAX_RING_ENTRIES = 2000 +_ring: "deque[dict]" = deque(maxlen=_MAX_RING_ENTRIES) _lock = threading.Lock() _installed = False _tees: "list[_TeeStream]" = [] # installed tee instances (for test carry reset) +# Saved real streams, so the structured logger can write past the tee. +_real_stdout = None +_real_stderr = None + + +def _now_iso() -> str: + # Millisecond precision, 'Z' suffix — matches the LogEntry contract. + return datetime.now(UTC).isoformat(timespec="milliseconds").replace("+00:00", "Z") + + +def _push(level: str, msg: str) -> None: + with _lock: + _ring.append({"ts": _now_iso(), "level": level, "msg": msg}) + class _TeeStream: - """Wraps a real text stream; mirrors complete lines into the ring.""" + """Wraps a real text stream; mirrors complete lines into the ring as entries.""" - def __init__(self, real): + def __init__(self, real, level): self._real = real + self._level = level self._carry = "" def write(self, text): @@ -48,9 +72,8 @@ def _capture(self, text): parts = combined.split("\n") # The last element is the (possibly empty) incomplete tail. self._carry = parts.pop() - if parts: - with _lock: - _ring.extend(parts) + for line in parts: + _push(self._level, line) def flush(self): self._real.flush() @@ -63,25 +86,48 @@ def __getattr__(self, name): def install(): """Wrap sys.stdout/sys.stderr. Idempotent; call once at process start.""" - global _installed + global _installed, _real_stdout, _real_stderr if _installed: return _installed = True - out = _TeeStream(sys.stdout) - err = _TeeStream(sys.stderr) + _real_stdout = sys.stdout + _real_stderr = sys.stderr + out = _TeeStream(sys.stdout, "info") + err = _TeeStream(sys.stderr, "error") _tees.extend((out, err)) sys.stdout = out sys.stderr = err +def log_event(level: str, msg: str) -> None: + """Append a structured entry AND write a formatted human line to the real + stream (bypassing the tee, so it is not re-captured as a duplicate). + + SECURITY: never pass secrets, auth headers, request bodies, or the admin + password — entries are admin-readable and (ADR-022) persisted to disk. + """ + _push(level, msg) + ts = _ring[-1]["ts"] if _ring else _now_iso() + line = f"{ts} {level.upper()} {msg}\n" + real = _real_stderr if level == "error" else _real_stdout + if real is None: + # Tee not installed yet (e.g. very early startup): fall back to the + # current stream — capture is harmless here. + real = sys.stderr if level == "error" else sys.stdout + try: + real.write(line) + except Exception: + pass + + def get_recent(n): - """Return (lines, truncated): the most recent ``n`` lines oldest→newest, + """Return (entries, truncated): the most recent ``n`` entries oldest→newest, and whether the ring held more than were returned.""" with _lock: items = list(_ring) n = max(0, min(n, len(items))) - lines = items[len(items) - n :] if n else [] - return lines, len(items) > len(lines) + entries = items[len(items) - n :] if n else [] + return entries, len(items) > len(entries) def _reset_for_test(): diff --git a/image-service/tests/test_logs.py b/image-service/tests/test_logs.py index 802102d..739eea1 100644 --- a/image-service/tests/test_logs.py +++ b/image-service/tests/test_logs.py @@ -1,4 +1,4 @@ -"""Tests for the internal admin-gated GET /logs server-log tail (#171).""" +"""Tests for the internal admin-gated GET /logs server-log tail (#171, #178).""" import io @@ -7,39 +7,67 @@ VALID_KEY = "hf_dev_key_2026" # dev fallback resolved when HIGHFIVE_API_KEY unset -def test_tee_captures_complete_lines_and_passes_through(): +def _msgs(entries): + return [e["msg"] for e in entries] + + +def test_tee_captures_complete_lines_as_entries_and_passes_through(): log_ring._reset_for_test() sink = io.StringIO() - tee = log_ring._TeeStream(sink) + tee = log_ring._TeeStream(sink, "info") tee.write("alpha\n") tee.write("partial") # no newline yet — buffered, not a line - lines, _ = log_ring.get_recent(10) - assert "alpha" in lines - assert "partial" not in lines + entries, _ = log_ring.get_recent(10) + assert "alpha" in _msgs(entries) + assert "partial" not in _msgs(entries) tee.write(" done\n") - lines, _ = log_ring.get_recent(10) - assert "partial done" in lines + entries, _ = log_ring.get_recent(10) + assert "partial done" in _msgs(entries) + alpha = next(e for e in entries if e["msg"] == "alpha") + assert alpha["level"] == "info" + assert alpha["ts"].endswith("Z") assert sink.getvalue() == "alpha\npartial done\n" +def test_stderr_tee_records_error_level(): + log_ring._reset_for_test() + tee = log_ring._TeeStream(io.StringIO(), "error") + tee.write("boom\n") + entries, _ = log_ring.get_recent(10) + assert entries[-1]["level"] == "error" + assert entries[-1]["msg"] == "boom" + + +def test_log_event_appends_entry_and_writes_through(monkeypatch): + log_ring._reset_for_test() + sink = io.StringIO() + monkeypatch.setattr(log_ring, "_real_stdout", sink) + log_ring.log_event("warn", "POST /upload 200 8ms") + entries, _ = log_ring.get_recent(10) + assert entries[-1]["level"] == "warn" + assert entries[-1]["msg"] == "POST /upload 200 8ms" + assert sink.getvalue().count("POST /upload 200 8ms") == 1 + assert _msgs(entries).count("POST /upload 200 8ms") == 1 + + def test_get_recent_clamps_and_flags_truncation(): log_ring._reset_for_test() for i in range(5): - log_ring._ring.append(f"line {i}") - lines, truncated = log_ring.get_recent(2) - assert lines == ["line 3", "line 4"] + log_ring._push("info", f"line {i}") + entries, truncated = log_ring.get_recent(2) + assert _msgs(entries) == ["line 3", "line 4"] assert truncated is True def test_ring_caps_at_max_and_evicts_oldest(): log_ring._reset_for_test() - total = log_ring._MAX_RING_LINES + 10 + total = log_ring._MAX_RING_ENTRIES + 10 for i in range(total): - log_ring._ring.append(f"L{i}") - lines, truncated = log_ring.get_recent(total) - assert len(lines) == log_ring._MAX_RING_LINES # bounded - assert lines[0] == "L10" # first 10 evicted - assert lines[-1] == f"L{total - 1}" # newest kept + log_ring._push("info", f"L{i}") + entries, truncated = log_ring.get_recent(total) + assert len(entries) == log_ring._MAX_RING_ENTRIES # bounded + assert entries[0]["msg"] == "L10" # first 10 evicted + assert entries[-1]["msg"] == f"L{total - 1}" # newest kept assert truncated is False # asked for >= ring size @@ -51,20 +79,22 @@ def test_logs_requires_admin_key(client): def test_logs_returns_ring_with_valid_key(client): log_ring._reset_for_test() - log_ring._ring.append("image marker line") + log_ring._push("info", "image marker line") resp = client.get("/logs?lines=50", headers={"X-Admin-Key": VALID_KEY}) assert resp.status_code == 200 body = resp.get_json() assert body["service"] == "image-service" - assert "image marker line" in body["lines"] + assert "image marker line" in _msgs(body["entries"]) + entry = body["entries"][-1] + assert set(entry.keys()) == {"ts", "level", "msg"} assert body["truncated"] is False def test_logs_route_honours_lines_and_reports_truncation(client): log_ring._reset_for_test() for i in range(5): - log_ring._ring.append(f"row {i}") + log_ring._push("info", f"row {i}") resp = client.get("/logs?lines=2", headers={"X-Admin-Key": VALID_KEY}) body = resp.get_json() - assert body["lines"] == ["row 3", "row 4"] + assert _msgs(body["entries"]) == ["row 3", "row 4"] assert body["truncated"] is True diff --git a/tests/ui/tests/admin-server-logs.spec.ts b/tests/ui/tests/admin-server-logs.spec.ts index 573fa73..28772f8 100644 --- a/tests/ui/tests/admin-server-logs.spec.ts +++ b/tests/ui/tests/admin-server-logs.spec.ts @@ -31,7 +31,13 @@ test.describe('admin server logs (#171)', () => { expect(resp.ok()).toBeTruthy(); const body = (await resp.json()) as ServerLogsResponse; expect(body.service).toBe(service); - expect(Array.isArray(body.lines)).toBeTruthy(); + expect(Array.isArray(body.entries)).toBeTruthy(); + // Structured wire shape (#178): when present, each entry is { ts, level, msg }. + for (const entry of body.entries) { + expect(typeof entry.ts).toBe('string'); + expect(['info', 'warn', 'error']).toContain(entry.level); + expect(typeof entry.msg).toBe('string'); + } expect(typeof body.truncated).toBe('boolean'); } From 49c46c679e05107971a7e7da655f620e5508c0b0 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 21:26:13 +0000 Subject: [PATCH 02/15] =?UTF-8?q?fix:=20address=20Phase=201=20review=20(#1?= =?UTF-8?q?78)=20=E2=80=94=20log=5Fevent=20caller,=20secret-free=20ring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Senior-review follow-ups (no behavior change beyond the boot banners): - Wire a real production caller for the Python structured logger: both Flask services emit a boot banner via log_event() at import, so the structured ingestion path is exercised in production, not just scaffolding. - Rewrite the Python no-double-capture tests to run in the installed state (tee on sys.stdout), so they actually fail if log_event wrote through the tee instead of the saved real stream. - Keep the dev admin key out of the admin-readable ring: server.ts writes the dev-key convenience lines via the writeStdout bypass (terminal only). - contracts: correct the entries comment — in-memory in Phase 1; on-disk persistence is ADR-022/Phase 2. - backend proxy: also validate `truncated` is a boolean before forwarding. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01HU7JHMbvtUKRfZY98qeWdK --- backend/src/app.ts | 7 ++++++- backend/src/server.ts | 12 ++++++++---- contracts/src/index.ts | 6 +++--- duckdb-service/app.py | 7 +++++++ duckdb-service/tests/test_logs.py | 20 +++++++++++++++----- image-service/app.py | 7 +++++++ image-service/tests/test_logs.py | 15 +++++++++++++-- 7 files changed, 59 insertions(+), 15 deletions(-) diff --git a/backend/src/app.ts b/backend/src/app.ts index 859810f..9ad8abd 100644 --- a/backend/src/app.ts +++ b/backend/src/app.ts @@ -626,7 +626,12 @@ app.get('/api/admin/logs', requireAdmin, async (req, res) => { // Don't forward a drifted wire shape typed as valid: a service that // changed its /logs envelope should surface as a clear 502, not as // `undefined` fields reaching the UI. - if (!payload || typeof payload.service !== 'string' || !Array.isArray(payload.entries)) { + if ( + !payload || + typeof payload.service !== 'string' || + !Array.isArray(payload.entries) || + typeof payload.truncated !== 'boolean' + ) { res.status(502).json({ error: `malformed logs response from ${service}` }); return; } diff --git a/backend/src/server.ts b/backend/src/server.ts index 728620e..3d919d5 100644 --- a/backend/src/server.ts +++ b/backend/src/server.ts @@ -4,7 +4,7 @@ import { getApiKey } from './auth'; import { duckdbHealth } from './duckdbClient'; import { isProduction } from './env'; import { log } from './log'; -import { installLogRing } from './logRing'; +import { installLogRing, writeStdout } from './logRing'; import { DEFAULT_PORT, resolvePort } from './port'; // Tee stdout/stderr into the in-memory ring so the admin server-logs endpoint @@ -40,9 +40,13 @@ async function bootstrap() { // typos so `"Production"` or `"production "` don't accidentally re-enable // the print on prod (PR #84 senior-review finding). if (!isProduction()) { - log.info(`🔑 Dev admin key: ${getApiKey()}`); - log.info(` Admin login: POST /api/admin/login {"password":""}`); - log.info(` Or machine credential: X-Admin-Key: ${getApiKey()}`); + // Write via the saved original stream (bypassing the ring tee) so the + // dev key reaches the terminal as a developer convenience but is NEVER + // captured into the admin-readable / (ADR-022) disk-persisted ring — + // the ring must not hold secrets even in dev. See log.ts SECURITY note. + writeStdout(`🔑 Dev admin key: ${getApiKey()}\n`); + writeStdout(` Admin login: POST /api/admin/login {"password":""}\n`); + writeStdout(` Or machine credential: X-Admin-Key: ${getApiKey()}\n`); } }); } diff --git a/contracts/src/index.ts b/contracts/src/index.ts index 7d5c113..0af0569 100644 --- a/contracts/src/index.ts +++ b/contracts/src/index.ts @@ -393,9 +393,9 @@ export interface LogEntry { export interface ServerLogsResponse { service: ServerLogService; - // Captured log entries, chronological (oldest→newest), like `tail`. Backed - // by an in-memory ring with on-disk persistence (ADR-022), so this survives - // process restart up to the retention bound (30 days / 100 MB). + // Captured log entries, chronological (oldest→newest), like `tail`. In-memory + // in Phase 1 (resets on process restart); on-disk persistence + rotation + // (survives restart, bounded to 30 days / 100 MB) is ADR-022 / Phase 2. entries: LogEntry[]; // True when the ring held more entries than were returned (clipped to the // requested `lines`, itself capped server-side). Lets the UI show a diff --git a/duckdb-service/app.py b/duckdb-service/app.py index eff98b3..bac6603 100644 --- a/duckdb-service/app.py +++ b/duckdb-service/app.py @@ -13,6 +13,7 @@ from routes.heartbeats import heartbeats_bp from services.backup import run_backup from services.log_ring import install as install_log_ring +from services.log_ring import log_event from services.silence_watcher import check_silence from services.weather_worker import run_weather_fetch @@ -22,6 +23,12 @@ # constructed lazily at app.run, so capture is complete. See services/log_ring.py. install_log_ring() +# Structured boot banner through the logger (#178) — the analogue to the +# backend's server.ts banner. Runs at import under flask run / gunicorn and +# under `python app.py`, so the structured ingestion path has a real +# production caller (not just the tee fallback). Never log secrets here. +log_event("info", "🗄 duckdb-service starting (DuckDB persistence)") + app = Flask(__name__) app.register_blueprint(health_bp) app.register_blueprint(logs_bp) diff --git a/duckdb-service/tests/test_logs.py b/duckdb-service/tests/test_logs.py index 410f2bc..0bd6413 100644 --- a/duckdb-service/tests/test_logs.py +++ b/duckdb-service/tests/test_logs.py @@ -1,6 +1,7 @@ """Tests for the internal admin-gated GET /logs server-log tail (#171, #178).""" import io +import sys from services import log_ring @@ -39,19 +40,28 @@ def test_stderr_tee_records_error_level(): assert entries[-1] == {"ts": entries[-1]["ts"], "level": "error", "msg": "boom"} -def test_log_event_appends_entry_and_writes_through(monkeypatch): +def test_log_event_no_double_capture_in_installed_state(monkeypatch): + # Simulate the *installed* state: sys.stdout is the tee. If log_event wrote + # to sys.stdout instead of the saved real stream, the tee would re-capture + # it and the message would land in the ring twice. The real stream behind + # the tee is the sink, which is also log_event's bypass target — so a + # correct implementation writes the human line once (to the sink) and + # pushes the entry once (directly), never through the tee. log_ring._reset_for_test() sink = io.StringIO() - # log_event writes to the saved real stream; point it at a sink. + tee = log_ring._TeeStream(sink, "info") + monkeypatch.setattr(sys, "stdout", tee) monkeypatch.setattr(log_ring, "_real_stdout", sink) + log_ring.log_event("warn", "GET /modules 200 5ms") + entries, _ = log_ring.get_recent(10) assert entries[-1]["level"] == "warn" assert entries[-1]["msg"] == "GET /modules 200 5ms" - # Human line reached the real stream exactly once... - assert sink.getvalue().count("GET /modules 200 5ms") == 1 - # ...and was not re-captured as a duplicate entry. + # Exactly once in the ring (not re-captured via the tee on sys.stdout)... assert _msgs(entries).count("GET /modules 200 5ms") == 1 + # ...and the human line reached the real stream exactly once. + assert sink.getvalue().count("GET /modules 200 5ms") == 1 def test_get_recent_clamps_and_flags_truncation(): diff --git a/image-service/app.py b/image-service/app.py index 18d8838..0d7bb3c 100644 --- a/image-service/app.py +++ b/image-service/app.py @@ -13,6 +13,7 @@ from services.duckdb import DuckDBService from services.log_ring import get_recent as _get_recent_logs from services.log_ring import install as install_log_ring +from services.log_ring import log_event from services.module_id import ModuleId from services.sidecar import LogSidecarEnvelope from services.upload_pipeline import UploadPipeline, UploadRequest @@ -23,6 +24,12 @@ # constructed lazily at app.run, so capture is complete. See services/log_ring.py. install_log_ring() +# Structured boot banner through the logger (#178) — the analogue to the +# backend's server.ts banner. Runs at import under flask run / gunicorn and +# under `python app.py`, so the structured ingestion path has a real +# production caller (not just the tee fallback). Never log secrets here. +log_event("info", "📷 image-service starting") + app = Flask(__name__) UPLOAD_FOLDER = os.getenv("IMAGE_STORE_PATH", "/data/images") diff --git a/image-service/tests/test_logs.py b/image-service/tests/test_logs.py index 739eea1..76f9878 100644 --- a/image-service/tests/test_logs.py +++ b/image-service/tests/test_logs.py @@ -1,6 +1,7 @@ """Tests for the internal admin-gated GET /logs server-log tail (#171, #178).""" import io +import sys from services import log_ring @@ -38,16 +39,26 @@ def test_stderr_tee_records_error_level(): assert entries[-1]["msg"] == "boom" -def test_log_event_appends_entry_and_writes_through(monkeypatch): +def test_log_event_no_double_capture_in_installed_state(monkeypatch): + # Simulate the *installed* state: sys.stdout is the tee. If log_event wrote + # to sys.stdout instead of the saved real stream, the tee would re-capture + # it and the message would land in the ring twice. The real stream behind + # the tee is the sink, which is also log_event's bypass target — so a + # correct implementation writes the human line once (to the sink) and + # pushes the entry once (directly), never through the tee. log_ring._reset_for_test() sink = io.StringIO() + tee = log_ring._TeeStream(sink, "info") + monkeypatch.setattr(sys, "stdout", tee) monkeypatch.setattr(log_ring, "_real_stdout", sink) + log_ring.log_event("warn", "POST /upload 200 8ms") + entries, _ = log_ring.get_recent(10) assert entries[-1]["level"] == "warn" assert entries[-1]["msg"] == "POST /upload 200 8ms" - assert sink.getvalue().count("POST /upload 200 8ms") == 1 assert _msgs(entries).count("POST /upload 200 8ms") == 1 + assert sink.getvalue().count("POST /upload 200 8ms") == 1 def test_get_recent_clamps_and_flags_truncation(): From 11821259fe1fd0b0af4910c590296ab8507481a9 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 21:28:36 +0000 Subject: [PATCH 03/15] test: tighten Phase 1 no-double-capture + truncated-guard tests (#178) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-review follow-ups (test/comment accuracy only, no production change): - no-double-capture test: assert the ring holds exactly one entry after a single log_event in the installed (tee-on-stdout) state — the unambiguous discriminator. The prior raw-substring count could not catch a buggy write through the tee (the re-captured entry holds the formatted line, a different string); comment corrected to say so. - backend proxy: add a case pinning the new `truncated` boolean guard branch (valid service + entries but non-boolean truncated → 502). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01HU7JHMbvtUKRfZY98qeWdK --- backend/tests/admin-server-logs.test.ts | 15 +++++++++++++++ duckdb-service/tests/test_logs.py | 23 ++++++++++++----------- image-service/tests/test_logs.py | 21 ++++++++++++--------- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/backend/tests/admin-server-logs.test.ts b/backend/tests/admin-server-logs.test.ts index 443568a..979944f 100644 --- a/backend/tests/admin-server-logs.test.ts +++ b/backend/tests/admin-server-logs.test.ts @@ -147,4 +147,19 @@ describe('GET /api/admin/logs (#171)', () => { expect(res.status).toBe(502); expect(res.body.error).toMatch(/malformed/i); }); + + it('returns 502 when the upstream omits a boolean truncated flag', async () => { + (globalThis.fetch as unknown as ReturnType).mockResolvedValue({ + ok: true, + status: 200, + // Valid service + entries, but truncated is not a boolean → must 502 + // rather than let `undefined` reach the panel. + json: async () => ({ service: 'image-service', entries: [], truncated: null }), + }); + const res = await request(app) + .get('/api/admin/logs?service=image-service') + .set('X-Admin-Key', KEY); + expect(res.status).toBe(502); + expect(res.body.error).toMatch(/malformed/i); + }); }); diff --git a/duckdb-service/tests/test_logs.py b/duckdb-service/tests/test_logs.py index 0bd6413..16a2151 100644 --- a/duckdb-service/tests/test_logs.py +++ b/duckdb-service/tests/test_logs.py @@ -41,12 +41,15 @@ def test_stderr_tee_records_error_level(): def test_log_event_no_double_capture_in_installed_state(monkeypatch): - # Simulate the *installed* state: sys.stdout is the tee. If log_event wrote - # to sys.stdout instead of the saved real stream, the tee would re-capture - # it and the message would land in the ring twice. The real stream behind - # the tee is the sink, which is also log_event's bypass target — so a - # correct implementation writes the human line once (to the sink) and - # pushes the entry once (directly), never through the tee. + # Simulate the *installed* state: sys.stdout is the tee. A correct log_event + # writes the human line to the saved real stream (the sink) — bypassing the + # tee — and pushes exactly one structured entry directly. A buggy log_event + # that wrote to sys.stdout would route the formatted human line through the + # tee, producing a SECOND ring entry (an info-level line of formatted text). + # The exact-size assertion below is the load-bearing discriminator: it is 1 + # only if the tee never saw the logger's own output. (A substring count on + # the raw msg would NOT catch the bug — the re-captured entry holds the + # formatted line, a different string — so don't rely on that.) log_ring._reset_for_test() sink = io.StringIO() tee = log_ring._TeeStream(sink, "info") @@ -56,11 +59,9 @@ def test_log_event_no_double_capture_in_installed_state(monkeypatch): log_ring.log_event("warn", "GET /modules 200 5ms") entries, _ = log_ring.get_recent(10) - assert entries[-1]["level"] == "warn" - assert entries[-1]["msg"] == "GET /modules 200 5ms" - # Exactly once in the ring (not re-captured via the tee on sys.stdout)... - assert _msgs(entries).count("GET /modules 200 5ms") == 1 - # ...and the human line reached the real stream exactly once. + assert len(entries) == 1 # exactly one entry — not re-captured via the tee + assert entries[0] == {"ts": entries[0]["ts"], "level": "warn", "msg": "GET /modules 200 5ms"} + # The human line reached the real stream exactly once. assert sink.getvalue().count("GET /modules 200 5ms") == 1 diff --git a/image-service/tests/test_logs.py b/image-service/tests/test_logs.py index 76f9878..942ac61 100644 --- a/image-service/tests/test_logs.py +++ b/image-service/tests/test_logs.py @@ -40,12 +40,15 @@ def test_stderr_tee_records_error_level(): def test_log_event_no_double_capture_in_installed_state(monkeypatch): - # Simulate the *installed* state: sys.stdout is the tee. If log_event wrote - # to sys.stdout instead of the saved real stream, the tee would re-capture - # it and the message would land in the ring twice. The real stream behind - # the tee is the sink, which is also log_event's bypass target — so a - # correct implementation writes the human line once (to the sink) and - # pushes the entry once (directly), never through the tee. + # Simulate the *installed* state: sys.stdout is the tee. A correct log_event + # writes the human line to the saved real stream (the sink) — bypassing the + # tee — and pushes exactly one structured entry directly. A buggy log_event + # that wrote to sys.stdout would route the formatted human line through the + # tee, producing a SECOND ring entry (an info-level line of formatted text). + # The exact-size assertion below is the load-bearing discriminator: it is 1 + # only if the tee never saw the logger's own output. (A substring count on + # the raw msg would NOT catch the bug — the re-captured entry holds the + # formatted line, a different string — so don't rely on that.) log_ring._reset_for_test() sink = io.StringIO() tee = log_ring._TeeStream(sink, "info") @@ -55,9 +58,9 @@ def test_log_event_no_double_capture_in_installed_state(monkeypatch): log_ring.log_event("warn", "POST /upload 200 8ms") entries, _ = log_ring.get_recent(10) - assert entries[-1]["level"] == "warn" - assert entries[-1]["msg"] == "POST /upload 200 8ms" - assert _msgs(entries).count("POST /upload 200 8ms") == 1 + assert len(entries) == 1 # exactly one entry — not re-captured via the tee + assert entries[0] == {"ts": entries[0]["ts"], "level": "warn", "msg": "POST /upload 200 8ms"} + # The human line reached the real stream exactly once. assert sink.getvalue().count("POST /upload 200 8ms") == 1 From e70f1291c6cf8c76178be197e841748535e8a22d Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 22:37:11 +0000 Subject: [PATCH 04/15] feat: per-request access logging into the server-log ring (#178 phase 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every handled request now emits one structured access entry through the Phase-1 logger, so the admin Server Logs panel fills with real traffic (on success too), visible per service: - backend: new accessLog.ts Express middleware mounted after cookieParser; on res 'finish' emits "method path status ms" via log.* (>=500 error, >=400 warn, else info). - duckdb-service + image-service: symmetric @app.before_request / @app.after_request hooks emitting the same shape via log_event. Security (load-bearing): logs req.path ONLY — never headers, body, or query string — so the X-Admin-Key header, the admin login password, and any ?token=/?key= value cannot reach the admin-readable ring. Tests assert a real request lands as an entry at the right level AND that no secret/query/ header substring appears. Docs: api-reference notes the access entry + path-only redaction; ADR-021's now-stale "dev key appears in service=backend" consequence corrected (Phase 1 moved it to the ring-bypassing writeStdout). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01HU7JHMbvtUKRfZY98qeWdK --- backend/src/accessLog.ts | 35 ++++++++ backend/src/app.ts | 7 ++ backend/tests/accessLog.test.ts | 80 +++++++++++++++++++ backend/tests/admin-server-logs.test.ts | 5 ++ .../adr-021-admin-server-log-ring.md | 10 ++- docs/api-reference.md | 5 +- duckdb-service/app.py | 36 ++++++++- duckdb-service/tests/test_logs.py | 28 +++++++ image-service/app.py | 33 +++++++- image-service/tests/test_logs.py | 28 +++++++ 10 files changed, 260 insertions(+), 7 deletions(-) create mode 100644 backend/src/accessLog.ts create mode 100644 backend/tests/accessLog.test.ts diff --git a/backend/src/accessLog.ts b/backend/src/accessLog.ts new file mode 100644 index 0000000..f222fbb --- /dev/null +++ b/backend/src/accessLog.ts @@ -0,0 +1,35 @@ +// Express access-log middleware (#178 Phase 2). +// +// Emits exactly one structured entry per handled request through the Phase-1 +// structured logger (`log.*`), so the admin Server Logs panel fills with real +// traffic — on success too, not just error branches. Level tracks status: +// >=500 error, >=400 warn, else info. +// +// SECURITY (load-bearing): logs `method path status durationMs` ONLY. It uses +// `req.path` — never `req.originalUrl`/query string, never headers, never the +// body — so the `X-Admin-Key` header, the `POST /api/admin/login` body +// password, and any `?token=`/`?key=` query value cannot reach the +// admin-readable (and, from Phase 3, disk-persisted) ring. "Path only" is the +// redaction mechanism by construction. + +import type { Request, Response, NextFunction } from 'express'; +import { log } from './log'; + +function levelFor(status: number): 'info' | 'warn' | 'error' { + if (status >= 500) return 'error'; + if (status >= 400) return 'warn'; + return 'info'; +} + +export function accessLog(req: Request, res: Response, next: NextFunction): void { + const start = process.hrtime.bigint(); + // `finish` fires once when the response has been fully flushed — for normal + // requests at completion, for a long-lived SSE stream only at disconnect + // (one entry, large duration; it issues no request, so it cannot loop). + res.once('finish', () => { + const ms = Number(process.hrtime.bigint() - start) / 1e6; + const msg = `${req.method} ${req.path} ${res.statusCode} ${ms.toFixed(1)}ms`; + log[levelFor(res.statusCode)](msg); + }); + next(); +} diff --git a/backend/src/app.ts b/backend/src/app.ts index 9ad8abd..d6e4cb4 100644 --- a/backend/src/app.ts +++ b/backend/src/app.ts @@ -5,6 +5,7 @@ import { tryParseModuleId } from '@highfive/contracts'; import type { ServerLogsResponse } from '@highfive/contracts'; import { db } from './database'; import { verifyApiKey, getApiKey } from './auth'; +import { accessLog } from './accessLog'; import { getRecentEntries } from './logRing'; import { SESSION_COOKIE, @@ -57,6 +58,12 @@ app.use(cors(corsOptions)); app.use(express.json()); app.use(cookieParser()); +// Access logging (#178): one structured entry per request into the admin log +// ring. Mounted here so it wraps every route below (health + public + admin). +// Logs method+path+status+duration only — never headers/body/query — so no +// secret can reach the ring. See accessLog.ts. +app.use(accessLog); + // Health check (public, no auth required) app.get('/api/health', (req, res) => { res.json({ status: 'ok', timestamp: new Date().toISOString() }); diff --git a/backend/tests/accessLog.test.ts b/backend/tests/accessLog.test.ts new file mode 100644 index 0000000..68c1077 --- /dev/null +++ b/backend/tests/accessLog.test.ts @@ -0,0 +1,80 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import request from 'supertest'; + +// Mock db so the app import doesn't trigger real fetches. +vi.mock('../src/database', () => ({ + db: { + listModules: vi.fn().mockResolvedValue([]), + getModuleDetail: vi.fn().mockResolvedValue(null), + }, +})); + +// Use the REAL logRing so access entries actually land in the ring; we read it +// back via getRecentEntries to assert behaviour (CLAUDE.md rule #5 — assert the +// request lands as an entry, not just envelope shape). +import { app } from '../src/app'; +import { getRecentEntries, __resetLogRingForTest } from '../src/logRing'; + +const KEY = 'hf_dev_key_2026'; + +const msgs = () => getRecentEntries(2000).entries.map((e) => e.msg); +const find = (re: RegExp) => getRecentEntries(2000).entries.find((e) => re.test(e.msg)); + +beforeEach(() => { + __resetLogRingForTest(); + vi.stubGlobal('fetch', vi.fn()); +}); + +afterEach(() => { + vi.unstubAllGlobals(); + vi.clearAllMocks(); +}); + +describe('access logging (#178)', () => { + it('emits one info entry for a 2xx request: method path status ms', async () => { + await request(app).get('/api/health'); + const entry = find(/^GET \/api\/health 200 /); + expect(entry).toBeTruthy(); + expect(entry?.level).toBe('info'); + expect(entry?.msg).toMatch(/^GET \/api\/health 200 \d+(\.\d+)?ms$/); + // Exactly one access entry for the single request. + expect(msgs().filter((m) => /^GET \/api\/health 200 /.test(m))).toHaveLength(1); + }); + + it('logs a 4xx as warn', async () => { + // Missing admin credential → 401 from requireAdmin. + await request(app).get('/api/admin/logs?service=backend'); + const entry = find(/^GET \/api\/admin\/logs 401 /); + expect(entry?.level).toBe('warn'); + }); + + it('logs an unmatched route (404) as warn', async () => { + await request(app).get('/api/does-not-exist'); + const entry = find(/^GET \/api\/does-not-exist 404 /); + expect(entry?.level).toBe('warn'); + }); + + it('never logs the query string, admin key, or header names', async () => { + // Path carries a token-ish query param; the entry must not include it. + await request(app) + .get('/api/admin/logs?service=backend&token=topsecret123') + .set('X-Admin-Key', KEY); + const all = msgs().join('\n'); + expect(all).not.toContain('topsecret123'); + expect(all).not.toContain('?'); // no query string at all + expect(all).not.toContain(KEY); + expect(all).not.toContain('X-Admin-Key'); + // The path itself is still logged (without the query). + expect(find(/^GET \/api\/admin\/logs 200 /)).toBeTruthy(); + }); + + it('never logs the admin login password from the request body', async () => { + await request(app) + .post('/api/admin/login') + .send({ password: 'hunter2-should-never-appear' }); + const all = msgs().join('\n'); + expect(all).not.toContain('hunter2-should-never-appear'); + // The login attempt is still logged as an access entry (path + status only). + expect(find(/^POST \/api\/admin\/login \d{3} /)).toBeTruthy(); + }); +}); diff --git a/backend/tests/admin-server-logs.test.ts b/backend/tests/admin-server-logs.test.ts index 979944f..62cda30 100644 --- a/backend/tests/admin-server-logs.test.ts +++ b/backend/tests/admin-server-logs.test.ts @@ -20,6 +20,11 @@ vi.mock('../src/logRing', () => ({ entries: BACKEND_ENTRIES, truncated: false, })), + // The access-log middleware (#178) runs on every request and routes through + // log.ts → these ring writers. Stub them so the mocked module is complete. + pushEntry: vi.fn(), + writeStdout: vi.fn(), + writeStderr: vi.fn(), })); import { app } from '../src/app'; diff --git a/docs/09-architecture-decisions/adr-021-admin-server-log-ring.md b/docs/09-architecture-decisions/adr-021-admin-server-log-ring.md index 23b1bfe..38330e0 100644 --- a/docs/09-architecture-decisions/adr-021-admin-server-log-ring.md +++ b/docs/09-architecture-decisions/adr-021-admin-server-log-ring.md @@ -48,10 +48,12 @@ and exposes it; the backend aggregates. readable unauthenticated. This requires `HIGHFIVE_API_KEY` to resolve to the same value in all three services (dev: shared `.env`; UI + prod compose: set explicitly on each). - **The ring faithfully captures whatever any code prints** — including any credential a log - line chooses to emit (e.g. `server.ts` prints the dev admin key to stdout at startup, so - it appears in `service=backend`). This is bounded by the admin gate on the endpoint and by - that banner being suppressed in production, but it is a reason the endpoint must stay - admin-only and a reason not to `console.log`/`print` secrets. + line chooses to emit. This is bounded by the admin gate on the endpoint, but it is the + reason the endpoint must stay admin-only and a reason not to `console.log`/`print` secrets. + (As of #178 the dev admin-key banner in `server.ts` is written via the ring-bypassing + `writeStdout`, so the key reaches the terminal but **not** the ring; and the per-request + access logs added in #178 log `method path status ms` **path-only** — never headers, body, + or query string — so the `X-Admin-Key` header and login password cannot reach the ring.) ### Alternatives rejected diff --git a/docs/api-reference.md b/docs/api-reference.md index 02906ff..960b2a5 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -259,7 +259,10 @@ Tails a service's **own** recent log entries — distinct from §1.5 (per-module ESP telemetry). Each service keeps a ring of structured `{ ts, level, msg }` entries (a stdout/stderr tee plus a structured logger); the backend serves its own ring and proxies to the two Flask services' internal `/logs`, forwarding the -machine credential. `service` must be one of the three names (others, incl. +machine credential. Every handled request adds one **access entry** +(`method path status ms`, level by status: `≥500` error, `≥400` warn, else info) +— logged **path-only**, never headers, body, or query string, so no secret +reaches the ring. `service` must be one of the three names (others, incl. `nginx`, return `400`). `lines` defaults to `200` and is clamped to `[1, 1000]`. Returns `401` without a valid admin credential, `502` if a proxied service is unreachable or returns a drifted envelope. Design + caveats: diff --git a/duckdb-service/app.py b/duckdb-service/app.py index bac6603..2004321 100644 --- a/duckdb-service/app.py +++ b/duckdb-service/app.py @@ -1,5 +1,7 @@ import os -from flask import Flask +import time + +from flask import Flask, g, request from apscheduler.schedulers.background import BackgroundScheduler from db.schema import init_db @@ -30,6 +32,38 @@ log_event("info", "🗄 duckdb-service starting (DuckDB persistence)") app = Flask(__name__) + + +def _status_level(code: int) -> str: + if code >= 500: + return "error" + if code >= 400: + return "warn" + return "info" + + +@app.before_request +def _access_log_start(): + g._access_start = time.perf_counter() + + +@app.after_request +def _access_log_finish(resp): + # One structured access entry per request (#178): "method path status ms". + # path ONLY — never query string, headers, or body — so the X-Admin-Key + # header and any ?token=/?key= value can't reach the admin-readable / + # disk-persisted ring. werkzeug's own request line is still tee-captured; + # this is the canonical, level-tagged entry. + start = g.pop("_access_start", None) + if start is not None: + ms = (time.perf_counter() - start) * 1000.0 + log_event( + _status_level(resp.status_code), + f"{request.method} {request.path} {resp.status_code} {ms:.1f}ms", + ) + return resp + + app.register_blueprint(health_bp) app.register_blueprint(logs_bp) app.register_blueprint(modules_bp) diff --git a/duckdb-service/tests/test_logs.py b/duckdb-service/tests/test_logs.py index 16a2151..dd209be 100644 --- a/duckdb-service/tests/test_logs.py +++ b/duckdb-service/tests/test_logs.py @@ -114,3 +114,31 @@ def test_logs_route_honours_lines_and_reports_truncation(client): body = resp.get_json() assert _msgs(body["entries"]) == ["row 3", "row 4"] assert body["truncated"] is True + + +def test_access_log_emits_entry_per_request(client): + # The @app.after_request hook (#178) emits one structured access entry per + # handled request. A real request must land as an entry (CLAUDE.md rule #5). + log_ring._reset_for_test() + resp = client.get("/health") + assert resp.status_code == 200 + entries, _ = log_ring.get_recent(50) + access = [e for e in entries if e["msg"].startswith("GET /health 200 ")] + assert len(access) == 1 + assert access[0]["level"] == "info" + assert access[0]["msg"].endswith("ms") + + +def test_access_log_redacts_query_and_credentials(client): + # path ONLY: the token-ish query param and the admin key must never appear + # in any entry, and no entry may carry a query string. + log_ring._reset_for_test() + resp = client.get("/logs?lines=5&token=secret123", headers={"X-Admin-Key": VALID_KEY}) + assert resp.status_code == 200 + blob = "\n".join(_msgs(log_ring.get_recent(50)[0])) + assert "secret123" not in blob + assert "?" not in blob + assert VALID_KEY not in blob + assert "X-Admin-Key" not in blob + # The path itself is still logged (without the query). + assert any(e["msg"].startswith("GET /logs 200 ") for e in log_ring.get_recent(50)[0]) diff --git a/image-service/app.py b/image-service/app.py index 0d7bb3c..cd09726 100644 --- a/image-service/app.py +++ b/image-service/app.py @@ -6,7 +6,7 @@ import time import requests as http_requests -from flask import Flask, jsonify, request, send_from_directory +from flask import Flask, g, jsonify, request, send_from_directory from pydantic import ValidationError from services.discord import send_discord_message @@ -32,6 +32,37 @@ app = Flask(__name__) + +def _status_level(code: int) -> str: + if code >= 500: + return "error" + if code >= 400: + return "warn" + return "info" + + +@app.before_request +def _access_log_start(): + g._access_start = time.perf_counter() + + +@app.after_request +def _access_log_finish(resp): + # One structured access entry per request (#178): "method path status ms". + # path ONLY — never query string, headers, or body — so the X-Admin-Key + # header and any ?token=/?key= value can't reach the admin-readable / + # disk-persisted ring. werkzeug's own request line is still tee-captured; + # this is the canonical, level-tagged entry. + start = g.pop("_access_start", None) + if start is not None: + ms = (time.perf_counter() - start) * 1000.0 + log_event( + _status_level(resp.status_code), + f"{request.method} {request.path} {resp.status_code} {ms:.1f}ms", + ) + return resp + + UPLOAD_FOLDER = os.getenv("IMAGE_STORE_PATH", "/data/images") os.makedirs(UPLOAD_FOLDER, exist_ok=True) diff --git a/image-service/tests/test_logs.py b/image-service/tests/test_logs.py index 942ac61..9f32922 100644 --- a/image-service/tests/test_logs.py +++ b/image-service/tests/test_logs.py @@ -112,3 +112,31 @@ def test_logs_route_honours_lines_and_reports_truncation(client): body = resp.get_json() assert _msgs(body["entries"]) == ["row 3", "row 4"] assert body["truncated"] is True + + +def test_access_log_emits_entry_per_request(client): + # The @app.after_request hook (#178) emits one structured access entry per + # handled request. A real request must land as an entry (CLAUDE.md rule #5). + log_ring._reset_for_test() + resp = client.get("/health") + assert resp.status_code == 200 + entries, _ = log_ring.get_recent(50) + access = [e for e in entries if e["msg"].startswith("GET /health 200 ")] + assert len(access) == 1 + assert access[0]["level"] == "info" + assert access[0]["msg"].endswith("ms") + + +def test_access_log_redacts_query_and_credentials(client): + # path ONLY: the token-ish query param and the admin key must never appear + # in any entry, and no entry may carry a query string. + log_ring._reset_for_test() + resp = client.get("/logs?lines=5&token=secret123", headers={"X-Admin-Key": VALID_KEY}) + assert resp.status_code == 200 + blob = "\n".join(_msgs(log_ring.get_recent(50)[0])) + assert "secret123" not in blob + assert "?" not in blob + assert VALID_KEY not in blob + assert "X-Admin-Key" not in blob + # The path itself is still logged (without the query). + assert any(e["msg"].startswith("GET /logs 200 ") for e in log_ring.get_recent(50)[0]) From c6bc51546a925df813637e63994b7ee433d32b5a Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 22:52:18 +0000 Subject: [PATCH 05/15] feat: disk persistence + rotation + startup backfill for server logs (#178 phase 3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Server logs now survive restart, bounded to 30 days / 100 MB (ADR-022, supersedes ADR-021's in-memory consequence). Gated on LOG_DIR (unset = in-memory only, as before). - backend: logRing.ts gains initLogPersistence — appends each entry as JSONL via rotating-file-stream (daily + 50MB rotate; retain <=30 files AND <=100MB total) and backfills the ring from the active file tail at startup. Wired in server.ts before the boot banners. - duckdb-service + image-service: byte-identical log_ring.py init_persistence using stdlib TimedRotatingFileHandler subclassed with a 100MB size-prune sweep; same JSONL + backfill. Distinct LOG_DIR per service (subdirs of the shared duckdb_data volume) so they never collide on one file. - compose (dev + prod): LOG_DIR env + a backend_logs volume; Flask services persist under /data/logs/. - ADR-022 written + indexed; ADR-021 marked superseded-in-part. Docs: auth.md (disk-persisted-secret caveat), docker-compose.md (log volumes), ch.11 (30-day retention/PII + per-process SSE emitter trade-offs). Tests: JSONL append, startup backfill (survives restart), no-op without LOG_DIR, malformed-line skip — backend + both Flask. Gate green: backend 191, duckdb 198, image 74, homepage 161; tsc/ruff/citations clean; compose validates. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01HU7JHMbvtUKRfZY98qeWdK --- backend/package.json | 3 +- backend/src/logRing.ts | 80 +++++++++++++- backend/src/server.ts | 6 +- backend/tests/logPersistence.test.ts | 82 ++++++++++++++ docker-compose.prod.yml | 14 +++ docker-compose.yml | 13 +++ docs/07-deployment-view/docker-compose.md | 14 +++ docs/08-crosscutting-concepts/auth.md | 20 ++++ docs/09-architecture-decisions/README.md | 3 +- .../adr-021-admin-server-log-ring.md | 8 ++ ...r-022-persistent-structured-server-logs.md | 102 ++++++++++++++++++ docs/11-risks-and-technical-debt/README.md | 12 +++ duckdb-service/app.py | 5 + duckdb-service/services/log_ring.py | 97 ++++++++++++++++- duckdb-service/tests/test_logs.py | 35 ++++++ image-service/app.py | 5 + image-service/services/log_ring.py | 97 ++++++++++++++++- image-service/tests/test_logs.py | 35 ++++++ package-lock.json | 15 ++- 19 files changed, 639 insertions(+), 7 deletions(-) create mode 100644 backend/tests/logPersistence.test.ts create mode 100644 docs/09-architecture-decisions/adr-022-persistent-structured-server-logs.md diff --git a/backend/package.json b/backend/package.json index a1f2bef..849b03d 100644 --- a/backend/package.json +++ b/backend/package.json @@ -22,7 +22,8 @@ "cookie-parser": "^1.4.7", "cors": "^2.8.5", "dotenv": "^17.2.3", - "express": "^4.18.2" + "express": "^4.18.2", + "rotating-file-stream": "^2.1.6" }, "devDependencies": { "@types/cookie-parser": "^1.4.10", diff --git a/backend/src/logRing.ts b/backend/src/logRing.ts index c8ba258..aa74d51 100644 --- a/backend/src/logRing.ts +++ b/backend/src/logRing.ts @@ -21,6 +21,9 @@ // would have one ring per worker). On-disk persistence + startup backfill // (ADR-022) is layered on separately. +import fs from 'node:fs'; +import path from 'node:path'; +import { createStream, type RotatingFileStream } from 'rotating-file-stream'; import type { LogEntry, LogLevel } from '@highfive/contracts'; const MAX_RING_ENTRIES = 2000; @@ -28,6 +31,14 @@ const MAX_RING_ENTRIES = 2000; const ring: LogEntry[] = []; let installed = false; +// On-disk persistence (#178 Phase 2 / ADR-022). Gated on LOG_DIR: when set, each +// entry is also appended as one JSON object per line (JSONL) to a rotating file, +// and the ring is backfilled from that file at startup so history survives a +// restart. When unset (e.g. unit tests), the ring is in-memory only — the +// pre-ADR-022 behaviour. Rotation: daily + 50 MB, retain ≤30 files AND ≤100 MB +// total (rfs prunes oldest past either bound). +let diskStream: RotatingFileStream | null = null; + // The real stream writers, captured before the tee replaces them. The // structured logger writes through these so its output reaches the terminal / // docker logs without being re-captured by the tee. @@ -38,11 +49,61 @@ let originalStderrWrite: ((...a: unknown[]) => boolean) | null = null; // until the next write completes the line. const carry: Record<'out' | 'err', string> = { out: '', err: '' }; +const LOG_FILENAME = 'backend.log'; + function pushEntryInternal(entry: LogEntry): void { ring.push(entry); if (ring.length > MAX_RING_ENTRIES) { ring.splice(0, ring.length - MAX_RING_ENTRIES); } + if (diskStream) { + try { + diskStream.write(`${JSON.stringify(entry)}\n`); + } catch { + // Persistence must never break in-memory logging. + } + } +} + +/** + * Enable on-disk persistence + startup backfill. Idempotent. Pass the log + * directory (typically `process.env.LOG_DIR`); a falsy value is a no-op so the + * ring stays in-memory. Call once, early, before serving traffic. + */ +export function initLogPersistence(dir: string | undefined = process.env.LOG_DIR): void { + if (!dir || diskStream) return; + fs.mkdirSync(dir, { recursive: true }); + // Backfill the ring from the active file's tail BEFORE opening the write + // stream, so a restart shows pre-restart history immediately. + backfillFromDisk(path.join(dir, LOG_FILENAME)); + diskStream = createStream(LOG_FILENAME, { + path: dir, + interval: '1d', // rotate daily + size: '50M', // …and when a file reaches 50 MB + maxFiles: 30, // retain ≤30 rotated files + maxSize: '100M', // …AND ≤100 MB total (prune oldest past either bound) + }); +} + +function backfillFromDisk(file: string): void { + let raw: string; + try { + raw = fs.readFileSync(file, 'utf8'); + } catch { + return; // no prior file → nothing to backfill + } + const lines = raw.split('\n').filter(Boolean); + for (const line of lines.slice(-MAX_RING_ENTRIES)) { + try { + const e = JSON.parse(line) as LogEntry; + if (e && typeof e.ts === 'string' && typeof e.msg === 'string') { + ring.push(e); + } + } catch { + // Skip malformed/partial trailing lines. + } + } + if (ring.length > MAX_RING_ENTRIES) ring.splice(0, ring.length - MAX_RING_ENTRIES); } /** @@ -114,9 +175,26 @@ export function getRecentEntries(n: number): { entries: LogEntry[]; truncated: b return { entries, truncated: ring.length > entries.length }; } -/** Test-only: clear the ring and partial-line carry. */ +/** Test-only: clear the ring and partial-line carry (does not touch disk). */ export function __resetLogRingForTest(): void { ring.length = 0; carry.out = ''; carry.err = ''; } + +/** + * Test-only: flush + close the disk stream and resolve once buffered writes + * have hit the file, so a test can read the file or re-init backfill + * deterministically. (rfs buffers writes; in production a real restart flushes.) + */ +export function __flushDiskForTest(): Promise { + return new Promise((resolve) => { + if (!diskStream) { + resolve(); + return; + } + const s = diskStream; + diskStream = null; + s.end(() => resolve()); + }); +} diff --git a/backend/src/server.ts b/backend/src/server.ts index 3d919d5..839b17b 100644 --- a/backend/src/server.ts +++ b/backend/src/server.ts @@ -4,7 +4,7 @@ import { getApiKey } from './auth'; import { duckdbHealth } from './duckdbClient'; import { isProduction } from './env'; import { log } from './log'; -import { installLogRing, writeStdout } from './logRing'; +import { installLogRing, initLogPersistence, writeStdout } from './logRing'; import { DEFAULT_PORT, resolvePort } from './port'; // Tee stdout/stderr into the in-memory ring so the admin server-logs endpoint @@ -12,6 +12,10 @@ import { DEFAULT_PORT, resolvePort } from './port'; // all real logging is runtime (below + request handlers), so installing here // captures it. Idempotent. See logRing.ts / ADR-021. installLogRing(); +// Enable on-disk persistence + backfill the ring from prior history when +// LOG_DIR is set (compose sets it; unset = in-memory only). Must run before +// the boot banners below so they are persisted too. See ADR-022. +initLogPersistence(); const { port: PORT, warned: portUnsetWarning } = resolvePort(process.env.PORT); if (portUnsetWarning) { diff --git a/backend/tests/logPersistence.test.ts b/backend/tests/logPersistence.test.ts new file mode 100644 index 0000000..a6c1448 --- /dev/null +++ b/backend/tests/logPersistence.test.ts @@ -0,0 +1,82 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { + initLogPersistence, + pushEntry, + getRecentEntries, + __resetLogRingForTest, + __flushDiskForTest, +} from '../src/logRing'; + +// Disk persistence + startup backfill (#178 / ADR-022). Gated on LOG_DIR: these +// tests pass an explicit tmp dir; unit tests that don't enable persistence stay +// in-memory and write nothing. + +let dir: string; + +beforeEach(() => { + __resetLogRingForTest(); + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'hf-logs-')); +}); + +afterEach(async () => { + await __flushDiskForTest(); + __resetLogRingForTest(); + fs.rmSync(dir, { recursive: true, force: true }); +}); + +const entry = (msg: string) => + ({ ts: new Date().toISOString(), level: 'info', msg }) as const; + +describe('log persistence (#178/ADR-022)', () => { + it('appends each entry as a JSONL line to the log file', async () => { + initLogPersistence(dir); + pushEntry(entry('hf-persist alpha')); + pushEntry(entry('hf-persist bravo')); + // Flush buffered writes to disk before reading. + await __flushDiskForTest(); + + const file = path.join(dir, 'backend.log'); + const lines = fs.readFileSync(file, 'utf8').split('\n').filter(Boolean); + expect(lines).toHaveLength(2); + const parsed = lines.map((l) => JSON.parse(l)); + expect(parsed[0].msg).toBe('hf-persist alpha'); + expect(parsed[1].msg).toBe('hf-persist bravo'); + expect(parsed[0]).toHaveProperty('ts'); + expect(parsed[0]).toHaveProperty('level', 'info'); + }); + + it('backfills the ring from a prior file on startup (survives restart)', async () => { + // Simulate a previous process having written history, then exiting. + initLogPersistence(dir); + pushEntry(entry('hf-prior 1')); + pushEntry(entry('hf-prior 2')); + await __flushDiskForTest(); // flush + close == "process exit" + __resetLogRingForTest(); // ring cleared + + // New process: ring is empty until we re-init from disk. + expect(getRecentEntries(10).entries).toHaveLength(0); + initLogPersistence(dir); + const msgs = getRecentEntries(10).entries.map((e) => e.msg); + expect(msgs).toEqual(['hf-prior 1', 'hf-prior 2']); + }); + + it('is a no-op when no dir is provided (in-memory only, no file written)', () => { + initLogPersistence(undefined); + pushEntry(entry('hf-nomem')); + expect(getRecentEntries(10).entries.map((e) => e.msg)).toContain('hf-nomem'); + expect(fs.readdirSync(dir)).toHaveLength(0); // nothing persisted + }); + + it('skips malformed trailing lines during backfill', () => { + const file = path.join(dir, 'backend.log'); + fs.writeFileSync( + file, + `${JSON.stringify(entry('good 1'))}\n${JSON.stringify(entry('good 2'))}\n{partial`, + ); + initLogPersistence(dir); + expect(getRecentEntries(10).entries.map((e) => e.msg)).toEqual(['good 1', 'good 2']); + }); +}); diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index f418034..16043c4 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -17,6 +17,11 @@ services: - PORT=3001 - DUCKDB_SERVICE_URL=http://duckdb-service:8000 - IMAGE_SERVICE_URL=http://image-service:4444 + # On-disk server-log persistence (#178 / ADR-022): JSONL + daily rotation, + # retained ≤30 files / ≤100 MB, backfilled into the ring on restart. + - LOG_DIR=/var/log/highfive + volumes: + - backend_logs:/var/log/highfive restart: unless-stopped networks: - highfive-network @@ -75,6 +80,9 @@ services: # forwards X-Admin-Key) authenticates against this service's internal # /logs gate. - HIGHFIVE_API_KEY=${HIGHFIVE_API_KEY:?HIGHFIVE_API_KEY must be set in .env.production (see .env.production.example)} + # Server-log persistence (#178 / ADR-022). Distinct subdir of the shared + # duckdb_data volume so the two Flask services don't collide on one file. + - LOG_DIR=/data/logs/image volumes: - duckdb_data:/data restart: unless-stopped @@ -105,6 +113,9 @@ services: - SEED_DATA=false # Must match the backend's key for the #171 server-logs proxy. - HIGHFIVE_API_KEY=${HIGHFIVE_API_KEY:?HIGHFIVE_API_KEY must be set in .env.production (see .env.production.example)} + # Server-log persistence (#178 / ADR-022). Distinct subdir of the shared + # duckdb_data volume so the two Flask services don't collide on one file. + - LOG_DIR=/data/logs/duckdb volumes: - duckdb_data:/data restart: unless-stopped @@ -125,6 +136,9 @@ services: volumes: duckdb_data: + # Backend server logs (#178 / ADR-022). The Flask services persist under + # subdirs of duckdb_data; the backend has no data volume, so it gets its own. + backend_logs: networks: highfive-network: diff --git a/docker-compose.yml b/docker-compose.yml index a063c99..3bbc51e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,6 +21,9 @@ services: PORT: '3002' DUCKDB_SERVICE_URL: http://duckdb-service:8000 IMAGE_SERVICE_URL: http://image-service:4444 + # On-disk server-log persistence (#178 / ADR-022): JSONL + daily rotation, + # retained ≤30 files / ≤100 MB, backfilled into the ring on restart. + LOG_DIR: /var/log/highfive volumes: # Bind-mount backend source for live reload, plus contracts so # type changes flow through. Anonymous volumes shadow the @@ -30,6 +33,7 @@ services: - ./contracts:/app/contracts - /app/node_modules - /app/backend/node_modules + - backend_logs:/var/log/highfive restart: unless-stopped networks: - net @@ -63,6 +67,9 @@ services: - '8000:4444' environment: DUCKDB_PATH: /data/app.duckdb + # Server-log persistence (#178 / ADR-022). Distinct subdir of the shared + # duckdb_data volume so the two Flask services don't collide on one file. + LOG_DIR: /data/logs/image volumes: - duckdb_data:/data restart: unless-stopped @@ -84,6 +91,9 @@ services: - .env environment: SEED_DATA: 'true' + # Server-log persistence (#178 / ADR-022). Distinct subdir of the shared + # duckdb_data volume so the two Flask services don't collide on one file. + LOG_DIR: /data/logs/duckdb # Dev-only: register /firmware.json + /firmware.app.bin proxy to # homepage:5173 so the ESP can fetch OTA artifacts in dev. In # prod, host-nginx serves these directly from homepage static. @@ -117,6 +127,9 @@ services: volumes: duckdb_data: + # Backend server logs (#178 / ADR-022). The Flask services persist under + # subdirs of duckdb_data; the backend has no data volume, so it gets its own. + backend_logs: networks: net: diff --git a/docs/07-deployment-view/docker-compose.md b/docs/07-deployment-view/docker-compose.md index a6f8f6d..d5c10c3 100644 --- a/docs/07-deployment-view/docker-compose.md +++ b/docs/07-deployment-view/docker-compose.md @@ -111,6 +111,20 @@ The DuckDB database is stored in the Docker volume: This volume is shared between the **image service** and the **DuckDB service** to persist the database and images across container restarts. +### Server log persistence (#178 / ADR-022) + +Each service also persists its admin **Server Logs** ring to disk (JSONL, daily +rotation, retained ≤30 files / ≤100 MB), gated on the `LOG_DIR` env var, and +backfills the ring from disk on restart so the panel shows pre-restart history: + +- **backend** → its own `backend_logs` volume at `/var/log/highfive`. +- **duckdb-service** and **image-service** → distinct subdirs of `duckdb_data` + (`/data/logs/duckdb`, `/data/logs/image`) so the two services sharing that + volume never collide on one file. + +`docker compose down -v` clears these along with `duckdb_data`. Unset `LOG_DIR` +to fall back to the in-memory-only ring (pre-ADR-022 behaviour). + During development it may be necessary to reset the database, for example when **primary key conflicts** occur due to previously inserted test data. diff --git a/docs/08-crosscutting-concepts/auth.md b/docs/08-crosscutting-concepts/auth.md index be73341..08bb21b 100644 --- a/docs/08-crosscutting-concepts/auth.md +++ b/docs/08-crosscutting-concepts/auth.md @@ -270,6 +270,26 @@ is being staged): map view are unaffected (the saved geolocation from first boot persists in module config). +## Server logs: secrets must never be logged (ADR-022) + +The admin **Server Logs** panel tails each service's own log ring +(`GET /api/admin/logs`, `requireAdmin`). As of [ADR-022](../09-architecture-decisions/adr-022-persistent-structured-server-logs.md) +that ring is **persisted to disk** (JSONL, 30 days / 100 MB, gated on `LOG_DIR`) and +backfilled on restart — so anything printed to stdout no longer just flashes past in +`docker logs`, it lands on disk for up to a month. That makes "never print secrets" +load-bearing, not advisory. Three controls keep credentials out of the ring: + +- **Access logs are path-only.** The per-request access entry is `method path status ms` + using the request **path only** — never headers, body, or query string — so the + `X-Admin-Key` header, the `POST /api/admin/login` body password, and any `?token=`/`?key=` + query value cannot reach the ring. (`accessLog.ts`; Flask `@app.after_request`.) +- **The dev admin-key banner bypasses the ring.** `server.ts` prints the dev key via the + ring-bypassing `writeStdout`, so it reaches the terminal as a developer convenience but is + not captured into the ring/disk — and the whole block is suppressed in production by + `auth.ts`'s boot guards (see [The secret](#the-secret)). +- **The endpoint stays admin-only.** The ring may still capture whatever other code prints, + so the gate is the backstop — and the reason not to `console.log`/`print` secrets anywhere. + ## Why one secret, two header names See [ADR-003](../09-architecture-decisions/adr-003-shared-api-key-for-admin.md). diff --git a/docs/09-architecture-decisions/README.md b/docs/09-architecture-decisions/README.md index f594ac5..b3da898 100644 --- a/docs/09-architecture-decisions/README.md +++ b/docs/09-architecture-decisions/README.md @@ -29,7 +29,8 @@ and link backwards. | [018](adr-018-captive-portal-wifi-only.md) | Captive portal is Wi-Fi-only; identity / URLs / camera defaulted in firmware | Accepted | | [019](adr-019-admin-session-no-bundle-secret.md) | Admin auth is a server-side session cookie; no secret in the homepage bundle | Accepted | | [020](adr-020-coordinate-generalization.md) | Module coordinates are generalized to ~1 km (2 dp) for every caller | Accepted | -| [021](adr-021-admin-server-log-ring.md) | Admin server logs via an in-process stdout/stderr ring, not the Docker socket | Accepted | +| [021](adr-021-admin-server-log-ring.md) | Admin server logs via an in-process stdout/stderr ring, not the Docker socket | Superseded by [022](adr-022-persistent-structured-server-logs.md) | +| [022](adr-022-persistent-structured-server-logs.md) | Server logs become structured, disk-persisted (30 d / 100 MB), SSE-streamed | Accepted | ## When to create an ADR diff --git a/docs/09-architecture-decisions/adr-021-admin-server-log-ring.md b/docs/09-architecture-decisions/adr-021-admin-server-log-ring.md index 38330e0..d500a54 100644 --- a/docs/09-architecture-decisions/adr-021-admin-server-log-ring.md +++ b/docs/09-architecture-decisions/adr-021-admin-server-log-ring.md @@ -7,6 +7,14 @@ Accepted ([#171](https://github.com/schutera/highfive/issues/171)). Spun out of server-side visibility the per-module telemetry sidecars (`GET /api/modules/:id/logs`) do not provide. +**Superseded in part by [ADR-022](adr-022-persistent-structured-server-logs.md) +([#178](https://github.com/schutera/highfive/issues/178)):** the ring is no longer +in-memory-only and the wire shape is no longer raw `string[]`. ADR-022 makes entries +structured (`{ ts, level, msg }`), persists them to disk (30 days / 100 MB, surviving +restart), and adds SSE streaming. The endpoint design, the admin gate, the per-service +ring + backend-proxy topology, and the "never `print`/`console.log` secrets" rule below +all carry forward unchanged. + ## Context There was no way to read the services' **own** process logs (backend, diff --git a/docs/09-architecture-decisions/adr-022-persistent-structured-server-logs.md b/docs/09-architecture-decisions/adr-022-persistent-structured-server-logs.md new file mode 100644 index 0000000..e2dd0a2 --- /dev/null +++ b/docs/09-architecture-decisions/adr-022-persistent-structured-server-logs.md @@ -0,0 +1,102 @@ +# ADR-022: Server logs become structured, disk-persisted, and SSE-streamed + +## Status + +Accepted ([#178](https://github.com/schutera/highfive/issues/178)). Supersedes the +in-memory-only and raw-`string[]` consequences of +[ADR-021](adr-021-admin-server-log-ring.md); builds directly on its per-service ring + +backend-proxy topology and its admin gate (which are unchanged). Delivered in phases: +structured entries + access logging + disk persistence (this ADR's core), then SSE live +tail. Auth model: [ADR-019](adr-019-admin-session-no-bundle-secret.md) / +[ADR-003](adr-003-shared-api-key-for-admin.md). + +## Context + +ADR-021 shipped the admin **Server Logs** panel and a per-service ring, but the data +source was nearly empty: the services barely wrote to stdout (boot banners + error-branch +`console.error`/`print`), the ring was **in-memory only** (reset on every restart), and +each line was an unstructured `string`. The panel worked; it just had ~2 lines to show, +none of which survived a deploy. Operators wanted a genuine live console: structured, +timestamped, filterable lines; per-request access logs (so traffic is visible on success +too); history that survives a restart; and a live "tail -f" without manual refresh. + +Three forces shaped the decision: + +- **Structure must be pinned once.** Per CLAUDE.md / ADR-004 the wire type crosses the + backend ↔ homepage boundary, so it must live in `@highfive/contracts`, not be re-declared + per service. +- **Persistence raises the credential stakes.** A secret printed to stdout no longer just + flashes past in `docker logs`; it lands in the ring **and on disk for up to 30 days**. + The "never print secrets" rule becomes load-bearing rather than advisory. +- **Retention must be bounded two ways.** Disk is finite and access logs accumulate, so + the store is capped by **both** age (30 days) and total size (100 MB) — whichever bites + first prunes oldest. + +## Decision + +Replace the raw line ring with a ring of structured **`LogEntry { ts, level, msg }`** +(`ts` ISO 8601, `level` ∈ `info|warn|error`), defined once in +[`contracts/src/index.ts`](../../contracts/src/index.ts) and used by every consumer. +`ServerLogsResponse` carries `entries: LogEntry[]` (was `lines: string[]`). + +Each service feeds its ring through **two no-double-capture paths**: the existing +stdout/stderr tee (stray `print`/`console.*` → `info`/`error` entries) and a **structured +logger** (`backend/src/log.ts` `log.*`; Flask `log_ring.log_event`) that pushes an entry +directly and writes a human line to the *saved original* stream so the tee can't re-capture +it. A per-request **access-log** middleware (Express `accessLog`; Flask +`@app.after_request`) emits one `method path status ms` entry per request, level by status +(`≥500` error, `≥400` warn, else info) — logged **path-only**, never headers, body, or +query string, so no credential can reach the ring. + +**Persistence** is gated on a `LOG_DIR` env var (set in compose; unset = in-memory only, as +ADR-021). When set, each entry is also appended as one JSON object per line (**JSONL**) to a +rotating file, and at startup the ring is **backfilled** from the file's tail so the panel +shows pre-restart history immediately. Rotation keeps **≤30 daily files AND ≤100 MB total** +(prune oldest past either bound). The backend uses the `rotating-file-stream` npm package +(both bounds native via `maxFiles` + `maxSize`); the Flask services use stdlib +`TimedRotatingFileHandler` subclassed with a 100 MB size-prune sweep after each rollover. +The backend writes to its own `backend_logs` volume; the two Flask services write to +**distinct subdirs** of the shared `duckdb_data` volume (`/data/logs/duckdb`, +`/data/logs/image`) so they never collide on one file. + +Live tail is **SSE** (`GET /api/admin/logs/stream`, `requireAdmin`): the backend streams its +own ring's emitter and pipes the Flask services' internal `/logs/stream`; the REST +`GET /api/admin/logs` stays for the initial backfill. (SSE lands in the streaming phase of +#178; the contract and ring emitter are designed for it here.) + +## Consequences + +### Positive + +- The panel is a real operations console: structured, color-coded, timestamped lines that + fill with live traffic and **survive restarts** (history from disk). +- One wire type in `contracts/` — drift is a compile error (ADR-004), and the backend proxy + rejects a drifted Flask envelope (`502`) rather than leaking `undefined` to the UI. +- Retention is bounded on both axes, so the store can't grow without limit. +- `LOG_DIR`-gating keeps unit tests and any "in-memory is fine" deployment on the old path + with zero file I/O. + +### Negative + +- **Secrets-on-disk risk is real now.** Anything printed to stdout persists up to 30 days. + Mitigations: access logs are path-only; the dev admin-key banner is written via the + ring-bypassing `writeStdout` (terminal only, not the ring/disk) and is suppressed in prod + by `auth.ts`'s boot guards; the endpoint stays admin-only. See + [auth.md](../08-crosscutting-concepts/auth.md). +- **Access logs are closer to an audit log** than ADR-021's "recent tail" — 30 days of + request paths. Paths must carry no personal data (they don't today: MACs and module ids, + no PII). Tracked in [chapter 11](../11-risks-and-technical-debt/README.md). +- **The SSE emitter is per-process.** A future multi-worker backend (gunicorn/PM2 cluster) + would stream only the serving worker's live entries; history via the shared disk file is + still complete. Revisit if/when workers multiply. +- New runtime dependency (`rotating-file-stream`) on the Node side. + +### Forbidden + +- **Never `console.log` / `print` a secret** (API key, admin password, session token, auth + header). It now lands on disk, not just in a transient stream. Use `writeStdout`/the + original stream for terminal-only dev hints that must stay out of the ring. +- **Never log request bodies, headers, or query strings** in the access path — `method`, + `path`, `status`, `duration` only. +- **Never re-declare `LogEntry` / `ServerLogsResponse` as a service-local interface** — the + type lives only in `@highfive/contracts`. diff --git a/docs/11-risks-and-technical-debt/README.md b/docs/11-risks-and-technical-debt/README.md index fecbab0..0399253 100644 --- a/docs/11-risks-and-technical-debt/README.md +++ b/docs/11-risks-and-technical-debt/README.md @@ -78,6 +78,18 @@ fixed in commit `778c9b1`. Don't reintroduce them. / data-protection notice does not mention this; if HiveHive ever reaches an audience that warrants a real GDPR posture, this flow needs to surface there. Tracked here, not as a bug. +- **Server logs are persisted for up to 30 days (#178, [ADR-022](../09-architecture-decisions/adr-022-persistent-structured-server-logs.md)).** + The admin server-log ring is now written to disk (JSONL, 30 days / 100 MB, + gated on `LOG_DIR`) and backfilled on restart. Two standing obligations: (1) the + per-request access logs are closer to an audit log than ADR-021's "recent tail" — they + must stay **path-only** (no headers/body/query) and request paths must carry no PII + (today: MACs + module ids, no personal data); (2) **no secret may ever be `print`/`console.log`-ed**, + since it would now persist on disk for a month, not just flash past in `docker logs`. + Both are enforced in code + tests, recorded here as a load-bearing invariant. +- **The server-log SSE emitter is per-process (#178 / ADR-022).** Live "tail -f" streams + only the serving worker's in-process entries. Single-process today, so complete. A future + multi-worker backend (gunicorn/PM2 cluster) would stream only one worker's live entries; + history via the shared on-disk file stays complete. Revisit if/when workers multiply. ## Lessons learned diff --git a/duckdb-service/app.py b/duckdb-service/app.py index 2004321..17c8d3a 100644 --- a/duckdb-service/app.py +++ b/duckdb-service/app.py @@ -14,6 +14,7 @@ from routes.progress import progress_bp from routes.heartbeats import heartbeats_bp from services.backup import run_backup +from services.log_ring import init_persistence as init_log_persistence from services.log_ring import install as install_log_ring from services.log_ring import log_event from services.silence_watcher import check_silence @@ -24,6 +25,10 @@ # print() re-resolves sys.stdout per call and Flask/werkzeug log handlers are # constructed lazily at app.run, so capture is complete. See services/log_ring.py. install_log_ring() +# Enable on-disk persistence + backfill the ring from prior history when LOG_DIR +# is set (compose sets it; unset = in-memory only). Before the banner so it is +# persisted too. See ADR-022. +init_log_persistence() # Structured boot banner through the logger (#178) — the analogue to the # backend's server.ts banner. Runs at import under flask run / gunicorn and diff --git a/duckdb-service/services/log_ring.py b/duckdb-service/services/log_ring.py index 7a34db9..9d3153c 100644 --- a/duckdb-service/services/log_ring.py +++ b/duckdb-service/services/log_ring.py @@ -22,10 +22,15 @@ constructed lazily at ``app.run`` / first request — bind to the tee. """ +import glob +import json +import logging +import os import sys import threading from collections import deque from datetime import UTC, datetime +from logging.handlers import TimedRotatingFileHandler _MAX_RING_ENTRIES = 2000 _ring: "deque[dict]" = deque(maxlen=_MAX_RING_ENTRIES) @@ -37,6 +42,14 @@ _real_stdout = None _real_stderr = None +# On-disk persistence (#178 / ADR-022). Gated on LOG_DIR: when set, each entry is +# also appended as one JSON object per line (JSONL) to a rotating file, and the +# ring is backfilled from that file at startup so history survives a restart. +# Rotation: daily, retain ≤30 files AND ≤100 MB total (prune oldest past either). +_LOG_FILENAME = "service.log" # each service has its OWN LOG_DIR (no collision) +_MAX_TOTAL_BYTES = 100 * 1024 * 1024 +_disk_logger: "logging.Logger | None" = None + def _now_iso() -> str: # Millisecond precision, 'Z' suffix — matches the LogEntry contract. @@ -44,8 +57,83 @@ def _now_iso() -> str: def _push(level: str, msg: str) -> None: + entry = {"ts": _now_iso(), "level": level, "msg": msg} with _lock: - _ring.append({"ts": _now_iso(), "level": level, "msg": msg}) + _ring.append(entry) + if _disk_logger is not None: + try: + _disk_logger.info(json.dumps(entry, ensure_ascii=False)) + except Exception: + # Persistence must never break in-memory logging. + pass + + +def _prune_by_size(log_path: str) -> None: + """Delete oldest rotated files until the total of * is ≤ 100 MB. + Never deletes the active file.""" + files = sorted(glob.glob(log_path + "*"), key=lambda p: os.path.getmtime(p)) + total = sum(os.path.getsize(p) for p in files if os.path.exists(p)) + for p in files: + if total <= _MAX_TOTAL_BYTES: + break + if p == log_path: + continue # keep the active file + try: + total -= os.path.getsize(p) + os.remove(p) + except OSError: + pass + + +class _PruningTimedHandler(TimedRotatingFileHandler): + """Daily rotation + backupCount, with a 100 MB total-size sweep after each + rollover so both retention bounds hold.""" + + def doRollover(self): # noqa: N802 (stdlib name) + super().doRollover() + _prune_by_size(self.baseFilename) + + +def _backfill_from_disk(log_path: str) -> None: + """Load the tail of the active file into the ring so a restart shows + pre-restart history. Skips malformed/partial lines.""" + try: + with open(log_path, encoding="utf-8") as fh: + lines = [ln for ln in fh.read().split("\n") if ln] + except OSError: + return + for ln in lines[-_MAX_RING_ENTRIES:]: + try: + entry = json.loads(ln) + except (ValueError, TypeError): + continue + if isinstance(entry, dict) and "ts" in entry and "msg" in entry: + with _lock: + _ring.append(entry) + + +def init_persistence(log_dir: "str | None" = None) -> None: + """Enable on-disk persistence + startup backfill. Idempotent. Reads LOG_DIR + when no dir is passed; a falsy value is a no-op (ring stays in-memory).""" + global _disk_logger + if _disk_logger is not None: + return + log_dir = log_dir if log_dir is not None else os.getenv("LOG_DIR") + if not log_dir: + return + os.makedirs(log_dir, exist_ok=True) + log_path = os.path.join(log_dir, _LOG_FILENAME) + _backfill_from_disk(log_path) # before opening the handler + logger = logging.getLogger("hf_log_persistence") + logger.setLevel(logging.INFO) + logger.propagate = False # don't reach root/stdout → no double-capture via the tee + handler = _PruningTimedHandler( + log_path, when="midnight", backupCount=30, encoding="utf-8" + ) + handler.setFormatter(logging.Formatter("%(message)s")) # raw JSON line, no prefix + logger.addHandler(handler) + _disk_logger = logger + _prune_by_size(log_path) class _TeeStream: @@ -131,9 +219,16 @@ def get_recent(n): def _reset_for_test(): + global _disk_logger with _lock: _ring.clear() # Also clear any installed tee's partial-line carry so a half-line from a # previous test can't bleed into the next. for tee in _tees: tee._carry = "" + # Tear down disk persistence so a later test can re-init against a fresh dir. + if _disk_logger is not None: + for h in list(_disk_logger.handlers): + h.close() + _disk_logger.removeHandler(h) + _disk_logger = None diff --git a/duckdb-service/tests/test_logs.py b/duckdb-service/tests/test_logs.py index dd209be..e055b48 100644 --- a/duckdb-service/tests/test_logs.py +++ b/duckdb-service/tests/test_logs.py @@ -1,6 +1,7 @@ """Tests for the internal admin-gated GET /logs server-log tail (#171, #178).""" import io +import json import sys from services import log_ring @@ -142,3 +143,37 @@ def test_access_log_redacts_query_and_credentials(client): assert "X-Admin-Key" not in blob # The path itself is still logged (without the query). assert any(e["msg"].startswith("GET /logs 200 ") for e in log_ring.get_recent(50)[0]) + + +def test_persistence_writes_jsonl(tmp_path): + log_ring._reset_for_test() + log_ring.init_persistence(str(tmp_path)) + log_ring._push("info", "persist alpha") + log_ring._push("warn", "persist bravo") + log_ring._reset_for_test() # closes the handler, flushing to disk + + lines = (tmp_path / "service.log").read_text(encoding="utf-8").splitlines() + parsed = [json.loads(ln) for ln in lines if ln] + assert [p["msg"] for p in parsed] == ["persist alpha", "persist bravo"] + assert parsed[0]["level"] == "info" and parsed[1]["level"] == "warn" + assert set(parsed[0].keys()) == {"ts", "level", "msg"} + + +def test_persistence_backfills_ring_on_restart(tmp_path): + log_ring._reset_for_test() + log_ring.init_persistence(str(tmp_path)) + log_ring._push("info", "prior 1") + log_ring._push("info", "prior 2") + log_ring._reset_for_test() # ring cleared + handler closed == "process exit" + assert log_ring.get_recent(10)[0] == [] + + log_ring.init_persistence(str(tmp_path)) # "restart" + assert _msgs(log_ring.get_recent(10)[0]) == ["prior 1", "prior 2"] + + +def test_persistence_is_noop_without_dir(tmp_path): + log_ring._reset_for_test() + log_ring.init_persistence(None) # no LOG_DIR → in-memory only + log_ring._push("info", "in-memory only") + assert "in-memory only" in _msgs(log_ring.get_recent(10)[0]) + assert list(tmp_path.iterdir()) == [] diff --git a/image-service/app.py b/image-service/app.py index cd09726..80b93d9 100644 --- a/image-service/app.py +++ b/image-service/app.py @@ -12,6 +12,7 @@ from services.discord import send_discord_message from services.duckdb import DuckDBService from services.log_ring import get_recent as _get_recent_logs +from services.log_ring import init_persistence as init_log_persistence from services.log_ring import install as install_log_ring from services.log_ring import log_event from services.module_id import ModuleId @@ -23,6 +24,10 @@ # print() re-resolves sys.stdout per call and Flask/werkzeug log handlers are # constructed lazily at app.run, so capture is complete. See services/log_ring.py. install_log_ring() +# Enable on-disk persistence + backfill the ring from prior history when LOG_DIR +# is set (compose sets it; unset = in-memory only). Before the banner so it is +# persisted too. See ADR-022. +init_log_persistence() # Structured boot banner through the logger (#178) — the analogue to the # backend's server.ts banner. Runs at import under flask run / gunicorn and diff --git a/image-service/services/log_ring.py b/image-service/services/log_ring.py index 7a34db9..9d3153c 100644 --- a/image-service/services/log_ring.py +++ b/image-service/services/log_ring.py @@ -22,10 +22,15 @@ constructed lazily at ``app.run`` / first request — bind to the tee. """ +import glob +import json +import logging +import os import sys import threading from collections import deque from datetime import UTC, datetime +from logging.handlers import TimedRotatingFileHandler _MAX_RING_ENTRIES = 2000 _ring: "deque[dict]" = deque(maxlen=_MAX_RING_ENTRIES) @@ -37,6 +42,14 @@ _real_stdout = None _real_stderr = None +# On-disk persistence (#178 / ADR-022). Gated on LOG_DIR: when set, each entry is +# also appended as one JSON object per line (JSONL) to a rotating file, and the +# ring is backfilled from that file at startup so history survives a restart. +# Rotation: daily, retain ≤30 files AND ≤100 MB total (prune oldest past either). +_LOG_FILENAME = "service.log" # each service has its OWN LOG_DIR (no collision) +_MAX_TOTAL_BYTES = 100 * 1024 * 1024 +_disk_logger: "logging.Logger | None" = None + def _now_iso() -> str: # Millisecond precision, 'Z' suffix — matches the LogEntry contract. @@ -44,8 +57,83 @@ def _now_iso() -> str: def _push(level: str, msg: str) -> None: + entry = {"ts": _now_iso(), "level": level, "msg": msg} with _lock: - _ring.append({"ts": _now_iso(), "level": level, "msg": msg}) + _ring.append(entry) + if _disk_logger is not None: + try: + _disk_logger.info(json.dumps(entry, ensure_ascii=False)) + except Exception: + # Persistence must never break in-memory logging. + pass + + +def _prune_by_size(log_path: str) -> None: + """Delete oldest rotated files until the total of * is ≤ 100 MB. + Never deletes the active file.""" + files = sorted(glob.glob(log_path + "*"), key=lambda p: os.path.getmtime(p)) + total = sum(os.path.getsize(p) for p in files if os.path.exists(p)) + for p in files: + if total <= _MAX_TOTAL_BYTES: + break + if p == log_path: + continue # keep the active file + try: + total -= os.path.getsize(p) + os.remove(p) + except OSError: + pass + + +class _PruningTimedHandler(TimedRotatingFileHandler): + """Daily rotation + backupCount, with a 100 MB total-size sweep after each + rollover so both retention bounds hold.""" + + def doRollover(self): # noqa: N802 (stdlib name) + super().doRollover() + _prune_by_size(self.baseFilename) + + +def _backfill_from_disk(log_path: str) -> None: + """Load the tail of the active file into the ring so a restart shows + pre-restart history. Skips malformed/partial lines.""" + try: + with open(log_path, encoding="utf-8") as fh: + lines = [ln for ln in fh.read().split("\n") if ln] + except OSError: + return + for ln in lines[-_MAX_RING_ENTRIES:]: + try: + entry = json.loads(ln) + except (ValueError, TypeError): + continue + if isinstance(entry, dict) and "ts" in entry and "msg" in entry: + with _lock: + _ring.append(entry) + + +def init_persistence(log_dir: "str | None" = None) -> None: + """Enable on-disk persistence + startup backfill. Idempotent. Reads LOG_DIR + when no dir is passed; a falsy value is a no-op (ring stays in-memory).""" + global _disk_logger + if _disk_logger is not None: + return + log_dir = log_dir if log_dir is not None else os.getenv("LOG_DIR") + if not log_dir: + return + os.makedirs(log_dir, exist_ok=True) + log_path = os.path.join(log_dir, _LOG_FILENAME) + _backfill_from_disk(log_path) # before opening the handler + logger = logging.getLogger("hf_log_persistence") + logger.setLevel(logging.INFO) + logger.propagate = False # don't reach root/stdout → no double-capture via the tee + handler = _PruningTimedHandler( + log_path, when="midnight", backupCount=30, encoding="utf-8" + ) + handler.setFormatter(logging.Formatter("%(message)s")) # raw JSON line, no prefix + logger.addHandler(handler) + _disk_logger = logger + _prune_by_size(log_path) class _TeeStream: @@ -131,9 +219,16 @@ def get_recent(n): def _reset_for_test(): + global _disk_logger with _lock: _ring.clear() # Also clear any installed tee's partial-line carry so a half-line from a # previous test can't bleed into the next. for tee in _tees: tee._carry = "" + # Tear down disk persistence so a later test can re-init against a fresh dir. + if _disk_logger is not None: + for h in list(_disk_logger.handlers): + h.close() + _disk_logger.removeHandler(h) + _disk_logger = None diff --git a/image-service/tests/test_logs.py b/image-service/tests/test_logs.py index 9f32922..8eab0dd 100644 --- a/image-service/tests/test_logs.py +++ b/image-service/tests/test_logs.py @@ -1,6 +1,7 @@ """Tests for the internal admin-gated GET /logs server-log tail (#171, #178).""" import io +import json import sys from services import log_ring @@ -140,3 +141,37 @@ def test_access_log_redacts_query_and_credentials(client): assert "X-Admin-Key" not in blob # The path itself is still logged (without the query). assert any(e["msg"].startswith("GET /logs 200 ") for e in log_ring.get_recent(50)[0]) + + +def test_persistence_writes_jsonl(tmp_path): + log_ring._reset_for_test() + log_ring.init_persistence(str(tmp_path)) + log_ring._push("info", "persist alpha") + log_ring._push("warn", "persist bravo") + log_ring._reset_for_test() # closes the handler, flushing to disk + + lines = (tmp_path / "service.log").read_text(encoding="utf-8").splitlines() + parsed = [json.loads(ln) for ln in lines if ln] + assert [p["msg"] for p in parsed] == ["persist alpha", "persist bravo"] + assert parsed[0]["level"] == "info" and parsed[1]["level"] == "warn" + assert set(parsed[0].keys()) == {"ts", "level", "msg"} + + +def test_persistence_backfills_ring_on_restart(tmp_path): + log_ring._reset_for_test() + log_ring.init_persistence(str(tmp_path)) + log_ring._push("info", "prior 1") + log_ring._push("info", "prior 2") + log_ring._reset_for_test() # ring cleared + handler closed == "process exit" + assert log_ring.get_recent(10)[0] == [] + + log_ring.init_persistence(str(tmp_path)) # "restart" + assert _msgs(log_ring.get_recent(10)[0]) == ["prior 1", "prior 2"] + + +def test_persistence_is_noop_without_dir(tmp_path): + log_ring._reset_for_test() + log_ring.init_persistence(None) # no LOG_DIR → in-memory only + log_ring._push("info", "in-memory only") + assert "in-memory only" in _msgs(log_ring.get_recent(10)[0]) + assert list(tmp_path.iterdir()) == [] diff --git a/package-lock.json b/package-lock.json index 59e4ccc..45301cc 100644 --- a/package-lock.json +++ b/package-lock.json @@ -31,7 +31,8 @@ "cookie-parser": "^1.4.7", "cors": "^2.8.5", "dotenv": "^17.2.3", - "express": "^4.18.2" + "express": "^4.18.2", + "rotating-file-stream": "^2.1.6" }, "devDependencies": { "@types/cookie-parser": "^1.4.10", @@ -48,6 +49,18 @@ "node": ">=22.12.0" } }, + "backend/node_modules/rotating-file-stream": { + "version": "2.1.6", + "resolved": "https://registry.npmjs.org/rotating-file-stream/-/rotating-file-stream-2.1.6.tgz", + "integrity": "sha512-qS0ndAlDu80MMXeRonqGMXslF0FErzcUSbcXhus3asRG4cvCS79hc5f7s0x4bPAsH6wAwyHVIeARg69VUe3JmQ==", + "license": "MIT", + "engines": { + "node": ">=10.0" + }, + "funding": { + "url": "https://www.blockchain.com/btc/address/12p1p5q7sK75tPyuesZmssiMYr4TKzpSCN" + } + }, "contracts": { "name": "@highfive/contracts", "version": "0.0.0", From 88d22efd39859dd06e26ad251603553d8bad3410 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 16:55:18 +0000 Subject: [PATCH 06/15] feat: SSE live tail for the admin Server Logs panel (#178 phase 4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The panel now streams new entries live after the REST backfill — "tail -f" with follow-mode auto-scroll — across all three services. - Ring pub/sub at the existing choke points: backend logRing.ts emits from pushEntryInternal to a Set of subscribers (subscribeEntries); Flask _push fans entries out to per-connection bounded queues (subscribe/unsubscribe), drop-on-full so the logger never blocks. - Backend GET /api/admin/logs/stream (requireAdmin): streams its own ring for `backend`; for the Flask services it awaits the upstream fetch first (connect failure still 502s) then pipes their /logs/stream through. Sets X-Accel-Buffering: no + a 25s keepalive; cleans up on client close. New logStream.ts helper keeps the route thin. - Flask /logs/stream (both services, X-Admin-Key gated): stream_with_context generator yielding one LogEntry per data: event + keepalives, unsubscribe in finally on disconnect. - Frontend: api.streamServerLogs() -> EventSource(withCredentials); panel does REST backfill then opens the stream, appends live entries (capped), follow mode with "Jump to latest", and a live/connecting indicator. EventSource stub added to test-setup for jsdom. - Docs: host-nginx proxy_buffering off block, api-reference + api-contracts SSE endpoint. No wire-shape change — the SSE event payload is one LogEntry. Tests: ring subscribe/unsubscribe; backend SSE endpoint over a real ephemeral server (backend ring) + mocked ReadableStream pipe + 502 on connect-fail; Flask subscribe/route stream; component live-append + close-on-unmount/service-change. Gate green: backend 197, homepage 164, duckdb 201, image 77; tsc/ruff/build/ check-citations clean. (Full live-stream E2E is the Phase 5 Playwright spec.) Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01HU7JHMbvtUKRfZY98qeWdK --- backend/src/app.ts | 48 +++++++ backend/src/logRing.ts | 27 +++- backend/src/logStream.ts | 50 +++++++ backend/tests/logRing.test.ts | 16 +++ backend/tests/logStream.test.ts | 130 ++++++++++++++++++ .../production-deployment.md | 16 +++ .../08-crosscutting-concepts/api-contracts.md | 8 +- docs/api-reference.md | 26 ++++ duckdb-service/routes/logs.py | 36 ++++- duckdb-service/services/log_ring.py | 29 ++++ duckdb-service/tests/test_logs.py | 43 ++++++ homepage/src/__tests__/AdminPage.test.tsx | 8 ++ .../src/__tests__/ServerLogsPanel.test.tsx | 78 ++++++++++- homepage/src/components/ServerLogsPanel.tsx | 107 ++++++++++++-- homepage/src/services/api.ts | 12 ++ homepage/src/test-setup.ts | 21 +++ image-service/app.py | 43 +++++- image-service/services/log_ring.py | 29 ++++ image-service/tests/test_logs.py | 43 ++++++ 19 files changed, 749 insertions(+), 21 deletions(-) create mode 100644 backend/src/logStream.ts create mode 100644 backend/tests/logStream.test.ts diff --git a/backend/src/app.ts b/backend/src/app.ts index d6e4cb4..1d094fa 100644 --- a/backend/src/app.ts +++ b/backend/src/app.ts @@ -1,3 +1,4 @@ +import { Readable } from 'node:stream'; import express from 'express'; import cors from 'cors'; import cookieParser from 'cookie-parser'; @@ -7,6 +8,7 @@ import { db } from './database'; import { verifyApiKey, getApiKey } from './auth'; import { accessLog } from './accessLog'; import { getRecentEntries } from './logRing'; +import { streamBackendRing, writeSseHeaders } from './logStream'; import { SESSION_COOKIE, issueSessionToken, @@ -648,3 +650,49 @@ app.get('/api/admin/logs', requireAdmin, async (req, res) => { res.status(502).json({ error: `${service} unreachable` }); } }); + +// SSE live tail (#178 Phase 4). One `LogEntry` JSON per `data:` event. The panel +// fetches GET /api/admin/logs once for backfill, then opens this for live tail. +// `backend` streams its own ring; the two Flask services are piped from their +// internal `/logs/stream` (X-Admin-Key forwarded). See ADR-022 / logStream.ts. +app.get('/api/admin/logs/stream', requireAdmin, async (req, res) => { + const service = String(req.query.service ?? ''); + if (!(LOG_SERVICES as readonly string[]).includes(service)) { + res.status(400).json({ + error: `invalid service; expected one of: ${LOG_SERVICES.join(', ')}`, + }); + return; + } + + if (service === 'backend') { + writeSseHeaders(res); + const cleanup = streamBackendRing(res); + req.on('close', cleanup); + return; + } + + // Proxy the Flask service's SSE stream. Connect FIRST so a failure still + // surfaces as 502 before we commit to a 200 event-stream response. + const base = service === 'duckdb-service' ? DUCKDB_URL : IMAGE_SERVICE_URL; + const controller = new AbortController(); + req.on('close', () => controller.abort()); + try { + const upstream = await fetch(`${base}/logs/stream`, { + headers: { 'X-Admin-Key': getApiKey() }, + signal: controller.signal, + }); + if (!upstream.ok || !upstream.body) { + res.status(502).json({ error: `Failed to open ${service} log stream` }); + return; + } + writeSseHeaders(res); + // Pipe the upstream SSE bytes straight through (Flask emits the same + // `data:`/keepalive framing and its own keepalives). + Readable.fromWeb(upstream.body as Parameters[0]).pipe(res); + } catch (error) { + if (controller.signal.aborted) return; // client went away mid-connect + console.error('[GET /api/admin/logs/stream]', { service, error: String(error) }); + if (!res.headersSent) res.status(502).json({ error: `${service} unreachable` }); + else res.end(); + } +}); diff --git a/backend/src/logRing.ts b/backend/src/logRing.ts index aa74d51..759f281 100644 --- a/backend/src/logRing.ts +++ b/backend/src/logRing.ts @@ -51,6 +51,11 @@ const carry: Record<'out' | 'err', string> = { out: '', err: '' }; const LOG_FILENAME = 'backend.log'; +// Live subscribers for SSE (#178 Phase 4). Every entry — from the tee or the +// structured logger — flows through pushEntryInternal, so emitting here is the +// single broadcast point. A Set (not EventEmitter) avoids MaxListeners warnings. +const subscribers = new Set<(entry: LogEntry) => void>(); + function pushEntryInternal(entry: LogEntry): void { ring.push(entry); if (ring.length > MAX_RING_ENTRIES) { @@ -63,6 +68,25 @@ function pushEntryInternal(entry: LogEntry): void { // Persistence must never break in-memory logging. } } + for (const cb of subscribers) { + try { + cb(entry); + } catch { + // A broken subscriber must never break logging or other subscribers. + } + } +} + +/** + * Subscribe to live entries (SSE live tail). The callback fires for every new + * entry from any ingestion path. Returns an unsubscribe function — call it on + * client disconnect. + */ +export function subscribeEntries(cb: (entry: LogEntry) => void): () => void { + subscribers.add(cb); + return () => { + subscribers.delete(cb); + }; } /** @@ -175,11 +199,12 @@ export function getRecentEntries(n: number): { entries: LogEntry[]; truncated: b return { entries, truncated: ring.length > entries.length }; } -/** Test-only: clear the ring and partial-line carry (does not touch disk). */ +/** Test-only: clear the ring, partial-line carry, and subscribers (not disk). */ export function __resetLogRingForTest(): void { ring.length = 0; carry.out = ''; carry.err = ''; + subscribers.clear(); } /** diff --git a/backend/src/logStream.ts b/backend/src/logStream.ts new file mode 100644 index 0000000..fc1f8f6 --- /dev/null +++ b/backend/src/logStream.ts @@ -0,0 +1,50 @@ +// SSE helpers for the admin log live-tail (#178 Phase 4). +// +// Two sources feed the `GET /api/admin/logs/stream` route: +// - the backend's own ring (subscribe to logRing and write each entry), and +// - a Flask service's internal `/logs/stream` (piped through by the route). +// This module owns the SSE header contract + the backend-ring streamer so the +// route in app.ts stays thin and this stays unit-testable. + +import type { Response } from 'express'; +import type { LogEntry } from '@highfive/contracts'; +import { subscribeEntries } from './logRing'; + +const KEEPALIVE_MS = 25_000; + +/** + * Write the SSE response headers. `X-Accel-Buffering: no` + `no-transform` stop + * nginx/proxies from buffering the stream (the host-nginx config also sets + * `proxy_buffering off` — this header is the safety net). See ADR-022. + */ +export function writeSseHeaders(res: Response): void { + res.writeHead(200, { + 'Content-Type': 'text/event-stream', + 'Cache-Control': 'no-cache, no-transform', + Connection: 'keep-alive', + 'X-Accel-Buffering': 'no', + }); + res.flushHeaders?.(); +} + +/** + * Stream the backend's own ring to `res` as SSE until the client disconnects. + * Subscribes for live entries and sends a periodic keepalive comment. Returns a + * cleanup function (also wired to `req`'s close); calling it is idempotent. + */ +export function streamBackendRing(res: Response): () => void { + const unsubscribe = subscribeEntries((entry: LogEntry) => { + res.write(`data: ${JSON.stringify(entry)}\n\n`); + }); + const keepalive = setInterval(() => { + res.write(': ping\n\n'); + }, KEEPALIVE_MS); + let done = false; + return () => { + if (done) return; + done = true; + clearInterval(keepalive); + unsubscribe(); + res.end(); + }; +} diff --git a/backend/tests/logRing.test.ts b/backend/tests/logRing.test.ts index b2a35a4..59f78f1 100644 --- a/backend/tests/logRing.test.ts +++ b/backend/tests/logRing.test.ts @@ -3,6 +3,7 @@ import { installLogRing, getRecentEntries, pushEntry, + subscribeEntries, __resetLogRingForTest, } from '../src/logRing'; @@ -75,4 +76,19 @@ describe('logRing (#171/#178)', () => { expect(all).toContain(`hf-cap ${N - 1}`); // newest kept expect(all).not.toContain('hf-cap 0'); // oldest evicted }); + + it('subscribeEntries fires for every new entry and stops after unsubscribe', () => { + // Filter to our own markers — the tee broadcasts ALL stdout, including any + // stray test-runner output, so strict equality must ignore that noise. + const seen: string[] = []; + const unsub = subscribeEntries((e) => { + if (e.msg.startsWith('hf-sub-')) seen.push(e.msg); + }); + pushEntry({ ts: '2026-06-18T00:00:00.000Z', level: 'info', msg: 'hf-sub-1' }); + process.stdout.write('hf-sub-2\n'); // tee path also broadcasts + expect(seen).toEqual(['hf-sub-1', 'hf-sub-2']); + unsub(); + pushEntry({ ts: '2026-06-18T00:00:01.000Z', level: 'info', msg: 'hf-sub-3' }); + expect(seen).toEqual(['hf-sub-1', 'hf-sub-2']); // no further deliveries + }); }); diff --git a/backend/tests/logStream.test.ts b/backend/tests/logStream.test.ts new file mode 100644 index 0000000..d52a0d6 --- /dev/null +++ b/backend/tests/logStream.test.ts @@ -0,0 +1,130 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import http from 'node:http'; +import request from 'supertest'; + +// Mock db so importing the app doesn't trigger real fetches. +vi.mock('../src/database', () => ({ + db: { listModules: vi.fn().mockResolvedValue([]), getModuleDetail: vi.fn().mockResolvedValue(null) }, +})); + +import { app } from '../src/app'; +import { log } from '../src/log'; +import { __resetLogRingForTest } from '../src/logRing'; + +const KEY = 'hf_dev_key_2026'; + +beforeEach(() => { + __resetLogRingForTest(); +}); + +afterEach(() => { + vi.unstubAllGlobals(); + vi.clearAllMocks(); +}); + +describe('GET /api/admin/logs/stream (#178 Phase 4)', () => { + it('returns 401 without an admin credential', async () => { + const res = await request(app).get('/api/admin/logs/stream?service=backend'); + expect(res.status).toBe(401); + }); + + it('returns 400 on a non-allow-listed service', async () => { + const res = await request(app).get('/api/admin/logs/stream?service=nginx').set('X-Admin-Key', KEY); + expect(res.status).toBe(400); + }); + + it('streams the backend ring as SSE: a new log entry arrives as a data event', async () => { + const server = http.createServer(app); + await new Promise((resolve) => server.listen(0, resolve)); + const { port } = server.address() as { port: number }; + + const received = await new Promise((resolve, reject) => { + const req = http.get( + { port, path: '/api/admin/logs/stream?service=backend', headers: { 'X-Admin-Key': KEY } }, + (res) => { + expect(res.statusCode).toBe(200); + expect(res.headers['content-type']).toMatch(/text\/event-stream/); + expect(res.headers['x-accel-buffering']).toBe('no'); + let buf = ''; + res.on('data', (chunk) => { + buf += chunk.toString(); + const line = buf.split('\n').find((l) => l.startsWith('data: ')); + if (line) { + req.destroy(); + resolve(line.slice('data: '.length)); + } + }); + // Emit an entry once we're connected. + setImmediate(() => log.info('sse-backend-entry')); + }, + ); + req.on('error', (e) => { + // destroy() triggers ECONNRESET — ignore once we already resolved. + if (!/aborted|ECONNRESET|socket hang up/i.test(String(e))) reject(e); + }); + setTimeout(() => reject(new Error('timed out waiting for SSE data')), 4000); + }); + + const entry = JSON.parse(received); + expect(entry.msg).toBe('sse-backend-entry'); + expect(entry.level).toBe('info'); + expect(entry.ts).toMatch(/^\d{4}-\d{2}-\d{2}T.*Z$/); + await new Promise((resolve) => server.close(() => resolve())); + }); + + it('pipes a Flask service SSE stream through to the client', async () => { + vi.stubGlobal( + 'fetch', + vi.fn().mockResolvedValue({ + ok: true, + status: 200, + body: new ReadableStream({ + start(c) { + c.enqueue( + new TextEncoder().encode( + 'data: {"ts":"2026-06-18T00:00:00.000Z","level":"warn","msg":"db-stream-entry"}\n\n', + ), + ); + c.close(); + }, + }), + }), + ); + + const server = http.createServer(app); + await new Promise((resolve) => server.listen(0, resolve)); + const { port } = server.address() as { port: number }; + + const body = await new Promise((resolve, reject) => { + http.get( + { + port, + path: '/api/admin/logs/stream?service=duckdb-service', + headers: { 'X-Admin-Key': KEY }, + }, + (res) => { + expect(res.statusCode).toBe(200); + let buf = ''; + res.on('data', (c) => (buf += c.toString())); + res.on('end', () => resolve(buf)); + }, + ).on('error', reject); + setTimeout(() => reject(new Error('timed out')), 4000); + }); + + expect(body).toContain('db-stream-entry'); + const fetchArg = (globalThis.fetch as unknown as ReturnType).mock.calls[0]; + expect(String(fetchArg[0])).toMatch(/\/logs\/stream$/); + expect((fetchArg[1] as { headers: Record }).headers['X-Admin-Key']).toBe(KEY); + await new Promise((resolve) => server.close(() => resolve())); + }); + + it('returns 502 when the upstream Flask stream cannot be opened', async () => { + vi.stubGlobal('fetch', vi.fn().mockRejectedValue(new Error('ECONNREFUSED'))); + const res = await request(app) + .get('/api/admin/logs/stream?service=image-service') + .set('X-Admin-Key', KEY); + expect(res.status).toBe(502); + expect(res.body.error).toMatch(/unreachable/i); + }); +}); diff --git a/docs/07-deployment-view/production-deployment.md b/docs/07-deployment-view/production-deployment.md index d422e3d..78aab8c 100644 --- a/docs/07-deployment-view/production-deployment.md +++ b/docs/07-deployment-view/production-deployment.md @@ -333,6 +333,22 @@ server { add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + # SSE live tail of the admin Server Logs (#178 / ADR-022). Must come BEFORE + # the catch-all `location /` so nginx routes it here. Buffering off so events + # reach the browser as they happen, not in one chunk at disconnect. The + # backend also sets `X-Accel-Buffering: no` as a safety net, but pin it here. + location /api/admin/logs/stream { + proxy_pass http://127.0.0.1:3001/api/admin/logs/stream; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_buffering off; + proxy_read_timeout 1h; # long-lived stream + proxy_set_header X-Accel-Buffering no; + } + location / { proxy_pass http://127.0.0.1:3001/; proxy_http_version 1.1; diff --git a/docs/08-crosscutting-concepts/api-contracts.md b/docs/08-crosscutting-concepts/api-contracts.md index c0e1d99..bb132a5 100644 --- a/docs/08-crosscutting-concepts/api-contracts.md +++ b/docs/08-crosscutting-concepts/api-contracts.md @@ -362,7 +362,13 @@ each entry `ts`/`level`/`msg`), so the backend proxies the JSON through verbatim and only the `backend` branch constructs it locally. The proxy branch rejects a drifted envelope (no `entries` array) with `502` rather than letting `undefined` fields reach the UI. `nginx` is intentionally not a `ServerLogService` (no app -ring). Design + caveats: [ADR-021](../09-architecture-decisions/adr-021-admin-server-log-ring.md). +ring). Design + caveats: [ADR-021](../09-architecture-decisions/adr-021-admin-server-log-ring.md) +/ [ADR-022](../09-architecture-decisions/adr-022-persistent-structured-server-logs.md). + +The live tail `GET /api/admin/logs/stream` (SSE) reuses the **same** `LogEntry` +shape: each `data:` event payload is one `LogEntry` JSON. No separate wire type — +the REST array and the SSE event are the same element, so the panel appends stream +entries to the backfilled list without a transform. ## `ImageUploadsPage` — admin gallery pagination diff --git a/docs/api-reference.md b/docs/api-reference.md index 960b2a5..0818fb7 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -284,6 +284,32 @@ unreachable or returns a drifted envelope. Design + caveats: when the ring held more than were returned. The TypeScript contract is `LogEntry` / `ServerLogsResponse` in [`contracts/src/index.ts`](../contracts/src/index.ts). +### Live tail (SSE) + +``` +GET /api/admin/logs/stream?service=backend|duckdb-service|image-service +Headers: Cookie: hf_admin_session=… # or X-Admin-Key: +Accept: text/event-stream +``` + +Server-Sent Events live tail (#178 / ADR-022). After the REST `GET /api/admin/logs` +backfill, the panel opens this for "tail -f": each new log entry arrives as one +`data:` event whose payload is a single `LogEntry` JSON (`{ ts, level, msg }`); +`: ping` comments keep the connection alive. `service` validation, the admin gate, +and the cross-service `X-Admin-Key` proxy match the REST endpoint (`backend` +streams its own ring; the Flask services are piped from their internal +`/logs/stream`). The response sets `X-Accel-Buffering: no`; the host-nginx vhost +must also set `proxy_buffering off` for this location (see +[production-deployment.md](07-deployment-view/production-deployment.md)). + +``` +: connected + +data: {"ts":"2026-06-18T20:42:56.004Z","level":"info","msg":"POST /heartbeat 200 3ms"} + +: ping +``` + ## 1.5 User location hint (dashboard map) ``` diff --git a/duckdb-service/routes/logs.py b/duckdb-service/routes/logs.py index 21eea8c..510cb7d 100644 --- a/duckdb-service/routes/logs.py +++ b/duckdb-service/routes/logs.py @@ -8,11 +8,13 @@ """ import hmac +import json import os +from queue import Empty -from flask import Blueprint, jsonify, request +from flask import Blueprint, Response, jsonify, request, stream_with_context -from services.log_ring import get_recent +from services.log_ring import get_recent, subscribe, unsubscribe logs_bp = Blueprint("logs", __name__) @@ -45,3 +47,33 @@ def get_logs(): return jsonify( {"service": SERVICE_NAME, "entries": entries, "truncated": truncated} ), 200 + + +@logs_bp.get("/logs/stream") +def stream_logs(): + """SSE live tail (#178 Phase 4). One LogEntry JSON per `data:` event. The + backend's GET /api/admin/logs/stream?service=duckdb-service pipes this + through, forwarding X-Admin-Key. REST /logs stays for the initial backfill.""" + provided = request.headers.get("X-Admin-Key", "") + if not hmac.compare_digest(provided, _resolve_key()): + return jsonify({"error": "unauthorized"}), 401 + + def gen(): + q = subscribe() + try: + yield ": connected\n\n" # flush headers immediately + while True: + try: + entry = q.get(timeout=25) + except Empty: + yield ": ping\n\n" # keepalive + continue + yield f"data: {json.dumps(entry, ensure_ascii=False)}\n\n" + finally: + unsubscribe(q) + + return Response( + stream_with_context(gen()), + mimetype="text/event-stream", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, + ) diff --git a/duckdb-service/services/log_ring.py b/duckdb-service/services/log_ring.py index 9d3153c..98560be 100644 --- a/duckdb-service/services/log_ring.py +++ b/duckdb-service/services/log_ring.py @@ -26,6 +26,7 @@ import json import logging import os +import queue import sys import threading from collections import deque @@ -50,6 +51,12 @@ _MAX_TOTAL_BYTES = 100 * 1024 * 1024 _disk_logger: "logging.Logger | None" = None +# Live SSE subscribers (#178 Phase 4). Each /logs/stream connection registers a +# bounded queue; _push fans every entry out to all of them (drop on full, never +# block the logger). Guarded by _lock alongside the ring. +_SUBSCRIBER_MAXSIZE = 1000 +_subscribers: "set[queue.Queue]" = set() + def _now_iso() -> str: # Millisecond precision, 'Z' suffix — matches the LogEntry contract. @@ -60,12 +67,33 @@ def _push(level: str, msg: str) -> None: entry = {"ts": _now_iso(), "level": level, "msg": msg} with _lock: _ring.append(entry) + subs = list(_subscribers) if _disk_logger is not None: try: _disk_logger.info(json.dumps(entry, ensure_ascii=False)) except Exception: # Persistence must never break in-memory logging. pass + for q in subs: + try: + q.put_nowait(entry) + except queue.Full: + # Slow/stuck subscriber — drop rather than block the logger. + pass + + +def subscribe() -> "queue.Queue": + """Register a live-tail subscriber (SSE). Returns a bounded queue the caller + drains; pair with unsubscribe() in a finally so a disconnect cleans up.""" + q: queue.Queue = queue.Queue(maxsize=_SUBSCRIBER_MAXSIZE) + with _lock: + _subscribers.add(q) + return q + + +def unsubscribe(q: "queue.Queue") -> None: + with _lock: + _subscribers.discard(q) def _prune_by_size(log_path: str) -> None: @@ -222,6 +250,7 @@ def _reset_for_test(): global _disk_logger with _lock: _ring.clear() + _subscribers.clear() # Also clear any installed tee's partial-line carry so a half-line from a # previous test can't bleed into the next. for tee in _tees: diff --git a/duckdb-service/tests/test_logs.py b/duckdb-service/tests/test_logs.py index e055b48..6e6d77c 100644 --- a/duckdb-service/tests/test_logs.py +++ b/duckdb-service/tests/test_logs.py @@ -3,6 +3,9 @@ import io import json import sys +from queue import Empty + +import pytest from services import log_ring @@ -177,3 +180,43 @@ def test_persistence_is_noop_without_dir(tmp_path): log_ring._push("info", "in-memory only") assert "in-memory only" in _msgs(log_ring.get_recent(10)[0]) assert list(tmp_path.iterdir()) == [] + + +# --- SSE live tail (#178 Phase 4) --- + + +def test_subscribe_receives_push_then_unsubscribe_stops(): + log_ring._reset_for_test() + q = log_ring.subscribe() + log_ring._push("warn", "sub-entry") + got = q.get_nowait() + assert got["msg"] == "sub-entry" and got["level"] == "warn" + log_ring.unsubscribe(q) + log_ring._push("info", "after-unsub") + with pytest.raises(Empty): + q.get_nowait() + + +def test_stream_requires_admin_key(client): + assert client.get("/logs/stream").status_code == 401 + assert client.get("/logs/stream", headers={"X-Admin-Key": "wrong"}).status_code == 401 + + +def test_stream_emits_pushed_entry_as_sse(client): + log_ring._reset_for_test() + resp = client.get("/logs/stream", headers={"X-Admin-Key": VALID_KEY}) + assert resp.status_code == 200 + assert resp.mimetype == "text/event-stream" + assert resp.headers.get("X-Accel-Buffering") == "no" + + it = iter(resp.response) + first = next(it) # ": connected" — also registers the subscriber + assert b"connected" in (first if isinstance(first, bytes) else first.encode()) + + log_ring._push("error", "stream-entry") + chunk = next(it) + text = chunk.decode() if isinstance(chunk, bytes) else chunk + assert text.startswith("data: ") + payload = json.loads(text[len("data: ") :].strip()) + assert payload["msg"] == "stream-entry" and payload["level"] == "error" + it.close() diff --git a/homepage/src/__tests__/AdminPage.test.tsx b/homepage/src/__tests__/AdminPage.test.tsx index d48d4a3..83d52c2 100644 --- a/homepage/src/__tests__/AdminPage.test.tsx +++ b/homepage/src/__tests__/AdminPage.test.tsx @@ -19,6 +19,14 @@ const mockApi = vi.hoisted(() => ({ getImageUrl: vi.fn((f: string) => `/api/images/${f}`), deleteModule: vi.fn(), deleteImage: vi.fn(), + // ServerLogsPanel (#178) backfills then opens an SSE stream; stub both so the + // embedded panel neither errors nor leaves an unhandled rejection. + getServerLogs: vi.fn().mockResolvedValue({ service: 'backend', entries: [], truncated: false }), + streamServerLogs: vi.fn(() => ({ + addEventListener: vi.fn(), + removeEventListener: vi.fn(), + close: vi.fn(), + })), })); vi.mock('../services/api', () => ({ api: mockApi })); diff --git a/homepage/src/__tests__/ServerLogsPanel.test.tsx b/homepage/src/__tests__/ServerLogsPanel.test.tsx index 783b0c3..37708ee 100644 --- a/homepage/src/__tests__/ServerLogsPanel.test.tsx +++ b/homepage/src/__tests__/ServerLogsPanel.test.tsx @@ -1,13 +1,34 @@ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; -import { render, screen, waitFor, fireEvent } from '@testing-library/react'; -import type { ServerLogsResponse } from '@highfive/contracts'; +import { render, screen, waitFor, fireEvent, act } from '@testing-library/react'; +import type { LogEntry, ServerLogsResponse } from '@highfive/contracts'; import ServerLogsPanel from '../components/ServerLogsPanel'; import { api } from '../services/api'; // Pins the #171/#178 admin server-logs view against the real ServerLogsResponse // wire shape (CLAUDE.md rule 3 — realistic fixture, not a guessed object). -// api.getServerLogs is the single boundary, spied so no real fetch fires. +// api.getServerLogs (REST backfill) and api.streamServerLogs (SSE live tail) are +// the two boundaries, spied so no real fetch / EventSource fires. + +// A controllable fake EventSource: records listeners so a test can emit events. +interface FakeES { + addEventListener: ReturnType; + removeEventListener: ReturnType; + close: ReturnType; + emit: (type: string, e: unknown) => void; +} +function makeFakeES(): FakeES { + const listeners: Record void)[]> = {}; + return { + addEventListener: vi.fn((type: string, cb: (e: unknown) => void) => { + (listeners[type] ??= []).push(cb); + }), + removeEventListener: vi.fn(), + close: vi.fn(), + emit: (type, e) => (listeners[type] ?? []).forEach((cb) => cb(e)), + }; +} +let createdStreams: FakeES[] = []; const backendLogs: ServerLogsResponse = { service: 'backend', @@ -32,7 +53,13 @@ const duckdbLogs: ServerLogsResponse = { }; beforeEach(() => { + createdStreams = []; vi.spyOn(api, 'getServerLogs').mockResolvedValue(backendLogs); + vi.spyOn(api, 'streamServerLogs').mockImplementation(() => { + const f = makeFakeES(); + createdStreams.push(f); + return f as unknown as EventSource; + }); }); afterEach(() => { @@ -78,8 +105,8 @@ describe('ServerLogsPanel (#171/#178)', () => { await waitFor(() => expect(api.getServerLogs).toHaveBeenCalledWith('duckdb-service', 200)); const out = await screen.findByTestId('server-logs-output'); expect(out.textContent).toContain('[heartbeat] mac=aabbccddeeff'); - // truncated → the "showing most recent N" hint renders. - expect(screen.getByText(/Showing the most recent/)).toBeTruthy(); + // truncated → the backfill hint renders. + expect(screen.getByText(/Backfilled the most recent/)).toBeTruthy(); }); it('shows an error message when the fetch fails', async () => { @@ -87,4 +114,45 @@ describe('ServerLogsPanel (#171/#178)', () => { render(); expect(await screen.findByText(/Failed to fetch backend logs/)).toBeTruthy(); }); + + it('appends entries that arrive over the SSE stream', async () => { + render(); + await screen.findByText(/HighFive Backend API listening/); + // The stream opens after the REST backfill resolves. + await waitFor(() => expect(createdStreams).toHaveLength(1)); + + const liveEntry: LogEntry = { + ts: '2026-06-18T21:00:00.000Z', + level: 'warn', + msg: 'POST /api/admin/login 401 2ms', + }; + act(() => createdStreams[0].emit('message', { data: JSON.stringify(liveEntry) })); + + const row = await screen.findByText('POST /api/admin/login 401 2ms'); + expect(row).toBeTruthy(); + // It rendered as a warn-level entry (3 rows now: 2 backfill + 1 live). + expect(screen.getAllByTestId('server-log-entry')).toHaveLength(3); + }); + + it('closes the stream on unmount', async () => { + const { unmount } = render(); + await waitFor(() => expect(createdStreams).toHaveLength(1)); + unmount(); + expect(createdStreams[0].close).toHaveBeenCalled(); + }); + + it('closes the old stream and opens a new one when the service changes', async () => { + (api.getServerLogs as ReturnType).mockImplementation(async (service: string) => + service === 'duckdb-service' ? duckdbLogs : backendLogs, + ); + render(); + await waitFor(() => expect(createdStreams).toHaveLength(1)); + + fireEvent.change(screen.getByTestId('log-service-select'), { + target: { value: 'duckdb-service' }, + }); + + await waitFor(() => expect(createdStreams).toHaveLength(2)); + expect(createdStreams[0].close).toHaveBeenCalled(); + }); }); diff --git a/homepage/src/components/ServerLogsPanel.tsx b/homepage/src/components/ServerLogsPanel.tsx index 603e153..af6cd74 100644 --- a/homepage/src/components/ServerLogsPanel.tsx +++ b/homepage/src/components/ServerLogsPanel.tsx @@ -1,14 +1,16 @@ -import { useState, useEffect, useCallback } from 'react'; +import { useState, useEffect, useCallback, useRef } from 'react'; import { api } from '../services/api'; import type { LogEntry, LogLevel, ServerLogService } from '../services/api'; // Admin-only server-log viewer (#171, #178). Tails a service's own recent -// structured log entries via GET /api/admin/logs (the app-level ring in each -// service). English-only, like the rest of AdminPage (operator-facing, no i18n -// consumer in this page — see AdminPage's existing note). See ADR-021/ADR-022. +// structured log entries: a REST backfill (GET /api/admin/logs) followed by an +// SSE live tail (GET /api/admin/logs/stream). English-only, like the rest of +// AdminPage (operator-facing). See ADR-021/ADR-022. const SERVICES: ServerLogService[] = ['backend', 'duckdb-service', 'image-service']; const DEFAULT_LINES = 200; const MAX_LINES = 1000; +// Cap the in-memory live list so a long-lived stream can't grow unbounded. +const MAX_LIVE_ENTRIES = 2000; // Tailwind classes per level. Info is muted; warn amber; error red — matches // the access-log middleware's status→level mapping. @@ -25,6 +27,12 @@ export default function ServerLogsPanel() { const [truncated, setTruncated] = useState(false); const [loading, setLoading] = useState(false); const [error, setError] = useState(null); + const [live, setLive] = useState(false); + // Follow mode: auto-scroll to the newest entry until the user scrolls up. + const [follow, setFollow] = useState(true); + + const scrollRef = useRef(null); + const esRef = useRef(null); const load = useCallback(async () => { setLoading(true); @@ -45,15 +53,83 @@ export default function ServerLogsPanel() { } }, [service, lines]); - // Reload whenever the selected service changes (and on mount). + // On mount and whenever the service changes: close any prior stream, REST + // backfill, then open the SSE live tail and append entries as they arrive. useEffect(() => { - load(); + let cancelled = false; + esRef.current?.close(); + esRef.current = null; + setLive(false); + setFollow(true); + + load().then(() => { + if (cancelled) return; + const es = api.streamServerLogs(service); + es.addEventListener('open', () => setLive(true)); + es.addEventListener('error', () => setLive(false)); + es.addEventListener('message', (e: MessageEvent) => { + try { + const entry = JSON.parse(e.data) as LogEntry; + setEntries((prev) => { + const next = [...prev, entry]; + return next.length > MAX_LIVE_ENTRIES + ? next.slice(next.length - MAX_LIVE_ENTRIES) + : next; + }); + } catch { + // Ignore a malformed SSE payload rather than break the stream. + } + }); + esRef.current = es; + }); + + return () => { + cancelled = true; + esRef.current?.close(); + esRef.current = null; + }; }, [service]); // eslint-disable-line react-hooks/exhaustive-deps + // Auto-scroll to the bottom on new entries while following. + useEffect(() => { + if (follow && scrollRef.current) { + scrollRef.current.scrollTop = scrollRef.current.scrollHeight; + } + }, [entries, follow]); + + // Pause follow when the user scrolls up; resume when they reach the bottom. + const onScroll = () => { + const el = scrollRef.current; + if (!el) return; + const nearBottom = el.scrollHeight - el.scrollTop - el.clientHeight < 40; + setFollow(nearBottom); + }; + + const jumpToLatest = () => { + setFollow(true); + if (scrollRef.current) scrollRef.current.scrollTop = scrollRef.current.scrollHeight; + }; + return (
-

Server Logs

+
+

Server Logs

+ + + {live ? 'Live' : 'Connecting…'} + +
-
+
{error &&

{error}

} {truncated && !error && (

- Showing the most recent {entries.length} entries. + Backfilled the most recent {entries.length} entries; new entries stream in live.

)} {!error && entries.length === 0 && !loading ? (

No log entries captured yet.

) : (
{entries.map((entry, i) => { const style = LEVEL_STYLES[entry.level] ?? LEVEL_STYLES.info; @@ -127,6 +205,15 @@ export default function ServerLogsPanel() { })}
)} + {!follow && entries.length > 0 && ( + + )}
); diff --git a/homepage/src/services/api.ts b/homepage/src/services/api.ts index f678fee..cbf6f3e 100644 --- a/homepage/src/services/api.ts +++ b/homepage/src/services/api.ts @@ -227,6 +227,18 @@ class ApiService { return response.json(); } + /** + * Admin-only: open an SSE live-tail of a service's structured log entries + * (#178 / ADR-022). Maps to `GET /api/admin/logs/stream?service=…`; the admin + * session cookie rides along via `withCredentials`. Pair with the REST + * `getServerLogs` backfill — the caller appends each `message` `LogEntry` and + * MUST `close()` the source on unmount / service change. + */ + streamServerLogs(service: ServerLogService): EventSource { + const url = `${this.baseUrl}/admin/logs/stream?service=${encodeURIComponent(service)}`; + return new EventSource(url, { withCredentials: true }); + } + /** * Bucketed image-upload activity for the dashboard weather chart. * Maps to `GET /api/modules/:id/activity` on the backend, which in diff --git a/homepage/src/test-setup.ts b/homepage/src/test-setup.ts index ac889fc..465378e 100644 --- a/homepage/src/test-setup.ts +++ b/homepage/src/test-setup.ts @@ -88,3 +88,24 @@ if (!globalThis.fetch || !(globalThis.fetch as { _isStub?: boolean })._isStub) { (stubFetch as unknown as { _isStub: boolean })._isStub = true; globalThis.fetch = stubFetch; } + +// jsdom doesn't implement EventSource (#178 SSE live tail) — provide a minimal +// class stub so SSE code can construct one without throwing. Component tests +// typically mock `api.streamServerLogs` to inject a controllable fake instead. +if (typeof globalThis.EventSource === 'undefined') { + class MockEventSource { + url: string; + withCredentials: boolean; + onmessage: ((e: MessageEvent) => void) | null = null; + onerror: ((e: Event) => void) | null = null; + onopen: ((e: Event) => void) | null = null; + constructor(url: string, init?: { withCredentials?: boolean }) { + this.url = url; + this.withCredentials = init?.withCredentials ?? false; + } + addEventListener = vi.fn(); + removeEventListener = vi.fn(); + close = vi.fn(); + } + globalThis.EventSource = MockEventSource as unknown as typeof EventSource; +} diff --git a/image-service/app.py b/image-service/app.py index 80b93d9..7f5093f 100644 --- a/image-service/app.py +++ b/image-service/app.py @@ -4,9 +4,18 @@ import os import random import time +from queue import Empty import requests as http_requests -from flask import Flask, g, jsonify, request, send_from_directory +from flask import ( + Flask, + Response, + g, + jsonify, + request, + send_from_directory, + stream_with_context, +) from pydantic import ValidationError from services.discord import send_discord_message @@ -14,7 +23,7 @@ from services.log_ring import get_recent as _get_recent_logs from services.log_ring import init_persistence as init_log_persistence from services.log_ring import install as install_log_ring -from services.log_ring import log_event +from services.log_ring import log_event, subscribe, unsubscribe from services.module_id import ModuleId from services.sidecar import LogSidecarEnvelope from services.upload_pipeline import UploadPipeline, UploadRequest @@ -157,6 +166,36 @@ def get_logs(): ), 200 +@app.get("/logs/stream") +def stream_logs(): + """SSE live tail (#178 Phase 4). One LogEntry JSON per `data:` event. The + backend's GET /api/admin/logs/stream?service=image-service pipes this + through, forwarding X-Admin-Key. REST /logs stays for the initial backfill.""" + provided = request.headers.get("X-Admin-Key", "") + if not hmac.compare_digest(provided, _logs_resolve_key()): + return jsonify({"error": "unauthorized"}), 401 + + def gen(): + q = subscribe() + try: + yield ": connected\n\n" # flush headers immediately + while True: + try: + entry = q.get(timeout=25) + except Empty: + yield ": ping\n\n" # keepalive + continue + yield f"data: {json.dumps(entry, ensure_ascii=False)}\n\n" + finally: + unsubscribe(q) + + return Response( + stream_with_context(gen()), + mimetype="text/event-stream", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, + ) + + @app.post("/upload") def upload_image(): mac = request.form.get("mac") or request.args.get("mac") diff --git a/image-service/services/log_ring.py b/image-service/services/log_ring.py index 9d3153c..98560be 100644 --- a/image-service/services/log_ring.py +++ b/image-service/services/log_ring.py @@ -26,6 +26,7 @@ import json import logging import os +import queue import sys import threading from collections import deque @@ -50,6 +51,12 @@ _MAX_TOTAL_BYTES = 100 * 1024 * 1024 _disk_logger: "logging.Logger | None" = None +# Live SSE subscribers (#178 Phase 4). Each /logs/stream connection registers a +# bounded queue; _push fans every entry out to all of them (drop on full, never +# block the logger). Guarded by _lock alongside the ring. +_SUBSCRIBER_MAXSIZE = 1000 +_subscribers: "set[queue.Queue]" = set() + def _now_iso() -> str: # Millisecond precision, 'Z' suffix — matches the LogEntry contract. @@ -60,12 +67,33 @@ def _push(level: str, msg: str) -> None: entry = {"ts": _now_iso(), "level": level, "msg": msg} with _lock: _ring.append(entry) + subs = list(_subscribers) if _disk_logger is not None: try: _disk_logger.info(json.dumps(entry, ensure_ascii=False)) except Exception: # Persistence must never break in-memory logging. pass + for q in subs: + try: + q.put_nowait(entry) + except queue.Full: + # Slow/stuck subscriber — drop rather than block the logger. + pass + + +def subscribe() -> "queue.Queue": + """Register a live-tail subscriber (SSE). Returns a bounded queue the caller + drains; pair with unsubscribe() in a finally so a disconnect cleans up.""" + q: queue.Queue = queue.Queue(maxsize=_SUBSCRIBER_MAXSIZE) + with _lock: + _subscribers.add(q) + return q + + +def unsubscribe(q: "queue.Queue") -> None: + with _lock: + _subscribers.discard(q) def _prune_by_size(log_path: str) -> None: @@ -222,6 +250,7 @@ def _reset_for_test(): global _disk_logger with _lock: _ring.clear() + _subscribers.clear() # Also clear any installed tee's partial-line carry so a half-line from a # previous test can't bleed into the next. for tee in _tees: diff --git a/image-service/tests/test_logs.py b/image-service/tests/test_logs.py index 8eab0dd..554a68d 100644 --- a/image-service/tests/test_logs.py +++ b/image-service/tests/test_logs.py @@ -3,6 +3,9 @@ import io import json import sys +from queue import Empty + +import pytest from services import log_ring @@ -175,3 +178,43 @@ def test_persistence_is_noop_without_dir(tmp_path): log_ring._push("info", "in-memory only") assert "in-memory only" in _msgs(log_ring.get_recent(10)[0]) assert list(tmp_path.iterdir()) == [] + + +# --- SSE live tail (#178 Phase 4) --- + + +def test_subscribe_receives_push_then_unsubscribe_stops(): + log_ring._reset_for_test() + q = log_ring.subscribe() + log_ring._push("warn", "sub-entry") + got = q.get_nowait() + assert got["msg"] == "sub-entry" and got["level"] == "warn" + log_ring.unsubscribe(q) + log_ring._push("info", "after-unsub") + with pytest.raises(Empty): + q.get_nowait() + + +def test_stream_requires_admin_key(client): + assert client.get("/logs/stream").status_code == 401 + assert client.get("/logs/stream", headers={"X-Admin-Key": "wrong"}).status_code == 401 + + +def test_stream_emits_pushed_entry_as_sse(client): + log_ring._reset_for_test() + resp = client.get("/logs/stream", headers={"X-Admin-Key": VALID_KEY}) + assert resp.status_code == 200 + assert resp.mimetype == "text/event-stream" + assert resp.headers.get("X-Accel-Buffering") == "no" + + it = iter(resp.response) + first = next(it) # ": connected" — also registers the subscriber + assert b"connected" in (first if isinstance(first, bytes) else first.encode()) + + log_ring._push("error", "stream-entry") + chunk = next(it) + text = chunk.decode() if isinstance(chunk, bytes) else chunk + assert text.startswith("data: ") + payload = json.loads(text[len("data: ") :].strip()) + assert payload["msg"] == "stream-entry" and payload["level"] == "error" + it.close() From 2614120664e2d28699f5851edb035e4946e5a404 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 20:43:49 +0000 Subject: [PATCH 07/15] feat: search, level filter, and .log export for Server Logs panel (#178 phase 5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes the live console usable at volume — all client-side, no endpoint or contract changes. - Panel: a search box (case-insensitive substring over msg) + three level checkboxes drive a derived visibleEntries; live-append, follow-mode, and "Jump to latest" keep working off the full buffer. "Download .log" builds a client-side blob and downloads it — all loaded entries when unfiltered, the filtered view when a search/level filter is active (per the agreed scope); button disables when nothing would export. Adds a "No entries match the filter" state. - Component tests: search narrows, level toggle hides, no-match disables export, and export blob content for both scopes (Blob-constructor spy since jsdom Blob has no .text()). - Playwright (tests/ui/admin-server-logs.spec.ts): the end-to-end proof — wait for the live indicator, fire a uniquely-pathed backend request, assert its access-log entry STREAMS into the panel with no refresh, search-filter to it, and assert a real .log download with structured lines. UI stack is in-memory (no LOG_DIR) which is fine for a live assertion. - Docs: api-reference notes the panel's live indicator + search/filter/export. Gate: homepage 169 (0 errors), tsc + build clean, UI spec typechecks, check-citations clean; backend/Flask unchanged from phase 4 (197/201/77). Playwright runs under the existing `make test-ui` CI job. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01HU7JHMbvtUKRfZY98qeWdK --- docs/api-reference.md | 5 ++ .../src/__tests__/ServerLogsPanel.test.tsx | 75 ++++++++++++++++++ homepage/src/components/ServerLogsPanel.tsx | 77 ++++++++++++++++++- tests/ui/tests/admin-server-logs.spec.ts | 42 ++++++++++ 4 files changed, 197 insertions(+), 2 deletions(-) diff --git a/docs/api-reference.md b/docs/api-reference.md index 0818fb7..57ce460 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -310,6 +310,11 @@ data: {"ts":"2026-06-18T20:42:56.004Z","level":"info","msg":"POST /heartbeat 200 : ping ``` +The admin **Server Logs** panel consumes both: a live indicator reflects the SSE +connection, follow-mode auto-scrolls (pausing when you scroll up), and the loaded +entries can be searched, filtered by level, and exported to a plain `.log` — +all client-side, no extra endpoints. + ## 1.5 User location hint (dashboard map) ``` diff --git a/homepage/src/__tests__/ServerLogsPanel.test.tsx b/homepage/src/__tests__/ServerLogsPanel.test.tsx index 37708ee..d4880e2 100644 --- a/homepage/src/__tests__/ServerLogsPanel.test.tsx +++ b/homepage/src/__tests__/ServerLogsPanel.test.tsx @@ -155,4 +155,79 @@ describe('ServerLogsPanel (#171/#178)', () => { await waitFor(() => expect(createdStreams).toHaveLength(2)); expect(createdStreams[0].close).toHaveBeenCalled(); }); + + it('filters entries by the search query', async () => { + render(); + await screen.findByText('GET /api/modules 200 4ms'); + expect(screen.getAllByTestId('server-log-entry')).toHaveLength(2); + + fireEvent.change(screen.getByTestId('logs-search'), { target: { value: 'modules' } }); + + const rows = screen.getAllByTestId('server-log-entry'); + expect(rows).toHaveLength(1); + expect(rows[0].textContent).toContain('GET /api/modules 200 4ms'); + }); + + it('hides a level when its checkbox is unchecked', async () => { + (api.getServerLogs as ReturnType).mockResolvedValue(duckdbLogs); + render(); + await screen.findByText('GET /health 500 12ms'); + expect(screen.getAllByTestId('server-log-entry')).toHaveLength(2); + + fireEvent.click(screen.getByTestId('logs-level-info')); + + const rows = screen.getAllByTestId('server-log-entry'); + expect(rows).toHaveLength(1); + expect(rows[0].getAttribute('data-level')).toBe('error'); + }); + + it('shows a no-match message and disables download when the filter matches nothing', async () => { + render(); + await screen.findByText('GET /api/modules 200 4ms'); + + fireEvent.change(screen.getByTestId('logs-search'), { target: { value: 'zzz-no-such-line' } }); + + expect(screen.getByTestId('logs-no-match')).toBeTruthy(); + expect(screen.queryByTestId('server-logs-output')).toBeNull(); + expect((screen.getByTestId('logs-download') as HTMLButtonElement).disabled).toBe(true); + }); + + describe('Download .log export', () => { + // jsdom's Blob has no .text(); capture the text via the Blob constructor. + let exported: string; + beforeEach(() => { + exported = ''; + vi.spyOn(globalThis, 'Blob').mockImplementation( + (parts?: BlobPart[]) => { + exported = (parts ?? []).join(''); + return {} as Blob; + }, + ); + (URL as unknown as { createObjectURL: unknown }).createObjectURL = vi.fn(() => 'blob:mock'); + (URL as unknown as { revokeObjectURL: unknown }).revokeObjectURL = vi.fn(); + }); + + it('exports all loaded entries as lines when unfiltered', async () => { + render(); + await screen.findByText('GET /api/modules 200 4ms'); + + fireEvent.click(screen.getByTestId('logs-download')); + + expect(exported).toContain( + '2026-06-18T20:42:55.000Z INFO 🐝 HighFive Backend API listening on port 3002 (all interfaces)', + ); + expect(exported).toContain('2026-06-18T20:42:56.000Z INFO GET /api/modules 200 4ms'); + }); + + it('exports only the filtered view when a search is active', async () => { + render(); + await screen.findByText('GET /api/modules 200 4ms'); + + fireEvent.change(screen.getByTestId('logs-search'), { target: { value: 'modules' } }); + fireEvent.click(screen.getByTestId('logs-download')); + + expect(exported).toContain('GET /api/modules 200 4ms'); + expect(exported).not.toContain('HighFive Backend API listening'); + }); + }); }); diff --git a/homepage/src/components/ServerLogsPanel.tsx b/homepage/src/components/ServerLogsPanel.tsx index af6cd74..e9b724f 100644 --- a/homepage/src/components/ServerLogsPanel.tsx +++ b/homepage/src/components/ServerLogsPanel.tsx @@ -30,10 +30,25 @@ export default function ServerLogsPanel() { const [live, setLive] = useState(false); // Follow mode: auto-scroll to the newest entry until the user scrolls up. const [follow, setFollow] = useState(true); + // Search + level filters (#178 Phase 5). Default: show everything. + const [query, setQuery] = useState(''); + const [levelsOn, setLevelsOn] = useState>({ + info: true, + warn: true, + error: true, + }); const scrollRef = useRef(null); const esRef = useRef(null); + // Entries actually rendered: the full buffer narrowed by the level toggles + // and a case-insensitive substring search over the message. + const needle = query.trim().toLowerCase(); + const filterActive = needle !== '' || !(levelsOn.info && levelsOn.warn && levelsOn.error); + const visibleEntries = entries.filter( + (e) => levelsOn[e.level] && (needle === '' || e.msg.toLowerCase().includes(needle)), + ); + const load = useCallback(async () => { setLoading(true); setError(null); @@ -110,6 +125,26 @@ export default function ServerLogsPanel() { if (scrollRef.current) scrollRef.current.scrollTop = scrollRef.current.scrollHeight; }; + const toggleLevel = (level: LogLevel) => + setLevelsOn((prev) => ({ ...prev, [level]: !prev[level] })); + + // Download the loaded entries as a plain `.log`. When a filter is active, the + // file matches what's on screen (the filtered view); unfiltered, it's the full + // loaded buffer. Built client-side — no endpoint. (#178 Phase 5) + const toExport = filterActive ? visibleEntries : entries; + const downloadLog = () => { + if (toExport.length === 0) return; + const text = `${toExport.map((e) => `${e.ts} ${e.level.toUpperCase()} ${e.msg}`).join('\n')}\n`; + const url = URL.createObjectURL(new Blob([text], { type: 'text/plain' })); + const a = document.createElement('a'); + a.href = url; + a.download = `${service}-logs.log`; + document.body.appendChild(a); + a.click(); + a.remove(); + URL.revokeObjectURL(url); + }; + return (
@@ -168,6 +203,40 @@ export default function ServerLogsPanel() {
+
+ setQuery(e.target.value)} + placeholder="Search messages…" + aria-label="Search log messages" + className="flex-1 min-w-[12rem] border border-gray-300 rounded-lg px-3 py-1.5 text-sm focus:ring-2 focus:ring-amber-500 focus:border-amber-500 outline-none" + /> +
+ {(['info', 'warn', 'error'] as LogLevel[]).map((lvl) => ( + + ))} +
+ +
+
{error &&

{error}

} {truncated && !error && ( @@ -177,6 +246,10 @@ export default function ServerLogsPanel() { )} {!error && entries.length === 0 && !loading ? (

No log entries captured yet.

+ ) : !error && visibleEntries.length === 0 ? ( +

+ No entries match the filter. +

) : (
- {entries.map((entry, i) => { + {visibleEntries.map((entry, i) => { const style = LEVEL_STYLES[entry.level] ?? LEVEL_STYLES.info; return (
)} - {!follow && entries.length > 0 && ( + {!follow && visibleEntries.length > 0 && (