Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ AUTH_OIDC_CLIENT_SECRET=""
# /api/metrics is Prometheus text. Without a token it is only readable via
# localhost/127.0.0.1 requests; set this for remote scraping.
# METRICS_BEARER_TOKEN="replace-with-random-secret"
# When set, server-side structured logs are also appended as JSON Lines under
# APP_LOG_DIR. Docker Compose mounts APP_LOG_HOST_DIR there.
# APP_LOG_DIR="/var/log/life-ustc"
# APP_LOG_HOST_DIR="./logs/app-dev"

# Dev-only defaults
# UPLOAD_TOTAL_QUOTA_MB="1024"
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,5 @@ ENV/

.cache
/screenshots
/logs
/src/generated
2 changes: 2 additions & 0 deletions docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ services:
S3_BUCKET: *minio-dev-bucket
AWS_ENDPOINT_URL_S3: *minio-internal-endpoint
CHOKIDAR_USEPOLLING: "1"
APP_LOG_DIR: ${APP_LOG_DIR:-/var/log/life-ustc}
ports:
- "127.0.0.1:3000:3000"
healthcheck:
Expand All @@ -89,6 +90,7 @@ services:
start_period: 20s
volumes:
- ./:/usr/src/app:Z
- ${APP_LOG_HOST_DIR:-./logs/app-dev}:/var/log/life-ustc:Z
- bun-cache:/home/bun/.bun/install/cache
depends_on:
postgres:
Expand Down
3 changes: 3 additions & 0 deletions docker-compose.prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ services:
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID}
AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY}
AWS_SESSION_TOKEN: ${AWS_SESSION_TOKEN:-}
APP_LOG_DIR: ${APP_LOG_DIR:-/var/log/life-ustc}
volumes:
- ${APP_LOG_HOST_DIR:-./logs/app}:/var/log/life-ustc
restart: unless-stopped
cap_drop:
- ALL
Expand Down
2 changes: 1 addition & 1 deletion docs/features/_audit.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,5 @@
"description": "Reserved for future admin description moderation actions."
}
},
"writer": "writeAuditLog in src/lib/audit/write-audit-log.ts writes a record. Calls are fire-and-forget (.catch(() => {})) so audit failures never block the main request."
"writer": "writeAuditLog in src/lib/audit/write-audit-log.ts writes a record and records bounded production-safe audit write metrics by action, status, and duration. Calls are fire-and-forget (.catch(() => {})) so audit failures never block the main request."
}
2 changes: 1 addition & 1 deletion docs/features/mcp.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"privacy-safe-summary": "Summary/default outputs may omit repeated low-value nested objects and redact token-bearing URLs or other sensitive strings; full mode is the escape hatch when exact raw values are required.",
"actionable-errors": "Validation and common not-found payloads prefer plain-language messages and may include a hint that points to the next useful tool or query to recover.",
"resource-bound-access-token": "MCP transport requests must present a resource-bound Bearer token for /api/mcp. JWT access tokens minted with the canonical public MCP resource URL as the resource indicator are accepted; opaque tokens minted without a resource indicator are rejected because the server cannot prove MCP audience binding from those token records. Refresh-token grants whose stored grant includes mcp:tools are normalized to that MCP resource URL when clients omit the resource parameter.",
"transport-observability": "MCP transport handling emits production-safe structured request/response logs with JSON-RPC method summaries, tool names, argument keys, auth/origin phase, status, duration, and registered tool count; it never logs bearer tokens, cookies, or tool argument values.",
"transport-observability": "MCP transport handling emits production-safe structured request/response logs and bounded metrics with JSON-RPC method summaries, tool names, argument keys, auth/origin phase, status, duration, registered tool count, and per-tool result status/duration; it never logs bearer tokens, cookies, or tool argument values.",
"flexible-date-inputs": "Date and datetime parameters on MCP tools accept ISO 8601 with timezone offset (2026-05-01T08:00:00+08:00), bare date strings (2026-05-01, treated as UTC midnight for @db.Date columns), or timezone-less datetimes (2026-05-01T08:00:00, interpreted as Asia/Shanghai). Invalid strings produce a descriptive error response rather than a validation rejection.",
"time-override": "Time-sensitive tools (get_my_7days_timeline, get_upcoming_deadlines, get_my_overview, get_next_buses) accept an optional atTime parameter to anchor their internal clock to a caller-supplied moment instead of the server clock, enabling reproducible queries and future-scenario planning."
},
Expand Down
6 changes: 6 additions & 0 deletions docs/features/oauth.json
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@
"notes": [
"Request body is application/x-www-form-urlencoded.",
"Supports CORS (Access-Control-Allow-Origin: *).",
"OPTIONS returns a 204 CORS preflight response.",
".well-known/openid-configuration already includes device_authorization_endpoint and urn:ietf:params:oauth:grant-type:device_code in grant_types_supported."
]
},
Expand All @@ -139,6 +140,11 @@
"The device polls this endpoint with grant_type=urn:ietf:params:oauth:grant-type:device_code and device_code to obtain an access token.",
"During polling, returns authorization_pending, slow_down, expired_token, access_denied, invalid_scope, or invalid_grant error codes based on state."
]
},
{
"path": "/api/auth/oauth2/token",
"method": "GET",
"returns": "{ error: \"invalid_request\" }"
}
]
},
Expand Down
12 changes: 11 additions & 1 deletion docs/features/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
"aligned-with-web": "The goal of REST / OpenAPI is to expose capabilities aligned with the web, not to create an additional product layer.",
"no-developer-model": "Public documentation helps callers understand the existing product capability boundaries, not to introduce a separate developer-specific business model.",
"session-and-bearer": "Current user-side business REST endpoints support both in-site session from the request cookie and OAuth bearer tokens; bearer-token access is no longer limited to MCP.",
"rest-mcp-consistent": "The same abstract endpoint currently maintains the same core fields, state information, and action results in both REST and MCP; differences mainly appear in interaction form and serialization level."
"rest-mcp-consistent": "The same abstract endpoint currently maintains the same core fields, state information, and action results in both REST and MCP; differences mainly appear in interaction form and serialization level.",
"rest-observability": "REST API handling records production-safe structured request-start/request-finish logs and bounded in-memory metrics with request ID, method, status, auth mode, duration, and normalized route template; it does not log request bodies, credentials, raw query strings, or high-cardinality resource IDs."
},
"capabilities": {
"api-docs-page": {
Expand Down Expand Up @@ -77,13 +78,22 @@
"method": "POST",
"returns": "{ success: Boolean }",
"notes": ["Sets the user language preference cookie."]
},
{
"path": "/api/readiness",
"method": "GET",
"returns": "{ status, checks, uptimeSeconds }",
"notes": [
"Internal readiness endpoint for dependency checks; readable only from localhost or with the configured readiness/metrics bearer token."
]
}
]
},
"display": {
"fields": [
"Metadata endpoint",
"Locale endpoint",
"Readiness endpoint",
"OAuth/provider catch-all endpoints",
"Webhook endpoint via Better Auth catch-all"
]
Expand Down
1 change: 1 addition & 0 deletions docs/features/upload.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"permission-and-quota": "Permission and quota checks are required before upload.",
"three-step-upload": "Web comment attachment uploads follow a three-step flow — create upload session on-site → browser direct upload to S3-compatible pre-signed URL generated by AWS SDK → confirm completion on-site; page CSP must allow the signed storage origin actually used by the upload backend, including AWS S3 virtual-hosted / path-style addresses or compatible backend addresses via custom endpoint configuration.",
"s3-compatible-backend": "The storage backend operates with AWS SDK S3-compatible configuration; defaults to AWS S3 but allows connecting to compatible backends via custom endpoint.",
"storage-observability": "Shared S3 helpers record production-safe bounded metrics for storage operation status and duration; metrics identify operation type but do not include bucket names, object keys, signed URLs, filenames, or user IDs.",
"e2e-local-s3": "E2E tests use a local S3-compatible test endpoint by default; external storage is only used when an available S3 environment is explicitly provided.",
"download-auth-required": "Downloads must not bypass authorization by obtaining a direct link.",
"download-signed-url": "/api/uploads/[id]/download only returns a short-lived signed download URL redirect, not a stable long-reusable object direct link."
Expand Down
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
- [docs/features/openapi.json](features/openapi.json) - OpenAPI feature surface.
- [docs/features/mcp.json](features/mcp.json) - MCP feature surface.
- [docs/features/security.json](features/security.json) - security and permission expectations.
- [docs/observability.md](observability.md) - production logs, metrics, readiness, alerts, and dashboard guidance.

## Verification

Expand Down
109 changes: 109 additions & 0 deletions docs/observability.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Observability

Life@USTC exposes production-safe structured logs and bounded Prometheus-style
runtime metrics. Logs and metrics must not include bearer tokens, cookies, OAuth
codes, request bodies, raw query strings, upload object keys, signed URLs, or
high-cardinality resource IDs.

## Log Storage

Structured app logs are always emitted to the app process stdout/stderr. When
`APP_LOG_DIR` is configured, server-side logs are also appended as JSON Lines to
`APP_LOG_DIR/app-YYYY-MM-DD.log`.

Docker Compose sets `APP_LOG_DIR=/var/log/life-ustc` and mounts
`APP_LOG_HOST_DIR` there:

- production default: `./logs/app`
- Docker dev default: `./logs/app-dev`
Comment thread
tiankaima marked this conversation as resolved.

Create the host directory before starting Compose and make it writable by the
container user. If Docker auto-creates the bind mount as root and the app cannot
write to it, stdout/stderr logging still works and the app emits one
`app.log_file_write_failed` message.

Caddy access logs remain separate from the app log file.

## Request Tracing

- Caddy access logs remain the edge source of truth for every HTTP request.
- Next.js REST routes propagate `x-request-id` and `x-request-start-ms`.
- REST route logs use normalized route templates such as `/api/todos/:id`.
- MCP transport logs include JSON-RPC method summaries, tool names, argument
keys, status, duration, and registered tool count.

## Metrics

Runtime metrics are exposed as Prometheus text at `/api/metrics`. The endpoint
is readable from localhost or with `METRICS_BEARER_TOKEN`.

REST:

- `life_ustc_api_requests_started_total{method,route}`
- `life_ustc_api_requests_total{auth_mode,method,route,status}`
- `life_ustc_api_request_duration_ms_count{method,route}`
- `life_ustc_api_request_duration_ms_sum{method,route}`
- `life_ustc_api_errors_total{method,route,status}`

MCP:

- `life_ustc_mcp_http_requests_total{method,phase,status}`
- `life_ustc_mcp_http_request_duration_ms_count{method,phase}`
- `life_ustc_mcp_http_request_duration_ms_sum{method,phase}`
- `life_ustc_mcp_jsonrpc_requests_total{rpc_method}`
- `life_ustc_mcp_tool_calls_total{tool}`
- `life_ustc_mcp_tool_call_results_total{status,tool}`
- `life_ustc_mcp_tool_call_duration_ms_count{tool}`
- `life_ustc_mcp_tool_call_duration_ms_sum{tool}`

OAuth, audit, and storage:

- `life_ustc_oauth_token_requests_total{grant_type,has_resource,status}`
- `life_ustc_oauth_token_request_duration_ms_count{grant_type,has_resource}`
- `life_ustc_oauth_token_request_duration_ms_sum{grant_type,has_resource}`
- `life_ustc_audit_writes_total{action,status}`
- `life_ustc_audit_write_duration_ms_count{action}`
- `life_ustc_audit_write_duration_ms_sum{action}`
- `life_ustc_storage_operations_total{operation,status}`
- `life_ustc_storage_operation_duration_ms_count{operation}`
- `life_ustc_storage_operation_duration_ms_sum{operation}`

## Readiness

`/api/readiness` returns internal dependency status for DB reachability, storage
configuration, and process uptime. It is readable from localhost, with
`READINESS_BEARER_TOKEN`, or with `METRICS_BEARER_TOKEN`.

Use readiness for operator diagnostics. Keep Docker health checks shallow so a
transient dependency issue does not restart an otherwise healthy process.

## Alerts

Recommended critical alerts:

- Public blackbox probe failure for `https://life-ustc.tiankaima.dev`.
- App container unhealthy or restarting.
- Sustained REST 5xx rate from `life_ustc_api_errors_total`.
- Caddy upstream `502` or `504` spike.
- Database readiness failure.
- Disk usage above 85%.

Recommended warning alerts:

- REST or MCP latency regression from duration sum/count metrics.
- OAuth token failure spike.
- MCP auth rejection spike.
- Storage operation error spike.
- Audit write error spike.
- Memory pressure or sustained swap growth.

## Dashboards

At minimum, Grafana should show:

- REST request rate, status class, top routes, and average latency.
- MCP HTTP phases, JSON-RPC methods, tool calls, tool failures, and latency.
- OAuth token requests by grant type/status.
- Audit write success/error counts.
- Storage operation success/error counts and latency.
- Host CPU, memory, swap, disk, and container restart state.
4 changes: 3 additions & 1 deletion src/app/api/admin/comments/[id]/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import {
import { adminModerateCommentRequestSchema } from "@/lib/api/schemas/request-schemas";
import { fireAuditLog } from "@/lib/audit/write-audit-log";
import { prisma } from "@/lib/db/prisma";
import { observedApiRoute } from "@/lib/log/api-observability";

export const dynamic = "force-dynamic";

Expand All @@ -19,7 +20,7 @@ export const dynamic = "force-dynamic";
* @response adminModeratedCommentResponseSchema
* @response 400:openApiErrorSchema
*/
export async function PATCH(
async function patchRoute(
request: Request,
{ params }: { params: Promise<{ id: string }> },
) {
Expand Down Expand Up @@ -69,3 +70,4 @@ export async function PATCH(
return jsonResponse({ comment: updated });
});
}
export const PATCH = observedApiRoute(patchRoute);
4 changes: 3 additions & 1 deletion src/app/api/admin/comments/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {
} from "@/lib/api/helpers";
import { adminCommentsQuerySchema } from "@/lib/api/schemas/request-schemas";
import { prisma } from "@/lib/db/prisma";
import { observedApiRoute } from "@/lib/log/api-observability";

export const dynamic = "force-dynamic";

Expand All @@ -18,7 +19,7 @@ const STATUS_FILTERS = ["active", "softbanned", "deleted"] as const;
* @response adminCommentsResponseSchema
* @response 400:openApiErrorSchema
*/
export async function GET(request: Request) {
async function getRoute(request: Request) {
return withAdminRoute("Failed to fetch moderation queue", async () => {
const searchParams = getRequestSearchParams(request);
const parsed = parseRouteQuery(
Expand Down Expand Up @@ -111,3 +112,4 @@ export async function GET(request: Request) {
);
});
}
export const GET = observedApiRoute(getRoute);
4 changes: 3 additions & 1 deletion src/app/api/admin/descriptions/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
} from "@/lib/api/helpers";
import { adminDescriptionsQuerySchema } from "@/lib/api/schemas/request-schemas";
import { prisma } from "@/lib/db/prisma";
import { observedApiRoute } from "@/lib/log/api-observability";
import { ilike } from "@/lib/query-helpers";

export const dynamic = "force-dynamic";
Expand All @@ -16,7 +17,7 @@ export const dynamic = "force-dynamic";
* @response adminDescriptionsResponseSchema
* @response 400:openApiErrorSchema
*/
export async function GET(request: Request) {
async function getRoute(request: Request) {
return withAdminRoute(
"Failed to fetch descriptions moderation queue",
async () => {
Expand Down Expand Up @@ -164,3 +165,4 @@ export async function GET(request: Request) {
},
);
}
export const GET = observedApiRoute(getRoute);
4 changes: 3 additions & 1 deletion src/app/api/admin/homeworks/[id]/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
parseResourceIdParam,
} from "@/lib/api/helpers";
import { prisma } from "@/lib/db/prisma";
import { observedApiRoute } from "@/lib/log/api-observability";

export const dynamic = "force-dynamic";

Expand All @@ -14,7 +15,7 @@ export const dynamic = "force-dynamic";
* @response successResponseSchema
* @response 404:openApiErrorSchema
*/
export async function DELETE(
async function deleteRoute(
_request: Request,
{ params }: { params: Promise<{ id: string }> },
) {
Expand Down Expand Up @@ -61,3 +62,4 @@ export async function DELETE(
return jsonResponse({ success: true });
});
}
export const DELETE = observedApiRoute(deleteRoute);
4 changes: 3 additions & 1 deletion src/app/api/admin/homeworks/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
} from "@/lib/api/helpers";
import { adminHomeworksQuerySchema } from "@/lib/api/schemas/request-schemas";
import { prisma } from "@/lib/db/prisma";
import { observedApiRoute } from "@/lib/log/api-observability";
import { ilike } from "@/lib/query-helpers";

export const dynamic = "force-dynamic";
Expand All @@ -16,7 +17,7 @@ export const dynamic = "force-dynamic";
* @response adminHomeworksResponseSchema
* @response 400:openApiErrorSchema
*/
export async function GET(request: Request) {
async function getRoute(request: Request) {
return withAdminRoute(
"Failed to fetch homework moderation queue",
async () => {
Expand Down Expand Up @@ -105,3 +106,4 @@ export async function GET(request: Request) {
},
);
}
export const GET = observedApiRoute(getRoute);
4 changes: 3 additions & 1 deletion src/app/api/admin/suspensions/[id]/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
} from "@/lib/api/helpers";
import { fireAuditLog } from "@/lib/audit/write-audit-log";
import { prisma } from "@/lib/db/prisma";
import { observedApiRoute } from "@/lib/log/api-observability";

export const dynamic = "force-dynamic";

Expand All @@ -15,7 +16,7 @@ export const dynamic = "force-dynamic";
* @response adminSuspensionResponseSchema
* @response 404:openApiErrorSchema
*/
export async function PATCH(
async function patchRoute(
_request: Request,
{ params }: { params: Promise<{ id: string }> },
) {
Expand Down Expand Up @@ -53,3 +54,4 @@ export async function PATCH(
return jsonResponse({ suspension });
});
}
export const PATCH = observedApiRoute(patchRoute);
Loading
Loading