From 5b8f4fb71d7d298b578916da8f8cb611e8843aa8 Mon Sep 17 00:00:00 2001 From: vivek1504 Date: Fri, 29 May 2026 00:12:36 +0530 Subject: [PATCH 1/2] removed pino pretty --- src/utils/logger.ts | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/utils/logger.ts b/src/utils/logger.ts index 6ddf478..d06e993 100644 --- a/src/utils/logger.ts +++ b/src/utils/logger.ts @@ -18,16 +18,6 @@ const baseOptions: LoggerOptions = { timestamp: pino.stdTimeFunctions.isoTime, }; -if (!isProd) { - baseOptions.transport = { - target: "pino-pretty", - options: { - colorize: true, - translateTime: "SYS:HH:MM:ss.l", - ignore: "pid,hostname", - }, - }; -} export const logger = pino(baseOptions); From 6d4ec067fc0361d30c4911ed38a95f39301481b1 Mon Sep 17 00:00:00 2001 From: vivek1504 Date: Sat, 30 May 2026 09:38:07 +0530 Subject: [PATCH 2/2] added monitoring using prometheus --- package-lock.json | 38 +++++++++++++ package.json | 1 + src/app.ts | 50 +++++++++++++++- src/deploy/pipeline.ts | 23 +++++--- src/routes/deploy.ts | 17 +++++- src/routes/invoke.test.ts | 1 - src/routes/invoke.ts | 14 ++++- src/runtime/cleanup.ts | 4 ++ src/runtime/protocol.ts | 5 ++ src/runtime/scheduler.ts | 3 + src/runtime/test.store.ts | 17 ------ src/runtime/transport.ts | 11 +++- src/runtime/vm-manager.ts | 72 +++++++++++++++-------- src/server.ts | 9 +++ src/utils/logger.ts | 12 ++-- src/utils/metrics.ts | 117 ++++++++++++++++++++++++++++++++++++++ 16 files changed, 332 insertions(+), 62 deletions(-) delete mode 100644 src/runtime/test.store.ts create mode 100644 src/utils/metrics.ts diff --git a/package-lock.json b/package-lock.json index 61400b0..ac768e1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -22,6 +22,7 @@ "p-queue": "^9.2.0", "pino": "^10.3.1", "pino-http": "^11.0.0", + "prom-client": "^15.1.3", "unzipper": "^0.12.3" }, "devDependencies": { @@ -185,6 +186,15 @@ "url": "https://paulmillr.com/funding/" } }, + "node_modules/@opentelemetry/api": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.1.tgz", + "integrity": "sha512-gLyJlPHPZYdAk1JENA9LeHejZe1Ti77/pTeFm/nMXmQH/HFZlcS/O2XJB+L8fkbrNSqhdtlvjBVjxwUYanNH5Q==", + "license": "Apache-2.0", + "engines": { + "node": ">=8.0.0" + } + }, "node_modules/@oxc-project/types": { "version": "0.132.0", "resolved": "https://registry.npmjs.org/@oxc-project/types/-/types-0.132.0.tgz", @@ -903,6 +913,12 @@ "proxy-from-env": "^2.1.0" } }, + "node_modules/bintrees": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/bintrees/-/bintrees-1.0.2.tgz", + "integrity": "sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==", + "license": "MIT" + }, "node_modules/bluebird": { "version": "3.7.2", "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.7.2.tgz", @@ -2579,6 +2595,19 @@ ], "license": "MIT" }, + "node_modules/prom-client": { + "version": "15.1.3", + "resolved": "https://registry.npmjs.org/prom-client/-/prom-client-15.1.3.tgz", + "integrity": "sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api": "^1.4.0", + "tdigest": "^0.1.1" + }, + "engines": { + "node": "^16 || ^18 || >=20" + } + }, "node_modules/proxy-addr": { "version": "2.0.7", "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", @@ -3024,6 +3053,15 @@ "node": ">=8" } }, + "node_modules/tdigest": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/tdigest/-/tdigest-0.1.2.tgz", + "integrity": "sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==", + "license": "MIT", + "dependencies": { + "bintrees": "1.0.2" + } + }, "node_modules/thread-stream": { "version": "4.2.0", "resolved": "https://registry.npmjs.org/thread-stream/-/thread-stream-4.2.0.tgz", diff --git a/package.json b/package.json index 25a7964..a0a2fa9 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,7 @@ "p-queue": "^9.2.0", "pino": "^10.3.1", "pino-http": "^11.0.0", + "prom-client": "^15.1.3", "unzipper": "^0.12.3" }, "devDependencies": { diff --git a/src/app.ts b/src/app.ts index 36f0353..b401c30 100644 --- a/src/app.ts +++ b/src/app.ts @@ -1,13 +1,59 @@ import express from "express"; -import pinoHttp from "pino-http"; +import { pinoHttp } from "pino-http"; +import crypto from "crypto"; import { deployRouter } from "./routes/deploy.js"; import { invokeRouter } from "./routes/invoke.js"; import { httpLoggerOptions } from "./utils/logger.js"; +import { register, httpRequestDuration, httpRequestsTotal } from "./utils/metrics.js"; +import { deployQueue } from "./deploy/queue.js"; export const app = express(); -// @ts-expect-error +app.use((req, _res, next) => { + req.id = req.headers["x-request-id"] || crypto.randomUUID(); + next(); +}); app.use(pinoHttp(httpLoggerOptions)); + +app.use((req, res, next) => { + const start = process.hrtime.bigint(); + + res.on("finish", () => { + const durationSec = + Number(process.hrtime.bigint() - start) / 1_000_000_000; + const route = req.route?.path || req.path || "unknown"; + const labels = { + method: req.method, + route, + status_code: String(res.statusCode), + }; + httpRequestDuration.observe(labels, durationSec); + httpRequestsTotal.inc(labels); + }); + + next(); +}); + app.use(express.json()); app.use("/deploy", deployRouter); app.use("/f", invokeRouter); + +app.get("/metrics", async (_req, res) => { + res.set("Content-Type", register.contentType); + res.end(await register.metrics()); +}); + +app.get("/health", (_req, res) => { + res.json({ status: "ok", uptime: process.uptime() }); +}); + +app.get("/ready", (_req, res) => { + const checks = { + queueAvailable: deployQueue.size < 50, + memoryOk: process.memoryUsage().heapUsed < 500 * 1024 * 1024, + }; + const healthy = Object.values(checks).every(Boolean); + res + .status(healthy ? 200 : 503) + .json({ status: healthy ? "ready" : "not_ready", checks }); +}); diff --git a/src/deploy/pipeline.ts b/src/deploy/pipeline.ts index 5cbcbcc..abdb24d 100644 --- a/src/deploy/pipeline.ts +++ b/src/deploy/pipeline.ts @@ -11,6 +11,7 @@ import fs from "fs"; import crypto from "crypto"; import { getPaths } from "../utils/path.js"; import { pipelineLogger } from "../utils/logger.js"; +import { deployTotal, deployStageDuration } from "../utils/metrics.js"; export async function deployFunction(zipPath: string) { const functionId = crypto.randomBytes(8).toString("hex"); @@ -23,35 +24,34 @@ export async function deployFunction(zipPath: string) { ); try { - // ── Stage 1: Extract zip ────────────────────────────────────── const t0 = performance.now(); await extractZip(zipPath, paths.outputDir); const extractDuration = performance.now() - t0; + deployStageDuration.observe({ stage: "extract" }, extractDuration / 1000); pipelineLogger.info( { functionId, stage: "extract", durationMs: extractDuration }, "zip extraction completed", ); await fs.promises.unlink(zipPath); - // ── Stage 2: Prepare rootfs ─────────────────────────────────── const t1 = performance.now(); const image = await prepareRootfs(functionId); const rootfsDuration = performance.now() - t1; + deployStageDuration.observe({ stage: "rootfs" }, rootfsDuration / 1000); pipelineLogger.info( { functionId, stage: "rootfs", durationMs: rootfsDuration, image }, "rootfs preparation completed", ); - // ── Stage 3: Spawn Firecracker ──────────────────────────────── const t2 = performance.now(); fc = await startFirecrackerProcess(paths.apiSock); const spawnDuration = performance.now() - t2; + deployStageDuration.observe({ stage: "fc-spawn" }, spawnDuration / 1000); pipelineLogger.info( { functionId, stage: "fc-spawn", durationMs: spawnDuration }, "firecracker process spawned", ); - // ── Stage 4: Configure VM ───────────────────────────────────── const t3 = performance.now(); const readyPromise = waitForVMReady(fc); const client = createFcCient(paths.apiSock); @@ -59,29 +59,33 @@ export async function deployFunction(zipPath: string) { const t4 = performance.now(); await configureVM(client, functionId, image); const configureDuration = performance.now() - t4; + deployStageDuration.observe({ stage: "configure-vm" }, configureDuration / 1000); pipelineLogger.info( { functionId, stage: "configure-vm", durationMs: configureDuration }, "VM configured", ); - // ── Stage 5: Wait for VM ready ──────────────────────────────── await readyPromise; const readyDuration = performance.now() - t3; + deployStageDuration.observe({ stage: "vm-ready" }, readyDuration / 1000); pipelineLogger.info( { functionId, stage: "vm-ready", durationMs: readyDuration }, "VM reported READY", ); - // ── Stage 6: Snapshot ───────────────────────────────────────── const t5 = performance.now(); await snapshotVM(client, functionId); const snapshotDuration = performance.now() - t5; + deployStageDuration.observe({ stage: "snapshot" }, snapshotDuration / 1000); pipelineLogger.info( { functionId, stage: "snapshot", durationMs: snapshotDuration }, "VM snapshot created", ); const totalDuration = performance.now() - t0; + deployStageDuration.observe({ stage: "complete" }, totalDuration / 1000); + deployTotal.inc({ status: "success" }); + pipelineLogger.info( { functionId, @@ -104,19 +108,22 @@ export async function deployFunction(zipPath: string) { url: `http://localhost:3000/f/${functionId}`, }; } catch (err) { + deployTotal.inc({ status: "error" }); + pipelineLogger.error( { functionId, err }, "deployment pipeline failed", ); throw err; } finally { - // Always kill the FC process — whether deploy succeeded or failed try { fc!?.kill("SIGKILL"); } catch { } const t6 = performance.now(); await cleanupResources(paths); + const cleanupDuration = performance.now() - t6; + deployStageDuration.observe({ stage: "cleanup" }, cleanupDuration / 1000); pipelineLogger.debug( - { functionId, stage: "cleanup", durationMs: performance.now() - t6 }, + { functionId, stage: "cleanup", durationMs: cleanupDuration }, "post-deploy cleanup completed", ); } diff --git a/src/routes/deploy.ts b/src/routes/deploy.ts index 9b36620..e60747b 100644 --- a/src/routes/deploy.ts +++ b/src/routes/deploy.ts @@ -4,6 +4,10 @@ import { upload } from "../deploy/upload.js"; import { jobs, deployQueue } from "../deploy/queue.js"; import { deployFunction } from "../deploy/pipeline.js"; import { deployLogger } from "../utils/logger.js"; +import { + deployQueueDepth, + deployQueueWaitTime, +} from "../utils/metrics.js"; export const deployRouter = Router(); @@ -24,6 +28,9 @@ deployRouter.post("/", upload.single("code"), async (req, res) => { const jobId = crypto.randomBytes(8).toString("hex"); jobs.set(jobId, { state: "pending" }); + deployQueueDepth.inc(); + const enqueueTime = performance.now(); + deployLogger.info( { jobId, @@ -35,8 +42,14 @@ deployRouter.post("/", upload.single("code"), async (req, res) => { ); deployQueue.add(async () => { + const waitDurationSec = (performance.now() - enqueueTime) / 1000; + deployQueueWaitTime.observe(waitDurationSec); + jobs.set(jobId, { state: "running" }); - deployLogger.info({ jobId }, "deployment job started"); + deployLogger.info( + { jobId, queueWaitMs: waitDurationSec * 1000 }, + "deployment job started", + ); try { const result = await deployFunction(req.file!.path); @@ -55,6 +68,8 @@ deployRouter.post("/", upload.single("code"), async (req, res) => { { jobId, err }, "deployment job failed", ); + } finally { + deployQueueDepth.dec(); } }); diff --git a/src/routes/invoke.test.ts b/src/routes/invoke.test.ts index 37dfab6..7303e86 100644 --- a/src/routes/invoke.test.ts +++ b/src/routes/invoke.test.ts @@ -2,7 +2,6 @@ import { describe, it, expect, vi } from "vitest"; vi.mock("../runtime/scheduler.js", () => ({ enqueueRequest: vi.fn((functionId, task) => { - // Simulate immediate error (no snapshot exists) task.reject(new Error("no snapshot")); }), })); diff --git a/src/routes/invoke.ts b/src/routes/invoke.ts index 6cf437a..0b76b12 100644 --- a/src/routes/invoke.ts +++ b/src/routes/invoke.ts @@ -1,12 +1,14 @@ import { Router } from "express"; import { enqueueRequest } from "../runtime/scheduler.js"; import { runtimeLogger } from "../utils/logger.js"; +import { invocationTotal, invocationTime } from "../utils/metrics.js"; export const invokeRouter = Router(); invokeRouter.use("/:functionId", async (req, res) => { const { functionId } = req.params; const subPath = req.path || "/"; + const start = performance.now(); runtimeLogger.info( { functionId, method: req.method, subPath }, @@ -24,13 +26,21 @@ invokeRouter.use("/:functionId", async (req, res) => { }); }); + const durationSec = (performance.now() - start) / 1000; + invocationTime.observe({ function_id: functionId }, durationSec); + invocationTotal.inc({ function_id: functionId, status: "success" }); + runtimeLogger.info( - { functionId, method: req.method, subPath, statusCode: res.statusCode }, + { functionId, method: req.method, subPath, statusCode: res.statusCode, durationMs: durationSec * 1000 }, "function invocation completed", ); } catch (e: any) { + const durationSec = (performance.now() - start) / 1000; + invocationTime.observe({ function_id: functionId }, durationSec); + invocationTotal.inc({ function_id: functionId, status: "error" }); + runtimeLogger.error( - { functionId, method: req.method, subPath, err: e }, + { functionId, method: req.method, subPath, err: e, durationMs: durationSec * 1000 }, "function invocation failed", ); diff --git a/src/runtime/cleanup.ts b/src/runtime/cleanup.ts index 58bda59..ddcb58b 100644 --- a/src/runtime/cleanup.ts +++ b/src/runtime/cleanup.ts @@ -1,5 +1,6 @@ import fs from "fs"; import { cleanupLogger } from "../utils/logger.js"; +import { vmCleanupTotal, vmCount } from "../utils/metrics.js"; import type { RuntimeFunction, Vm } from "../types/types.js"; @@ -27,6 +28,9 @@ export async function cleanupVm(fn: RuntimeFunction, vm: Vm) { } catch {} fn.vms = fn.vms.filter((v) => v !== vm); + vmCleanupTotal.inc(); + vmCount.dec({ function_id: fn.functionId, state: "ready" }); + cleanupLogger.info( { functionId: fn.functionId, vmId: vm.id, remainingVms: fn.vms.length }, "VM cleanup completed", diff --git a/src/runtime/protocol.ts b/src/runtime/protocol.ts index 8a59f00..91635ce 100644 --- a/src/runtime/protocol.ts +++ b/src/runtime/protocol.ts @@ -1,5 +1,6 @@ import type { Socket } from "net"; import { protocolLogger } from "../utils/logger.js"; +import { vsockErrors } from "../utils/metrics.js"; export function buildPayload(req: any, subPath: string): string { return ( @@ -32,6 +33,7 @@ export function readVsockResponse( const timer = setTimeout(() => { protocolLogger.error({ timeoutMs: timeout }, "function execution timeout"); + vsockErrors.inc({ error_type: "timeout" }); socket.destroy(); reject(new Error("Function timeout")); }, timeout); @@ -64,6 +66,7 @@ export function readVsockResponse( return; } } catch { + vsockErrors.inc({ error_type: "parse_error" }); protocolLogger.error({ rawLine: line }, "invalid JSON received from VM"); } } @@ -72,6 +75,7 @@ export function readVsockResponse( onError = (err) => { clearTimeout(timer); cleanup(); + vsockErrors.inc({ error_type: "connection_error" }); protocolLogger.error({ err }, "vsock read error"); reject(err); }; @@ -79,6 +83,7 @@ export function readVsockResponse( onEnd = () => { clearTimeout(timer); cleanup(); + vsockErrors.inc({ error_type: "connection_closed" }); protocolLogger.error("vsock connection closed before response received"); reject(new Error("Connection closed before response")); }; diff --git a/src/runtime/scheduler.ts b/src/runtime/scheduler.ts index 917a513..15dba4b 100644 --- a/src/runtime/scheduler.ts +++ b/src/runtime/scheduler.ts @@ -2,6 +2,7 @@ import { runtimeStore } from "./store.js"; import { createVm } from "./vm-manager.js"; import { sendRequest } from "./transport.js"; import { schedulerLogger } from "../utils/logger.js"; +import { invocationQueueDepth } from "../utils/metrics.js"; import type { RequestTask, RuntimeFunction } from "../types/types.js"; @@ -22,6 +23,7 @@ export async function enqueueRequest(functionId: string, task: RequestTask) { } fn.queue.push(task); + invocationQueueDepth.inc({ function_id: functionId }); schedulerLogger.debug( { functionId, queueDepth: fn.queue.length }, "request enqueued", @@ -57,6 +59,7 @@ async function processQueue(fn: RuntimeFunction) { const task = fn.queue.shift(); if (!task) return; + invocationQueueDepth.dec({ function_id: fn.functionId }); vm.state = "busy"; schedulerLogger.debug( { functionId: fn.functionId, vmId: vm.id, subPath: task.subPath }, diff --git a/src/runtime/test.store.ts b/src/runtime/test.store.ts deleted file mode 100644 index 4b2881f..0000000 --- a/src/runtime/test.store.ts +++ /dev/null @@ -1,17 +0,0 @@ -import { describe, it, expect, beforeEach } from "vitest"; -import { runtimeStore } from "./store.js"; - -describe("runtimeStore", () => { - beforeEach(() => runtimeStore.reset()); - - it("starts empty", () => { - expect(runtimeStore.functions.size).toBe(0); - }); - - it("reset clears all functions", () => { - runtimeStore.functions.set("test", {} as any); - runtimeStore.reset(); - expect(runtimeStore.functions.size).toBe(0); - }); -}); - diff --git a/src/runtime/transport.ts b/src/runtime/transport.ts index b04b803..f853189 100644 --- a/src/runtime/transport.ts +++ b/src/runtime/transport.ts @@ -1,6 +1,7 @@ import net, { Socket } from "net"; import { buildPayload, readVsockResponse } from "./protocol.js"; import { transportLogger } from "../utils/logger.js"; +import { vsockConnectionTime, vsockErrors } from "../utils/metrics.js"; import type { Vm } from "../types/types.js"; export async function connectVsock( @@ -8,16 +9,19 @@ export async function connectVsock( timeout = 5000, ): Promise { transportLogger.debug({ path, timeoutMs: timeout }, "connecting to vsock"); + const start = performance.now(); return new Promise((resolve, reject) => { - const start = Date.now(); + const connectStart = Date.now(); const tryConnect = () => { const socket = net.createConnection({ path }); socket.once("connect", () => { + const durationSec = (performance.now() - start) / 1000; + vsockConnectionTime.observe(durationSec); transportLogger.debug( - { path, elapsedMs: Date.now() - start }, + { path, elapsedMs: Date.now() - connectStart }, "vsock connected", ); resolve(socket); @@ -26,7 +30,8 @@ export async function connectVsock( socket.once("error", () => { socket.destroy(); - if (Date.now() - start > timeout) { + if (Date.now() - connectStart > timeout) { + vsockErrors.inc({ error_type: "timeout" }); transportLogger.error({ path, timeoutMs: timeout }, "vsock connection timeout"); return reject(new Error("Vsock timeout")); } diff --git a/src/runtime/vm-manager.ts b/src/runtime/vm-manager.ts index 88e3da8..09934c5 100644 --- a/src/runtime/vm-manager.ts +++ b/src/runtime/vm-manager.ts @@ -4,6 +4,7 @@ import axios from "axios"; import path from "path"; import crypto from "crypto"; import { vmManagerLogger } from "../utils/logger.js"; +import { vmCount, vmCreationTime, vmCreationTotal } from "../utils/metrics.js"; import type { RuntimeFunction, Vm } from "../types/types.js"; @@ -14,42 +15,65 @@ export async function createVm( const instanceId = crypto.randomBytes(4).toString("hex"); const apiSock = `/tmp/firecracker-${functionId}-${instanceId}.socket`; const vsock = `/tmp/vsock-${functionId}-${instanceId}.sock`; + const start = performance.now(); vmManagerLogger.info( { functionId, instanceId, apiSock, vsock }, "creating new VM instance", ); - const fc = spawn("firecracker", ["--api-sock", apiSock]); + vmCount.inc({ function_id: functionId, state: "creating" }); - fc.on("error", (err) => { - vmManagerLogger.error({ instanceId, err }, "firecracker process error"); - }); + try { + const fc = spawn("firecracker", ["--api-sock", apiSock]); - fc.on("exit", (code, signal) => { - vmManagerLogger.info({ instanceId, exitCode: code, signal }, "firecracker process exited"); - }); + fc.on("error", (err) => { + vmManagerLogger.error({ instanceId, err }, "firecracker process error"); + }); - await waitForFirecrackerApiSocket(apiSock); + fc.on("exit", (code, signal) => { + vmManagerLogger.info({ instanceId, exitCode: code, signal }, "firecracker process exited"); + }); - const client = createFcClient(apiSock); - await restoreVm(client, functionId, vsock); + await waitForFirecrackerApiSocket(apiSock); - const vm: Vm = { - id: instanceId, - state: "ready", - firecrackerProcess: fc, - apiSock, - vsock, - idleTime: Date.now(), - }; + const client = createFcClient(apiSock); + await restoreVm(client, functionId, vsock); - fn.vms.push(vm); - vmManagerLogger.info( - { functionId, instanceId, totalVms: fn.vms.length }, - "VM instance created and ready", - ); - return vm; + const vm: Vm = { + id: instanceId, + state: "ready", + firecrackerProcess: fc, + apiSock, + vsock, + idleTime: Date.now(), + }; + + fn.vms.push(vm); + + const durationSec = (performance.now() - start) / 1000; + vmCreationTime.observe(durationSec); + vmCreationTotal.inc({ status: "success" }); + vmCount.dec({ function_id: functionId, state: "creating" }); + vmCount.inc({ function_id: functionId, state: "ready" }); + + vmManagerLogger.info( + { functionId, instanceId, totalVms: fn.vms.length, durationMs: durationSec * 1000 }, + "VM instance created and ready", + ); + return vm; + } catch (err) { + const durationSec = (performance.now() - start) / 1000; + vmCreationTime.observe(durationSec); + vmCreationTotal.inc({ status: "error" }); + vmCount.dec({ function_id: functionId, state: "creating" }); + + vmManagerLogger.error( + { functionId, instanceId, err, durationMs: durationSec * 1000 }, + "VM creation failed", + ); + throw err; + } } export async function waitForFirecrackerApiSocket( diff --git a/src/server.ts b/src/server.ts index 89747ca..d5544bf 100644 --- a/src/server.ts +++ b/src/server.ts @@ -3,6 +3,15 @@ import { logger } from "./utils/logger.js"; const PORT = process.env.PORT || 3000; +process.on("uncaughtException", (err) => { + logger.fatal({ err }, "uncaught exception — shutting down"); + process.exit(1); +}); + +process.on("unhandledRejection", (reason) => { + logger.fatal({ err: reason }, "unhandled rejection"); +}); + app.listen(PORT, () => { logger.info({ port: PORT }, `server listening on http://localhost:${PORT}`); }); diff --git a/src/utils/logger.ts b/src/utils/logger.ts index d06e993..bc013ad 100644 --- a/src/utils/logger.ts +++ b/src/utils/logger.ts @@ -2,23 +2,24 @@ import { pino } from "pino"; import type { Options } from "pino-http"; import type { LoggerOptions } from "pino"; -const isProd = process.env.NODE_ENV === "production"; - const baseOptions: LoggerOptions = { level: process.env.LOG_LEVEL || "info", redact: { paths: [ "req.headers.authorization", "req.headers.cookie", + "req.headers['x-api-key']", "password", "token", + "*.password", + "*.token", + "*.secret", ], censor: "[REDACTED]", }, timestamp: pino.stdTimeFunctions.isoTime, }; - export const logger = pino(baseOptions); export const deployLogger = logger.child({ module: "deploy" }); @@ -36,7 +37,10 @@ export const cleanupLogger = logger.child({ module: "cleanup" }); export const httpLoggerOptions: Options = { logger: logger.child({ module: "http" }), autoLogging: { - ignore: (req) => req.url === "/health", + ignore: (req) => + req.url === "/health" || + req.url === "/ready" || + req.url === "/metrics", }, customLogLevel: (_req, res, err) => { if (res.statusCode >= 500 || err) return "error"; diff --git a/src/utils/metrics.ts b/src/utils/metrics.ts new file mode 100644 index 0000000..08bb719 --- /dev/null +++ b/src/utils/metrics.ts @@ -0,0 +1,117 @@ +import { + Registry, + Counter, + Histogram, + Gauge, + collectDefaultMetrics, +} from "prom-client"; + +export const register = new Registry(); + +collectDefaultMetrics({ register }); + +export const httpRequestDuration = new Histogram({ + name: "http_request_duration", + help: "Duration of HTTP requests", + labelNames: ["method", "route", "status_code"], + buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], + registers: [register], +}); + +export const httpRequestsTotal = new Counter({ + name: "total_http_requests", + help: "Total number of HTTP requests", + labelNames: ["method", "route", "status_code"], + registers: [register], +}); + +export const deployTotal = new Counter({ + name: "total_deployments", + help: "Total deployment attempts", + labelNames: ["status"], + registers: [register], +}); + +export const deployStageDuration = new Histogram({ + name: "deploy_stage_duration", + help: "Duration of each deployment pipeline stage", + labelNames: ["stage"], + buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60], + registers: [register], +}); + +export const deployQueueDepth = new Gauge({ + name: "deploy_queue_depth", + help: "Current number of jobs in the deploy queue", + registers: [register], +}); + +export const deployQueueWaitTime = new Histogram({ + name: "deploy_queue_wait_time", + help: "Time a deploy job spends waiting in queue before execution", + buckets: [0.1, 0.5, 1, 5, 10, 30, 60], + registers: [register], +}); + +export const vmCount = new Gauge({ + name: "active_vm_count", + help: "Number of currently active VMs", + labelNames: ["function_id", "state"], + registers: [register], +}); + +export const vmCreationTime = new Histogram({ + name: "vm_creation_time", + help: "Time to create and restore a VM from snapshot", + buckets: [0.1, 0.25, 0.5, 1, 2, 5], + registers: [register], +}); + +export const vmCreationTotal = new Counter({ + name: "total_vm_created", + help: "Total VMs created", + labelNames: ["status"], + registers: [register], +}); + +export const vmCleanupTotal = new Counter({ + name: "total_vm_cleanups", + help: "Total VMs cleaned up", + registers: [register], +}); + +export const invocationTotal = new Counter({ + name: "total_invocations", + help: "Total function invocations", + labelNames: ["function_id", "status"], + registers: [register], +}); + +export const invocationTime = new Histogram({ + name: "invocation_time", + help: "End-to-end function invocation time", + labelNames: ["function_id"], + buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], + registers: [register], +}); + +export const invocationQueueDepth = new Gauge({ + name: "invocation_queue_depth", + help: "Current request queue depth per function", + labelNames: ["function_id"], + registers: [register], +}); + +export const vsockConnectionTime = new Histogram({ + name: "vsock_connection_time", + help: "Time to establish vsock connection", + buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2], + registers: [register], +}); + +export const vsockErrors = new Counter({ + name: "vsock_errors_total", + help: "Total vsock connection/read errors", + labelNames: ["error_type"], + registers: [register], +});