diff --git a/.github/workflows/build-pr-monitoring.yaml b/.github/workflows/build-pr-monitoring.yaml new file mode 100644 index 0000000..9c9524b --- /dev/null +++ b/.github/workflows/build-pr-monitoring.yaml @@ -0,0 +1,18 @@ +name: "PR Build Check: monitoring" + +on: + pull_request: + paths: + - "monitoring/**" + +jobs: + build: + name: Build monitoring (PR check) + uses: ./.github/workflows/_build-push.yaml + with: + service_name: monitoring + context_path: monitoring + dockerfile_path: monitoring/docker/Dockerfile + image_name: local/monitoring + image_tag: pr-check + push: false diff --git a/.github/workflows/build-preprod-monitoring.yaml b/.github/workflows/build-preprod-monitoring.yaml new file mode 100644 index 0000000..ef32def --- /dev/null +++ b/.github/workflows/build-preprod-monitoring.yaml @@ -0,0 +1,25 @@ +name: "Build & Deploy to Preprod: monitoring" + +on: + push: + branches: + - develop + paths: + - "monitoring/**" + +jobs: + build: + name: Build monitoring (preprod) + uses: ./.github/workflows/_build-push.yaml + with: + service_name: monitoring + context_path: monitoring + dockerfile_path: monitoring/docker/Dockerfile + image_name: harbor.dyingstar-game.space/dyingstar/monitoring + image_tag: develop + chart_name: service-monitoring + trigger_preprod_deploy: true + secrets: + HARBOR_USERNAME: ${{ secrets.HARBOR_USERNAME }} + HARBOR_PASSWORD: ${{ secrets.HARBOR_PASSWORD }} + KUBERNETES_REPO_TOKEN: ${{ secrets.KUBERNETES_REPO_TOKEN }} diff --git a/.github/workflows/build-prod-monitoring.yaml b/.github/workflows/build-prod-monitoring.yaml new file mode 100644 index 0000000..0b0d925 --- /dev/null +++ b/.github/workflows/build-prod-monitoring.yaml @@ -0,0 +1,37 @@ +name: "Build Production Image: monitoring" + +on: + push: + tags: + - "monitoring-v*" + +jobs: + extract-version: + name: Extract version from tag + runs-on: ubuntu-latest + outputs: + version: ${{ steps.tag.outputs.version }} + steps: + - name: Extract version + id: tag + run: | + TAG="${GITHUB_REF_NAME}" + VERSION="${TAG#monitoring-}" + echo "version=${VERSION}" >> "$GITHUB_OUTPUT" + + build: + name: Build monitoring (prod) + needs: extract-version + uses: ./.github/workflows/_build-push.yaml + with: + service_name: monitoring + context_path: monitoring + dockerfile_path: monitoring/docker/Dockerfile + image_name: harbor.dyingstar-game.space/dyingstar/monitoring + image_tag: ${{ needs.extract-version.outputs.version }} + additional_tags: latest + chart_name: service-monitoring + trigger_preprod_deploy: false + secrets: + HARBOR_USERNAME: ${{ secrets.HARBOR_USERNAME }} + HARBOR_PASSWORD: ${{ secrets.HARBOR_PASSWORD }} diff --git a/monitoring/.env.example b/monitoring/.env.example new file mode 100644 index 0000000..4e92c62 --- /dev/null +++ b/monitoring/.env.example @@ -0,0 +1,4 @@ +# Environment variables for local development. +# Copy this file to .env and adjust as needed. +PORT=9300 +RUST_LOG=monitoring=debug diff --git a/monitoring/Cargo.toml b/monitoring/Cargo.toml new file mode 100644 index 0000000..a71db45 --- /dev/null +++ b/monitoring/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "monitoring" +version = "0.1.0" +edition = "2021" + +[[bin]] +name = "monitoring" +path = "src/main.rs" + +[dependencies] +# Async runtime +tokio = { version = "1", features = ["full"] } + +# Web framework + WebSocket +axum = { version = "0.8", features = ["ws", "macros"] } +tower = "0.5" +tower-http = { version = "0.6", features = ["trace"] } + +# Serialization +serde = { version = "1", features = ["derive"] } +serde_json = "1" + +# Prometheus metrics +prometheus = { version = "0.13", features = ["process"] } + +# Logging / tracing +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] } + +# Error handling +anyhow = "1" + +# Environment / config +dotenvy = "0.15" diff --git a/monitoring/docker/Dockerfile b/monitoring/docker/Dockerfile new file mode 100644 index 0000000..ccd91e4 --- /dev/null +++ b/monitoring/docker/Dockerfile @@ -0,0 +1,31 @@ +# ─── Stage 1: build ──────────────────────────────────────────────────────── +FROM rust:1.87-slim AS builder + +RUN apt-get update && apt-get install -y pkg-config libssl-dev && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Cache dependencies separately from source code. +COPY Cargo.toml Cargo.lock* ./ +RUN mkdir -p src && echo 'fn main() {}' > src/main.rs +RUN cargo build --release && rm -rf src + +# Copy real source and rebuild. +COPY src ./src +RUN touch src/main.rs && cargo build --release + +# ─── Stage 2: minimal runtime ────────────────────────────────────────────── +FROM debian:bookworm-slim AS runtime + +RUN apt-get update && apt-get install -y libssl3 ca-certificates && rm -rf /var/lib/apt/lists/* + +RUN useradd -r -u 1001 -g root monitoring +WORKDIR /app + +COPY --from=builder /app/target/release/monitoring /app/monitoring + +USER 1001 + +EXPOSE 9300 + +ENTRYPOINT ["/app/monitoring"] diff --git a/monitoring/src/config.rs b/monitoring/src/config.rs new file mode 100644 index 0000000..d359736 --- /dev/null +++ b/monitoring/src/config.rs @@ -0,0 +1,19 @@ +use std::env; + +/// Application configuration loaded from environment variables. +#[derive(Debug, Clone)] +pub struct Config { + /// Port for the combined HTTP server (WebSocket /ws + metrics /metrics). + pub port: u16, +} + +impl Config { + pub fn from_env() -> Self { + let port = env::var("PORT") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(9300); + + Self { port } + } +} diff --git a/monitoring/src/main.rs b/monitoring/src/main.rs new file mode 100644 index 0000000..f8790df --- /dev/null +++ b/monitoring/src/main.rs @@ -0,0 +1,61 @@ +pub mod config; +pub mod metrics; +pub mod websocket; + +use std::{net::SocketAddr, sync::Arc}; + +use axum::{routing::get, Router}; +use prometheus::{Encoder, TextEncoder}; +use tokio::net::TcpListener; +use tracing::info; + +use crate::{config::Config, metrics::Metrics, websocket::ws_handler}; + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + // Load .env file if present (non-fatal if absent). + let _ = dotenvy::dotenv(); + + // Initialise structured logging. + tracing_subscriber::fmt() + .with_env_filter( + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| "monitoring=debug,tower_http=info".parse().unwrap()), + ) + .init(); + + let config = Arc::new(Config::from_env()); + info!(?config, "Starting monitoring service"); + + // Register all Prometheus metrics. + Metrics::init(); + + // Build the axum router: + // GET /ws — WebSocket endpoint (ds_bridge connects here) + // GET /metrics — Prometheus text endpoint (Prometheus scrapes here) + // GET /health — Simple liveness probe + let app = Router::new() + .route("/ws", get(ws_handler)) + .route("/metrics", get(metrics_handler)) + .route("/health", get(|| async { "ok" })); + + let addr: SocketAddr = format!("0.0.0.0:{}", config.port).parse()?; + let listener = TcpListener::bind(addr).await?; + info!(%addr, "listening"); + + axum::serve(listener, app).await?; + Ok(()) +} + +/// Render all registered Prometheus metrics as text. +async fn metrics_handler() -> impl axum::response::IntoResponse { + let mut buf = Vec::new(); + let encoder = TextEncoder::new(); + let content_type = encoder.format_type().to_owned(); + let metric_families = prometheus::gather(); + encoder.encode(&metric_families, &mut buf).unwrap_or(()); + ( + [(axum::http::header::CONTENT_TYPE, content_type)], + buf, + ) +} diff --git a/monitoring/src/metrics.rs b/monitoring/src/metrics.rs new file mode 100644 index 0000000..791e559 --- /dev/null +++ b/monitoring/src/metrics.rs @@ -0,0 +1,84 @@ +use prometheus::{ + register_gauge, register_int_counter, register_int_counter_vec, + Gauge, IntCounter, IntCounterVec, +}; +use std::sync::OnceLock; + +/// All Prometheus metrics for the monitoring service. +/// Initialised once at startup via `Metrics::init()`. +pub struct Metrics { + /// Current number of players connected to Horizon (gauge). + pub players_connected: Gauge, + /// Total player connections since startup (counter). + pub players_connect_total: IntCounter, + /// Total player disconnections since startup (counter). + pub players_disconnect_total: IntCounter, + /// Number of active Godot game server instances (gauge). + pub godot_servers_active: Gauge, + /// Total items created, by kind (counter, label: kind). + pub items_created_total: IntCounterVec, + /// Total successful player authentications (counter). + pub auth_success_total: IntCounter, +} + +static METRICS: OnceLock = OnceLock::new(); + +impl Metrics { + /// Register all metrics with the default Prometheus registry. + /// Must be called exactly once at startup before the HTTP server starts. + pub fn init() -> &'static Metrics { + METRICS.get_or_init(|| Metrics { + players_connected: register_gauge!( + "horizon_players_connected", + "Current number of players connected to Horizon" + ) + .expect("metric registration failed"), + + players_connect_total: register_int_counter!( + "horizon_players_connect_total", + "Total player connections since service start" + ) + .expect("metric registration failed"), + + players_disconnect_total: register_int_counter!( + "horizon_players_disconnect_total", + "Total player disconnections since service start" + ) + .expect("metric registration failed"), + + godot_servers_active: register_gauge!( + "horizon_godot_servers_active", + "Number of active Godot game server instances registered with Horizon" + ) + .expect("metric registration failed"), + + items_created_total: register_int_counter_vec!( + "horizon_items_created_total", + "Total items created in Horizon, by kind", + &["kind"] + ) + .expect("metric registration failed"), + + auth_success_total: register_int_counter!( + "horizon_auth_success_total", + "Total successful player authentications" + ) + .expect("metric registration failed"), + }) + } + + /// Get the global metrics instance (panics if not yet initialised). + pub fn get() -> &'static Metrics { + METRICS.get().expect("Metrics not initialised — call Metrics::init() first") + } + + /// Reset all metric values to zero (called when Horizon restarts). + pub fn reset(&self) { + self.players_connected.set(0.0); + self.godot_servers_active.set(0.0); + self.players_connect_total.reset(); + self.players_disconnect_total.reset(); + self.auth_success_total.reset(); + self.items_created_total.reset(); + } +} diff --git a/monitoring/src/websocket.rs b/monitoring/src/websocket.rs new file mode 100644 index 0000000..520345e --- /dev/null +++ b/monitoring/src/websocket.rs @@ -0,0 +1,129 @@ +use axum::{ + extract::{ + ws::{Message, WebSocket}, + WebSocketUpgrade, + }, + response::IntoResponse, +}; +use serde::Deserialize; +use tracing::{debug, info, warn}; + +use crate::metrics::Metrics; + +/// Event envelope sent by ds_bridge to external services. +/// Must match the BridgeEventEnvelope struct in ds_bridge/src/config.rs. +#[derive(Debug, Deserialize)] +struct BridgeEventEnvelope { + event_type: String, + namespace: Option, + name: String, + // payload fields we don't inspect — keep as raw JSON + #[allow(dead_code)] + payload: serde_json::Value, +} + +/// Axum handler — upgrades an HTTP connection to WebSocket. +pub async fn ws_handler(ws: WebSocketUpgrade) -> impl IntoResponse { + ws.on_upgrade(handle_socket) +} + +/// Process messages on an accepted WebSocket connection. +async fn handle_socket(mut socket: WebSocket) { + debug!("ds_bridge connected"); + + while let Some(msg) = socket.recv().await { + match msg { + Ok(Message::Text(text)) => { + handle_envelope(text.as_str()); + } + Ok(Message::Ping(data)) => { + // Axum auto-replies to pings; send explicit Pong anyway. + let _ = socket.send(Message::Pong(data)).await; + } + Ok(Message::Close(_)) | Err(_) => { + debug!("ds_bridge disconnected"); + break; + } + _ => {} // Binary / Pong — ignore + } + } +} + +/// Dispatch a raw JSON envelope string to the appropriate metric update. +fn handle_envelope(raw: &str) { + let env: BridgeEventEnvelope = match serde_json::from_str(raw) { + Ok(e) => e, + Err(e) => { + warn!("failed to parse envelope: {} — raw: {}", e, raw); + return; + } + }; + + debug!( + event_type = %env.event_type, + namespace = ?env.namespace, + name = %env.name, + "received event" + ); + + let m = Metrics::get(); + + match (env.event_type.as_str(), env.namespace.as_deref(), env.name.as_str()) { + // ── Bridge lifecycle ────────────────────────────────────────────────── + ("bridge", _, "connected") => { + let is_reconnection = env.payload + .get("is_reconnection") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + if !is_reconnection { + info!("Horizon restarted — resetting all metrics"); + m.reset(); + } else { + debug!("Horizon reconnected (reconnection=true) — keeping metrics"); + } + } + + // ── Core events ────────────────────────────────────────────────────── + ("core", _, "player_connected") => { + m.players_connected.inc(); + m.players_connect_total.inc(); + } + ("core", _, "player_disconnected") => { + // Clamp at 0 — in case a disconnect arrives before a reconnect. + let current = m.players_connected.get(); + m.players_connected.set((current - 1.0).max(0.0)); + m.players_disconnect_total.inc(); + } + + // ── Generic props ───────────────────────────────────────────────────── + ("plugin", Some("genericprops"), "create_object") => { + m.items_created_total.with_label_values(&["generic"]).inc(); + } + ("plugin", Some("genericprops"), "create_object_from_gameserver") => { + m.items_created_total.with_label_values(&["gameserver"]).inc(); + } + + // ── Authentication ──────────────────────────────────────────────────── + ("plugin", Some("ds_player_authentication"), "player_authenticated") => { + m.auth_success_total.inc(); + } + + // ── Godot game servers ──────────────────────────────────────────────── + ("plugin", Some("ds_game_server"), "server_registered") => { + m.godot_servers_active.inc(); + } + ("plugin", Some("ds_game_server"), "server_unregistered") => { + let current = m.godot_servers_active.get(); + m.godot_servers_active.set((current - 1.0).max(0.0)); + } + + _ => { + debug!( + event_type = %env.event_type, + namespace = ?env.namespace, + name = %env.name, + "unhandled event — ignoring" + ); + } + } +}