From 989d4aa7525e9b985352b6cb8530009ed12b7936 Mon Sep 17 00:00:00 2001 From: Chong Yang Date: Wed, 6 May 2026 16:04:13 +0930 Subject: [PATCH 1/4] feat(metrics): TECH-6381 break workflow executions down by org_slug Replace the two queries (status counts + per-org error breakdown) with a single combined query that groups by status AND organization.slug. Adds executionsByStatusAndOrgSlug to WorkflowStats so the prometheus collector can label keeperhub_workflow_executions_total by org_slug, following the same convention as the errors gauge (anonymous workflows bucket under '_anonymous' so per-org sums match the global totals). totalSuccess/totalError/etc and errorByOrgSlug are now derived from the combined query, keeping the existing errors gauge wiring unchanged. --- lib/metrics/db-metrics.ts | 83 +++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 39 deletions(-) diff --git a/lib/metrics/db-metrics.ts b/lib/metrics/db-metrics.ts index 69b8f2178..17f2087b0 100644 --- a/lib/metrics/db-metrics.ts +++ b/lib/metrics/db-metrics.ts @@ -49,7 +49,7 @@ const WORKFLOW_DURATION_BUCKETS = [ const STEP_DURATION_BUCKETS = [50, 100, 250, 500, 1000, 2000, 5000]; export type WorkflowStats = { - // Total executions by status + // Total executions by status (sum across all orgs) totalSuccess: number; totalError: number; totalRunning: number; @@ -60,6 +60,15 @@ export type WorkflowStats = { // under ANONYMOUS_ORG_SLUG so the sum across this map matches totalError. errorByOrgSlug: Record; + // Per-(status, org_slug) execution counts. Personal/anonymous workflows + // are bucketed under ANONYMOUS_ORG_SLUG so the sum of counts for a given + // status across all orgs matches the corresponding total* above. + executionsByStatusAndOrgSlug: Array<{ + status: string; + orgSlug: string; + count: number; + }>; + // Duration histogram data (count of executions in each bucket) durationBuckets: number[]; durationSum: number; @@ -74,15 +83,6 @@ export type WorkflowStats = { */ export async function getWorkflowStatsFromDb(): Promise { try { - // Query execution counts by status - const statusCounts = await db - .select({ - status: workflowExecutions.status, - count: count(), - }) - .from(workflowExecutions) - .groupBy(workflowExecutions.status); - const stats: WorkflowStats = { totalSuccess: 0, totalError: 0, @@ -90,27 +90,54 @@ export async function getWorkflowStatsFromDb(): Promise { totalPending: 0, totalCancelled: 0, errorByOrgSlug: {}, + executionsByStatusAndOrgSlug: [], durationBuckets: new Array(WORKFLOW_DURATION_BUCKETS.length + 1).fill(0), durationSum: 0, durationCount: 0, }; - for (const row of statusCounts) { + // Per-(status, org_slug) execution breakdown: JOIN workflows + organization, + // LEFT JOIN so anonymous workflows still contribute (under ANONYMOUS_ORG_SLUG). + // GROUP BY uses the organization.slug column reference (not the COALESCE + // expression): Drizzle would otherwise bind ANONYMOUS_ORG_SLUG as separate + // parameters in SELECT and GROUP BY clauses, and Postgres rejects the query + // because the two COALESCE expressions are not textually identical. Postgres + // groups all NULL slugs into one group (NULLs are equal in GROUP BY), and + // the SELECT-side COALESCE renders that group as ANONYMOUS_ORG_SLUG. + const breakdown = await db + .select({ + status: workflowExecutions.status, + orgSlug: sql`COALESCE(${organization.slug}, ${ANONYMOUS_ORG_SLUG})`, + count: count(), + }) + .from(workflowExecutions) + .innerJoin(workflows, eq(workflowExecutions.workflowId, workflows.id)) + .leftJoin(organization, eq(workflows.organizationId, organization.id)) + .groupBy(workflowExecutions.status, organization.slug); + + for (const row of breakdown) { + const c = Number(row.count) || 0; + stats.executionsByStatusAndOrgSlug.push({ + status: row.status, + orgSlug: row.orgSlug, + count: c, + }); switch (row.status) { case "success": - stats.totalSuccess = row.count; + stats.totalSuccess += c; break; case "error": - stats.totalError = row.count; + stats.totalError += c; + stats.errorByOrgSlug[row.orgSlug] = c; break; case "running": - stats.totalRunning = row.count; + stats.totalRunning += c; break; case "pending": - stats.totalPending = row.count; + stats.totalPending += c; break; case "cancelled": - stats.totalCancelled = row.count; + stats.totalCancelled += c; break; default: // Ignore unknown status values @@ -118,29 +145,6 @@ export async function getWorkflowStatsFromDb(): Promise { } } - // Per-org error breakdown: JOIN workflows + organization, LEFT JOIN so - // anonymous workflows still contribute (under ANONYMOUS_ORG_SLUG). - // GROUP BY uses the column reference (not the COALESCE expression): - // Drizzle would otherwise bind ANONYMOUS_ORG_SLUG as separate parameters - // in SELECT and GROUP BY clauses, and Postgres rejects the query because - // the two COALESCE expressions are not textually identical. Postgres - // groups all NULL slugs into one group (NULLs are equal in GROUP BY), - // and the SELECT-side COALESCE renders that group as ANONYMOUS_ORG_SLUG. - const errorByOrg = await db - .select({ - orgSlug: sql`COALESCE(${organization.slug}, ${ANONYMOUS_ORG_SLUG})`, - count: count(), - }) - .from(workflowExecutions) - .innerJoin(workflows, eq(workflowExecutions.workflowId, workflows.id)) - .leftJoin(organization, eq(workflows.organizationId, organization.id)) - .where(eq(workflowExecutions.status, "error")) - .groupBy(organization.slug); - - for (const row of errorByOrg) { - stats.errorByOrgSlug[row.orgSlug] = Number(row.count) || 0; - } - // Query duration histogram data for completed executions // Build bucket counts using SQL CASE statements for efficiency const durationQuery = await db @@ -193,6 +197,7 @@ export async function getWorkflowStatsFromDb(): Promise { totalPending: 0, totalCancelled: 0, errorByOrgSlug: {}, + executionsByStatusAndOrgSlug: [], durationBuckets: new Array(WORKFLOW_DURATION_BUCKETS.length + 1).fill(0), durationSum: 0, durationCount: 0, From a967febe9000b33e6bdc322ebd7917470a6748fb Mon Sep 17 00:00:00 2001 From: Chong Yang Date: Wed, 6 May 2026 16:04:19 +0930 Subject: [PATCH 2/4] feat(metrics): TECH-6381 expose executions gauge with org_slug label Add org_slug to keeperhub_workflow_executions_total so dashboards and alerts can scope the success rate to managed clients (the errors gauge already had this label; the executions gauge didn't, which made it impossible to compute a system-only success rate). Reset before populating so series for orgs that drop to zero in a given status clear out instead of going stale -- same pattern as the errors gauge. --- lib/metrics/collectors/prometheus.ts | 36 ++++++++++++---------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/lib/metrics/collectors/prometheus.ts b/lib/metrics/collectors/prometheus.ts index 927478611..6fcf13865 100644 --- a/lib/metrics/collectors/prometheus.ts +++ b/lib/metrics/collectors/prometheus.ts @@ -94,12 +94,14 @@ function getOrCreateGauge( // All metrics are GAUGES (point-in-time snapshots). Use max() aggregation across pods. // For rate/delta queries, use PromQL delta() function: max(delta(metric[1h])) -// Workflow execution counts by status +// Workflow execution counts by status and org_slug. Personal/anonymous +// workflows are emitted under org_slug="_anonymous" so the sum across +// org_slug for a given status equals the global per-status total. const workflowExecutionsTotal = getOrCreateGauge( dbRegistry, "keeperhub_workflow_executions_total", - "Total workflow executions by status (all-time)", - ["status"] + "Total workflow executions by status, broken down by org_slug (all-time)", + ["status", "org_slug"] ); // Workflow errors total (convenience gauge for alerting). Labeled by org_slug @@ -1176,24 +1178,16 @@ export async function updateDbMetrics(): Promise { getBillingStatsFromDb(), ]); - // Update workflow execution counts by status (gauges - point-in-time snapshots) - workflowExecutionsTotal.set( - { status: "success" }, - workflowStats.totalSuccess - ); - workflowExecutionsTotal.set({ status: "error" }, workflowStats.totalError); - workflowExecutionsTotal.set( - { status: "running" }, - workflowStats.totalRunning - ); - workflowExecutionsTotal.set( - { status: "pending" }, - workflowStats.totalPending - ); - workflowExecutionsTotal.set( - { status: "cancelled" }, - workflowStats.totalCancelled - ); + // Update workflow execution counts per (status, org_slug). Reset before + // populating so series for orgs that no longer have executions in a given + // status clear out instead of going stale. + workflowExecutionsTotal.reset(); + for (const row of workflowStats.executionsByStatusAndOrgSlug) { + workflowExecutionsTotal.set( + { status: row.status, org_slug: row.orgSlug }, + row.count + ); + } // Update workflow errors total per org_slug (convenience gauge for // alerting). Reset before populating so series for orgs that no longer From 70fbdb2d0edd5c7f45ecab6459e5157d75a2713e Mon Sep 17 00:00:00 2001 From: Chong Yang Date: Wed, 6 May 2026 16:04:24 +0930 Subject: [PATCH 3/4] docs(metrics): TECH-6381 note org_slug label on workflow executions/errors Document the convention so dashboard authors know they can scope these gauges by managed-client org_slug, and that '_anonymous' is reserved for personal workflows. --- lib/metrics/METRICS_REFERENCE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/metrics/METRICS_REFERENCE.md b/lib/metrics/METRICS_REFERENCE.md index 5a6f11670..12b85fe4c 100644 --- a/lib/metrics/METRICS_REFERENCE.md +++ b/lib/metrics/METRICS_REFERENCE.md @@ -301,6 +301,8 @@ max by (role) (keeperhub_org_members_by_role{...}) sum by (status) (keeperhub_workflow_executions_total{...}) ``` +`keeperhub_workflow_executions_total` and `keeperhub_workflow_execution_errors_total` are also labeled by `org_slug` so dashboards/alerts can scope to managed clients. Personal/anonymous workflows are emitted under `org_slug="_anonymous"` so the sum across `org_slug` for a given status equals the unfiltered per-status total. + **Metrics requiring `max()` aggregation:** | Category | Metrics | From fd045a77f9b460c331b702c341787de19a11f6c9 Mon Sep 17 00:00:00 2001 From: Chong Yang Date: Wed, 6 May 2026 16:11:11 +0930 Subject: [PATCH 4/4] docs(metrics): TECH-6381 update aggregation guidance for org_slug-labeled gauges The existing 'use max by (label)' guidance was correct when status was the only label and pods were the only repetition source. With org_slug now a real partition dimension on workflow_executions_total and workflow_execution_errors_total, max by (status) returns the busiest single org instead of the total -- a silent regression for any panel using that pattern. Document the corrected pattern (sum-of-max) so dashboard authors get the total across orgs while still deduping pods, and update the delta() examples to match. --- lib/metrics/METRICS_REFERENCE.md | 59 +++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/lib/metrics/METRICS_REFERENCE.md b/lib/metrics/METRICS_REFERENCE.md index 12b85fe4c..559be1da0 100644 --- a/lib/metrics/METRICS_REFERENCE.md +++ b/lib/metrics/METRICS_REFERENCE.md @@ -290,18 +290,29 @@ max(keeperhub_user_total{cluster="prod", namespace="keeperhub"}) sum(keeperhub_user_total{cluster="prod", namespace="keeperhub"}) ``` -**For labeled gauges, use `max by (label)`:** +**For labeled gauges where the label is a *replication* dimension (same value across pods), use `max by (label)`:** ```promql # CORRECT -max by (status) (keeperhub_workflow_executions_total{...}) max by (role) (keeperhub_org_members_by_role{...}) +``` + +**For labeled gauges where the label is a *partition* dimension (different values per series that should be summed), combine `max` (to dedupe across pods) with `sum` (to aggregate across partitions):** -# WRONG +```promql +# CORRECT - sum across org_slug, dedupe across pods +sum by (status) ( + max by (status, org_slug) (keeperhub_workflow_executions_total{...}) +) + +# WRONG - returns max-across-orgs, NOT total +max by (status) (keeperhub_workflow_executions_total{...}) + +# WRONG - double-counts across pods sum by (status) (keeperhub_workflow_executions_total{...}) ``` -`keeperhub_workflow_executions_total` and `keeperhub_workflow_execution_errors_total` are also labeled by `org_slug` so dashboards/alerts can scope to managed clients. Personal/anonymous workflows are emitted under `org_slug="_anonymous"` so the sum across `org_slug` for a given status equals the unfiltered per-status total. +`keeperhub_workflow_executions_total` and `keeperhub_workflow_execution_errors_total` are labeled by `org_slug` so dashboards/alerts can scope to managed clients. Personal/anonymous workflows are emitted under `org_slug="_anonymous"` so the sum across `org_slug` for a given status equals the unfiltered per-status total. To filter to managed clients, add `org_slug=~"techops-services|ajna"` (or the inverse `!~` for user workflows). **Metrics requiring `max()` aggregation:** @@ -333,22 +344,30 @@ For rate and change-over-time queries on DB-sourced gauges, use PromQL's `delta( **PromQL examples:** ```promql -# Point-in-time snapshots (use max() for multi-pod) -max(keeperhub_workflow_executions_total{status="error"}) -max(keeperhub_workflow_execution_errors_total) - -# Errors added in the last hour -max(delta(keeperhub_workflow_execution_errors_total[1h])) - -# Errors per minute (rate) -max(delta(keeperhub_workflow_execution_errors_total[1h])) / 60 - -# Executions in last 30 minutes by status -max by (status) (delta(keeperhub_workflow_executions_total[30m])) - -# Error rate percentage over last hour -100 * max(delta(keeperhub_workflow_execution_errors_total[1h])) - / clamp_min(max(delta(keeperhub_workflow_executions_total[1h])), 1) +# Total errors across all orgs (sum across org_slug, dedupe across pods) +sum(max by (org_slug) (keeperhub_workflow_execution_errors_total)) + +# Total successful executions across all orgs +sum(max by (status, org_slug) (keeperhub_workflow_executions_total{status="success"})) + +# Errors added in the last hour, summed across orgs +sum(max by (org_slug) (delta(keeperhub_workflow_execution_errors_total[1h]))) + +# Executions in last 30 minutes by status, summed across orgs +sum by (status) ( + max by (status, org_slug) (delta(keeperhub_workflow_executions_total[30m])) +) + +# Error rate over last hour, scoped to managed orgs +100 * sum(max by (org_slug) ( + delta(keeperhub_workflow_execution_errors_total{org_slug=~"techops-services|ajna"}[1h]) + )) + / clamp_min( + sum(max by (status, org_slug) ( + delta(keeperhub_workflow_executions_total{org_slug=~"techops-services|ajna"}[1h]) + )), + 1 + ) ``` **delta() vs offset:**