KeeperHub · eskp · May 7, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/lib/metrics/METRICS_REFERENCE.md b/lib/metrics/METRICS_REFERENCE.md
@@ -290,17 +290,30 @@ max(keeperhub_user_total{cluster="prod", namespace="keeperhub"})
 sum(keeperhub_user_total{cluster="prod", namespace="keeperhub"})
 ```
 
-**For labeled gauges, use `max by (label)`:**
+**For labeled gauges where the label is a *replication* dimension (same value across pods), use `max by (label)`:**
 
 ```promql
 # CORRECT
-max by (status) (keeperhub_workflow_executions_total{...})
 max by (role) (keeperhub_org_members_by_role{...})
+```
+
+**For labeled gauges where the label is a *partition* dimension (different values per series that should be summed), combine `max` (to dedupe across pods) with `sum` (to aggregate across partitions):**
+
+```promql
+# CORRECT - sum across org_slug, dedupe across pods
+sum by (status) (
+  max by (status, org_slug) (keeperhub_workflow_executions_total{...})
+)
+
+# WRONG - returns max-across-orgs, NOT total
+max by (status) (keeperhub_workflow_executions_total{...})
 
-# WRONG
+# WRONG - double-counts across pods
 sum by (status) (keeperhub_workflow_executions_total{...})
 ```
 
+`keeperhub_workflow_executions_total` and `keeperhub_workflow_execution_errors_total` are labeled by `org_slug` so dashboards/alerts can scope to managed clients. Personal/anonymous workflows are emitted under `org_slug="_anonymous"` so the sum across `org_slug` for a given status equals the unfiltered per-status total. To filter to managed clients, add `org_slug=~"techops-services|ajna"` (or the inverse `!~` for user workflows).
+
 **Metrics requiring `max()` aggregation:**
 
 | Category | Metrics |
@@ -331,22 +344,30 @@ For rate and change-over-time queries on DB-sourced gauges, use PromQL's `delta(
 **PromQL examples:**
 
 ```promql
-# Point-in-time snapshots (use max() for multi-pod)
-max(keeperhub_workflow_executions_total{status="error"})
-max(keeperhub_workflow_execution_errors_total)
-
-# Errors added in the last hour
-max(delta(keeperhub_workflow_execution_errors_total[1h]))
-
-# Errors per minute (rate)
-max(delta(keeperhub_workflow_execution_errors_total[1h])) / 60
-
-# Executions in last 30 minutes by status
-max by (status) (delta(keeperhub_workflow_executions_total[30m]))
-
-# Error rate percentage over last hour
-100 * max(delta(keeperhub_workflow_execution_errors_total[1h]))
-    / clamp_min(max(delta(keeperhub_workflow_executions_total[1h])), 1)
+# Total errors across all orgs (sum across org_slug, dedupe across pods)
+sum(max by (org_slug) (keeperhub_workflow_execution_errors_total))
+
+# Total successful executions across all orgs
+sum(max by (status, org_slug) (keeperhub_workflow_executions_total{status="success"}))
+
+# Errors added in the last hour, summed across orgs
+sum(max by (org_slug) (delta(keeperhub_workflow_execution_errors_total[1h])))
+
+# Executions in last 30 minutes by status, summed across orgs
+sum by (status) (
+  max by (status, org_slug) (delta(keeperhub_workflow_executions_total[30m]))
+)
+
+# Error rate over last hour, scoped to managed orgs
+100 * sum(max by (org_slug) (
+        delta(keeperhub_workflow_execution_errors_total{org_slug=~"techops-services|ajna"}[1h])
+      ))
+    / clamp_min(
+        sum(max by (status, org_slug) (
+          delta(keeperhub_workflow_executions_total{org_slug=~"techops-services|ajna"}[1h])
+        )),
+        1
+      )
 ```
 
 **delta() vs offset:**

diff --git a/lib/metrics/collectors/prometheus.ts b/lib/metrics/collectors/prometheus.ts
@@ -94,12 +94,14 @@ function getOrCreateGauge(
 // All metrics are GAUGES (point-in-time snapshots). Use max() aggregation across pods.
 // For rate/delta queries, use PromQL delta() function: max(delta(metric[1h]))
 
-// Workflow execution counts by status
+// Workflow execution counts by status and org_slug. Personal/anonymous
+// workflows are emitted under org_slug="_anonymous" so the sum across
+// org_slug for a given status equals the global per-status total.
 const workflowExecutionsTotal = getOrCreateGauge(
   dbRegistry,
   "keeperhub_workflow_executions_total",
-  "Total workflow executions by status (all-time)",
-  ["status"]
+  "Total workflow executions by status, broken down by org_slug (all-time)",
+  ["status", "org_slug"]
 );
 
 // Workflow errors total (convenience gauge for alerting). Labeled by org_slug
@@ -1176,24 +1178,16 @@ export async function updateDbMetrics(): Promise<void> {
       getBillingStatsFromDb(),
     ]);
 
-    // Update workflow execution counts by status (gauges - point-in-time snapshots)
-    workflowExecutionsTotal.set(
-      { status: "success" },
-      workflowStats.totalSuccess
-    );
-    workflowExecutionsTotal.set({ status: "error" }, workflowStats.totalError);
-    workflowExecutionsTotal.set(
-      { status: "running" },
-      workflowStats.totalRunning
-    );
-    workflowExecutionsTotal.set(
-      { status: "pending" },
-      workflowStats.totalPending
-    );
-    workflowExecutionsTotal.set(
-      { status: "cancelled" },
-      workflowStats.totalCancelled
-    );
+    // Update workflow execution counts per (status, org_slug). Reset before
+    // populating so series for orgs that no longer have executions in a given
+    // status clear out instead of going stale.
+    workflowExecutionsTotal.reset();
+    for (const row of workflowStats.executionsByStatusAndOrgSlug) {
+      workflowExecutionsTotal.set(
+        { status: row.status, org_slug: row.orgSlug },
+        row.count
+      );
+    }
 
     // Update workflow errors total per org_slug (convenience gauge for
     // alerting). Reset before populating so series for orgs that no longer

diff --git a/lib/metrics/db-metrics.ts b/lib/metrics/db-metrics.ts
@@ -49,7 +49,7 @@ const WORKFLOW_DURATION_BUCKETS = [
 const STEP_DURATION_BUCKETS = [50, 100, 250, 500, 1000, 2000, 5000];
 
 export type WorkflowStats = {
-  // Total executions by status
+  // Total executions by status (sum across all orgs)
   totalSuccess: number;
   totalError: number;
   totalRunning: number;
@@ -60,6 +60,15 @@ export type WorkflowStats = {
   // under ANONYMOUS_ORG_SLUG so the sum across this map matches totalError.
   errorByOrgSlug: Record<string, number>;
 
+  // Per-(status, org_slug) execution counts. Personal/anonymous workflows
+  // are bucketed under ANONYMOUS_ORG_SLUG so the sum of counts for a given
+  // status across all orgs matches the corresponding total* above.
+  executionsByStatusAndOrgSlug: Array<{
+    status: string;
+    orgSlug: string;
+    count: number;
+  }>;
+
   // Duration histogram data (count of executions in each bucket)
   durationBuckets: number[];
   durationSum: number;
@@ -74,73 +83,68 @@ export type WorkflowStats = {
  */
 export async function getWorkflowStatsFromDb(): Promise<WorkflowStats> {
   try {
-    // Query execution counts by status
-    const statusCounts = await db
-      .select({
-        status: workflowExecutions.status,
-        count: count(),
-      })
-      .from(workflowExecutions)
-      .groupBy(workflowExecutions.status);
-
     const stats: WorkflowStats = {
       totalSuccess: 0,
       totalError: 0,
       totalRunning: 0,
       totalPending: 0,
       totalCancelled: 0,
       errorByOrgSlug: {},
+      executionsByStatusAndOrgSlug: [],
       durationBuckets: new Array(WORKFLOW_DURATION_BUCKETS.length + 1).fill(0),
       durationSum: 0,
       durationCount: 0,
     };
 
-    for (const row of statusCounts) {
+    // Per-(status, org_slug) execution breakdown: JOIN workflows + organization,
+    // LEFT JOIN so anonymous workflows still contribute (under ANONYMOUS_ORG_SLUG).
+    // GROUP BY uses the organization.slug column reference (not the COALESCE
+    // expression): Drizzle would otherwise bind ANONYMOUS_ORG_SLUG as separate
+    // parameters in SELECT and GROUP BY clauses, and Postgres rejects the query
+    // because the two COALESCE expressions are not textually identical. Postgres
+    // groups all NULL slugs into one group (NULLs are equal in GROUP BY), and
+    // the SELECT-side COALESCE renders that group as ANONYMOUS_ORG_SLUG.
+    const breakdown = await db
+      .select({
+        status: workflowExecutions.status,
+        orgSlug: sql<string>`COALESCE(${organization.slug}, ${ANONYMOUS_ORG_SLUG})`,
+        count: count(),
+      })
+      .from(workflowExecutions)
+      .innerJoin(workflows, eq(workflowExecutions.workflowId, workflows.id))
+      .leftJoin(organization, eq(workflows.organizationId, organization.id))
+      .groupBy(workflowExecutions.status, organization.slug);
+
+    for (const row of breakdown) {
+      const c = Number(row.count) || 0;
+      stats.executionsByStatusAndOrgSlug.push({
+        status: row.status,
+        orgSlug: row.orgSlug,
+        count: c,
+      });
       switch (row.status) {
         case "success":
-          stats.totalSuccess = row.count;
+          stats.totalSuccess += c;
           break;
         case "error":
-          stats.totalError = row.count;
+          stats.totalError += c;
+          stats.errorByOrgSlug[row.orgSlug] = c;
           break;
         case "running":
-          stats.totalRunning = row.count;
+          stats.totalRunning += c;
           break;
         case "pending":
-          stats.totalPending = row.count;
+          stats.totalPending += c;
           break;
         case "cancelled":
-          stats.totalCancelled = row.count;
+          stats.totalCancelled += c;
           break;
         default:
           // Ignore unknown status values
           break;
       }
     }
 
-    // Per-org error breakdown: JOIN workflows + organization, LEFT JOIN so
-    // anonymous workflows still contribute (under ANONYMOUS_ORG_SLUG).
-    // GROUP BY uses the column reference (not the COALESCE expression):
-    // Drizzle would otherwise bind ANONYMOUS_ORG_SLUG as separate parameters
-    // in SELECT and GROUP BY clauses, and Postgres rejects the query because
-    // the two COALESCE expressions are not textually identical. Postgres
-    // groups all NULL slugs into one group (NULLs are equal in GROUP BY),
-    // and the SELECT-side COALESCE renders that group as ANONYMOUS_ORG_SLUG.
-    const errorByOrg = await db
-      .select({
-        orgSlug: sql<string>`COALESCE(${organization.slug}, ${ANONYMOUS_ORG_SLUG})`,
-        count: count(),
-      })
-      .from(workflowExecutions)
-      .innerJoin(workflows, eq(workflowExecutions.workflowId, workflows.id))
-      .leftJoin(organization, eq(workflows.organizationId, organization.id))
-      .where(eq(workflowExecutions.status, "error"))
-      .groupBy(organization.slug);
-
-    for (const row of errorByOrg) {
-      stats.errorByOrgSlug[row.orgSlug] = Number(row.count) || 0;
-    }
-
     // Query duration histogram data for completed executions
     // Build bucket counts using SQL CASE statements for efficiency
     const durationQuery = await db
@@ -193,6 +197,7 @@ export async function getWorkflowStatsFromDb(): Promise<WorkflowStats> {
       totalPending: 0,
       totalCancelled: 0,
       errorByOrgSlug: {},
+      executionsByStatusAndOrgSlug: [],
       durationBuckets: new Array(WORKFLOW_DURATION_BUCKETS.length + 1).fill(0),
       durationSum: 0,
       durationCount: 0,