From 989d4aa7525e9b985352b6cb8530009ed12b7936 Mon Sep 17 00:00:00 2001
From: Chong Yang <chong@techops.services>
Date: Wed, 6 May 2026 16:04:13 +0930
Subject: [PATCH 1/4] feat(metrics): TECH-6381 break workflow executions down
 by org_slug

Replace the two queries (status counts + per-org error breakdown) with
a single combined query that groups by status AND organization.slug.
Adds executionsByStatusAndOrgSlug to WorkflowStats so the prometheus
collector can label keeperhub_workflow_executions_total by org_slug,
following the same convention as the errors gauge (anonymous workflows
bucket under '_anonymous' so per-org sums match the global totals).

totalSuccess/totalError/etc and errorByOrgSlug are now derived from the
combined query, keeping the existing errors gauge wiring unchanged.
---
 lib/metrics/db-metrics.ts | 83 +++++++++++++++++++++------------------
 1 file changed, 44 insertions(+), 39 deletions(-)
diff --git a/lib/metrics/db-metrics.ts b/lib/metrics/db-metrics.ts
index 69b8f2178..17f2087b0 100644
--- a/lib/metrics/db-metrics.ts
+++ b/lib/metrics/db-metrics.ts
@@ -49,7 +49,7 @@ const WORKFLOW_DURATION_BUCKETS = [
 const STEP_DURATION_BUCKETS = [50, 100, 250, 500, 1000, 2000, 5000];
 
 export type WorkflowStats = {
-  // Total executions by status
+  // Total executions by status (sum across all orgs)
   totalSuccess: number;
   totalError: number;
   totalRunning: number;
@@ -60,6 +60,15 @@ export type WorkflowStats = {
   // under ANONYMOUS_ORG_SLUG so the sum across this map matches totalError.
   errorByOrgSlug: Record<string, number>;
 
+  // Per-(status, org_slug) execution counts. Personal/anonymous workflows
+  // are bucketed under ANONYMOUS_ORG_SLUG so the sum of counts for a given
+  // status across all orgs matches the corresponding total* above.
+  executionsByStatusAndOrgSlug: Array<{
+    status: string;
+    orgSlug: string;
+    count: number;
+  }>;
+
   // Duration histogram data (count of executions in each bucket)
   durationBuckets: number[];
   durationSum: number;
@@ -74,15 +83,6 @@ export type WorkflowStats = {
  */
 export async function getWorkflowStatsFromDb(): Promise<WorkflowStats> {
   try {
-    // Query execution counts by status
-    const statusCounts = await db
-      .select({
-        status: workflowExecutions.status,
-        count: count(),
-      })
-      .from(workflowExecutions)
-      .groupBy(workflowExecutions.status);
-
     const stats: WorkflowStats = {
       totalSuccess: 0,
       totalError: 0,
@@ -90,27 +90,54 @@ export async function getWorkflowStatsFromDb(): Promise<WorkflowStats> {
       totalPending: 0,
       totalCancelled: 0,
       errorByOrgSlug: {},
+      executionsByStatusAndOrgSlug: [],
       durationBuckets: new Array(WORKFLOW_DURATION_BUCKETS.length + 1).fill(0),
       durationSum: 0,
       durationCount: 0,
     };
 
-    for (const row of statusCounts) {
+    // Per-(status, org_slug) execution breakdown: JOIN workflows + organization,
+    // LEFT JOIN so anonymous workflows still contribute (under ANONYMOUS_ORG_SLUG).
+    // GROUP BY uses the organization.slug column reference (not the COALESCE
+    // expression): Drizzle would otherwise bind ANONYMOUS_ORG_SLUG as separate
+    // parameters in SELECT and GROUP BY clauses, and Postgres rejects the query
+    // because the two COALESCE expressions are not textually identical. Postgres
+    // groups all NULL slugs into one group (NULLs are equal in GROUP BY), and
+    // the SELECT-side COALESCE renders that group as ANONYMOUS_ORG_SLUG.
+    const breakdown = await db
+      .select({
+        status: workflowExecutions.status,
+        orgSlug: sql<string>`COALESCE(${organization.slug}, ${ANONYMOUS_ORG_SLUG})`,
+        count: count(),
+      })
+      .from(workflowExecutions)
+      .innerJoin(workflows, eq(workflowExecutions.workflowId, workflows.id))
+      .leftJoin(organization, eq(workflows.organizationId, organization.id))
+      .groupBy(workflowExecutions.status, organization.slug);
+
+    for (const row of breakdown) {
+      const c = Number(row.count) || 0;
+      stats.executionsByStatusAndOrgSlug.push({
+        status: row.status,
+        orgSlug: row.orgSlug,
+        count: c,
+      });
       switch (row.status) {
         case "success":
-          stats.totalSuccess = row.count;
+          stats.totalSuccess += c;
           break;
         case "error":
-          stats.totalError = row.count;
+          stats.totalError += c;
+          stats.errorByOrgSlug[row.orgSlug] = c;
           break;
         case "running":
-          stats.totalRunning = row.count;
+          stats.totalRunning += c;
           break;
         case "pending":
-          stats.totalPending = row.count;
+          stats.totalPending += c;
           break;
         case "cancelled":
-          stats.totalCancelled = row.count;
+          stats.totalCancelled += c;
           break;
         default:
           // Ignore unknown status values
@@ -118,29 +145,6 @@ export async function getWorkflowStatsFromDb(): Promise<WorkflowStats> {
       }
     }
 
-    // Per-org error breakdown: JOIN workflows + organization, LEFT JOIN so
-    // anonymous workflows still contribute (under ANONYMOUS_ORG_SLUG).
-    // GROUP BY uses the column reference (not the COALESCE expression):
-    // Drizzle would otherwise bind ANONYMOUS_ORG_SLUG as separate parameters
-    // in SELECT and GROUP BY clauses, and Postgres rejects the query because
-    // the two COALESCE expressions are not textually identical. Postgres
-    // groups all NULL slugs into one group (NULLs are equal in GROUP BY),
-    // and the SELECT-side COALESCE renders that group as ANONYMOUS_ORG_SLUG.
-    const errorByOrg = await db
-      .select({
-        orgSlug: sql<string>`COALESCE(${organization.slug}, ${ANONYMOUS_ORG_SLUG})`,
-        count: count(),
-      })
-      .from(workflowExecutions)
-      .innerJoin(workflows, eq(workflowExecutions.workflowId, workflows.id))
-      .leftJoin(organization, eq(workflows.organizationId, organization.id))
-      .where(eq(workflowExecutions.status, "error"))
-      .groupBy(organization.slug);
-
-    for (const row of errorByOrg) {
-      stats.errorByOrgSlug[row.orgSlug] = Number(row.count) || 0;
-    }
-
     // Query duration histogram data for completed executions
     // Build bucket counts using SQL CASE statements for efficiency
     const durationQuery = await db
@@ -193,6 +197,7 @@ export async function getWorkflowStatsFromDb(): Promise<WorkflowStats> {
       totalPending: 0,
       totalCancelled: 0,
       errorByOrgSlug: {},
+      executionsByStatusAndOrgSlug: [],
       durationBuckets: new Array(WORKFLOW_DURATION_BUCKETS.length + 1).fill(0),
       durationSum: 0,
       durationCount: 0,

From a967febe9000b33e6bdc322ebd7917470a6748fb Mon Sep 17 00:00:00 2001
From: Chong Yang <chong@techops.services>
Date: Wed, 6 May 2026 16:04:19 +0930
Subject: [PATCH 2/4] feat(metrics): TECH-6381 expose executions gauge with
 org_slug label

Add org_slug to keeperhub_workflow_executions_total so dashboards and
alerts can scope the success rate to managed clients (the errors gauge
already had this label; the executions gauge didn't, which made it
impossible to compute a system-only success rate).

Reset before populating so series for orgs that drop to zero in a given
status clear out instead of going stale -- same pattern as the errors
gauge.
---
 lib/metrics/collectors/prometheus.ts | 36 ++++++++++++----------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/lib/metrics/collectors/prometheus.ts b/lib/metrics/collectors/prometheus.ts
index 927478611..6fcf13865 100644
--- a/lib/metrics/collectors/prometheus.ts
+++ b/lib/metrics/collectors/prometheus.ts
@@ -94,12 +94,14 @@ function getOrCreateGauge(
 // All metrics are GAUGES (point-in-time snapshots). Use max() aggregation across pods.
 // For rate/delta queries, use PromQL delta() function: max(delta(metric[1h]))
 
-// Workflow execution counts by status
+// Workflow execution counts by status and org_slug. Personal/anonymous
+// workflows are emitted under org_slug="_anonymous" so the sum across
+// org_slug for a given status equals the global per-status total.
 const workflowExecutionsTotal = getOrCreateGauge(
   dbRegistry,
   "keeperhub_workflow_executions_total",
-  "Total workflow executions by status (all-time)",
-  ["status"]
+  "Total workflow executions by status, broken down by org_slug (all-time)",
+  ["status", "org_slug"]
 );
 
 // Workflow errors total (convenience gauge for alerting). Labeled by org_slug
@@ -1176,24 +1178,16 @@ export async function updateDbMetrics(): Promise<void> {
       getBillingStatsFromDb(),
     ]);
 
-    // Update workflow execution counts by status (gauges - point-in-time snapshots)
-    workflowExecutionsTotal.set(
-      { status: "success" },
-      workflowStats.totalSuccess
-    );
-    workflowExecutionsTotal.set({ status: "error" }, workflowStats.totalError);
-    workflowExecutionsTotal.set(
-      { status: "running" },
-      workflowStats.totalRunning
-    );
-    workflowExecutionsTotal.set(
-      { status: "pending" },
-      workflowStats.totalPending
-    );
-    workflowExecutionsTotal.set(
-      { status: "cancelled" },
-      workflowStats.totalCancelled
-    );
+    // Update workflow execution counts per (status, org_slug). Reset before
+    // populating so series for orgs that no longer have executions in a given
+    // status clear out instead of going stale.
+    workflowExecutionsTotal.reset();
+    for (const row of workflowStats.executionsByStatusAndOrgSlug) {
+      workflowExecutionsTotal.set(
+        { status: row.status, org_slug: row.orgSlug },
+        row.count
+      );
+    }
 
     // Update workflow errors total per org_slug (convenience gauge for
     // alerting). Reset before populating so series for orgs that no longer

From 70fbdb2d0edd5c7f45ecab6459e5157d75a2713e Mon Sep 17 00:00:00 2001
From: Chong Yang <chong@techops.services>
Date: Wed, 6 May 2026 16:04:24 +0930
Subject: [PATCH 3/4] docs(metrics): TECH-6381 note org_slug label on workflow
 executions/errors

Document the convention so dashboard authors know they can scope these
gauges by managed-client org_slug, and that '_anonymous' is reserved for
personal workflows.
---
 lib/metrics/METRICS_REFERENCE.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/metrics/METRICS_REFERENCE.md b/lib/metrics/METRICS_REFERENCE.md
index 5a6f11670..12b85fe4c 100644
--- a/lib/metrics/METRICS_REFERENCE.md
+++ b/lib/metrics/METRICS_REFERENCE.md
@@ -301,6 +301,8 @@ max by (role) (keeperhub_org_members_by_role{...})
 sum by (status) (keeperhub_workflow_executions_total{...})
 ```
 
+`keeperhub_workflow_executions_total` and `keeperhub_workflow_execution_errors_total` are also labeled by `org_slug` so dashboards/alerts can scope to managed clients. Personal/anonymous workflows are emitted under `org_slug="_anonymous"` so the sum across `org_slug` for a given status equals the unfiltered per-status total.
+
 **Metrics requiring `max()` aggregation:**
 
 | Category | Metrics |

From fd045a77f9b460c331b702c341787de19a11f6c9 Mon Sep 17 00:00:00 2001
From: Chong Yang <chong@techops.services>
Date: Wed, 6 May 2026 16:11:11 +0930
Subject: [PATCH 4/4] docs(metrics): TECH-6381 update aggregation guidance for
 org_slug-labeled gauges

The existing 'use max by (label)' guidance was correct when status was
the only label and pods were the only repetition source. With org_slug
now a real partition dimension on workflow_executions_total and
workflow_execution_errors_total, max by (status) returns the busiest
single org instead of the total -- a silent regression for any panel
using that pattern.

Document the corrected pattern (sum-of-max) so dashboard authors get
the total across orgs while still deduping pods, and update the
delta() examples to match.
---
 lib/metrics/METRICS_REFERENCE.md | 59 +++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 20 deletions(-)

diff --git a/lib/metrics/METRICS_REFERENCE.md b/lib/metrics/METRICS_REFERENCE.md
index 12b85fe4c..559be1da0 100644
--- a/lib/metrics/METRICS_REFERENCE.md
+++ b/lib/metrics/METRICS_REFERENCE.md
@@ -290,18 +290,29 @@ max(keeperhub_user_total{cluster="prod", namespace="keeperhub"})
 sum(keeperhub_user_total{cluster="prod", namespace="keeperhub"})
 ```
 
-**For labeled gauges, use `max by (label)`:**
+**For labeled gauges where the label is a *replication* dimension (same value across pods), use `max by (label)`:**
 
 ```promql
 # CORRECT
-max by (status) (keeperhub_workflow_executions_total{...})
 max by (role) (keeperhub_org_members_by_role{...})
+```
+
+**For labeled gauges where the label is a *partition* dimension (different values per series that should be summed), combine `max` (to dedupe across pods) with `sum` (to aggregate across partitions):**
 
-# WRONG
+```promql
+# CORRECT - sum across org_slug, dedupe across pods
+sum by (status) (
+  max by (status, org_slug) (keeperhub_workflow_executions_total{...})
+)
+
+# WRONG - returns max-across-orgs, NOT total
+max by (status) (keeperhub_workflow_executions_total{...})
+
+# WRONG - double-counts across pods
 sum by (status) (keeperhub_workflow_executions_total{...})
 ```
 
-`keeperhub_workflow_executions_total` and `keeperhub_workflow_execution_errors_total` are also labeled by `org_slug` so dashboards/alerts can scope to managed clients. Personal/anonymous workflows are emitted under `org_slug="_anonymous"` so the sum across `org_slug` for a given status equals the unfiltered per-status total.
+`keeperhub_workflow_executions_total` and `keeperhub_workflow_execution_errors_total` are labeled by `org_slug` so dashboards/alerts can scope to managed clients. Personal/anonymous workflows are emitted under `org_slug="_anonymous"` so the sum across `org_slug` for a given status equals the unfiltered per-status total. To filter to managed clients, add `org_slug=~"techops-services|ajna"` (or the inverse `!~` for user workflows).
 
 **Metrics requiring `max()` aggregation:**
 
@@ -333,22 +344,30 @@ For rate and change-over-time queries on DB-sourced gauges, use PromQL's `delta(
 **PromQL examples:**
 
 ```promql
-# Point-in-time snapshots (use max() for multi-pod)
-max(keeperhub_workflow_executions_total{status="error"})
-max(keeperhub_workflow_execution_errors_total)
-
-# Errors added in the last hour
-max(delta(keeperhub_workflow_execution_errors_total[1h]))
-
-# Errors per minute (rate)
-max(delta(keeperhub_workflow_execution_errors_total[1h])) / 60
-
-# Executions in last 30 minutes by status
-max by (status) (delta(keeperhub_workflow_executions_total[30m]))
-
-# Error rate percentage over last hour
-100 * max(delta(keeperhub_workflow_execution_errors_total[1h]))
-    / clamp_min(max(delta(keeperhub_workflow_executions_total[1h])), 1)
+# Total errors across all orgs (sum across org_slug, dedupe across pods)
+sum(max by (org_slug) (keeperhub_workflow_execution_errors_total))
+
+# Total successful executions across all orgs
+sum(max by (status, org_slug) (keeperhub_workflow_executions_total{status="success"}))
+
+# Errors added in the last hour, summed across orgs
+sum(max by (org_slug) (delta(keeperhub_workflow_execution_errors_total[1h])))
+
+# Executions in last 30 minutes by status, summed across orgs
+sum by (status) (
+  max by (status, org_slug) (delta(keeperhub_workflow_executions_total[30m]))
+)
+
+# Error rate over last hour, scoped to managed orgs
+100 * sum(max by (org_slug) (
+        delta(keeperhub_workflow_execution_errors_total{org_slug=~"techops-services|ajna"}[1h])
+      ))
+    / clamp_min(
+        sum(max by (status, org_slug) (
+          delta(keeperhub_workflow_executions_total{org_slug=~"techops-services|ajna"}[1h])
+        )),
+        1
+      )
 ```
 
 **delta() vs offset:**