From 5705a75b913dfab8557c0cd20b96fe8d589643b3 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 16:15:01 -0700 Subject: [PATCH 1/3] Update [ghstack-poisoned] --- .../alerts/buildkit-autoscaling-alerts.yaml | 49 +++++++++++++++++++ .../kubernetes/alerts/kustomization.yaml | 1 + 2 files changed, 50 insertions(+) create mode 100644 osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml diff --git a/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml b/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml new file mode 100644 index 00000000..56f1a7fa --- /dev/null +++ b/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml @@ -0,0 +1,49 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: buildkit-autoscaling-alerts + namespace: monitoring + labels: + app.kubernetes.io/part-of: osdc-monitoring +spec: + groups: + - name: buildkit-autoscaling + rules: + # KEDA can't read the scale metric — if it persists past the ScaledObject's + # failureThreshold, KEDA drops to the fixed fallback pool instead of scaling. + - alert: BuildkitKedaScalerErrors + expr: | + sum by (scaledObject) (increase(keda_scaler_errors_total[15m])) > 0 + for: 10m + labels: + severity: warning + team: pytorch-dev-infra + priority: P3 + annotations: + summary: "KEDA can't read the scale metric for {{ $labels.scaledObject }}" + description: "KEDA scaler errors for {{ $labels.scaledObject }} over the last 15m; sustained errors trip the fallback to the fixed BuildKit pool." + + - alert: BuildkitKedaScaledObjectErrors + expr: | + sum by (scaledObject) (increase(keda_scaledobject_errors_total[15m])) > 0 + for: 10m + labels: + severity: warning + team: pytorch-dev-infra + priority: P3 + annotations: + summary: "KEDA ScaledObject {{ $labels.scaledObject }} reconcile errors" + description: "KEDA failed to reconcile ScaledObject {{ $labels.scaledObject }} in the last 15m; autoscaling for that arch may be stale." + + # Builds stuck waiting for a pod — the pool isn't scaling up fast enough. + - alert: BuildkitQueueBacklog + expr: | + haproxy_backend_current_queue{proxy=~"bk_amd64|bk_arm64"} > 0 + for: 15m + labels: + severity: warning + team: pytorch-dev-infra + priority: P3 + annotations: + summary: "BuildKit {{ $labels.proxy }} has builds queued for 15m" + description: "Builds have been waiting in the {{ $labels.proxy }} queue for 15m — the pool isn't scaling up fast enough (or is at max) to meet demand." diff --git a/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml b/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml index fbb95f53..40065e5a 100644 --- a/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml +++ b/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml @@ -3,6 +3,7 @@ kind: Kustomization resources: - arc-alerts.yaml + - buildkit-autoscaling-alerts.yaml - infrastructure-alerts.yaml - gpu-alerts.yaml - node-compactor-alerts.yaml From bb4b8967ee6603e4ff9ecf33f51cb8e1ec63f50e Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 17:57:36 -0700 Subject: [PATCH 2/3] Update [ghstack-poisoned] --- .../kubernetes/alerts/buildkit-autoscaling-alerts.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml b/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml index 56f1a7fa..e00340c6 100644 --- a/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml +++ b/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml @@ -35,15 +35,17 @@ spec: summary: "KEDA ScaledObject {{ $labels.scaledObject }} reconcile errors" description: "KEDA failed to reconcile ScaledObject {{ $labels.scaledObject }} in the last 15m; autoscaling for that arch may be stale." - # Builds stuck waiting for a pod — the pool isn't scaling up fast enough. + # A real backlog the pool can't keep up with. The >20 threshold (not >0) + # avoids firing on normal burst churn, where small batches keep the queue + # briefly non-zero but still drain within minutes as pods scale up. - alert: BuildkitQueueBacklog expr: | - haproxy_backend_current_queue{proxy=~"bk_amd64|bk_arm64"} > 0 + haproxy_backend_current_queue{proxy=~"bk_amd64|bk_arm64"} > 20 for: 15m labels: severity: warning team: pytorch-dev-infra priority: P3 annotations: - summary: "BuildKit {{ $labels.proxy }} has builds queued for 15m" - description: "Builds have been waiting in the {{ $labels.proxy }} queue for 15m — the pool isn't scaling up fast enough (or is at max) to meet demand." + summary: "BuildKit {{ $labels.proxy }} backlog: >20 builds queued for 15m" + description: "More than 20 builds have been waiting in the {{ $labels.proxy }} queue for 15m — beyond normal burst churn; the pool isn't scaling up fast enough (or is at max)." From e42a93cac5a962acb181679ef4c4f60209490ed0 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 18:45:07 -0700 Subject: [PATCH 3/3] Update [ghstack-poisoned] --- .../kubernetes/alerts/buildkit-autoscaling-alerts.yaml | 4 ++-- .../kubernetes/monitors/servicemonitors/buildkit-haproxy.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml b/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml index e00340c6..53ac2a2b 100644 --- a/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml +++ b/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml @@ -13,7 +13,7 @@ spec: # failureThreshold, KEDA drops to the fixed fallback pool instead of scaling. - alert: BuildkitKedaScalerErrors expr: | - sum by (scaledObject) (increase(keda_scaler_errors_total[15m])) > 0 + sum by (scaledObject) (increase(keda_scaler_detail_errors_total[15m])) > 0 for: 10m labels: severity: warning @@ -25,7 +25,7 @@ spec: - alert: BuildkitKedaScaledObjectErrors expr: | - sum by (scaledObject) (increase(keda_scaledobject_errors_total[15m])) > 0 + sum by (scaledObject) (increase(keda_scaled_object_errors_total[15m])) > 0 for: 10m labels: severity: warning diff --git a/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/buildkit-haproxy.yaml b/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/buildkit-haproxy.yaml index d1160507..65674d71 100644 --- a/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/buildkit-haproxy.yaml +++ b/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/buildkit-haproxy.yaml @@ -20,4 +20,4 @@ spec: # Keep only operationally important HAProxy metrics - action: keep sourceLabels: [__name__] - regex: "haproxy_server_status|haproxy_server_current_sessions|haproxy_server_connection_errors_total|haproxy_backend_current_sessions" + regex: "haproxy_server_status|haproxy_server_current_sessions|haproxy_server_connection_errors_total|haproxy_backend_current_sessions|haproxy_backend_current_queue"