diff --git a/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml b/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml new file mode 100644 index 00000000..53ac2a2b --- /dev/null +++ b/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml @@ -0,0 +1,51 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: buildkit-autoscaling-alerts + namespace: monitoring + labels: + app.kubernetes.io/part-of: osdc-monitoring +spec: + groups: + - name: buildkit-autoscaling + rules: + # KEDA can't read the scale metric — if it persists past the ScaledObject's + # failureThreshold, KEDA drops to the fixed fallback pool instead of scaling. + - alert: BuildkitKedaScalerErrors + expr: | + sum by (scaledObject) (increase(keda_scaler_detail_errors_total[15m])) > 0 + for: 10m + labels: + severity: warning + team: pytorch-dev-infra + priority: P3 + annotations: + summary: "KEDA can't read the scale metric for {{ $labels.scaledObject }}" + description: "KEDA scaler errors for {{ $labels.scaledObject }} over the last 15m; sustained errors trip the fallback to the fixed BuildKit pool." + + - alert: BuildkitKedaScaledObjectErrors + expr: | + sum by (scaledObject) (increase(keda_scaled_object_errors_total[15m])) > 0 + for: 10m + labels: + severity: warning + team: pytorch-dev-infra + priority: P3 + annotations: + summary: "KEDA ScaledObject {{ $labels.scaledObject }} reconcile errors" + description: "KEDA failed to reconcile ScaledObject {{ $labels.scaledObject }} in the last 15m; autoscaling for that arch may be stale." + + # A real backlog the pool can't keep up with. The >20 threshold (not >0) + # avoids firing on normal burst churn, where small batches keep the queue + # briefly non-zero but still drain within minutes as pods scale up. + - alert: BuildkitQueueBacklog + expr: | + haproxy_backend_current_queue{proxy=~"bk_amd64|bk_arm64"} > 20 + for: 15m + labels: + severity: warning + team: pytorch-dev-infra + priority: P3 + annotations: + summary: "BuildKit {{ $labels.proxy }} backlog: >20 builds queued for 15m" + description: "More than 20 builds have been waiting in the {{ $labels.proxy }} queue for 15m — beyond normal burst churn; the pool isn't scaling up fast enough (or is at max)." diff --git a/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml b/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml index fbb95f53..40065e5a 100644 --- a/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml +++ b/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml @@ -3,6 +3,7 @@ kind: Kustomization resources: - arc-alerts.yaml + - buildkit-autoscaling-alerts.yaml - infrastructure-alerts.yaml - gpu-alerts.yaml - node-compactor-alerts.yaml diff --git a/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/buildkit-haproxy.yaml b/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/buildkit-haproxy.yaml index d1160507..65674d71 100644 --- a/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/buildkit-haproxy.yaml +++ b/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/buildkit-haproxy.yaml @@ -20,4 +20,4 @@ spec: # Keep only operationally important HAProxy metrics - action: keep sourceLabels: [__name__] - regex: "haproxy_server_status|haproxy_server_current_sessions|haproxy_server_connection_errors_total|haproxy_backend_current_sessions" + regex: "haproxy_server_status|haproxy_server_current_sessions|haproxy_server_connection_errors_total|haproxy_backend_current_sessions|haproxy_backend_current_queue"