From 3431dab3cb4dae6620a8f788e0fc67b98573ec21 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 18 Jun 2026 18:36:32 -0700 Subject: [PATCH] Fix recurring dcgm-exporter OOMKill on dense-GPU nodes dcgm-exporter pods OOMKill-loop on multi-GPU nodes, blinding GPU monitoring/alerting there. A single pod reads every GPU on its node and the native DCGM (cgo) memory scales with GPU count, so 512Mi is too low on 8-GPU nodes (g5/g6/p4d .48xlarge). 13 pods on arc-cbr-production have OOMKill history, all on high-GPU-count nodes; the smoke test caught one at 5 restarts. The prior fix (#631) did not cover the densest nodes. Raise the memory limit 512Mi -> 1Gi and GOMEMLIMIT 450MiB -> 768MiB (both must move together). 1Gi is ~0.1% of these hosts' RAM, so node packing and cost are unaffected. --- .../monitoring/kubernetes/dcgm-exporter/daemonset.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/osdc/modules/monitoring/kubernetes/dcgm-exporter/daemonset.yaml b/osdc/modules/monitoring/kubernetes/dcgm-exporter/daemonset.yaml index a6e09975..81d7bad0 100644 --- a/osdc/modules/monitoring/kubernetes/dcgm-exporter/daemonset.yaml +++ b/osdc/modules/monitoring/kubernetes/dcgm-exporter/daemonset.yaml @@ -55,15 +55,18 @@ spec: value: "true" - name: DCGM_EXPORTER_LISTEN value: ":9400" + # Soft Go heap ceiling below the hard limit; native DCGM (cgo) + # allocations sit on top and scale with GPU count. - name: GOMEMLIMIT - value: "450MiB" + value: "768MiB" resources: requests: cpu: 100m memory: 256Mi limits: cpu: 200m - memory: 512Mi + # 1Gi: 512Mi OOMKills on 8-GPU nodes (one pod reads every GPU). + memory: 1Gi securityContext: runAsNonRoot: false runAsUser: 0