From 3431dab3cb4dae6620a8f788e0fc67b98573ec21 Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Thu, 18 Jun 2026 18:36:32 -0700
Subject: [PATCH] Fix recurring dcgm-exporter OOMKill on dense-GPU nodes

dcgm-exporter pods OOMKill-loop on multi-GPU nodes, blinding GPU
monitoring/alerting there. A single pod reads every GPU on its node and
the native DCGM (cgo) memory scales with GPU count, so 512Mi is too low
on 8-GPU nodes (g5/g6/p4d .48xlarge). 13 pods on arc-cbr-production have
OOMKill history, all on high-GPU-count nodes; the smoke test caught one
at 5 restarts. The prior fix (#631) did not cover the densest nodes.

Raise the memory limit 512Mi -> 1Gi and GOMEMLIMIT 450MiB -> 768MiB
(both must move together). 1Gi is ~0.1% of these hosts' RAM, so node
packing and cost are unaffected.
---
 .../monitoring/kubernetes/dcgm-exporter/daemonset.yaml     | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/osdc/modules/monitoring/kubernetes/dcgm-exporter/daemonset.yaml b/osdc/modules/monitoring/kubernetes/dcgm-exporter/daemonset.yaml
index a6e09975..81d7bad0 100644
--- a/osdc/modules/monitoring/kubernetes/dcgm-exporter/daemonset.yaml
+++ b/osdc/modules/monitoring/kubernetes/dcgm-exporter/daemonset.yaml
@@ -55,15 +55,18 @@ spec:
               value: "true"
             - name: DCGM_EXPORTER_LISTEN
               value: ":9400"
+            # Soft Go heap ceiling below the hard limit; native DCGM (cgo)
+            # allocations sit on top and scale with GPU count.
             - name: GOMEMLIMIT
-              value: "450MiB"
+              value: "768MiB"
           resources:
             requests:
               cpu: 100m
               memory: 256Mi
             limits:
               cpu: 200m
-              memory: 512Mi
+              # 1Gi: 512Mi OOMKills on 8-GPU nodes (one pod reads every GPU).
+              memory: 1Gi
           securityContext:
             runAsNonRoot: false
             runAsUser: 0