From b2365637bd695ec8b74b0a6e94debd313beb37d3 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Fri, 26 Jun 2026 09:28:16 -0700 Subject: [PATCH] Decouple OperatorMetrics object from ClusterPolicyController object With this commit, any controller in the GPU Operator that needs to set operator metrics can do so without relying on state from the ClusterPolicy controller object. This will be especially relevant when we integrate the NVIDIA DRA Driver for GPUs since we will be introducing a new CRD that serves as an alternative to the ClusterPolicy CRD. Signed-off-by: Christopher Desiniotis --- cmd/gpu-operator/main.go | 22 ++++++++------ controllers/clusterpolicy_controller.go | 7 +++-- controllers/object_controls_test.go | 2 +- controllers/operator_metrics.go | 4 ++- controllers/state_manager.go | 3 -- controllers/upgrade_controller.go | 38 +++++++++---------------- 6 files changed, 35 insertions(+), 41 deletions(-) diff --git a/cmd/gpu-operator/main.go b/cmd/gpu-operator/main.go index 727c13ed90..9ac5df1072 100644 --- a/cmd/gpu-operator/main.go +++ b/cmd/gpu-operator/main.go @@ -154,11 +154,16 @@ func main() { } ctx := ctrl.SetupSignalHandler() + + setupLog.Info("initializing operator metrics") + operatorMetrics := controllers.InitOperatorMetrics() + if err = (&controllers.ClusterPolicyReconciler{ - Namespace: operatorNamespace, - Client: mgr.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("ClusterPolicy"), - Scheme: mgr.GetScheme(), + Namespace: operatorNamespace, + Client: mgr.GetClient(), + Log: ctrl.Log.WithName("controllers").WithName("ClusterPolicy"), + Scheme: mgr.GetScheme(), + OperatorMetrics: operatorMetrics, }).SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "ClusterPolicy") os.Exit(1) @@ -182,10 +187,11 @@ func main() { clusterUpgradeStateManager = clusterUpgradeStateManager.WithPodDeletionEnabled(gpuPodSpecFilter).WithValidationEnabled("app=nvidia-operator-validator") if err = (&controllers.UpgradeReconciler{ - Client: mgr.GetClient(), - Log: upgradeLogger, - Scheme: mgr.GetScheme(), - StateManager: clusterUpgradeStateManager, + Client: mgr.GetClient(), + Log: upgradeLogger, + Scheme: mgr.GetScheme(), + StateManager: clusterUpgradeStateManager, + OperatorMetrics: operatorMetrics, }).SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Upgrade") os.Exit(1) diff --git a/controllers/clusterpolicy_controller.go b/controllers/clusterpolicy_controller.go index d16d2d445c..bb89047e7c 100644 --- a/controllers/clusterpolicy_controller.go +++ b/controllers/clusterpolicy_controller.go @@ -61,6 +61,7 @@ type ClusterPolicyReconciler struct { Log logr.Logger Scheme *runtime.Scheme Namespace string + OperatorMetrics *OperatorMetrics conditionUpdater conditions.Updater } @@ -126,9 +127,7 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil { r.Log.Error(condErr, "failed to set condition") } - if clusterPolicyCtrl.operatorMetrics != nil { - clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable) - } + clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable) return ctrl.Result{}, err } @@ -350,6 +349,8 @@ func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl return err } + clusterPolicyCtrl.operatorMetrics = r.OperatorMetrics + // initialize condition updater r.conditionUpdater = conditions.NewClusterPolicyUpdater(mgr.GetClient()) diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index e884ffec24..197138c3cd 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -544,7 +544,7 @@ func setup() error { scheme: s, } - clusterPolicyController.operatorMetrics = initOperatorMetrics() + clusterPolicyController.operatorMetrics = InitOperatorMetrics() hasNFDLabels, gpuNodeCount, err := clusterPolicyController.labelGPUNodes() if err != nil { diff --git a/controllers/operator_metrics.go b/controllers/operator_metrics.go index e51faae5f0..5f4a7ff291 100644 --- a/controllers/operator_metrics.go +++ b/controllers/operator_metrics.go @@ -66,7 +66,9 @@ const ( operatorMetricsNamespace = "gpu_operator" ) -func initOperatorMetrics() *OperatorMetrics { +// InitOperatorMetrics registers all GPU operator Prometheus metrics with the +// controller-runtime registry and returns the initialised OperatorMetrics. +func InitOperatorMetrics() *OperatorMetrics { m := &OperatorMetrics{ gpuNodesTotal: promcli.NewGauge( promcli.GaugeOpts{ diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 45b890df2d..8c2a943d00 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -883,9 +883,6 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP return fmt.Errorf("error validating clusterpolicy: %w", err) } - n.operatorMetrics = initOperatorMetrics() - n.logger.Info("Operator metrics initialized.") - addState(n, "/opt/gpu-operator/pre-requisites") addState(n, "/opt/gpu-operator/state-operator-metrics") addState(n, "/opt/gpu-operator/state-driver") diff --git a/controllers/upgrade_controller.go b/controllers/upgrade_controller.go index ac20826b32..9ba941ef7e 100644 --- a/controllers/upgrade_controller.go +++ b/controllers/upgrade_controller.go @@ -52,9 +52,10 @@ import ( // UpgradeReconciler reconciles Driver Daemon Sets for upgrade type UpgradeReconciler struct { client.Client - Log logr.Logger - Scheme *runtime.Scheme - StateManager upgrade.ClusterUpgradeStateManager + Log logr.Logger + Scheme *runtime.Scheme + StateManager upgrade.ClusterUpgradeStateManager + OperatorMetrics *OperatorMetrics } const ( @@ -89,9 +90,7 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct err := r.Get(ctx, req.NamespacedName, clusterPolicy) if err != nil { reqLogger.Error(err, "Error getting ClusterPolicy object") - if clusterPolicyCtrl.operatorMetrics != nil { - clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable) - } + r.OperatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable) if apierrors.IsNotFound(err) { // Request object not found, could have been deleted after reconcile request. // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers. @@ -105,26 +104,17 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct if clusterPolicy.Spec.SandboxWorkloads.IsEnabled() { reqLogger.V(consts.LogLevelInfo).Info("Advanced driver upgrade policy is not supported when 'sandboxWorkloads.enabled=true'" + "in ClusterPolicy, cleaning up upgrade state and skipping reconciliation") - // disable driver upgrade metrics - if clusterPolicyCtrl.operatorMetrics != nil { - clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled) - } + r.OperatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled) return ctrl.Result{}, r.removeNodeUpgradeStateLabels(ctx) } if clusterPolicy.Spec.Driver.UpgradePolicy == nil || !clusterPolicy.Spec.Driver.UpgradePolicy.AutoUpgrade { reqLogger.V(consts.LogLevelInfo).Info("Advanced driver upgrade policy is disabled, cleaning up upgrade state and skipping reconciliation") - // disable driver upgrade metrics - if clusterPolicyCtrl.operatorMetrics != nil { - clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled) - } + r.OperatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled) return ctrl.Result{}, r.removeNodeUpgradeStateLabels(ctx) } - // enable driver upgrade metrics - if clusterPolicyCtrl.operatorMetrics != nil { - clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeEnabled) - } + r.OperatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeEnabled) var driverLabel map[string]string @@ -181,13 +171,11 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } // log metrics with the current state - if clusterPolicyCtrl.operatorMetrics != nil { - clusterPolicyCtrl.operatorMetrics.upgradesInProgress.Set(float64(r.StateManager.GetUpgradesInProgress(state))) - clusterPolicyCtrl.operatorMetrics.upgradesDone.Set(float64(r.StateManager.GetUpgradesDone(state))) - clusterPolicyCtrl.operatorMetrics.upgradesAvailable.Set(float64(r.StateManager.GetUpgradesAvailable(state, clusterPolicy.Spec.Driver.UpgradePolicy.MaxParallelUpgrades, maxUnavailable))) - clusterPolicyCtrl.operatorMetrics.upgradesFailed.Set(float64(r.StateManager.GetUpgradesFailed(state))) - clusterPolicyCtrl.operatorMetrics.upgradesPending.Set(float64(r.StateManager.GetUpgradesPending(state))) - } + r.OperatorMetrics.upgradesInProgress.Set(float64(r.StateManager.GetUpgradesInProgress(state))) + r.OperatorMetrics.upgradesDone.Set(float64(r.StateManager.GetUpgradesDone(state))) + r.OperatorMetrics.upgradesAvailable.Set(float64(r.StateManager.GetUpgradesAvailable(state, clusterPolicy.Spec.Driver.UpgradePolicy.MaxParallelUpgrades, maxUnavailable))) + r.OperatorMetrics.upgradesFailed.Set(float64(r.StateManager.GetUpgradesFailed(state))) + r.OperatorMetrics.upgradesPending.Set(float64(r.StateManager.GetUpgradesPending(state))) err = r.StateManager.ApplyState(ctx, state, clusterPolicy.Spec.Driver.UpgradePolicy) if err != nil {