diff --git a/cmd/gpu-operator/main.go b/cmd/gpu-operator/main.go index 727c13ed90..9ac5df1072 100644 --- a/cmd/gpu-operator/main.go +++ b/cmd/gpu-operator/main.go @@ -154,11 +154,16 @@ func main() { } ctx := ctrl.SetupSignalHandler() + + setupLog.Info("initializing operator metrics") + operatorMetrics := controllers.InitOperatorMetrics() + if err = (&controllers.ClusterPolicyReconciler{ - Namespace: operatorNamespace, - Client: mgr.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("ClusterPolicy"), - Scheme: mgr.GetScheme(), + Namespace: operatorNamespace, + Client: mgr.GetClient(), + Log: ctrl.Log.WithName("controllers").WithName("ClusterPolicy"), + Scheme: mgr.GetScheme(), + OperatorMetrics: operatorMetrics, }).SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "ClusterPolicy") os.Exit(1) @@ -182,10 +187,11 @@ func main() { clusterUpgradeStateManager = clusterUpgradeStateManager.WithPodDeletionEnabled(gpuPodSpecFilter).WithValidationEnabled("app=nvidia-operator-validator") if err = (&controllers.UpgradeReconciler{ - Client: mgr.GetClient(), - Log: upgradeLogger, - Scheme: mgr.GetScheme(), - StateManager: clusterUpgradeStateManager, + Client: mgr.GetClient(), + Log: upgradeLogger, + Scheme: mgr.GetScheme(), + StateManager: clusterUpgradeStateManager, + OperatorMetrics: operatorMetrics, }).SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Upgrade") os.Exit(1) diff --git a/controllers/clusterpolicy_controller.go b/controllers/clusterpolicy_controller.go index d16d2d445c..bb89047e7c 100644 --- a/controllers/clusterpolicy_controller.go +++ b/controllers/clusterpolicy_controller.go @@ -61,6 +61,7 @@ type ClusterPolicyReconciler struct { Log logr.Logger Scheme *runtime.Scheme Namespace string + OperatorMetrics *OperatorMetrics conditionUpdater conditions.Updater } @@ -126,9 +127,7 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil { r.Log.Error(condErr, "failed to set condition") } - if clusterPolicyCtrl.operatorMetrics != nil { - clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable) - } + clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable) return ctrl.Result{}, err } @@ -350,6 +349,8 @@ func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl return err } + clusterPolicyCtrl.operatorMetrics = r.OperatorMetrics + // initialize condition updater r.conditionUpdater = conditions.NewClusterPolicyUpdater(mgr.GetClient()) diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index e884ffec24..197138c3cd 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -544,7 +544,7 @@ func setup() error { scheme: s, } - clusterPolicyController.operatorMetrics = initOperatorMetrics() + clusterPolicyController.operatorMetrics = InitOperatorMetrics() hasNFDLabels, gpuNodeCount, err := clusterPolicyController.labelGPUNodes() if err != nil { diff --git a/controllers/operator_metrics.go b/controllers/operator_metrics.go index e51faae5f0..5f4a7ff291 100644 --- a/controllers/operator_metrics.go +++ b/controllers/operator_metrics.go @@ -66,7 +66,9 @@ const ( operatorMetricsNamespace = "gpu_operator" ) -func initOperatorMetrics() *OperatorMetrics { +// InitOperatorMetrics registers all GPU operator Prometheus metrics with the +// controller-runtime registry and returns the initialised OperatorMetrics. +func InitOperatorMetrics() *OperatorMetrics { m := &OperatorMetrics{ gpuNodesTotal: promcli.NewGauge( promcli.GaugeOpts{ diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 45b890df2d..8c2a943d00 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -883,9 +883,6 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP return fmt.Errorf("error validating clusterpolicy: %w", err) } - n.operatorMetrics = initOperatorMetrics() - n.logger.Info("Operator metrics initialized.") - addState(n, "/opt/gpu-operator/pre-requisites") addState(n, "/opt/gpu-operator/state-operator-metrics") addState(n, "/opt/gpu-operator/state-driver") diff --git a/controllers/upgrade_controller.go b/controllers/upgrade_controller.go index ac20826b32..9ba941ef7e 100644 --- a/controllers/upgrade_controller.go +++ b/controllers/upgrade_controller.go @@ -52,9 +52,10 @@ import ( // UpgradeReconciler reconciles Driver Daemon Sets for upgrade type UpgradeReconciler struct { client.Client - Log logr.Logger - Scheme *runtime.Scheme - StateManager upgrade.ClusterUpgradeStateManager + Log logr.Logger + Scheme *runtime.Scheme + StateManager upgrade.ClusterUpgradeStateManager + OperatorMetrics *OperatorMetrics } const ( @@ -89,9 +90,7 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct err := r.Get(ctx, req.NamespacedName, clusterPolicy) if err != nil { reqLogger.Error(err, "Error getting ClusterPolicy object") - if clusterPolicyCtrl.operatorMetrics != nil { - clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable) - } + r.OperatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable) if apierrors.IsNotFound(err) { // Request object not found, could have been deleted after reconcile request. // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers. @@ -105,26 +104,17 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct if clusterPolicy.Spec.SandboxWorkloads.IsEnabled() { reqLogger.V(consts.LogLevelInfo).Info("Advanced driver upgrade policy is not supported when 'sandboxWorkloads.enabled=true'" + "in ClusterPolicy, cleaning up upgrade state and skipping reconciliation") - // disable driver upgrade metrics - if clusterPolicyCtrl.operatorMetrics != nil { - clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled) - } + r.OperatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled) return ctrl.Result{}, r.removeNodeUpgradeStateLabels(ctx) } if clusterPolicy.Spec.Driver.UpgradePolicy == nil || !clusterPolicy.Spec.Driver.UpgradePolicy.AutoUpgrade { reqLogger.V(consts.LogLevelInfo).Info("Advanced driver upgrade policy is disabled, cleaning up upgrade state and skipping reconciliation") - // disable driver upgrade metrics - if clusterPolicyCtrl.operatorMetrics != nil { - clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled) - } + r.OperatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled) return ctrl.Result{}, r.removeNodeUpgradeStateLabels(ctx) } - // enable driver upgrade metrics - if clusterPolicyCtrl.operatorMetrics != nil { - clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeEnabled) - } + r.OperatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeEnabled) var driverLabel map[string]string @@ -181,13 +171,11 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } // log metrics with the current state - if clusterPolicyCtrl.operatorMetrics != nil { - clusterPolicyCtrl.operatorMetrics.upgradesInProgress.Set(float64(r.StateManager.GetUpgradesInProgress(state))) - clusterPolicyCtrl.operatorMetrics.upgradesDone.Set(float64(r.StateManager.GetUpgradesDone(state))) - clusterPolicyCtrl.operatorMetrics.upgradesAvailable.Set(float64(r.StateManager.GetUpgradesAvailable(state, clusterPolicy.Spec.Driver.UpgradePolicy.MaxParallelUpgrades, maxUnavailable))) - clusterPolicyCtrl.operatorMetrics.upgradesFailed.Set(float64(r.StateManager.GetUpgradesFailed(state))) - clusterPolicyCtrl.operatorMetrics.upgradesPending.Set(float64(r.StateManager.GetUpgradesPending(state))) - } + r.OperatorMetrics.upgradesInProgress.Set(float64(r.StateManager.GetUpgradesInProgress(state))) + r.OperatorMetrics.upgradesDone.Set(float64(r.StateManager.GetUpgradesDone(state))) + r.OperatorMetrics.upgradesAvailable.Set(float64(r.StateManager.GetUpgradesAvailable(state, clusterPolicy.Spec.Driver.UpgradePolicy.MaxParallelUpgrades, maxUnavailable))) + r.OperatorMetrics.upgradesFailed.Set(float64(r.StateManager.GetUpgradesFailed(state))) + r.OperatorMetrics.upgradesPending.Set(float64(r.StateManager.GetUpgradesPending(state))) err = r.StateManager.ApplyState(ctx, state, clusterPolicy.Spec.Driver.UpgradePolicy) if err != nil {