Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 14 additions & 8 deletions cmd/gpu-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,16 @@ func main() {
}

ctx := ctrl.SetupSignalHandler()

setupLog.Info("initializing operator metrics")
operatorMetrics := controllers.InitOperatorMetrics()

if err = (&controllers.ClusterPolicyReconciler{
Namespace: operatorNamespace,
Client: mgr.GetClient(),
Log: ctrl.Log.WithName("controllers").WithName("ClusterPolicy"),
Scheme: mgr.GetScheme(),
Namespace: operatorNamespace,
Client: mgr.GetClient(),
Log: ctrl.Log.WithName("controllers").WithName("ClusterPolicy"),
Scheme: mgr.GetScheme(),
OperatorMetrics: operatorMetrics,
}).SetupWithManager(ctx, mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "ClusterPolicy")
os.Exit(1)
Expand All @@ -182,10 +187,11 @@ func main() {
clusterUpgradeStateManager = clusterUpgradeStateManager.WithPodDeletionEnabled(gpuPodSpecFilter).WithValidationEnabled("app=nvidia-operator-validator")

if err = (&controllers.UpgradeReconciler{
Client: mgr.GetClient(),
Log: upgradeLogger,
Scheme: mgr.GetScheme(),
StateManager: clusterUpgradeStateManager,
Client: mgr.GetClient(),
Log: upgradeLogger,
Scheme: mgr.GetScheme(),
StateManager: clusterUpgradeStateManager,
OperatorMetrics: operatorMetrics,
}).SetupWithManager(ctx, mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Upgrade")
os.Exit(1)
Expand Down
7 changes: 4 additions & 3 deletions controllers/clusterpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ type ClusterPolicyReconciler struct {
Log logr.Logger
Scheme *runtime.Scheme
Namespace string
OperatorMetrics *OperatorMetrics
conditionUpdater conditions.Updater
}

Expand Down Expand Up @@ -126,9 +127,7 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
r.Log.Error(condErr, "failed to set condition")
}
if clusterPolicyCtrl.operatorMetrics != nil {
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable)
}
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable)
return ctrl.Result{}, err
}

Expand Down Expand Up @@ -350,6 +349,8 @@ func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl
return err
}

clusterPolicyCtrl.operatorMetrics = r.OperatorMetrics

// initialize condition updater
r.conditionUpdater = conditions.NewClusterPolicyUpdater(mgr.GetClient())

Expand Down
2 changes: 1 addition & 1 deletion controllers/object_controls_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ func setup() error {
scheme: s,
}

clusterPolicyController.operatorMetrics = initOperatorMetrics()
clusterPolicyController.operatorMetrics = InitOperatorMetrics()

hasNFDLabels, gpuNodeCount, err := clusterPolicyController.labelGPUNodes()
if err != nil {
Expand Down
4 changes: 3 additions & 1 deletion controllers/operator_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ const (
operatorMetricsNamespace = "gpu_operator"
)

func initOperatorMetrics() *OperatorMetrics {
// InitOperatorMetrics registers all GPU operator Prometheus metrics with the
// controller-runtime registry and returns the initialised OperatorMetrics.
func InitOperatorMetrics() *OperatorMetrics {
m := &OperatorMetrics{
gpuNodesTotal: promcli.NewGauge(
promcli.GaugeOpts{
Expand Down
3 changes: 0 additions & 3 deletions controllers/state_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -883,9 +883,6 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
return fmt.Errorf("error validating clusterpolicy: %w", err)
}

n.operatorMetrics = initOperatorMetrics()
n.logger.Info("Operator metrics initialized.")

addState(n, "/opt/gpu-operator/pre-requisites")
addState(n, "/opt/gpu-operator/state-operator-metrics")
addState(n, "/opt/gpu-operator/state-driver")
Expand Down
38 changes: 13 additions & 25 deletions controllers/upgrade_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,10 @@ import (
// UpgradeReconciler reconciles Driver Daemon Sets for upgrade
type UpgradeReconciler struct {
client.Client
Log logr.Logger
Scheme *runtime.Scheme
StateManager upgrade.ClusterUpgradeStateManager
Log logr.Logger
Scheme *runtime.Scheme
StateManager upgrade.ClusterUpgradeStateManager
OperatorMetrics *OperatorMetrics
}

const (
Expand Down Expand Up @@ -89,9 +90,7 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
err := r.Get(ctx, req.NamespacedName, clusterPolicy)
if err != nil {
reqLogger.Error(err, "Error getting ClusterPolicy object")
if clusterPolicyCtrl.operatorMetrics != nil {
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable)
}
r.OperatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable)
if apierrors.IsNotFound(err) {
// Request object not found, could have been deleted after reconcile request.
// Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
Expand All @@ -105,26 +104,17 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
if clusterPolicy.Spec.SandboxWorkloads.IsEnabled() {
reqLogger.V(consts.LogLevelInfo).Info("Advanced driver upgrade policy is not supported when 'sandboxWorkloads.enabled=true'" +
"in ClusterPolicy, cleaning up upgrade state and skipping reconciliation")
// disable driver upgrade metrics
if clusterPolicyCtrl.operatorMetrics != nil {
clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled)
}
r.OperatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled)
return ctrl.Result{}, r.removeNodeUpgradeStateLabels(ctx)
}

if clusterPolicy.Spec.Driver.UpgradePolicy == nil ||
!clusterPolicy.Spec.Driver.UpgradePolicy.AutoUpgrade {
reqLogger.V(consts.LogLevelInfo).Info("Advanced driver upgrade policy is disabled, cleaning up upgrade state and skipping reconciliation")
// disable driver upgrade metrics
if clusterPolicyCtrl.operatorMetrics != nil {
clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled)
}
r.OperatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled)
return ctrl.Result{}, r.removeNodeUpgradeStateLabels(ctx)
}
// enable driver upgrade metrics
if clusterPolicyCtrl.operatorMetrics != nil {
clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeEnabled)
}
r.OperatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeEnabled)

var driverLabel map[string]string

Expand Down Expand Up @@ -181,13 +171,11 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
}

// log metrics with the current state
if clusterPolicyCtrl.operatorMetrics != nil {
clusterPolicyCtrl.operatorMetrics.upgradesInProgress.Set(float64(r.StateManager.GetUpgradesInProgress(state)))
clusterPolicyCtrl.operatorMetrics.upgradesDone.Set(float64(r.StateManager.GetUpgradesDone(state)))
clusterPolicyCtrl.operatorMetrics.upgradesAvailable.Set(float64(r.StateManager.GetUpgradesAvailable(state, clusterPolicy.Spec.Driver.UpgradePolicy.MaxParallelUpgrades, maxUnavailable)))
clusterPolicyCtrl.operatorMetrics.upgradesFailed.Set(float64(r.StateManager.GetUpgradesFailed(state)))
clusterPolicyCtrl.operatorMetrics.upgradesPending.Set(float64(r.StateManager.GetUpgradesPending(state)))
}
r.OperatorMetrics.upgradesInProgress.Set(float64(r.StateManager.GetUpgradesInProgress(state)))
r.OperatorMetrics.upgradesDone.Set(float64(r.StateManager.GetUpgradesDone(state)))
r.OperatorMetrics.upgradesAvailable.Set(float64(r.StateManager.GetUpgradesAvailable(state, clusterPolicy.Spec.Driver.UpgradePolicy.MaxParallelUpgrades, maxUnavailable)))
r.OperatorMetrics.upgradesFailed.Set(float64(r.StateManager.GetUpgradesFailed(state)))
r.OperatorMetrics.upgradesPending.Set(float64(r.StateManager.GetUpgradesPending(state)))

err = r.StateManager.ApplyState(ctx, state, clusterPolicy.Spec.Driver.UpgradePolicy)
if err != nil {
Expand Down