Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -2205,6 +2205,14 @@ func (d *DriverSpec) IsVGPULicensingEnabled() bool {
return d.LicensingConfig.ConfigMapName != "" || d.LicensingConfig.SecretName != ""
}

// IsAutoUpgradeEnabled returns true if auto upgrade is enabled
func (d *DriverSpec) IsAutoUpgradeEnabled() bool {
if d.UpgradePolicy == nil {
return false
}
return d.UpgradePolicy.AutoUpgrade
}

// IsEnabled returns true if device-plugin is enabled(default) through gpu-operator
func (p *DevicePluginSpec) IsEnabled() bool {
if p.Enabled == nil {
Expand Down
10 changes: 10 additions & 0 deletions cmd/gpu-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,16 @@ func main() {
setupLog.Error(err, "unable to create controller", "controller", "NVIDIADriver")
os.Exit(1)
}

if err = (&controllers.NodeLabelingReconciler{
Namespace: operatorNamespace,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: ctrl.Log.WithName("controllers").WithName("NodeLabeling"),
}).SetupWithManager(ctx, mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "NodeLabeling")
os.Exit(1)
}
// +kubebuilder:scaffold:builder
if err := mgr.AddHealthzCheck("health", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
Expand Down
13 changes: 4 additions & 9 deletions controllers/clusterpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,9 +280,8 @@ func addWatchNewGPUNode(r *ClusterPolicyReconciler, c controller.Controller, mgr
oldLabels := e.ObjectOld.GetLabels()
nodeName := e.ObjectNew.GetName()

gpuCommonLabelMissing := hasGPULabels(newLabels) && !hasCommonGPULabel(newLabels)
gpuCommonLabelOutdated := !hasGPULabels(newLabels) && hasCommonGPULabel(newLabels)
migManagerLabelMissing := hasMIGCapableGPU(newLabels) && !hasMIGManagerLabel(newLabels)
// Trigger when NodeLabelingReconciler sets gpu.present=true on a new GPU node.
gpuCommonLabelAdded := !hasCommonGPULabel(oldLabels) && hasCommonGPULabel(newLabels)
commonOperandsLabelChanged := hasOperandsDisabled(oldLabels) != hasOperandsDisabled(newLabels)

oldGPUWorkloadConfig, _ := getWorkloadConfig(oldLabels, true)
Expand All @@ -293,19 +292,15 @@ func addWatchNewGPUNode(r *ClusterPolicyReconciler, c controller.Controller, mgr
newOSTreeLabel := newLabels[nfdOSTreeVersionLabelKey]
osTreeLabelChanged := oldOSTreeLabel != newOSTreeLabel

needsUpdate := gpuCommonLabelMissing ||
gpuCommonLabelOutdated ||
migManagerLabelMissing ||
needsUpdate := gpuCommonLabelAdded ||
commonOperandsLabelChanged ||
gpuWorkloadConfigLabelChanged ||
osTreeLabelChanged

if needsUpdate {
r.Log.Info("Node needs an update",
"name", nodeName,
"gpuCommonLabelMissing", gpuCommonLabelMissing,
"gpuCommonLabelOutdated", gpuCommonLabelOutdated,
"migManagerLabelMissing", migManagerLabelMissing,
Comment thread
tariq1890 marked this conversation as resolved.
"gpuCommonLabelAdded", gpuCommonLabelAdded,
"commonOperandsLabelChanged", commonOperandsLabelChanged,
"gpuWorkloadConfigLabelChanged", gpuWorkloadConfigLabelChanged,
"osTreeLabelChanged", osTreeLabelChanged,
Expand Down
Loading
Loading