Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion cmd/gpu-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ import (
"github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
"github.com/NVIDIA/gpu-operator/internal/consts"
"github.com/NVIDIA/gpu-operator/internal/info"
"github.com/NVIDIA/gpu-operator/internal/predicates"
// +kubebuilder:scaffold:imports
)

Expand Down Expand Up @@ -184,7 +185,10 @@ func main() {
setupLog.Error(err, "unable to create new ClusterUpdateStateManager", "controller", "Upgrade")
os.Exit(1)
}
clusterUpgradeStateManager = clusterUpgradeStateManager.WithPodDeletionEnabled(gpuPodSpecFilter).WithValidationEnabled("app=nvidia-operator-validator")
clusterUpgradeStateManager = clusterUpgradeStateManager.
WithPodDeletionEnabled(gpuPodSpecFilter).
WithValidationEnabled("app=nvidia-operator-validator").
WithRestartOnlyPredicate(predicates.DriverPodRestartOnly(upgradeLogger))

if err = (&controllers.UpgradeReconciler{
Client: mgr.GetClient(),
Expand Down
6 changes: 3 additions & 3 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -1064,19 +1064,19 @@ func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n C
// Set the computed digest in driver-manager initContainer
driverManagerContainer := findContainerByName(obj.Spec.Template.Spec.InitContainers, "k8s-driver-manager")
if driverManagerContainer != nil {
setContainerEnv(driverManagerContainer, "DRIVER_CONFIG_DIGEST", configDigest)
setContainerEnv(driverManagerContainer, driverconfig.DriverConfigDigestEnvName, configDigest)
Comment thread
tariq1890 marked this conversation as resolved.
}

// Set the computed digest in nvidia-driver container
driverContainer := findContainerByName(obj.Spec.Template.Spec.Containers, "nvidia-driver-ctr")
if driverContainer != nil {
setContainerEnv(driverContainer, "DRIVER_CONFIG_DIGEST", configDigest)
setContainerEnv(driverContainer, driverconfig.DriverConfigDigestEnvName, configDigest)
}

// Used by dtk-build-driver to determine if fast path should be used (skip rebuild)
driverToolkitContainer := findContainerByName(obj.Spec.Template.Spec.Containers, "openshift-driver-toolkit-ctr")
if driverToolkitContainer != nil {
setContainerEnv(driverToolkitContainer, "DRIVER_CONFIG_DIGEST", configDigest)
setContainerEnv(driverToolkitContainer, driverconfig.DriverConfigDigestEnvName, configDigest)
}

// set hostNetwork for driver if specified
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ require (
github.com/Masterminds/sprig/v3 v3.3.0
github.com/NVIDIA/go-nvlib v0.11.0
github.com/NVIDIA/k8s-kata-manager v0.2.3
github.com/NVIDIA/k8s-operator-libs v0.0.0-20260505175649-fa6a3643c441
github.com/NVIDIA/k8s-operator-libs v0.0.0-20260629200812-d720f2557494
github.com/NVIDIA/nvidia-container-toolkit v1.19.1
github.com/cyphar/filepath-securejoin v0.7.0
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ github.com/NVIDIA/go-nvlib v0.11.0 h1:J6c9deWGJ1x4yY7fKg+aOdm2v5+WmCIeCLsuaO3tRt
github.com/NVIDIA/go-nvlib v0.11.0/go.mod h1:uQNH63NoDuSfn/1lixD1D1Hvhko/xdnBHmc4H1mFUlY=
github.com/NVIDIA/k8s-kata-manager v0.2.3 h1:d5+gRFqU5el/fKMXhHUaPY7haj+dbHL4nDsO/q05LBo=
github.com/NVIDIA/k8s-kata-manager v0.2.3/go.mod h1:xx5OUiMsHyKbyX0JjKHqAftvqS8vx00LFn/5EaMdtB4=
github.com/NVIDIA/k8s-operator-libs v0.0.0-20260505175649-fa6a3643c441 h1:U+1f77CBKtvJEL/wzze5mY2+Y3XQ5ZgRK0R2Ru2phz4=
github.com/NVIDIA/k8s-operator-libs v0.0.0-20260505175649-fa6a3643c441/go.mod h1:L+aiCiTKN63AX9SWz/F8pv9Jw9FIfI+dAEr7VA+KowE=
github.com/NVIDIA/k8s-operator-libs v0.0.0-20260629200812-d720f2557494 h1:j+tWK79l9AouBulQps7rxILLhy2fWYcEhH4zgYjth/o=
github.com/NVIDIA/k8s-operator-libs v0.0.0-20260629200812-d720f2557494/go.mod h1:L+aiCiTKN63AX9SWz/F8pv9Jw9FIfI+dAEr7VA+KowE=
github.com/NVIDIA/nvidia-container-toolkit v1.19.1 h1:1sV4ddFrBccqL9Lbzcdu50w2j5FhyNJpN5hXTfCsjps=
github.com/NVIDIA/nvidia-container-toolkit v1.19.1/go.mod h1:yGsZ4s2lMjfE4r8/DMUPVpaFhRGkWvo2H++/Dy84nVc=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
Expand Down
32 changes: 32 additions & 0 deletions internal/config/driver_config_digest.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,38 @@ import (
corev1 "k8s.io/api/core/v1"
)

// DriverConfigDigestEnvName is the env var the operator sets on the driver pod
// template, carrying a hash of the install-relevant driver config (DriverInstallState).
const DriverConfigDigestEnvName = "DRIVER_CONFIG_DIGEST"

// DriverConfigDigestFromPodSpec returns the DRIVER_CONFIG_DIGEST value from a driver
// pod spec, or "" if absent. The env is set identically on every driver container, so
// the first non-empty value (init containers first) is returned.
func DriverConfigDigestFromPodSpec(spec *corev1.PodSpec) string {
if spec == nil {
return ""
}
digestFromEnv := func(env []corev1.EnvVar) string {
for _, e := range env {
if e.Name == DriverConfigDigestEnvName {
return e.Value
}
}
return ""
}
for _, initCtr := range spec.InitContainers {
if v := digestFromEnv(initCtr.Env); v != "" {
return v
}
}
for _, ctr := range spec.Containers {
if v := digestFromEnv(ctr.Env); v != "" {
return v
}
}
return ""
}

// DriverInstallState lists all fields that affect driver installation.
// Changes to these fields trigger a driver reinstall.
//
Expand Down
78 changes: 78 additions & 0 deletions internal/config/driver_config_digest_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -309,3 +309,81 @@ func TestExtractVolumes(t *testing.T) {
})
}
}

// containerWithConfigDigest builds a container carrying the DRIVER_CONFIG_DIGEST env
// when digest is non-empty (matching how object_controls.go sets it).
func containerWithConfigDigest(name, digest string) corev1.Container {
c := corev1.Container{Name: name}
if digest != "" {
c.Env = []corev1.EnvVar{{Name: DriverConfigDigestEnvName, Value: digest}}
}
return c
}

func TestDriverConfigDigestFromPodSpec(t *testing.T) {
tests := []struct {
name string
spec *corev1.PodSpec
want string
}{
{
name: "digest on k8s-driver-manager init container",
spec: &corev1.PodSpec{
InitContainers: []corev1.Container{containerWithConfigDigest("k8s-driver-manager", "abc123")},
Containers: []corev1.Container{containerWithConfigDigest("nvidia-driver-ctr", "")},
},
want: "abc123",
},
{
name: "digest on nvidia-driver-ctr main container",
spec: &corev1.PodSpec{
Containers: []corev1.Container{containerWithConfigDigest("nvidia-driver-ctr", "def456")},
},
want: "def456",
},
{
name: "digest on OCP openshift-driver-toolkit-ctr",
spec: &corev1.PodSpec{
Containers: []corev1.Container{containerWithConfigDigest("openshift-driver-toolkit-ctr", "ocp789")},
},
want: "ocp789",
},
{
name: "init container digest takes precedence over main container",
spec: &corev1.PodSpec{
InitContainers: []corev1.Container{containerWithConfigDigest("k8s-driver-manager", "init-digest")},
Containers: []corev1.Container{containerWithConfigDigest("nvidia-driver-ctr", "main-digest")},
},
want: "init-digest",
},
{
name: "empty init digest is skipped; main container value used",
spec: &corev1.PodSpec{
InitContainers: []corev1.Container{{
Name: "k8s-driver-manager",
Env: []corev1.EnvVar{{Name: DriverConfigDigestEnvName, Value: ""}},
}},
Containers: []corev1.Container{containerWithConfigDigest("nvidia-driver-ctr", "main-digest")},
},
want: "main-digest",
},
{
name: "no digest anywhere",
spec: &corev1.PodSpec{
InitContainers: []corev1.Container{{Name: "k8s-driver-manager"}},
Containers: []corev1.Container{{Name: "nvidia-driver-ctr"}},
},
want: "",
},
{
name: "nil spec",
spec: nil,
want: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, DriverConfigDigestFromPodSpec(tt.spec))
})
}
}
50 changes: 50 additions & 0 deletions internal/predicates/restart_only.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

// Package predicates holds predicates the upgrade controller registers on the
// k8s-operator-libs upgrade state manager.
package predicates

import (
"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"

"github.com/NVIDIA/k8s-operator-libs/pkg/consts"
"github.com/NVIDIA/k8s-operator-libs/pkg/upgrade"

driverconfig "github.com/NVIDIA/gpu-operator/internal/config"
)

// DriverPodRestartOnly returns the upgrade controller's RestartOnlyPredicate: it allows an
// out-of-sync driver pod to be restarted in place when the running pod spec and the desired
// DaemonSet template spec have the same DRIVER_CONFIG_DIGEST, i.e. the install-relevant
// config is unchanged (e.g. only a helm.sh/chart label changed). If either digest is missing,
// it returns false and the node takes the full upgrade flow.
func DriverPodRestartOnly(log logr.Logger) upgrade.RestartOnlyPredicate {
return func(running, desired *corev1.PodSpec) (bool, error) {
desiredDigest := driverconfig.DriverConfigDigestFromPodSpec(desired)
runningDigest := driverconfig.DriverConfigDigestFromPodSpec(running)
if desiredDigest == "" || runningDigest == "" {
log.V(consts.LogLevelDebug).Info("driver config digest missing; taking full upgrade flow",
"desiredDigest", desiredDigest, "runningDigest", runningDigest)
return false, nil
}
restartOnly := desiredDigest == runningDigest
log.V(consts.LogLevelDebug).Info("evaluated driver config digest for restart-only routing",
"desiredDigest", desiredDigest, "runningDigest", runningDigest, "restartOnly", restartOnly)
return restartOnly, nil
}
}
59 changes: 59 additions & 0 deletions internal/predicates/restart_only_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package predicates

import (
"testing"

"github.com/go-logr/logr"
"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"

driverconfig "github.com/NVIDIA/gpu-operator/internal/config"
)

func TestDriverPodRestartOnly(t *testing.T) {
podSpec := func(digest string) *corev1.PodSpec {
return &corev1.PodSpec{Containers: []corev1.Container{{
Name: "nvidia-driver-ctr",
Env: []corev1.EnvVar{{Name: driverconfig.DriverConfigDigestEnvName, Value: digest}},
}}}
}

predicate := DriverPodRestartOnly(logr.Discard())

tests := []struct {
name string
running *corev1.PodSpec
desired *corev1.PodSpec
wantRestart bool
}{
{name: "equal digests -> restart-only", running: podSpec("same"), desired: podSpec("same"), wantRestart: true},
{name: "differing digests -> full upgrade", running: podSpec("old"), desired: podSpec("new"), wantRestart: false},
{name: "missing digest on running pod -> full upgrade", running: podSpec(""), desired: podSpec("new"), wantRestart: false},
{name: "missing digest on desired template -> full upgrade", running: podSpec("old"), desired: podSpec(""), wantRestart: false},
{name: "nil running spec -> full upgrade", running: nil, desired: podSpec("x"), wantRestart: false},
{name: "nil desired spec -> full upgrade", running: podSpec("x"), desired: nil, wantRestart: false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := predicate(tt.running, tt.desired)
assert.NoError(t, err)
assert.Equal(t, tt.wantRestart, got)
})
}
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading